import urllib.request as req
import gzip, os, os.path
savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"
files = [
"train-images-idx3-ubyte.gz",
"train-labels-idx1-ubyte.gz",
"t10k-images-idx3-ubyte.gz",
"t10k-labels-idx1-ubyte.gz"]
if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
url = baseurl + "/" + f
loc = savepath = "/" + f
print("download:", url)
if not os.path.exists(loc):
req.urlretrieve(url, loc)
for f in files:
gz_file = savepath + "/" + f
raw_file = savepath + "/" + f.replace(".gz", "")
print("gzip:", f)
with gzip.open(gz_file, "rb") as fp:
body = fp.read()
with open(raw_file, "wb") as w:
w.write(body)
print("ok")
[vagrant@localhost python]$ python3 app.py
download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Traceback (most recent call last):
File “app.py”, line 18, in
req.urlretrieve(url, loc)
File “/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/urllib/request.py”, line 198, in urlretrieve
tfp = open(filename, ‘wb’)
PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’
何故だ!?
from sklearn import svm, metrics
import glob, os.path, re, json
def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r’^[a-z]{2,}’, name).group()
wtih open(fname, “r”, encoding=”utf-8″) as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord(“a”)
code_z = ord(“z”)
for ch in text:
n = ord(ch)
if code_a <= n <= code_z:
cnt[n - code_a] += 1
total = sum(cnt)
freq = list(map(lambda n: n / total, cnt))
return (freq, lang)
def load_files(path):
freqs = []
labels = []
file_list = glob.glob(path)
for fname in file_list:
r = check_freq(fname)
freqs.append(r[0])
labels.append(r[1])
return {"freqs":freqs, "labels":labels}
data = load_files("./lang/train/*.txt")
test = load_files("./lang/test/*.txt")
with open("./lang/freq.json", "w", encoding="utf-8") as fp:
json.dump([data, test], fp)
clf = svm.SVC()
clf.fit(data["freqs"], data["labels"])
predict = clf.predict(test["freqs"])
sc_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("正解率=", ac_score)
print("レポート=")
print(cl_report)
[/python]
[python]
import matplotlib.pyplot as plt
import pandas as pd
import json
with open("./lang/freq.json", "r", encoding="utf-8") as fp:
freq = json.load(fp)
lang_dic = {}
for i, lbl in enumerate(freq[0]["labels"]):
fq = freq[0]["freqs"][i]
if not (lbl in lang_dic):
lang_dic[lbl] = fq
continue
for idx, v in enumerate(fq):
lang_dic[lbl][idx] = (lang_dic[lbl][idx] + v) / 2
asclist = [[chr(n) for n in range(97,97+26)]]
df = pd.DataFrame(lang_dic, index=asclist)
plt.style.use('ggplot')
df.plot(kind="bar", subplots=True, ylim=(0,0.15))
plt.savefig("lang-plot.png")
[/python]
[python]
from sklearn import svm
from sklearn.externals import joblib
import json
with open("./lang/freq.json", "r", encoding="utf-8") as fp:
d = json.load(fp)
data = d[0]
clf = svm.SVC()
clf.fit(data["freqs"], data["labels"])
joblib.dump(clf, "./cgi-bin/freq.pkl")
print("ok")
[/python]