import urllib.request as req local = "mushroom.csv" url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data" req.urlretrieve(url, local) print("ok")
Category: Python
bmi
import random
def calc_bmi(h, w):
bmi = w / (h / 100) ** 2
if bmi < 18.5: return "thin"
if bmi < 25: return "normal"
return "fat"
fp = open("bmi.csv", "w", encoding="utf-8")
fp.write("height, weight, label\r\n")
cnt = {"thin":0, "normal":0, "fat":0}
for i in range(20000):
h = random.randint(120, 200)
w = random.randint(35, 80)
label = calc_bmi(h, w)
cnt[label] += 1
fp.write("{0},{1},{2}\r\n".format(h, w, label))
fp.close()
print("ok", cnt)
[/python]
[python]
from sklearn import cross_validation, svm, metrics
import matplotlib.pyplot as plt
import pandas as pd
tbl = pd.read_csv("bmi.csv")
label = tbl["label"]
w = tbl["weight"] / 100
h = tbl["height"] / 200
wh = pd.concat([w, h], axis=1)
data_train, data_test, label_train, label_test = \
cross_validation.train_test_split(wh, label)
clf = svm.SVC()
clf.fit(data_train, label_train)
predict = clf.predict(data_test)
ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)
print("正解率=", ac_score)
print("レポート=\n", cl_report)
[/python]
[python]
import matplotlib.pyplot as plt
import pandas as pd
tbl = pd.read_csv("bmi.csv", index_col=2)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
def scatter(lbl, color):
b = tbl.loc[lbl]
ax.scatter(b["weight"],b["height"], c=color, label=lbl)
scatter("fat", "red")
scatter("normal", "yellow")
scatter("thin", "purple")
ax.legend()
plt.savefig("bmi-test.png")
[/python]
PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’
import urllib.request as req import gzip, os, os.path savepath = "./mnist" baseurl = "http://yann.lecun.com/exdb/mnist" files = [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"] if not os.path.exists(savepath): os.mkdir(savepath) for f in files: url = baseurl + "/" + f loc = savepath = "/" + f print("download:", url) if not os.path.exists(loc): req.urlretrieve(url, loc) for f in files: gz_file = savepath + "/" + f raw_file = savepath + "/" + f.replace(".gz", "") print("gzip:", f) with gzip.open(gz_file, "rb") as fp: body = fp.read() with open(raw_file, "wb") as w: w.write(body) print("ok")
[vagrant@localhost python]$ python3 app.py
download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Traceback (most recent call last):
File “app.py”, line 18, in
req.urlretrieve(url, loc)
File “/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/urllib/request.py”, line 198, in urlretrieve
tfp = open(filename, ‘wb’)
PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’
何故だ!?
from sklearn import svm, metrics
import glob, os.path, re, json
def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r’^[a-z]{2,}’, name).group()
wtih open(fname, “r”, encoding=”utf-8″) as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord(“a”)
code_z = ord(“z”)
for ch in text:
n = ord(ch)
if code_a <= n <= code_z:
cnt[n - code_a] += 1
total = sum(cnt)
freq = list(map(lambda n: n / total, cnt))
return (freq, lang)
def load_files(path):
freqs = []
labels = []
file_list = glob.glob(path)
for fname in file_list:
r = check_freq(fname)
freqs.append(r[0])
labels.append(r[1])
return {"freqs":freqs, "labels":labels}
data = load_files("./lang/train/*.txt")
test = load_files("./lang/test/*.txt")
with open("./lang/freq.json", "w", encoding="utf-8") as fp:
json.dump([data, test], fp)
clf = svm.SVC()
clf.fit(data["freqs"], data["labels"])
predict = clf.predict(test["freqs"])
sc_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("正解率=", ac_score)
print("レポート=")
print(cl_report)
[/python]
[python]
import matplotlib.pyplot as plt
import pandas as pd
import json
with open("./lang/freq.json", "r", encoding="utf-8") as fp:
freq = json.load(fp)
lang_dic = {}
for i, lbl in enumerate(freq[0]["labels"]):
fq = freq[0]["freqs"][i]
if not (lbl in lang_dic):
lang_dic[lbl] = fq
continue
for idx, v in enumerate(fq):
lang_dic[lbl][idx] = (lang_dic[lbl][idx] + v) / 2
asclist = [[chr(n) for n in range(97,97+26)]]
df = pd.DataFrame(lang_dic, index=asclist)
plt.style.use('ggplot')
df.plot(kind="bar", subplots=True, ylim=(0,0.15))
plt.savefig("lang-plot.png")
[/python]
[python]
from sklearn import svm
from sklearn.externals import joblib
import json
with open("./lang/freq.json", "r", encoding="utf-8") as fp:
d = json.load(fp)
data = d[0]
clf = svm.SVC()
clf.fit(data["freqs"], data["labels"])
joblib.dump(clf, "./cgi-bin/freq.pkl")
print("ok")
[/python]
アヤメの品種分類
GitHubからcsvをダウンロードします
https://github.com/pandas-dev/pandas/tree/master/pandas/tests/data
from sklearn import svm, metrics
import random, re
csv = []
with open(‘iris.csv’, ‘r’, encoding=’utf-8′) as fp:
for line in fp:
line = line.strip()
cols = line.split(‘,’)
fn = lambda n : float(n) if re.match(r’^[0-9\.]+$’, n) else n
cols = list(map(fn, cols))
csv.append(cols)
del csv[0]
random.shuffle(csv)
total_len = len(csv)
train_len = int(total_len * 2 / 3)
train_data = []
train_label = []
test_data = []
test_label = []
for i in range(total_len):
data = csv[i][0:4]
label = csv[i][4]
if i < train_len:
train_data.append(data)
train_label.append(label)
else:
test_data.append(data)
test_label.append(label)
clf = svm.SVC()
clf.fit(train_data, train_label)
pre = clf.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pre)
print("正解率=", ac_score)
[/python]
[vagrant@localhost python]$ python3 app.py
return f(*args, **kwds)
正解率= 0.96
[python]
import pandas as pd
from sklearn import svm, metrics, cross_validation
csv = pd.read_csv('iris.csv')
csv_data = csv[["SepalLength","SepalWidth","PetalLength","PetalWidth"]]
csv_label = csv["Name"]
train_data, test_data, train_label, test_label = \
cross_validation.train_test_split(csv_data, csv_label)
clf = svm.SVC()
clf.fit(train_data, train_label)
pre = clf.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pre)
print("正解率=", ac_score)
[/python]
[vagrant@localhost python]$ python3 app.py
正解率= 0.9736842105263158
ImportError: No module named ‘_tkinter’
from sklearn import datasets digits = datasets.load_digits() import matplotlib.pyplot as plt plt.matshow(digits.images[0], cmap="Greys") plt.show()
[vagrant@localhost python]$ python3 app.py
…
import _tkinter # If this fails your Python may not be configured for Tk
ImportError: No module named ‘_tkinter’
pyenv上python3.5.2環境でtkinterを利用できるようにしないといけないということですね。
scikit-learnのアルゴリズム
画像認識はこちらになりますね。
Classification : Identifying to which category an object belongs to.
SVM, nearest neighbors, random forest, …
Spam detection, Image recognition.
株価の予測はこちらでしょうか。
Regression:Predicting a continuous-valued attribute associated with an object.
SVR, ridge regression, Lasso, …
Drug response, Stock prices.
属性の分類はこちらです。Web解析でデモグラフィックをグルーピングするにはこちらでしょうね。
Clustering:Automatic grouping of similar objects into sets.
k-Means, spectral clustering, mean-shift, …
Customer segmentation, Grouping experiment outcomes
これは、よくわかりません。変数を減らしていく、ビジュアリゼーションと記載がありますね。。
Dimensionality reduction:Reducing the number of random variables to consider.
Algorithms: PCA, feature selection, non-negative matrix factorization.
Visualization, Increased efficiency
これもあまり馴染みがないですね。モデリングの調整ということは理解ができますが。
Model selection:Comparing, validating and choosing parameters and models.
grid search, cross validation, metrics.
Improved accuracy via parameter tuning
テキストを機械学習に組み込むと書いてますね。どういうことでしょうか。翻訳の精度を上げるとかでしょうか。これは、形態素解析と組み合わせるのでしょうか。面白そうな分野ではありますね。
Preprocessing:Feature extraction and normalization.
preprocessing, feature extraction.
Transforming input data such as text for use with machine learning algorithms.
ということで、Classification、 Regression、Clusteringは割と一般的なモデリングだと思います。
初期は辛いですね。
とにかく量をこなさないと、どうやって学習していったらいいかすらわかりません。
sklearn import pandas
import pandas as pd from sklearn import svm, metrics xor_input = [ [0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0] ] xor_df = pd.DataFrame(xor_input) xor_data = xor_df.ix[:,0:1] xor_label = xor_df.ix[:,2] clf = svm.SVC() clf.fit(xor_data, xor_label) pre = clf.predict(xor_data) ac_score = metrics.accuracy_score(xor_label, pre) print(" 正解率=", ac_score)
[vagrant@localhost python]$ python3 app.py
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
正解率= 1.0
こりゃあかん、プログラミングやり始めた時と全く同じだ。
何故こう動いているか理解できん。
scikit-learnのfit()メソッドを使う
いきなりコードから始めます。
from sklearn import svm xor_data = [ [0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0] ] data = [] label = [] for row in xor_data: p = row[0] q = row[1] r = row[2] data.append([p, q]) label.append(r) clf = svm.SVC() clf.fit(data, label) pre = clf.predict(data) print("予測結果:", pre) ok = 0; total = 0 for idx, answer in enumerate(label): p = pre[idx] if p == answer: ok += 1 total += 1 print("正解率:", ok, "/", total, "=", ok/total)
続いてコマンドライン
[vagrant@localhost python]$ python3 app.py
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
予測結果: [0 1 1 0]
正解率: 4 / 4 = 1.0
なんだこりゃ、いきなり一気に難易度が上がった?!
scikit-learn -> Pythonの機械学習ライブラリ
scikit-learnを入れる
いよいよ来ました、pythonで機械学習 ^^
やっとここまできましたね(祝!) ワクワクします。
とりあえず、scikit-learnを入れます。
[vagrant@localhost python]$ pip3 install -U scikit-learn scipy matplotlib scikit-image
それから、pandasも入れておきましょう。
[vagrant@localhost python]$ pip3 install pandas
Collecting pandas
Downloading https://files.pythonhosted.org/packages/5d/d4/6e9c56a561f1d27407bf29318ca43f36ccaa289271b805a30034eb3a8ec4/pandas-0.23.4-cp35-cp35m-manylinux1_x86_64.whl (8.7MB)
100% |████████████████████████████████| 8.7MB 735kB/s
pythonでtinydbを使う
まず、pip3でtinydbを入れる
[vagrant@localhost python]$ pip3 install tinydb
Collecting tinydb
Downloading https://files.pythonhosted.org/packages/97/6c/fb150f2c09d8b6f23b8f080396673794f970fa7ca0d50900fbe5fe07b8b9/tinydb-3.10.0-py2.py3-none-any.whl
Installing collected packages: tinydb
Successfully installed tinydb-3.10.0
続いて、python
from tinydb import TinyDB, Query filepath = "test-tynydb.json" db = TinyDB(filepath) db.purge_table('fruits') table = db.table('fruits') table.insert({'name':'Banana', 'price':600}) table.insert({'name':'Orange', 'price':1200}) table.insert({'name':'Mango', 'price':840}) print(table.all()) Item = Query() res = table.search(Item.name == 'Orange') print('Orange is', res[0]['price']) print("800円以上のもの:") res = table.search(Item.price >= 800) for it in res: print("-", it['name'])
うーん、 jsonですね。phpからmongodbのinsert, fetchよりもコード量が少ないので、書きやすいかもしれませんね。
[vagrant@localhost python]$ python3 app.py
[{‘name’: ‘Banana’, ‘price’: 600}, {‘name’: ‘Orange’, ‘price’: 1200}, {‘name’: ‘Mango’, ‘price’: 840}]
Orange is 1200
800円以上のもの:
– Orange
– Mango