mushroom

import urllib.request as req
local = "mushroom.csv"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
req.urlretrieve(url, local)
print("ok")

bmi

import random

def calc_bmi(h, w):
bmi = w / (h / 100) ** 2
if bmi < 18.5: return "thin" if bmi < 25: return "normal" return "fat" fp = open("bmi.csv", "w", encoding="utf-8") fp.write("height, weight, label\r\n") cnt = {"thin":0, "normal":0, "fat":0} for i in range(20000): h = random.randint(120, 200) w = random.randint(35, 80) label = calc_bmi(h, w) cnt[label] += 1 fp.write("{0},{1},{2}\r\n".format(h, w, label)) fp.close() print("ok", cnt) [/python] [python] from sklearn import cross_validation, svm, metrics import matplotlib.pyplot as plt import pandas as pd tbl = pd.read_csv("bmi.csv") label = tbl["label"] w = tbl["weight"] / 100 h = tbl["height"] / 200 wh = pd.concat([w, h], axis=1) data_train, data_test, label_train, label_test = \ cross_validation.train_test_split(wh, label) clf = svm.SVC() clf.fit(data_train, label_train) predict = clf.predict(data_test) ac_score = metrics.accuracy_score(label_test, predict) cl_report = metrics.classification_report(label_test, predict) print("正解率=", ac_score) print("レポート=\n", cl_report) [/python] [python] import matplotlib.pyplot as plt import pandas as pd tbl = pd.read_csv("bmi.csv", index_col=2) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) def scatter(lbl, color): b = tbl.loc[lbl] ax.scatter(b["weight"],b["height"], c=color, label=lbl) scatter("fat", "red") scatter("normal", "yellow") scatter("thin", "purple") ax.legend() plt.savefig("bmi-test.png") [/python]

PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’

import urllib.request as req
import gzip, os, os.path

savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"
files = [
	"train-images-idx3-ubyte.gz",
	"train-labels-idx1-ubyte.gz",
	"t10k-images-idx3-ubyte.gz",
	"t10k-labels-idx1-ubyte.gz"]

if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
	url = baseurl + "/" + f
	loc = savepath = "/" + f
	print("download:", url)
	if not os.path.exists(loc):
		req.urlretrieve(url, loc)

for f in files:
	gz_file = savepath + "/" + f
	raw_file = savepath + "/" + f.replace(".gz", "")
	print("gzip:", f)
	with gzip.open(gz_file, "rb") as fp:
		body = fp.read()
		with open(raw_file, "wb") as w:
			w.write(body)
print("ok")

[vagrant@localhost python]$ python3 app.py
download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Traceback (most recent call last):
File “app.py”, line 18, in
req.urlretrieve(url, loc)
File “/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/urllib/request.py”, line 198, in urlretrieve
tfp = open(filename, ‘wb’)
PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’

何故だ!?

from sklearn import svm, metrics
import glob, os.path, re, json

def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r’^[a-z]{2,}’, name).group()
wtih open(fname, “r”, encoding=”utf-8″) as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord(“a”)
code_z = ord(“z”)
for ch in text:
n = ord(ch)
if code_a <= n <= code_z: cnt[n - code_a] += 1 total = sum(cnt) freq = list(map(lambda n: n / total, cnt)) return (freq, lang) def load_files(path): freqs = [] labels = [] file_list = glob.glob(path) for fname in file_list: r = check_freq(fname) freqs.append(r[0]) labels.append(r[1]) return {"freqs":freqs, "labels":labels} data = load_files("./lang/train/*.txt") test = load_files("./lang/test/*.txt") with open("./lang/freq.json", "w", encoding="utf-8") as fp: json.dump([data, test], fp) clf = svm.SVC() clf.fit(data["freqs"], data["labels"]) predict = clf.predict(test["freqs"]) sc_score = metrics.accuracy_score(test["labels"], predict) cl_report = metrics.classification_report(test["labels"], predict) print("正解率=", ac_score) print("レポート=") print(cl_report) [/python] [python] import matplotlib.pyplot as plt import pandas as pd import json with open("./lang/freq.json", "r", encoding="utf-8") as fp: freq = json.load(fp) lang_dic = {} for i, lbl in enumerate(freq[0]["labels"]): fq = freq[0]["freqs"][i] if not (lbl in lang_dic): lang_dic[lbl] = fq continue for idx, v in enumerate(fq): lang_dic[lbl][idx] = (lang_dic[lbl][idx] + v) / 2 asclist = [[chr(n) for n in range(97,97+26)]] df = pd.DataFrame(lang_dic, index=asclist) plt.style.use('ggplot') df.plot(kind="bar", subplots=True, ylim=(0,0.15)) plt.savefig("lang-plot.png") [/python] [python] from sklearn import svm from sklearn.externals import joblib import json with open("./lang/freq.json", "r", encoding="utf-8") as fp: d = json.load(fp) data = d[0] clf = svm.SVC() clf.fit(data["freqs"], data["labels"]) joblib.dump(clf, "./cgi-bin/freq.pkl") print("ok") [/python]

アヤメの品種分類

GitHubからcsvをダウンロードします
https://github.com/pandas-dev/pandas/tree/master/pandas/tests/data

from sklearn import svm, metrics
import random, re

csv = []
with open(‘iris.csv’, ‘r’, encoding=’utf-8′) as fp:
for line in fp:
line = line.strip()
cols = line.split(‘,’)
fn = lambda n : float(n) if re.match(r’^[0-9\.]+$’, n) else n
cols = list(map(fn, cols))
csv.append(cols)

del csv[0]

random.shuffle(csv)

total_len = len(csv)
train_len = int(total_len * 2 / 3)
train_data = []
train_label = []
test_data = []
test_label = []
for i in range(total_len):
data = csv[i][0:4]
label = csv[i][4]
if i < train_len: train_data.append(data) train_label.append(label) else: test_data.append(data) test_label.append(label) clf = svm.SVC() clf.fit(train_data, train_label) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(test_label, pre) print("正解率=", ac_score) [/python] [vagrant@localhost python]$ python3 app.py return f(*args, **kwds) 正解率= 0.96 [python] import pandas as pd from sklearn import svm, metrics, cross_validation csv = pd.read_csv('iris.csv') csv_data = csv[["SepalLength","SepalWidth","PetalLength","PetalWidth"]] csv_label = csv["Name"] train_data, test_data, train_label, test_label = \ cross_validation.train_test_split(csv_data, csv_label) clf = svm.SVC() clf.fit(train_data, train_label) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(test_label, pre) print("正解率=", ac_score) [/python] [vagrant@localhost python]$ python3 app.py 正解率= 0.9736842105263158

ImportError: No module named ‘_tkinter’

from sklearn import datasets

digits = datasets.load_digits()

import matplotlib.pyplot as plt
plt.matshow(digits.images[0], cmap="Greys")
plt.show()

[vagrant@localhost python]$ python3 app.py

import _tkinter # If this fails your Python may not be configured for Tk
ImportError: No module named ‘_tkinter’

pyenv上python3.5.2環境でtkinterを利用できるようにしないといけないということですね。

scikit-learnのアルゴリズム

画像認識はこちらになりますね。
Classification : Identifying to which category an object belongs to.
SVM, nearest neighbors, random forest, …
Spam detection, Image recognition.

株価の予測はこちらでしょうか。
Regression:Predicting a continuous-valued attribute associated with an object.
SVR, ridge regression, Lasso, …
Drug response, Stock prices.

属性の分類はこちらです。Web解析でデモグラフィックをグルーピングするにはこちらでしょうね。
Clustering:Automatic grouping of similar objects into sets.
k-Means, spectral clustering, mean-shift, …
Customer segmentation, Grouping experiment outcomes

これは、よくわかりません。変数を減らしていく、ビジュアリゼーションと記載がありますね。。
Dimensionality reduction:Reducing the number of random variables to consider.
Algorithms: PCA, feature selection, non-negative matrix factorization.
Visualization, Increased efficiency

これもあまり馴染みがないですね。モデリングの調整ということは理解ができますが。
Model selection:Comparing, validating and choosing parameters and models.
grid search, cross validation, metrics.
Improved accuracy via parameter tuning

テキストを機械学習に組み込むと書いてますね。どういうことでしょうか。翻訳の精度を上げるとかでしょうか。これは、形態素解析と組み合わせるのでしょうか。面白そうな分野ではありますね。
Preprocessing:Feature extraction and normalization.
preprocessing, feature extraction.
Transforming input data such as text for use with machine learning algorithms.

ということで、Classification、 Regression、Clusteringは割と一般的なモデリングだと思います。

初期は辛いですね。
とにかく量をこなさないと、どうやって学習していったらいいかすらわかりません。

sklearn import pandas

import pandas as pd
from sklearn import svm, metrics

xor_input = [
	[0, 0, 0],
	[0, 1, 1],
	[1, 0, 1],
	[1, 1, 0]
]

xor_df = pd.DataFrame(xor_input)
xor_data = xor_df.ix[:,0:1]
xor_label = xor_df.ix[:,2]

clf = svm.SVC()
clf.fit(xor_data, xor_label)
pre = clf.predict(xor_data)

ac_score = metrics.accuracy_score(xor_label, pre)
print(" 正解率=", ac_score)

[vagrant@localhost python]$ python3 app.py
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
正解率= 1.0

こりゃあかん、プログラミングやり始めた時と全く同じだ。
何故こう動いているか理解できん。

scikit-learnのfit()メソッドを使う

いきなりコードから始めます。

from sklearn import svm

xor_data = [
		[0, 0, 0],
		[0, 1, 1],
		[1, 0, 1],
		[1, 1, 0]
] 

data = []
label = []
for row in xor_data:
	p = row[0]
	q = row[1]
	r = row[2]
	data.append([p, q])
	label.append(r)

clf = svm.SVC()
clf.fit(data, label)

pre = clf.predict(data)
print("予測結果:", pre)

ok = 0; total = 0
for idx, answer in enumerate(label):
	p = pre[idx]
	if p == answer: ok += 1
	total += 1
print("正解率:", ok, "/", total, "=", ok/total)

続いてコマンドライン
[vagrant@localhost python]$ python3 app.py
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
予測結果: [0 1 1 0]
正解率: 4 / 4 = 1.0

なんだこりゃ、いきなり一気に難易度が上がった?!
scikit-learn -> Pythonの機械学習ライブラリ

scikit-learnを入れる

いよいよ来ました、pythonで機械学習 ^^
やっとここまできましたね(祝!) ワクワクします。

とりあえず、scikit-learnを入れます。
[vagrant@localhost python]$ pip3 install -U scikit-learn scipy matplotlib scikit-image

それから、pandasも入れておきましょう。
[vagrant@localhost python]$ pip3 install pandas
Collecting pandas
Downloading https://files.pythonhosted.org/packages/5d/d4/6e9c56a561f1d27407bf29318ca43f36ccaa289271b805a30034eb3a8ec4/pandas-0.23.4-cp35-cp35m-manylinux1_x86_64.whl (8.7MB)
100% |████████████████████████████████| 8.7MB 735kB/s

pythonでtinydbを使う

まず、pip3でtinydbを入れる
[vagrant@localhost python]$ pip3 install tinydb
Collecting tinydb
Downloading https://files.pythonhosted.org/packages/97/6c/fb150f2c09d8b6f23b8f080396673794f970fa7ca0d50900fbe5fe07b8b9/tinydb-3.10.0-py2.py3-none-any.whl
Installing collected packages: tinydb
Successfully installed tinydb-3.10.0

続いて、python

from tinydb import TinyDB, Query

filepath = "test-tynydb.json"
db = TinyDB(filepath)

db.purge_table('fruits')

table = db.table('fruits')

table.insert({'name':'Banana', 'price':600})
table.insert({'name':'Orange', 'price':1200})
table.insert({'name':'Mango', 'price':840})

print(table.all())

Item = Query()
res = table.search(Item.name == 'Orange')
print('Orange is', res[0]['price'])

print("800円以上のもの:")
res = table.search(Item.price >= 800)
for it in res:
	print("-", it['name'])

うーん、 jsonですね。phpからmongodbのinsert, fetchよりもコード量が少ないので、書きやすいかもしれませんね。
[vagrant@localhost python]$ python3 app.py
[{‘name’: ‘Banana’, ‘price’: 600}, {‘name’: ‘Orange’, ‘price’: 1200}, {‘name’: ‘Mango’, ‘price’: 840}]
Orange is 1200
800円以上のもの:
– Orange
– Mango