クロスバリデーション

from sklearn import svm, metrics
import random, re

lines = open('iris.csv', 'r', encoding='utf-8').read().split("\n")
f_tonum = lambda n : float(n) if re.match(r'^[0-9\.]+$', n) else n
f_cols = lambda li: list(map(f_tonum, li.strip().split(',')))
csv = list(map(f_cols, lines))
del csv[0]
random.shuffle(csv)

K = 5
csvk = [ [] for i in range(K) ]
for i in range(len(csv)):
	csvk[i % K].append(csv[i])

def split_data_label(rows):
	data = []; label = []
	for row in rows:
		data.append(row[0:4])
		label.append(row[4])
	return (data, label)

def calc_score(test, train):
	test_f, test_l = split_data_label(test)
	train_f, train_l = split_data_label(train)
	clf = svm.SVC()
	clf.fit(train_f, train_l)
	pre = clf.predict(test_f)
	return metrics.accuracy_score(test_l, pre)

score_list = []
for testc in csvk:
	trainc = []
	for i in csvk:
		if i != testc: trainc += i
	sc = calc_score(testc, trainc)
	score_list.append(sc)
print("各正解率=", score_list)
print("平均成果率=", sum(score_list) / len(score_list))

各正解率= [0.9666666666666667, 1.0, 1.0, 0.9333333333333333, 1.0]
平均成果率= 0.9800000000000001

import pandas as pd
from sklearn import cross_validation, svm, metrics
from sklearn.grid_search import GridSearchCV

train_csv = pd.read_csv("./mnist/train.csv")
test_csv = pd.read_csv("./mnist/t10k.csv")

train_label = train_csv.ix[:, 0]
train_data = train_csv.ix[:, 1:577]
test_label = test_csv.ix[:, 0]
test_data = test_csv.ix[:, 1:577]
print("学習データ数=", len(train_label))

params = [
	{"C": [1,10,100,1000], "kernel":["linear"]},
	{"C": [1,10,100,1000], "kernel":["rbf"], "gamma":[0.001, 0.0001]}
]

clf = GridSearchCV(svm.SVC(), params, n_jobs = -1)
clf.fit(train_data, train_label)
print("学習器=", clf.best_estimator_)

pre = clf.predict(test_data)
ac_score = metrics.accuracy_score(pre, test_label)
print("正解率=", ac_score)

RandomForestClassifier

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, metrics

mr = pd.read_csv("mushroom.csv", header=None)

label = []
data = []
attr_list = []
for row_index, row in mr.iterrows():
	label.append(row.ix[0])
	row_data = []
	for v in row.ix[1:]:
		row_data.append(ord(v))
	data.append(row_data)

data_train, data_test, label_train, label_test = \
	cross_validation.train_test_split(data, label)

clf = RandomForestClassifier()
clf.fit(data_train, label_train)

predict = clf.predict(data_test)

ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)
print("正解率=", ac_score)
print("レポート=\n", cl_report)

[vagrant@localhost python]$ python3 app.py
正解率= 1.0
レポート=
precision recall f1-score support

e 1.00 1.00 1.00 1031
p 1.00 1.00 1.00 1000

avg / total 1.00 1.00 1.00 2031

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, metrics

mr = pd.read_csv("mushroom.csv", header=None)

label = []
data = []
attr_list = []
for row_index, row in mr.iterrorws():
	label.append(row.ix[0])
	exdata = []
	for col, v in enumerate(row.ix[1:]):
		if row_index == 0:
			attr = {"dic": {}, "cnt":0}
			attr_list.append(attr)
		else:
			attr = attr_list[col]
		d = [0,0,0,0,0,0,0,0,0,0,0,0]
		if v in attr["dic"]:
			idx = attr["dic"][v]
		else:
			idx = attr["cnt"]
			attr["dic"][v] = idx
			attr["cnt"] += 1
			d[idx] = 1
			exdata += d
		data.append(exdata)

data_train, data_test, label_train, label_test = \
	cross_validation.train_test_split(data, label)

clf = RandomForestClassifier()
clf.fit(data_train, label_train)
predict = clf.predict(data_test)
ac_score = metrics.accuracy_score(label_test, predict)
print("正解率=", ac_score)

mushroom

import urllib.request as req
local = "mushroom.csv"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
req.urlretrieve(url, local)
print("ok")

bmi

import random

def calc_bmi(h, w):
bmi = w / (h / 100) ** 2
if bmi < 18.5: return "thin" if bmi < 25: return "normal" return "fat" fp = open("bmi.csv", "w", encoding="utf-8") fp.write("height, weight, label\r\n") cnt = {"thin":0, "normal":0, "fat":0} for i in range(20000): h = random.randint(120, 200) w = random.randint(35, 80) label = calc_bmi(h, w) cnt[label] += 1 fp.write("{0},{1},{2}\r\n".format(h, w, label)) fp.close() print("ok", cnt) [/python] [python] from sklearn import cross_validation, svm, metrics import matplotlib.pyplot as plt import pandas as pd tbl = pd.read_csv("bmi.csv") label = tbl["label"] w = tbl["weight"] / 100 h = tbl["height"] / 200 wh = pd.concat([w, h], axis=1) data_train, data_test, label_train, label_test = \ cross_validation.train_test_split(wh, label) clf = svm.SVC() clf.fit(data_train, label_train) predict = clf.predict(data_test) ac_score = metrics.accuracy_score(label_test, predict) cl_report = metrics.classification_report(label_test, predict) print("正解率=", ac_score) print("レポート=\n", cl_report) [/python] [python] import matplotlib.pyplot as plt import pandas as pd tbl = pd.read_csv("bmi.csv", index_col=2) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) def scatter(lbl, color): b = tbl.loc[lbl] ax.scatter(b["weight"],b["height"], c=color, label=lbl) scatter("fat", "red") scatter("normal", "yellow") scatter("thin", "purple") ax.legend() plt.savefig("bmi-test.png") [/python]

PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’

import urllib.request as req
import gzip, os, os.path

savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"
files = [
	"train-images-idx3-ubyte.gz",
	"train-labels-idx1-ubyte.gz",
	"t10k-images-idx3-ubyte.gz",
	"t10k-labels-idx1-ubyte.gz"]

if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
	url = baseurl + "/" + f
	loc = savepath = "/" + f
	print("download:", url)
	if not os.path.exists(loc):
		req.urlretrieve(url, loc)

for f in files:
	gz_file = savepath + "/" + f
	raw_file = savepath + "/" + f.replace(".gz", "")
	print("gzip:", f)
	with gzip.open(gz_file, "rb") as fp:
		body = fp.read()
		with open(raw_file, "wb") as w:
			w.write(body)
print("ok")

[vagrant@localhost python]$ python3 app.py
download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Traceback (most recent call last):
File “app.py”, line 18, in
req.urlretrieve(url, loc)
File “/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/urllib/request.py”, line 198, in urlretrieve
tfp = open(filename, ‘wb’)
PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’

何故だ!?

from sklearn import svm, metrics
import glob, os.path, re, json

def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r’^[a-z]{2,}’, name).group()
wtih open(fname, “r”, encoding=”utf-8″) as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord(“a”)
code_z = ord(“z”)
for ch in text:
n = ord(ch)
if code_a <= n <= code_z: cnt[n - code_a] += 1 total = sum(cnt) freq = list(map(lambda n: n / total, cnt)) return (freq, lang) def load_files(path): freqs = [] labels = [] file_list = glob.glob(path) for fname in file_list: r = check_freq(fname) freqs.append(r[0]) labels.append(r[1]) return {"freqs":freqs, "labels":labels} data = load_files("./lang/train/*.txt") test = load_files("./lang/test/*.txt") with open("./lang/freq.json", "w", encoding="utf-8") as fp: json.dump([data, test], fp) clf = svm.SVC() clf.fit(data["freqs"], data["labels"]) predict = clf.predict(test["freqs"]) sc_score = metrics.accuracy_score(test["labels"], predict) cl_report = metrics.classification_report(test["labels"], predict) print("正解率=", ac_score) print("レポート=") print(cl_report) [/python] [python] import matplotlib.pyplot as plt import pandas as pd import json with open("./lang/freq.json", "r", encoding="utf-8") as fp: freq = json.load(fp) lang_dic = {} for i, lbl in enumerate(freq[0]["labels"]): fq = freq[0]["freqs"][i] if not (lbl in lang_dic): lang_dic[lbl] = fq continue for idx, v in enumerate(fq): lang_dic[lbl][idx] = (lang_dic[lbl][idx] + v) / 2 asclist = [[chr(n) for n in range(97,97+26)]] df = pd.DataFrame(lang_dic, index=asclist) plt.style.use('ggplot') df.plot(kind="bar", subplots=True, ylim=(0,0.15)) plt.savefig("lang-plot.png") [/python] [python] from sklearn import svm from sklearn.externals import joblib import json with open("./lang/freq.json", "r", encoding="utf-8") as fp: d = json.load(fp) data = d[0] clf = svm.SVC() clf.fit(data["freqs"], data["labels"]) joblib.dump(clf, "./cgi-bin/freq.pkl") print("ok") [/python]

アヤメの品種分類

GitHubからcsvをダウンロードします
https://github.com/pandas-dev/pandas/tree/master/pandas/tests/data

from sklearn import svm, metrics
import random, re

csv = []
with open(‘iris.csv’, ‘r’, encoding=’utf-8′) as fp:
for line in fp:
line = line.strip()
cols = line.split(‘,’)
fn = lambda n : float(n) if re.match(r’^[0-9\.]+$’, n) else n
cols = list(map(fn, cols))
csv.append(cols)

del csv[0]

random.shuffle(csv)

total_len = len(csv)
train_len = int(total_len * 2 / 3)
train_data = []
train_label = []
test_data = []
test_label = []
for i in range(total_len):
data = csv[i][0:4]
label = csv[i][4]
if i < train_len: train_data.append(data) train_label.append(label) else: test_data.append(data) test_label.append(label) clf = svm.SVC() clf.fit(train_data, train_label) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(test_label, pre) print("正解率=", ac_score) [/python] [vagrant@localhost python]$ python3 app.py return f(*args, **kwds) 正解率= 0.96 [python] import pandas as pd from sklearn import svm, metrics, cross_validation csv = pd.read_csv('iris.csv') csv_data = csv[["SepalLength","SepalWidth","PetalLength","PetalWidth"]] csv_label = csv["Name"] train_data, test_data, train_label, test_label = \ cross_validation.train_test_split(csv_data, csv_label) clf = svm.SVC() clf.fit(train_data, train_label) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(test_label, pre) print("正解率=", ac_score) [/python] [vagrant@localhost python]$ python3 app.py 正解率= 0.9736842105263158

手書きでイラスト

段々イラストレーターさんと仕事する機会も出てきましたね。何故かわかりませんが。

ある方に、イラストを描いてみたら、と薦められたので、
可愛い魔女

うううう、難しい。。。

ImportError: No module named ‘_tkinter’

from sklearn import datasets

digits = datasets.load_digits()

import matplotlib.pyplot as plt
plt.matshow(digits.images[0], cmap="Greys")
plt.show()

[vagrant@localhost python]$ python3 app.py

import _tkinter # If this fails your Python may not be configured for Tk
ImportError: No module named ‘_tkinter’

pyenv上python3.5.2環境でtkinterを利用できるようにしないといけないということですね。

scikit-learnのアルゴリズム

画像認識はこちらになりますね。
Classification : Identifying to which category an object belongs to.
SVM, nearest neighbors, random forest, …
Spam detection, Image recognition.

株価の予測はこちらでしょうか。
Regression:Predicting a continuous-valued attribute associated with an object.
SVR, ridge regression, Lasso, …
Drug response, Stock prices.

属性の分類はこちらです。Web解析でデモグラフィックをグルーピングするにはこちらでしょうね。
Clustering:Automatic grouping of similar objects into sets.
k-Means, spectral clustering, mean-shift, …
Customer segmentation, Grouping experiment outcomes

これは、よくわかりません。変数を減らしていく、ビジュアリゼーションと記載がありますね。。
Dimensionality reduction:Reducing the number of random variables to consider.
Algorithms: PCA, feature selection, non-negative matrix factorization.
Visualization, Increased efficiency

これもあまり馴染みがないですね。モデリングの調整ということは理解ができますが。
Model selection:Comparing, validating and choosing parameters and models.
grid search, cross validation, metrics.
Improved accuracy via parameter tuning

テキストを機械学習に組み込むと書いてますね。どういうことでしょうか。翻訳の精度を上げるとかでしょうか。これは、形態素解析と組み合わせるのでしょうか。面白そうな分野ではありますね。
Preprocessing:Feature extraction and normalization.
preprocessing, feature extraction.
Transforming input data such as text for use with machine learning algorithms.

ということで、Classification、 Regression、Clusteringは割と一般的なモデリングだと思います。

初期は辛いですね。
とにかく量をこなさないと、どうやって学習していったらいいかすらわかりません。

scikit-learnを学ぼう

ホームページから見ていきます
http://scikit-learn.org/stable/

-Simple and efficient tools for data mining and data analysis
-Accessible to everybody, and reusable in various contexts
-Built on NumPy, SciPy, and matplotlib
-Open source, commercially usable – BSD license

BSD license ってあまりみませんね。
BSD license:カリフォルニア大学によって策定され、同大学のバークレー校内の研究グループ、Computer Systems Research Groupが開発したソフトウェア群であるBSDなどで採用されている。「無保証」であることの明記と著作権およびライセンス条文自身の表示を再頒布の条件とするライセンス規定である。この条件さえ満たせば、BSDライセンスのソースコードを複製・改変して作成したオブジェクトコードをソースコードを公開せずに頒布できる。

著作権・免責の表示が必要ということですね。