sk-learnを使ってみよう

————— ——-
cycler 0.10.0
joblib 0.13.2
kiwisolver 1.1.0
matplotlib 3.0.3
numpy 1.17.1
pandas 0.25.1
pip 19.2.3
pyparsing 2.4.2
python-dateutil 2.8.0
pytz 2019.2
scikit-learn 0.21.3
scipy 1.3.1
setuptools 20.10.1
six 1.12.0

from sklearn import svm
xor_data = [
	[0,0,0],
	[0,1,0],
	[1,0,1],
	[1,1,0]
]
data = []
label = []
for row in xor_data:
	p = row[0]
	q = row[1]
	r = row[2]
	data.append([p,q])
	label.append(r)
clf = svm.SVC(gamma="auto")
clf.fit(data, label)
pre = clf.predict(data)
print("予想結果:", pre)
ok = 0; total = 0
for idx, answer in enumerate(label):
	p = pre[idx]
	if p == answer: ok += 1
	total += 1
print("正解率:", ok, "/", total, "=", ok/total)

[vagrant@localhost python]$ python myapp.py
予想結果: [0 0 0 0]
正解率: 3 / 4 = 0.75

OK、取り敢えず環境は整ってきた

python astモジュール

The ast module makes it easy to handle Python abstract syntax trees in Python applications. The abstract syntax itself can change with every release of Python. Using this module will help to learn the current grammer programmatically.

To create an abstract syntax tree, pass ast. PyCF_ONLY_AST as a flag for the built-in function compile() or use the helper function parse() provided by this module. The result is a tree of objects of classes that inherit from ast.AST. Abstract syntax trees can be compiled into Python code objects using the built-in function compile().

[vagrant@localhost test]$ python --version
Python 3.5.2
[vagrant@localhost test]$ cat << 'EOF' > helloworld.py
> !#/usr/bin/env python
> # -*- coding: utf-8 -*-
>
> def main():
>   print('hello, world!')
>
> if __name__ == '__main__':
>   main()
> EOF

[vagrant@localhost test]$ python
Python 3.5.2 (default, Jul 28 2018, 11:25:01)
[GCC 4.4.7 20120313 (Red Hat 4.4.7-23)] on linux
Type “help”, “copyright”, “credits” or “license” for more information.
>>> FILENAME = ‘helloworld.py’
>>> with open(FILENAME, ‘r’) as f:
… source = f.read()
File ““, line 2
source = f.read()
^
IndentationError: expected an indented block

あれ、うまくいかんな。。

ec2でpythonにpost

[ec2-user@ip-xxx-xx-xx-xx app]$ python -V
Python 2.7.14

[ec2-user@ip-xxx-xx-xx-xx app]$ python35
Python 3.5.5 (default, Apr 25 2018, 23:51:32)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux
Type “help”, “copyright”, “credits” or “license” for more information.
>>>

python2系だからか?
usr/binを見ます。

うまくいきませんね。
他の方法を試してみましょうか。。

vagrantでajaxでpythonにpostする

index.php

<!DOCTYPE html>
<html lang="ja">
<head>
  <title>Ajax</title>
</head>

<body>
  <h1>Ajax</h1>
  <form id="form">
    <div><label>送信する数字</label><input type="number" id="number" value="0"></div>
    <div>
      <label>送信するテキスト</label>
      <textarea id="text"></textarea>
    </div>
    <button type="submit" class="btn btn-primary">Submit</button>
  </form>
  <div id="result"></div>

  <script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
  <script type="text/javascript">
    $(document).ready(function(){
      $('#form').submit(function(){
        event.preventDefault();
        var $form = $(this);
        $.ajax({
          url:'http://localhost:8000/cgi-bin/index.py',
          type: 'post',
          dataType: 'text',
          data: {
            number: $('#number').val(),
            text: $('#text').val()
          },
        })
        .done(function(response){
          $('#result').html(response);
        })
        .fail(function(){
          $('#result').html('Failed.');
        });
      });
    });
    </script>
  </body>
</html>

index.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import cgi, cgitb

cgitb.enable()

form = cgi.FieldStorage()
text = form.getFirst("text")
n = form.getFirst("number")
sequence_list = []

print('Content-type: text/html\nAccess-Control-Allow-Origin: *\n')
print("<p>送信された数字: {}</p>".format("None" if n is None else int(n)))
print("<p>送信されたテキスト: {}</p>".format(text))
python -m http.server --cgi

何故だ? 問題はHTML側ではないと思うので、AWSもしくはsakuraでやってみるか。

index.phpをindex.htmlに変更します。

192.168.35.1 – – [26/Aug/2018 10:19:03] code 403, message CGI script is not executable (‘/cgi-bin/index.py’)
192.168.35.1 – – [26/Aug/2018 10:19:03] “POST /cgi-bin/index.py HTTP/1.1” 403 –

なに?
[vagrant@localhost app]$ cd cgi-bin
[vagrant@localhost cgi-bin]$ chmod 755 index.py

192.168.35.1 – – [26/Aug/2018 10:23:35] “GET / HTTP/1.1” 200 –
192.168.35.1 – – [26/Aug/2018 10:23:43] “POST /cgi-bin/index.py HTTP/1.1” 200 –
: そのようなファイルやディレクトリはありません
192.168.35.1 – – [26/Aug/2018 10:23:43] CGI script exit status 0x7f00

う~ん、なんでだろう。
jsのdocument.titleで取得してphpファイルに送ることもできるが、後々のことを考えるとpythonでやりたいですね。

クロスバリデーション

from sklearn import svm, metrics
import random, re

lines = open('iris.csv', 'r', encoding='utf-8').read().split("\n")
f_tonum = lambda n : float(n) if re.match(r'^[0-9\.]+$', n) else n
f_cols = lambda li: list(map(f_tonum, li.strip().split(',')))
csv = list(map(f_cols, lines))
del csv[0]
random.shuffle(csv)

K = 5
csvk = [ [] for i in range(K) ]
for i in range(len(csv)):
	csvk[i % K].append(csv[i])

def split_data_label(rows):
	data = []; label = []
	for row in rows:
		data.append(row[0:4])
		label.append(row[4])
	return (data, label)

def calc_score(test, train):
	test_f, test_l = split_data_label(test)
	train_f, train_l = split_data_label(train)
	clf = svm.SVC()
	clf.fit(train_f, train_l)
	pre = clf.predict(test_f)
	return metrics.accuracy_score(test_l, pre)

score_list = []
for testc in csvk:
	trainc = []
	for i in csvk:
		if i != testc: trainc += i
	sc = calc_score(testc, trainc)
	score_list.append(sc)
print("各正解率=", score_list)
print("平均成果率=", sum(score_list) / len(score_list))

各正解率= [0.9666666666666667, 1.0, 1.0, 0.9333333333333333, 1.0]
平均成果率= 0.9800000000000001

import pandas as pd
from sklearn import cross_validation, svm, metrics
from sklearn.grid_search import GridSearchCV

train_csv = pd.read_csv("./mnist/train.csv")
test_csv = pd.read_csv("./mnist/t10k.csv")

train_label = train_csv.ix[:, 0]
train_data = train_csv.ix[:, 1:577]
test_label = test_csv.ix[:, 0]
test_data = test_csv.ix[:, 1:577]
print("学習データ数=", len(train_label))

params = [
	{"C": [1,10,100,1000], "kernel":["linear"]},
	{"C": [1,10,100,1000], "kernel":["rbf"], "gamma":[0.001, 0.0001]}
]

clf = GridSearchCV(svm.SVC(), params, n_jobs = -1)
clf.fit(train_data, train_label)
print("学習器=", clf.best_estimator_)

pre = clf.predict(test_data)
ac_score = metrics.accuracy_score(pre, test_label)
print("正解率=", ac_score)

RandomForestClassifier

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, metrics

mr = pd.read_csv("mushroom.csv", header=None)

label = []
data = []
attr_list = []
for row_index, row in mr.iterrows():
	label.append(row.ix[0])
	row_data = []
	for v in row.ix[1:]:
		row_data.append(ord(v))
	data.append(row_data)

data_train, data_test, label_train, label_test = \
	cross_validation.train_test_split(data, label)

clf = RandomForestClassifier()
clf.fit(data_train, label_train)

predict = clf.predict(data_test)

ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)
print("正解率=", ac_score)
print("レポート=\n", cl_report)

[vagrant@localhost python]$ python3 app.py
正解率= 1.0
レポート=
precision recall f1-score support

e 1.00 1.00 1.00 1031
p 1.00 1.00 1.00 1000

avg / total 1.00 1.00 1.00 2031

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, metrics

mr = pd.read_csv("mushroom.csv", header=None)

label = []
data = []
attr_list = []
for row_index, row in mr.iterrorws():
	label.append(row.ix[0])
	exdata = []
	for col, v in enumerate(row.ix[1:]):
		if row_index == 0:
			attr = {"dic": {}, "cnt":0}
			attr_list.append(attr)
		else:
			attr = attr_list[col]
		d = [0,0,0,0,0,0,0,0,0,0,0,0]
		if v in attr["dic"]:
			idx = attr["dic"][v]
		else:
			idx = attr["cnt"]
			attr["dic"][v] = idx
			attr["cnt"] += 1
			d[idx] = 1
			exdata += d
		data.append(exdata)

data_train, data_test, label_train, label_test = \
	cross_validation.train_test_split(data, label)

clf = RandomForestClassifier()
clf.fit(data_train, label_train)
predict = clf.predict(data_test)
ac_score = metrics.accuracy_score(label_test, predict)
print("正解率=", ac_score)

mushroom

import urllib.request as req
local = "mushroom.csv"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
req.urlretrieve(url, local)
print("ok")

bmi

import random

def calc_bmi(h, w):
bmi = w / (h / 100) ** 2
if bmi < 18.5: return "thin" if bmi < 25: return "normal" return "fat" fp = open("bmi.csv", "w", encoding="utf-8") fp.write("height, weight, label\r\n") cnt = {"thin":0, "normal":0, "fat":0} for i in range(20000): h = random.randint(120, 200) w = random.randint(35, 80) label = calc_bmi(h, w) cnt[label] += 1 fp.write("{0},{1},{2}\r\n".format(h, w, label)) fp.close() print("ok", cnt) [/python] [python] from sklearn import cross_validation, svm, metrics import matplotlib.pyplot as plt import pandas as pd tbl = pd.read_csv("bmi.csv") label = tbl["label"] w = tbl["weight"] / 100 h = tbl["height"] / 200 wh = pd.concat([w, h], axis=1) data_train, data_test, label_train, label_test = \ cross_validation.train_test_split(wh, label) clf = svm.SVC() clf.fit(data_train, label_train) predict = clf.predict(data_test) ac_score = metrics.accuracy_score(label_test, predict) cl_report = metrics.classification_report(label_test, predict) print("正解率=", ac_score) print("レポート=\n", cl_report) [/python] [python] import matplotlib.pyplot as plt import pandas as pd tbl = pd.read_csv("bmi.csv", index_col=2) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) def scatter(lbl, color): b = tbl.loc[lbl] ax.scatter(b["weight"],b["height"], c=color, label=lbl) scatter("fat", "red") scatter("normal", "yellow") scatter("thin", "purple") ax.legend() plt.savefig("bmi-test.png") [/python]

PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’

import urllib.request as req
import gzip, os, os.path

savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"
files = [
	"train-images-idx3-ubyte.gz",
	"train-labels-idx1-ubyte.gz",
	"t10k-images-idx3-ubyte.gz",
	"t10k-labels-idx1-ubyte.gz"]

if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
	url = baseurl + "/" + f
	loc = savepath = "/" + f
	print("download:", url)
	if not os.path.exists(loc):
		req.urlretrieve(url, loc)

for f in files:
	gz_file = savepath + "/" + f
	raw_file = savepath + "/" + f.replace(".gz", "")
	print("gzip:", f)
	with gzip.open(gz_file, "rb") as fp:
		body = fp.read()
		with open(raw_file, "wb") as w:
			w.write(body)
print("ok")

[vagrant@localhost python]$ python3 app.py
download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Traceback (most recent call last):
File “app.py”, line 18, in
req.urlretrieve(url, loc)
File “/home/vagrant/.pyenv/versions/3.5.2/lib/python3.5/urllib/request.py”, line 198, in urlretrieve
tfp = open(filename, ‘wb’)
PermissionError: [Errno 13] Permission denied: ‘/train-images-idx3-ubyte.gz’

何故だ!?

from sklearn import svm, metrics
import glob, os.path, re, json

def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r’^[a-z]{2,}’, name).group()
wtih open(fname, “r”, encoding=”utf-8″) as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord(“a”)
code_z = ord(“z”)
for ch in text:
n = ord(ch)
if code_a <= n <= code_z: cnt[n - code_a] += 1 total = sum(cnt) freq = list(map(lambda n: n / total, cnt)) return (freq, lang) def load_files(path): freqs = [] labels = [] file_list = glob.glob(path) for fname in file_list: r = check_freq(fname) freqs.append(r[0]) labels.append(r[1]) return {"freqs":freqs, "labels":labels} data = load_files("./lang/train/*.txt") test = load_files("./lang/test/*.txt") with open("./lang/freq.json", "w", encoding="utf-8") as fp: json.dump([data, test], fp) clf = svm.SVC() clf.fit(data["freqs"], data["labels"]) predict = clf.predict(test["freqs"]) sc_score = metrics.accuracy_score(test["labels"], predict) cl_report = metrics.classification_report(test["labels"], predict) print("正解率=", ac_score) print("レポート=") print(cl_report) [/python] [python] import matplotlib.pyplot as plt import pandas as pd import json with open("./lang/freq.json", "r", encoding="utf-8") as fp: freq = json.load(fp) lang_dic = {} for i, lbl in enumerate(freq[0]["labels"]): fq = freq[0]["freqs"][i] if not (lbl in lang_dic): lang_dic[lbl] = fq continue for idx, v in enumerate(fq): lang_dic[lbl][idx] = (lang_dic[lbl][idx] + v) / 2 asclist = [[chr(n) for n in range(97,97+26)]] df = pd.DataFrame(lang_dic, index=asclist) plt.style.use('ggplot') df.plot(kind="bar", subplots=True, ylim=(0,0.15)) plt.savefig("lang-plot.png") [/python] [python] from sklearn import svm from sklearn.externals import joblib import json with open("./lang/freq.json", "r", encoding="utf-8") as fp: d = json.load(fp) data = d[0] clf = svm.SVC() clf.fit(data["freqs"], data["labels"]) joblib.dump(clf, "./cgi-bin/freq.pkl") print("ok") [/python]

アヤメの品種分類

GitHubからcsvをダウンロードします
https://github.com/pandas-dev/pandas/tree/master/pandas/tests/data

from sklearn import svm, metrics
import random, re

csv = []
with open(‘iris.csv’, ‘r’, encoding=’utf-8′) as fp:
for line in fp:
line = line.strip()
cols = line.split(‘,’)
fn = lambda n : float(n) if re.match(r’^[0-9\.]+$’, n) else n
cols = list(map(fn, cols))
csv.append(cols)

del csv[0]

random.shuffle(csv)

total_len = len(csv)
train_len = int(total_len * 2 / 3)
train_data = []
train_label = []
test_data = []
test_label = []
for i in range(total_len):
data = csv[i][0:4]
label = csv[i][4]
if i < train_len: train_data.append(data) train_label.append(label) else: test_data.append(data) test_label.append(label) clf = svm.SVC() clf.fit(train_data, train_label) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(test_label, pre) print("正解率=", ac_score) [/python] [vagrant@localhost python]$ python3 app.py return f(*args, **kwds) 正解率= 0.96 [python] import pandas as pd from sklearn import svm, metrics, cross_validation csv = pd.read_csv('iris.csv') csv_data = csv[["SepalLength","SepalWidth","PetalLength","PetalWidth"]] csv_label = csv["Name"] train_data, test_data, train_label, test_label = \ cross_validation.train_test_split(csv_data, csv_label) clf = svm.SVC() clf.fit(train_data, train_label) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(test_label, pre) print("正解率=", ac_score) [/python] [vagrant@localhost python]$ python3 app.py 正解率= 0.9736842105263158