from sklearn import svm, metrics import random, re lines = open('iris.csv', 'r', encoding='utf-8').read().split("\n") f_tonum = lambda n : float(n) if re.match(r'^[0-9\.]+$', n) else n f_cols = lambda li: list(map(f_tonum, li.strip().split(','))) csv = list(map(f_cols, lines)) del csv[0] random.shuffle(csv) K = 5 csvk = [ [] for i in range(K) ] for i in range(len(csv)): csvk[i % K].append(csv[i]) def split_data_label(rows): data = []; label = [] for row in rows: data.append(row[0:4]) label.append(row[4]) return (data, label) def calc_score(test, train): test_f, test_l = split_data_label(test) train_f, train_l = split_data_label(train) clf = svm.SVC() clf.fit(train_f, train_l) pre = clf.predict(test_f) return metrics.accuracy_score(test_l, pre) score_list = [] for testc in csvk: trainc = [] for i in csvk: if i != testc: trainc += i sc = calc_score(testc, trainc) score_list.append(sc) print("各正解率=", score_list) print("平均成果率=", sum(score_list) / len(score_list))
各正解率= [0.9666666666666667, 1.0, 1.0, 0.9333333333333333, 1.0]
平均成果率= 0.9800000000000001
import pandas as pd from sklearn import cross_validation, svm, metrics from sklearn.grid_search import GridSearchCV train_csv = pd.read_csv("./mnist/train.csv") test_csv = pd.read_csv("./mnist/t10k.csv") train_label = train_csv.ix[:, 0] train_data = train_csv.ix[:, 1:577] test_label = test_csv.ix[:, 0] test_data = test_csv.ix[:, 1:577] print("学習データ数=", len(train_label)) params = [ {"C": [1,10,100,1000], "kernel":["linear"]}, {"C": [1,10,100,1000], "kernel":["rbf"], "gamma":[0.001, 0.0001]} ] clf = GridSearchCV(svm.SVC(), params, n_jobs = -1) clf.fit(train_data, train_label) print("学習器=", clf.best_estimator_) pre = clf.predict(test_data) ac_score = metrics.accuracy_score(pre, test_label) print("正解率=", ac_score)