Bumpiness: smooth – bad
slope : flat – very steep
more like red x of blue circle, that’s most important in machine learning
Decision surface: Linear
Naive Bayes
Zooming ahead on supervised classification with python!
goal: draw decision boundary
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
come with page, just run with python interpreter
>>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> Y = np.array([1, 1, 1, 2, 2, 2]) >>> from sklearn.naive_bayes import GaussianNB >>> clf = GaussianNB() >>> clf.fit(X, Y) GaussianNB(priors=None) >>> print(clf.predict([[-0.8, -1]])) [1] >>> clf_pf = GaussianNB() >>> clf_pf.partial_fit(X, Y, np.unique(Y)) GaussianNB(priors=None) >>> print(clf_pf.predict([[-0.8, -1]])) [1]
plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.scatter(grade_sig, bumpy_sig, color = "b", label = "fast!") plt.scatter(grade_bkg, bumpy_bkg, color = "r", label = "slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() from sklearn.naive_bayes import GaussianNB clt = GaussianNB() clf.fit(features_train, labels_train) pred = clf.predict(features_test)
#coding:utf-8 import math import sys from collections import defaultdict class NaiveBayes: """Multinomial Naive Bayes""" def __init__(self): self.categories = set() self.vocabularies = set() self.wordcount = {} self.catcount = {} self.denominator = {} def train(self, data): """ナイーブベイズ分類器の訓練""" # 文書集合からカテゴリを抽出して辞書を初期化 for d in data: cat = d[0] self.categories.add(cat) for cat in self.categories: self.wordcount[cat] = defaultdict(int) self.catcount[cat] = 0 # 文書集合からカテゴリと単語をカウント for d in data: cat, doc = d[0], d[1:] self.catcount[cat] += 1 for word in doc: self.vocabularies.add(word) self.wordcount[cat][word] += 1 # 単語の条件付き確率の分母の値をあらかじめ一括計算しておく (高速化のため) for cat in self.categories: self.denominator[cat] = sum(self.wordcount[cat].values()) + len(self.yocabularies) def classify(self, doc): """事後確率の対数 log(P(cat|doc))がもっとも大きなカテゴリを返す""" best = None max = -sys.maxint for cat in self.catcount.keys(): p = self.score(docs, cat) if p > max: max = p best = cat return best def wordProb(self, word, cat): """単語の条件付き確率 P(word|cat)を求める""" # ラプサムスムージングを適用 # wordcount[cat]はdefaultdict(int)なのでカテゴリに存在しなかった単語はデフォルトの0を返す return float(self.wordcount[cat][word] + 1)/ float(self.denominator[cat]) score(self, doc, cat): """文書が与えられたときのカテゴリの事後確率の対数 log(P(cat|doc))を求める""" total = sum(self.catcount.values()) #総文書数 score = math.log(float(self.catcount[cat]) / total) # log P(cat) for word in doc: # logをとると掛け算は足し算に score += math.log(self.wordProb(word, cat)) # log P(word|cat) return score __str__(self): total = sum(self.catcount.values()) #総文書数 return "documents: %d, vocabularies: %d, categories: %d" % (total, len(self.vocabularies), len(self.categories)) if __name__ == "__main__": # Introduction to Information Retrieval 13.2 data = [["yes", "chinese", "Beijin", "Chinese"], ["yes","chinese", "Chinse", "Shanghai"], ["yes","Chinese", "Macao"], ["no","Tokyo","Japan","Chinse"]] #ナイーブベイズ分類器を訓練 nb = NaiveBayes() nb.train(data) print nb print "P(Chinese|yes) =", nb.wordProb("Chinese", "yes") print "P(Tokyo|yes) =", nb.wordProb("Tokyo", "yes") print "P(Japan|yes) =", nb.wordProb("Japan", "yes") print "P(Chinese|no) =", nb.wordProb("Chinese", "no") print "P(Tokyo|no) =", nb.wordProb("Tokyo", "no") print "P(Japan|no) =", nb.wordProb("Japan", "no") test = ["Chinese","Chinese", "Chinese", "Tokyo", "Japan"] print "log P(yest|test) =", nb.score(test, "yes") print "log P(no|test) =", nb.score(test, "no") print nb.classify(test)