Bumpiness: smooth – bad
slope : flat – very steep
more like red x of blue circle, that’s most important in machine learning
Decision surface: Linear
Naive Bayes
Zooming ahead on supervised classification with python!
goal: draw decision boundary
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
come with page, just run with python interpreter
>>> import numpy as np
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> Y = np.array([1, 1, 1, 2, 2, 2])
>>> from sklearn.naive_bayes import GaussianNB
>>> clf = GaussianNB()
>>> clf.fit(X, Y)
GaussianNB(priors=None)
>>> print(clf.predict([[-0.8, -1]]))
[1]
>>> clf_pf = GaussianNB()
>>> clf_pf.partial_fit(X, Y, np.unique(Y))
GaussianNB(priors=None)
>>> print(clf_pf.predict([[-0.8, -1]]))
[1]
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.scatter(grade_sig, bumpy_sig, color = "b", label = "fast!")
plt.scatter(grade_bkg, bumpy_bkg, color = "r", label = "slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
from sklearn.naive_bayes import GaussianNB
clt = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
#coding:utf-8
import math
import sys
from collections import defaultdict
class NaiveBayes:
"""Multinomial Naive Bayes"""
def __init__(self):
self.categories = set()
self.vocabularies = set()
self.wordcount = {}
self.catcount = {}
self.denominator = {}
def train(self, data):
"""ナイーブベイズ分類器の訓練"""
# 文書集合からカテゴリを抽出して辞書を初期化
for d in data:
cat = d[0]
self.categories.add(cat)
for cat in self.categories:
self.wordcount[cat] = defaultdict(int)
self.catcount[cat] = 0
# 文書集合からカテゴリと単語をカウント
for d in data:
cat, doc = d[0], d[1:]
self.catcount[cat] += 1
for word in doc:
self.vocabularies.add(word)
self.wordcount[cat][word] += 1
# 単語の条件付き確率の分母の値をあらかじめ一括計算しておく (高速化のため)
for cat in self.categories:
self.denominator[cat] = sum(self.wordcount[cat].values()) + len(self.yocabularies)
def classify(self, doc):
"""事後確率の対数 log(P(cat|doc))がもっとも大きなカテゴリを返す"""
best = None
max = -sys.maxint
for cat in self.catcount.keys():
p = self.score(docs, cat)
if p > max:
max = p
best = cat
return best
def wordProb(self, word, cat):
"""単語の条件付き確率 P(word|cat)を求める"""
# ラプサムスムージングを適用
# wordcount[cat]はdefaultdict(int)なのでカテゴリに存在しなかった単語はデフォルトの0を返す
return float(self.wordcount[cat][word] + 1)/ float(self.denominator[cat])
score(self, doc, cat):
"""文書が与えられたときのカテゴリの事後確率の対数 log(P(cat|doc))を求める"""
total = sum(self.catcount.values()) #総文書数
score = math.log(float(self.catcount[cat]) / total) # log P(cat)
for word in doc:
# logをとると掛け算は足し算に
score += math.log(self.wordProb(word, cat)) # log P(word|cat)
return score
__str__(self):
total = sum(self.catcount.values()) #総文書数
return "documents: %d, vocabularies: %d, categories: %d" % (total, len(self.vocabularies), len(self.categories))
if __name__ == "__main__":
# Introduction to Information Retrieval 13.2
data = [["yes", "chinese", "Beijin", "Chinese"],
["yes","chinese", "Chinse", "Shanghai"],
["yes","Chinese", "Macao"],
["no","Tokyo","Japan","Chinse"]]
#ナイーブベイズ分類器を訓練
nb = NaiveBayes()
nb.train(data)
print nb
print "P(Chinese|yes) =", nb.wordProb("Chinese", "yes")
print "P(Tokyo|yes) =", nb.wordProb("Tokyo", "yes")
print "P(Japan|yes) =", nb.wordProb("Japan", "yes")
print "P(Chinese|no) =", nb.wordProb("Chinese", "no")
print "P(Tokyo|no) =", nb.wordProb("Tokyo", "no")
print "P(Japan|no) =", nb.wordProb("Japan", "no")
test = ["Chinese","Chinese", "Chinese", "Tokyo", "Japan"]
print "log P(yest|test) =", nb.score(test, "yes")
print "log P(no|test) =", nb.score(test, "no")
print nb.classify(test)