parameters = {'kerne':('linear','rbf'),'c':[1,10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(iris.data, iris.target)
parameters = {'kernel':('linear','rbf'),'C':[1,10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr,parameters)
clf.fit(iris.data, iris.target)
clf.best_params_
Validation
import numpy as np from sklearn import cross_validation from sklearn imort datasets from sklearn import svm iris = datasets.load_iris() iris.data.shape, iris.target.shape((150, 4), (150,)) X_train, X_test, y_train, y_test = cross_validation.train_test_split( iris.data, iris.target, test_size=0.4, random_state=0) X_train.shape, y_train.shape((90, 4), (90,)) X_test.shape, y_test.shape((60, 4), (60,)) clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train) clf.score(X_test, y_test)0.96
Training, Transforms, Predicting
Train/test split -> pca -> svm
clf = GaussianNB() t0 = time() kf = KFold(len(authors), 2) for train_indices, test_indicies in kf: features_train = [word_data[ii] for ii in train_indices] features_test = [word_data[ii] for ii in test_indices] authors_train = [authors[ii] for ii in train_indices] authors_test = [authors[ii] for ii in test_indices] vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, authors_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() clf.fit(features_train_transformed, authors_train) print "training time:", round(time()-t0, 3), "s" t0 = time() pred = clf.predict( features_test_transformed )
When to use PCA
When to use PCA
-> latent features driving the patterns in data
-> dimensional reduction
-> visualize high-dimensional data, reduce noise
-> make other algorithms(regression, classification) work better fewer inputs
PCA for facial recognition
X_train, X_test, y_train, y_test = train_split(X, y, test_size=0.25) n_components = 150 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizePCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_component, h, w)) print "Projecting the input data on the eigenfaces orthnormal basis" t0 = time() X_train_pca = pca.tranform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) print "Fitting the classifier to the training set"
http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html
PCA
Principal Component Analysis – PCA
Dimensional of data:2
x = 2
y = 3
Δx = 1
Δy = 2
square footage + No.Rooms -> Size
How to determine the principal component
variance – the willingness/flexibility of an algorithm to learn
technical term in statistics – roughly the “spread” of a data distribution(similar to standard duration)
– maximum variance and information loss
def doPCA():
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(data)
return pca
pca = doPCA()
print pca.explained_variance_ratio_
first_pc = pca.component_[0]
second_pc = pca.components_[1]
transformed_data = pca.transform(data)
for ii, jj in zip(transofrmed_data, data):
plt.scatter( first_pc[0]*ii[0], first_pc[1]*ii[0], color="r")
plt.scatter( second_pc[0]*ii[1], second_pc[1]*ii[1], color="c")
plt.scatter( jj[0], jj[i], color="b")
plt.xlabel("bonus")
plt.ylabel("long-term incentive")
plt.show()
Features != Information
There are two big univariate feature selection tools in sklearn: SelectPercentile and SelectKBest. The difference is pretty apparent by the names: SelectPercentile selects the X% of features that are most powerful (where X is a parameter) and SelectKBest selects the K features that are most powerful (where K is a parameter).
high bias
pays little attention to data over simplified, high error on training set
high variance
pays too much attention to data(does not generalize well) over fit
Regularization in Regression
method for automatically penalizing extra features
-Lasso Regression: minimize SSE + γ|β|
m1 – m4: coefficients of regression
x1-x4: features
import sklearn, linear_model Lasso
features, labes = GetMyData()
reguression = Lasso()
regression fit(features)
regression predict([2, 4])
Visualizing
import pickle
from get_data import getData
def computeFraction( poi_messages, all_messages ):
fraction = 0.
return faraction
data_dict = getData()
submit_dict = {}
for name in data_dict
data_point = data_dict[name]
print
from_poi_to_this_person = data_point["from_poi_to_this_person"]
to_messages = data_point["to_messages"]
fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
print fraction_from_poi
data_point["fraction_from_poi"] = fraction_from_poi
from_this_person_to_poi = data_point["from_this_person_to_poi"]
from_messages = data_point["from_messages"]
fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
print fraction_to_poi
submit_dict[name] = {"from_poi_to_this_person":fraction_from_poi,
"from_this_person_to_poi":fraction_to_poi}
data_point["fraction_to_poi"] = fraction_to_poi
def submitDict():
return submit_dict
TfIdf Representation
Tf – term frequency
Idf – inverse document frequency
make everything as simple as possible, but no simpler – Albert Einstein
#!/usr/bin/python
import sys
import reader
import poi_emails
def getToFromStrings(f):
f.seek(0)
to_string, from_string, cc_string = reader.getAddresses(f)
to_emails = reader.parseAddresses( to_string )
from_emails = reader.parseAddresses( from_string )
cc_emails = reader.parseAddresses( cc_string )
return to_emails, from_emails, cc_emails
def poiFlagEmail(f):
to_emails, from_emails, cc_emails = getToFromStrings(f)
poi_email_list = poi_emails.poiEmails()
to_poi = False
from_poi = False
cc_poi = False
if to_emails:
ctr = 0
while not to_poi and ctr < len(to_emails):
if to_emails[ctr] in poi_email_list:
to_poi = True
ctr += 1
if cc_emails:
ctr = 0
while not to_poi and ctr < len(cc_emails):
if cc_emails[ctr] in poi_email_list:
cc_poi = True
ctr += 1
return to poi, from poi, cc poi
[/python]
[python]
#!/usr/bin/python
import os
import sys
import zipfile
from poi_flag_email import poiFlagEmail, getToFromStrings
data_dict = {}
with zipfile.ZipFile('emails.zip', "r") as z:
z.extractall()
for email_message in os.listdir("emails"):
if email_message == ".DS_Store":
continue
message = open(os.getcwd()+"/emails/"+email_message, "r")
to_addresses, from_addresses, cc_addresses = getToFromStrings(message)
to_poi, from_poi, cc_poi = poiFlagEmail(message)
for recipient in to_addresses:
if recipient not in data_dict:
data_dict[recipient] = {"from_poi_to_this_person":0}
if from_poi:
data_dict[recipient]["from_poi_to_this_person"] += 1
message.close()
for item in data_dict:
print item, data_dict[item]
def submitData():
return data dict
[/python]
Text Learning
Learning from TEXT
– Nice day
– A very nice day
-> SVM -> {o, x}
input dimension for svm
BAG OF WORDS, just frequency count
nice:1, very:0, day:1, he:0, she:0, love:0
Mr day loves a nice day
nice:1, very:0, day:2, he:0, she:0, love:1
from nltk.corpus import stopwords
nltk.download()
sw = stopwords.words("english")
sw[0]
sw[10]
len(sw)
Vocabulary: Not all unique words are different
unresponsive, response, responsivity, responsiveness, respond
Feature scaling
Feature scaling
– try to determine Chris’s t-shirt size: 140 lbs, 6.1ft
– training set: Cameron, Sarah: 175 lbs, 5.9ft, 115 lbs, 5.2ft
measure height + weight
-> who is Chirs closer to in height + weight
Cameron(large shirt), Sarah(small shirt)
Feature Scaling
X’ = (X – Xmin)/(Xmax – Xmin)
[115, 140, 175]
25 / 60 = 0.417
0<= X' <= 1
[python]
from sklearn.preprocessing import MinMaxScaler
import numpy
weights = numpy.array([[115],[140],[175]])
scaler = MinMaxScaler()
rescaled_weight = scaler.fit_transform(weights)
weights = numpy.array([[115.],[140.],[175.]])
rescaled_weight = scaler.fit_transform(weights)
rescaled_weight
[/python]
Which algorithm would be affected by feature rescaling?
- SVM with RBF
- K-MEAN clustering
Clustering
Unsupervised Learning
K-MEANS
how many clusters?
-> 2
assign, optimize
Visualizing K-Means Clustering
https://www.naftaliharris.com/blog/visualizing-k-means-clustering/
-uniform point
K-MEANS
will output for any fixed training set always be the same
Local minimum