import pickle from get_data import getData def computeFraction( poi_messages, all_messages ): fraction = 0. return faraction data_dict = getData() submit_dict = {} for name in data_dict data_point = data_dict[name] print from_poi_to_this_person = data_point["from_poi_to_this_person"] to_messages = data_point["to_messages"] fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages ) print fraction_from_poi data_point["fraction_from_poi"] = fraction_from_poi from_this_person_to_poi = data_point["from_this_person_to_poi"] from_messages = data_point["from_messages"] fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages ) print fraction_to_poi submit_dict[name] = {"from_poi_to_this_person":fraction_from_poi, "from_this_person_to_poi":fraction_to_poi} data_point["fraction_to_poi"] = fraction_to_poi def submitDict(): return submit_dict
TfIdf Representation
Tf – term frequency
Idf – inverse document frequency
make everything as simple as possible, but no simpler – Albert Einstein
#!/usr/bin/python
import sys
import reader
import poi_emails
def getToFromStrings(f):
f.seek(0)
to_string, from_string, cc_string = reader.getAddresses(f)
to_emails = reader.parseAddresses( to_string )
from_emails = reader.parseAddresses( from_string )
cc_emails = reader.parseAddresses( cc_string )
return to_emails, from_emails, cc_emails
def poiFlagEmail(f):
to_emails, from_emails, cc_emails = getToFromStrings(f)
poi_email_list = poi_emails.poiEmails()
to_poi = False
from_poi = False
cc_poi = False
if to_emails:
ctr = 0
while not to_poi and ctr < len(to_emails):
if to_emails[ctr] in poi_email_list:
to_poi = True
ctr += 1
if cc_emails:
ctr = 0
while not to_poi and ctr < len(cc_emails):
if cc_emails[ctr] in poi_email_list:
cc_poi = True
ctr += 1
return to poi, from poi, cc poi
[/python]
[python]
#!/usr/bin/python
import os
import sys
import zipfile
from poi_flag_email import poiFlagEmail, getToFromStrings
data_dict = {}
with zipfile.ZipFile('emails.zip', "r") as z:
z.extractall()
for email_message in os.listdir("emails"):
if email_message == ".DS_Store":
continue
message = open(os.getcwd()+"/emails/"+email_message, "r")
to_addresses, from_addresses, cc_addresses = getToFromStrings(message)
to_poi, from_poi, cc_poi = poiFlagEmail(message)
for recipient in to_addresses:
if recipient not in data_dict:
data_dict[recipient] = {"from_poi_to_this_person":0}
if from_poi:
data_dict[recipient]["from_poi_to_this_person"] += 1
message.close()
for item in data_dict:
print item, data_dict[item]
def submitData():
return data dict
[/python]
Text Learning
Learning from TEXT
– Nice day
– A very nice day
-> SVM -> {o, x}
input dimension for svm
BAG OF WORDS, just frequency count
nice:1, very:0, day:1, he:0, she:0, love:0
Mr day loves a nice day
nice:1, very:0, day:2, he:0, she:0, love:1
from nltk.corpus import stopwords nltk.download() sw = stopwords.words("english") sw[0] sw[10] len(sw)
Vocabulary: Not all unique words are different
unresponsive, response, responsivity, responsiveness, respond
Feature scaling
Feature scaling
– try to determine Chris’s t-shirt size: 140 lbs, 6.1ft
– training set: Cameron, Sarah: 175 lbs, 5.9ft, 115 lbs, 5.2ft
measure height + weight
-> who is Chirs closer to in height + weight
Cameron(large shirt), Sarah(small shirt)
Feature Scaling
X’ = (X – Xmin)/(Xmax – Xmin)
[115, 140, 175]
25 / 60 = 0.417
0<= X' <= 1
[python]
from sklearn.preprocessing import MinMaxScaler
import numpy
weights = numpy.array([[115],[140],[175]])
scaler = MinMaxScaler()
rescaled_weight = scaler.fit_transform(weights)
weights = numpy.array([[115.],[140.],[175.]])
rescaled_weight = scaler.fit_transform(weights)
rescaled_weight
[/python]
Which algorithm would be affected by feature rescaling?
- SVM with RBF
- K-MEAN clustering
Clustering
Unsupervised Learning
K-MEANS
how many clusters?
-> 2
assign, optimize
Visualizing K-Means Clustering
https://www.naftaliharris.com/blog/visualizing-k-means-clustering/
-uniform point
K-MEANS
will output for any fixed training set always be the same
Local minimum
Outliers
what causes outliers?
-sensor malfunction: ignore
-data entry errors
-freak event: pay attention
Outlier Detection
-train
-remove
-train again
for point in data: salary = point[0] bonus = point[1] matplotlib.pyplot.scatter( salary, bonus ) matplotlib.pyplot.xlabel("salary") matplotlib.pyplot.ylabel("bonus") matplotlib.pyplot.show()
r^2 of a regression
r^2
how much of my change in the output(y) is explained by the change in my input(x)
0.0 < r^2 < 1.0 classification & regression property, supervised classification, regression output type, discrete(class labels), continuous(number) what are you trying to find?, decision boundary, best fit line evaluation, accuracy, "sum of squared error" r^2 Regression multi-variate age, IQ, education -> net worth
Multi-variate regression
y = 5×1 + 2.5×2 – 200
y = House Price
y = x1 – 10×2 + 500
import sys import pickle sys.path.append("../tools/") from feature_format import featureFormat, targetFeaturSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl","r")) features_list = ["bonus", "salary"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit( data ) from sklearn.cross_validation import tarain_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "b" import matplotlib.pyplot as plt for feature, target in zip(feature_test, target_test): plt.scatter( feature, target, color=test_color ) for feature, target in zip(feature_train, target_train): plt.scatter( feature, target, color=train_color ) plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") try: plt.plot( feature_test, reg.predict(feature_test) ) except NameError: pass plt.xlabel(features_list[1]) plt.ylabel(features_list[0]) plt.legend() plt.show()
Linear Regression Errors
error = actual net worth – predicted net worth
(taken from training data, predicted by regression line)
predicted new worth = 218.75
actual net worth = 200
error(distance) = -18.75
Σ|error| on all data points
Σ|error^2| on all data points
minimizes Σall training point (actual – predicted)^2
several algorithms
-Ordinary least squares(OLS)
-> used in sklearn LinearRegression
-Gradient descent
SSE isn’t perfect! incline
Regression
Continuous supervised learning
discrete: fast slow
continuous
sklearn regression
http://scikit-learn.org/stable/modules/linear_model.html
>>> from sklearn import linear_model >>> reg = linear_model.LinearRegression() >>> reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) >>> reg.coef_ array([ 0.5, 0.5])
#!/usr/bin/python import numpy import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt from studentRegression import studentReg from class_vis import prettyPicture, output_image from ages_net_worths import ageNetWorthData ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData() reg = studentReg(ages_train, net_worths_train) plt.clf() plt.scatter(ages_train, net_worths_train, color="b", label="train data") plt.scatter(ages_test, net_worths_test, color="r", label="test data") plt.plot(ages_test, reg.predict(ages_test), color="black") plt.legend(loc=2) plt.xlabel("ages") plt.ylabel("net worths")
def studentRegression( ages_train, net_worths_train): from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit( ages_train, net_worths_train ) return reg
New algorithm
k nearest neighbors:classic, simple, easy to understand
random forest: “ensemble methods” meta classifiers built from (usually) decision trees
adaboost(boosted decision tree)
(previous algorithms:Naive Bayes, SVM, decision tree)
Process
1) do some research!
– get a general understanding
2) find sklearn documentation
3) deploy it!
4) use it to make predictions
What is a person of interest?
– indicted
– settled without admitting guilt
– testified in exchange for immunity
MORE DATA > fine-tuned algorithm
numerical – numerical values(numbers)
categorical – limited number of discrete values(category)
time series – temporal value(date, timestamp)
text – words