Be comfortable with
– making a simple mobile application
– basic command line
– calling a web or cloud API


public class FriendlyMessage {
	private String text;
	private String name;
	private String photoUrl;

	public FriendlyMessage(){

	public FriendlyMessage(String text, String name, String photoUrl){
		this.text = text; = name;
		this.photoUrl = photoUrl;

	public String getText(){
		return text;

	public void setText(string text){
		this.text = text;

	public String getName(){
		return name;

	public void setName(String name){ = name;
	public String getPhotoUrl{
		return photoUrl;

	public void setPhotoUrl(String photoUrl){
		this.photoUrl = photoUrl;

import android.os.Bundle;
import android.text.Editable;
import android.text.TextWatcher;
import android.view.Menu;
import android.view.MenuInflater;
import android.view.MenuItem;
import android.view.View;
import android.widget.Button;
import android.widget.EditText;
import android.widget.ImageButton;
import android.widget.ListView;
import android.widget.ProgressBar;

import java.util.ArrayList;
import java.util.List;

public class MainActivity extends(Bundle savedInstanceState){

	mUsername = ANONYMOUS;

	mProgressBar = (ProgressBar) findViewById(;
	mMessageListView = (ListView) findViewById(;
	mPhotoPickerButton = (ImageButton) findViewById(;
	mMessageEditText = (EditText) findViewById(;
	mSendButton = (Button) findViewById(;

	List<FriendlyMessage> friendlyMessages = new ArrayList<>();
	mMessageAdapter = new MessageAdapter(this, R.layout.item_message, friendlyMessages);


	mPhotoPickerButton.setOnClickListener(new View.OnClickListener(){
		public void onClick(View view){


	mMessageEditText.addTextChangedListener(new TextWatcher(){
		public void beforeTextChanged(CharSequence charSequence, int i, int i1, int i2){			

		public void onTestChanged(CharSequence charSequence, int i, int i1, int i2){
			if (charSequence.toString().trim().length() > 0){
			} else {

		public void afterTextChanged(Editable editable){
	mMessageEditText.setFilter(new InputFilter[]{new InputFilter.LengthFilter(DEFALT_MSG_LENGTH_LIMIT)});

	mSendButton.setOnClickListener(new View.OnClickListener(){
		public void onClick(View view){

	public boolean onCreateOptionsMenu(Menu menu){
		MenuInflater inflater = getMenuInflater();
		inflater.inflate(, menu);
		return true;

	public boolean onOptionsItemSelected(MenuItem item){
		return super.onOptionsItemSelected(item);

Evaluation Metrics

accuracy = no of items in a class labeled correctly / all items in that class

positive – negative
percision = true positive / true positive + false positive
recall = true positive / true positive + negative positive

predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1]
true labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

data set, features, algorithms, evaluation


parameters = {'kerne':('linear','rbf'),'c':[1,10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters),

parameters = {'kernel':('linear','rbf'),'C':[1,10]}

svr = svm.SVC()
clf = grid_search.GridSearchCV(svr,parameters),


import numpy as np
from sklearn import cross_validation
from sklearn imort datasets
from sklearn import svm

iris = datasets.load_iris(),, 4), (150,))

X_train, X_test, y_train, y_test = cross_validation.train_test_split(,, test_size=0.4, random_state=0)

X_train.shape, y_train.shape((90, 4), (90,))
X_test.shape, y_test.shape((60, 4), (60,))

clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train)
clf.score(X_test, y_test)0.96

Training, Transforms, Predicting
Train/test split -> pca -> svm

clf = GaussianNB()
t0 = time()
kf = KFold(len(authors), 2)
for train_indices, test_indicies in kf:
	features_train = [word_data[ii] for ii in train_indices]
	features_test = [word_data[ii] for ii in test_indices]
	authors_train = [authors[ii] for ii in train_indices]
	authors_test = [authors[ii] for ii in test_indices]

	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
	features_train_transformed = vectorizer.fit_transform(features_train)
	features_test_transformed = vectorizer.transform(features_test)
	selector = SelectPercentile(f_classif, percentile=10), authors_train)
	features_train_transformed = selector.transform(features_train_transformed).toarray()
	features_test_transformed = selector.transform(features_test_transformed).toarray(), authors_train)
	print "training time:", round(time()-t0, 3), "s"
	t0 = time()
	pred = clf.predict( features_test_transformed )

When to use PCA

When to use PCA
-> latent features driving the patterns in data
-> dimensional reduction
-> visualize high-dimensional data, reduce noise
-> make other algorithms(regression, classification) work better fewer inputs

PCA for facial recognition

X_train, X_test, y_train, y_test = train_split(X, y, test_size=0.25)

n_components = 150

print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
t0 = time()
pca = RandomizePCA(n_components=n_components, whiten=True).fit(X_train)
print "done in %0.3fs" % (time() - t0)

eigenfaces = pca.components_.reshape((n_component, h, w))

print "Projecting the input data on the eigenfaces orthnormal basis"
t0 = time()
X_train_pca = pca.tranform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

print "Fitting the classifier to the training set"


Principal Component Analysis – PCA
Dimensional of data:2

x = 2
y = 3
Δx = 1
Δy = 2

square footage + No.Rooms -> Size

How to determine the principal component
variance – the willingness/flexibility of an algorithm to learn
technical term in statistics – roughly the “spread” of a data distribution(similar to standard duration)

– maximum variance and information loss

def doPCA():
	from sklearn.decomposition import PCA
	pca = PCA(n_components=2)
	return pca

pca = doPCA()
print pca.explained_variance_ratio_
first_pc = pca.component_[0]
second_pc = pca.components_[1]

transformed_data = pca.transform(data)
for ii, jj in zip(transofrmed_data, data):
	plt.scatter( first_pc[0]*ii[0],  first_pc[1]*ii[0], color="r")
	plt.scatter( second_pc[0]*ii[1], second_pc[1]*ii[1], color="c")
	plt.scatter( jj[0], jj[i], color="b")

plt.ylabel("long-term incentive")

Features != Information

There are two big univariate feature selection tools in sklearn: SelectPercentile and SelectKBest. The difference is pretty apparent by the names: SelectPercentile selects the X% of features that are most powerful (where X is a parameter) and SelectKBest selects the K features that are most powerful (where K is a parameter).

high bias
pays little attention to data over simplified, high error on training set
high variance
pays too much attention to data(does not generalize well) over fit

Regularization in Regression
method for automatically penalizing extra features
-Lasso Regression: minimize SSE + γ|β|

m1 – m4: coefficients of regression
x1-x4: features

import sklearn, linear_model Lasso
features, labes = GetMyData()
reguression = Lasso()
regression fit(features)
regression predict([2, 4])


import pickle
from get_data import getData

def computeFraction( poi_messages, all_messages ):

	fraction = 0.
	return faraction

data_dict = getData()

submit_dict = {}
for name in data_dict

	data_point = data_dict[name]

	from_poi_to_this_person = data_point["from_poi_to_this_person"]
	to_messages = data_point["to_messages"]
	fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
	print fraction_from_poi
	data_point["fraction_from_poi"] = fraction_from_poi

	from_this_person_to_poi = data_point["from_this_person_to_poi"]
	from_messages = data_point["from_messages"]
	fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
	print fraction_to_poi
	submit_dict[name] = {"from_poi_to_this_person":fraction_from_poi, 
	data_point["fraction_to_poi"] = fraction_to_poi

def submitDict():
	return submit_dict

TfIdf Representation

Tf – term frequency
Idf – inverse document frequency

make everything as simple as possible, but no simpler – Albert Einstein


import sys
import reader
import poi_emails

def getToFromStrings(f):
to_string, from_string, cc_string = reader.getAddresses(f)
to_emails = reader.parseAddresses( to_string )
from_emails = reader.parseAddresses( from_string )
cc_emails = reader.parseAddresses( cc_string )

return to_emails, from_emails, cc_emails

def poiFlagEmail(f):
to_emails, from_emails, cc_emails = getToFromStrings(f)

poi_email_list = poi_emails.poiEmails()

to_poi = False
from_poi = False
cc_poi = False

if to_emails:
ctr = 0
while not to_poi and ctr < len(to_emails): if to_emails[ctr] in poi_email_list: to_poi = True ctr += 1 if cc_emails: ctr = 0 while not to_poi and ctr < len(cc_emails): if cc_emails[ctr] in poi_email_list: cc_poi = True ctr += 1 return to poi, from poi, cc poi [/python] [python] #!/usr/bin/python import os import sys import zipfile from poi_flag_email import poiFlagEmail, getToFromStrings data_dict = {} with zipfile.ZipFile('', "r") as z: z.extractall() for email_message in os.listdir("emails"): if email_message == ".DS_Store": continue message = open(os.getcwd()+"/emails/"+email_message, "r") to_addresses, from_addresses, cc_addresses = getToFromStrings(message) to_poi, from_poi, cc_poi = poiFlagEmail(message) for recipient in to_addresses: if recipient not in data_dict: data_dict[recipient] = {"from_poi_to_this_person":0} if from_poi: data_dict[recipient]["from_poi_to_this_person"] += 1 message.close() for item in data_dict: print item, data_dict[item] def submitData(): return data dict [/python]

Text Learning

Learning from TEXT

– Nice day
– A very nice day
-> SVM -> {o, x}

input dimension for svm

BAG OF WORDS, just frequency count
nice:1, very:0, day:1, he:0, she:0, love:0
Mr day loves a nice day
nice:1, very:0, day:2, he:0, she:0, love:1

from nltk.corpus import stopwords
sw = stopwords.words("english")

Vocabulary: Not all unique words are different
unresponsive, response, responsivity, responsiveness, respond