Validation

import numpy as np
from sklearn import cross_validation
from sklearn imort datasets
from sklearn import svm

iris = datasets.load_iris()
iris.data.shape, iris.target.shape((150, 4), (150,))

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
	iris.data, iris.target, test_size=0.4, random_state=0)

X_train.shape, y_train.shape((90, 4), (90,))
X_test.shape, y_test.shape((60, 4), (60,))

clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train)
clf.score(X_test, y_test)0.96

Training, Transforms, Predicting
Train/test split -> pca -> svm

clf = GaussianNB()
t0 = time()
kf = KFold(len(authors), 2)
for train_indices, test_indicies in kf:
	features_train = [word_data[ii] for ii in train_indices]
	features_test = [word_data[ii] for ii in test_indices]
	authors_train = [authors[ii] for ii in train_indices]
	authors_test = [authors[ii] for ii in test_indices]

	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
			stop_words='english')
	features_train_transformed = vectorizer.fit_transform(features_train)
	features_test_transformed = vectorizer.transform(features_test)
	selector = SelectPercentile(f_classif, percentile=10)
	selector.fit(features_train_transformed, authors_train)
	features_train_transformed = selector.transform(features_train_transformed).toarray()
	features_test_transformed = selector.transform(features_test_transformed).toarray()

	clf.fit(features_train_transformed, authors_train)
	print "training time:", round(time()-t0, 3), "s"
	t0 = time()
	pred = clf.predict( features_test_transformed )