import numpy as np from sklearn import cross_validation from sklearn imort datasets from sklearn import svm iris = datasets.load_iris() iris.data.shape, iris.target.shape((150, 4), (150,)) X_train, X_test, y_train, y_test = cross_validation.train_test_split( iris.data, iris.target, test_size=0.4, random_state=0) X_train.shape, y_train.shape((90, 4), (90,)) X_test.shape, y_test.shape((60, 4), (60,)) clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train) clf.score(X_test, y_test)0.96
Training, Transforms, Predicting
Train/test split -> pca -> svm
clf = GaussianNB() t0 = time() kf = KFold(len(authors), 2) for train_indices, test_indicies in kf: features_train = [word_data[ii] for ii in train_indices] features_test = [word_data[ii] for ii in test_indices] authors_train = [authors[ii] for ii in train_indices] authors_test = [authors[ii] for ii in test_indices] vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, authors_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() clf.fit(features_train_transformed, authors_train) print "training time:", round(time()-t0, 3), "s" t0 = time() pred = clf.predict( features_test_transformed )