CornellのsentimentデータでTextClassification

### dataset
Cornell Natural Language Processing Groupの映画レビューのデータセットを使います。
http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
txt_sentokenフォルダ配下にnegativeとpositiveのデータが入っています。

### Sentiment Analysis with Scikit-Learn
1. import libraries and dataset
2. text preprocessing
3. converting text to numbers
4. training and test sets
5. training text classification model and predicting sentiment
6. evaluating the model
7. saving and loading the model

import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# データ取得
movie_data = load_files("txt_sentoken")
X, y = movie_data.data, movie_data.target

# remove all the special characters
stemmer = WordNetLemmatizer()

documents = []

for sen in range(0, len(X)):
	# remove all the special character
	document = re.sub(r'\W', ' ', str(X[sen]))
	# remove all the single character
	document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
	# remove all the single character from the start
	document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
	# substituting multiple spaces with single space
	document = re.sub(r'\s+', ' ', document, flags=re.I)
	# Removing prefixed 'b'
	document = re.sub(r'^b\s+', '', document)
	# converting to lowercase
	document = document.lower()
	# lemmatization (見出し語に変換)
	document = document.split()

	document = [stemmer.lemmatize(word) for word in document]
	document = ' '.join(document)

	documents.append(document)

# Bag of wordsとWord Embedding があるがここではBag of wordsを使う
# max_featuresはmost occuring world of 1500, min_dfはminimum number of documents contain this feature, max_dfはfraction corresponds to a percentage 最大70%
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
# fit_transformでnumeric featuresに変換
X = vectorizer.fit_transform(documents).toarray()
# tfidf
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

# training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# random forest algorithm, predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

$ python3 model.py
[[180 28]
[ 30 162]]
precision recall f1-score support

0 0.86 0.87 0.86 208
1 0.85 0.84 0.85 192

accuracy 0.85 400
macro avg 0.85 0.85 0.85 400
weighted avg 0.85 0.85 0.85 400

0.855

# save model
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

with open('text_classifier', 'rb') as training_model:
	model = pickle.load(training_model)

y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

Scikit-Learnでやるんやな。