### dataset
Cornell Natural Language Processing Groupの映画レビューのデータセットを使います。
http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
txt_sentokenフォルダ配下にnegativeとpositiveのデータが入っています。
### Sentiment Analysis with Scikit-Learn
1. import libraries and dataset
2. text preprocessing
3. converting text to numbers
4. training and test sets
5. training text classification model and predicting sentiment
6. evaluating the model
7. saving and loading the model
import numpy as np import re import nltk from sklearn.datasets import load_files nltk.download('stopwords') import pickle import nltk nltk.download('wordnet') from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # データ取得 movie_data = load_files("txt_sentoken") X, y = movie_data.data, movie_data.target # remove all the special characters stemmer = WordNetLemmatizer() documents = [] for sen in range(0, len(X)): # remove all the special character document = re.sub(r'\W', ' ', str(X[sen])) # remove all the single character document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # remove all the single character from the start document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) # substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Removing prefixed 'b' document = re.sub(r'^b\s+', '', document) # converting to lowercase document = document.lower() # lemmatization (見出し語に変換) document = document.split() document = [stemmer.lemmatize(word) for word in document] document = ' '.join(document) documents.append(document) # Bag of wordsとWord Embedding があるがここではBag of wordsを使う # max_featuresはmost occuring world of 1500, min_dfはminimum number of documents contain this feature, max_dfはfraction corresponds to a percentage 最大70% vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english')) # fit_transformでnumeric featuresに変換 X = vectorizer.fit_transform(documents).toarray() # tfidf tfidfconverter = TfidfTransformer() X = tfidfconverter.fit_transform(X).toarray() # training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # random forest algorithm, predicting sentiment classifier = RandomForestClassifier(n_estimators=1000, random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred))
$ python3 model.py
[[180 28]
[ 30 162]]
precision recall f1-score support
0 0.86 0.87 0.86 208
1 0.85 0.84 0.85 192
accuracy 0.85 400
macro avg 0.85 0.85 0.85 400
weighted avg 0.85 0.85 0.85 400
0.855
# save model with open('text_classifier', 'wb') as picklefile: pickle.dump(classifier,picklefile)
import pickle from sklearn.metrics import classification_report, confusion_matrix, accuracy_score with open('text_classifier', 'rb') as training_model: model = pickle.load(training_model) y_pred2 = model.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred))
Scikit-Learnでやるんやな。