unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])
def skip_unwanted(pos_tuple):
word, tag = pos_tuple
if not word.isalpha() or word in unwanted:
return False
if tag.startswith("NN"):
return False
return True
positive_words = [word for word, tag in filter(
skip_unwanted,
nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
skip_unwanted,
nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)
common_set = set(positive_fd).intersection(negative_fd)
for word in common_set:
del positive_fd[word]
del negative_fd[word]
top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])
positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
if w.isalpha() and w not in unwanted
])
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
w for w in nltk.corpus.movie_reviews.words(categories=["neg"])