unwanted = nltk.corpus.stopwords.words("english") unwanted.extend([w.lower() for w in nltk.corpus.names.words()]) def skip_unwanted(pos_tuple): word, tag = pos_tuple if not word.isalpha() or word in unwanted: return False if tag.startswith("NN"): return False return True positive_words = [word for word, tag in filter( skip_unwanted, nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"])) )] negative_words = [word for word, tag in filter( skip_unwanted, nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"])) )] positive_fd = nltk.FreqDist(positive_words) negative_fd = nltk.FreqDist(negative_words) common_set = set(positive_fd).intersection(negative_fd) for word in common_set: del positive_fd[word] del negative_fd[word] top_100_positive = {word for word, count in positive_fd.most_common(100)} top_100_negative = {word for word, count in negative_fd.most_common(100)} unwanted = nltk.corpus.stopwords.words("english") unwanted.extend([w.lower() for w in nltk.corpus.names.words()]) positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([ w for w in nltk.corpus.movie_reviews.words(categories=["pos"]) if w.isalpha() and w not in unwanted ]) negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([ w for w in nltk.corpus.movie_reviews.words(categories=["neg"])