$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ make
$ pip3 install cython
$ pip3 install fasttext
$ install requests requests_oauthlib
Twitterの開発者向けのページから、consumer key, consumer secret, access token, access token secretを取得します。
tweet_get.py
import re import json import MeCab from requests_oauthlib import OAuth1Session CK = "" CS = "" AT = "" AS = "" API_URL = "https://api.twitter.com/1.1/search/tweets.json?tweet_mode=extended" KEYWORD = "芸能 OR アニメ OR 漫画 OR TV OR ゲーム" CLASS_LABEL = "__label__1" def main(): tweets = get_tweet() surfaces = get_surfaces(tweets) write_txt(surfaces) def get_tweet(): params = {'q' : KEYWORD, 'count' : 20} twitter = OAuth1Session(CK, CS, AT, AS) req = twitter.get(API_URL, params = params) results = [] if req.status_code == 200: tweets = json.loads(req.text) for tweet in tweets['statuses']: results.append(tweet['full_text']) return results else: print ("Error: %d" % req.status_code) def get_surfaces(contents): results = [] for row in contents: content = format_text(row) tagger = MeCab.Tagger('') tagger.parse('') surf = [] node = tagger.parseToNode(content) while node: surf.append(node.surface) node = node.next results.append(surf) return results def write_txt(contents): try: if(len(contents) > 0): fileName = CLASS_LABEL + ".txt" labelText = CLASS_LABEL + "," f = open(fileName,'a') for row in contents: spaceTokens = "".join(row); result = labelText + spaceTokens + "\n" f.write(result) f.close() print(str(len(contents))+"行を書き込み") except Exception as e: print("テキストの書き込みに失敗") print(e) def format_text(text): text=re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) text=re.sub(r'@[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) text=re.sub(r'&[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) text=re.sub(';', "", text) text=re.sub('RT', "", text) text=re.sub('\n', " ", text) return text if __name__ == '__main__': main()
1.エンタメ、2.美容、3.住まい/暮らし のキーワードで集めます。
__label__1: エンタメ
-> 芸能 OR アニメ OR 漫画 OR TV OR ゲーム
__label__2: 美容
-> 肌 OR ヨガ OR 骨盤 OR ウィッグ OR シェイプ
__label__3: 住まい/暮らし
-> リフォーム OR 住宅 OR 家事 OR 収納 OR 食材
label1〜3のテキストを結合させます
$ cat __label__1.txt __label__2.txt __label__3.txt > model.txt
$ ls
__label__1.txt __label__3.txt model.txt
__label__2.txt fastText tweet_get.py
learning.py
import sys import fasttext as ft argvs = sys.argv input_file = argvs[1] output_file = argvs[2] classifier = ft.supervised(input_file, output_file)
$ python3 learning.py model.txt model
raise Exception(“`supervised` is not supported any more. Please use `train_supervised`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module”)
Exception: `supervised` is not supported any more. Please use `train_supervised`.
model = ft.train_supervised(input_file)
$ python3 learning.py model.txt ftmodel
Read 0M words
Number of words: 6
Number of labels: 135
Floating point exception
おかしいな、number of labelsが135ではなく3のはずなんだけど。。。