$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ make
$ pip3 install cython
$ pip3 install fasttext
$ install requests requests_oauthlib
Twitterの開発者向けのページから、consumer key, consumer secret, access token, access token secretを取得します。
tweet_get.py
import re
import json
import MeCab
from requests_oauthlib import OAuth1Session
CK = ""
CS = ""
AT = ""
AS = ""
API_URL = "https://api.twitter.com/1.1/search/tweets.json?tweet_mode=extended"
KEYWORD = "芸能 OR アニメ OR 漫画 OR TV OR ゲーム"
CLASS_LABEL = "__label__1"
def main():
tweets = get_tweet()
surfaces = get_surfaces(tweets)
write_txt(surfaces)
def get_tweet():
params = {'q' : KEYWORD, 'count' : 20}
twitter = OAuth1Session(CK, CS, AT, AS)
req = twitter.get(API_URL, params = params)
results = []
if req.status_code == 200:
tweets = json.loads(req.text)
for tweet in tweets['statuses']:
results.append(tweet['full_text'])
return results
else:
print ("Error: %d" % req.status_code)
def get_surfaces(contents):
results = []
for row in contents:
content = format_text(row)
tagger = MeCab.Tagger('')
tagger.parse('')
surf = []
node = tagger.parseToNode(content)
while node:
surf.append(node.surface)
node = node.next
results.append(surf)
return results
def write_txt(contents):
try:
if(len(contents) > 0):
fileName = CLASS_LABEL + ".txt"
labelText = CLASS_LABEL + ","
f = open(fileName,'a')
for row in contents:
spaceTokens = "".join(row);
result = labelText + spaceTokens + "\n"
f.write(result)
f.close()
print(str(len(contents))+"行を書き込み")
except Exception as e:
print("テキストの書き込みに失敗")
print(e)
def format_text(text):
text=re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
text=re.sub(r'@[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
text=re.sub(r'&[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
text=re.sub(';', "", text)
text=re.sub('RT', "", text)
text=re.sub('\n', " ", text)
return text
if __name__ == '__main__':
main()
1.エンタメ、2.美容、3.住まい/暮らし のキーワードで集めます。
__label__1: エンタメ
-> 芸能 OR アニメ OR 漫画 OR TV OR ゲーム
__label__2: 美容
-> 肌 OR ヨガ OR 骨盤 OR ウィッグ OR シェイプ
__label__3: 住まい/暮らし
-> リフォーム OR 住宅 OR 家事 OR 収納 OR 食材
label1〜3のテキストを結合させます
$ cat __label__1.txt __label__2.txt __label__3.txt > model.txt
$ ls
__label__1.txt __label__3.txt model.txt
__label__2.txt fastText tweet_get.py
learning.py
import sys
import fasttext as ft
argvs = sys.argv
input_file = argvs[1]
output_file = argvs[2]
classifier = ft.supervised(input_file, output_file)
$ python3 learning.py model.txt model
raise Exception(“`supervised` is not supported any more. Please use `train_supervised`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module”)
Exception: `supervised` is not supported any more. Please use `train_supervised`.
model = ft.train_supervised(input_file)
$ python3 learning.py model.txt ftmodel
Read 0M words
Number of words: 6
Number of labels: 135
Floating point exception
おかしいな、number of labelsが135ではなく3のはずなんだけど。。。