fastTextによるカテゴリ分類 – ソフトウェアエンジニアの技術ブログ：Software engineer tech blog

$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ make
$ pip3 install cython
$ pip3 install fasttext

$ install requests requests_oauthlib

Twitterの開発者向けのページから、consumer key, consumer secret, access token, access token secretを取得します。

tweet_get.py

import re
import json
import MeCab
from requests_oauthlib import OAuth1Session

CK = ""
CS = ""
AT = ""
AS = ""

API_URL = "https://api.twitter.com/1.1/search/tweets.json?tweet_mode=extended"
KEYWORD = "芸能 OR アニメ OR 漫画 OR TV OR ゲーム"
CLASS_LABEL = "__label__1"

def main():
	tweets = get_tweet()
	surfaces = get_surfaces(tweets)
	write_txt(surfaces)

def get_tweet():
    params = {'q' : KEYWORD, 'count' : 20}
    twitter = OAuth1Session(CK, CS, AT, AS)
    req = twitter.get(API_URL, params = params)
    results = []
    if req.status_code == 200:
        tweets = json.loads(req.text)
        for tweet in tweets['statuses']:
            results.append(tweet['full_text'])
        return results
    else:
        print ("Error: %d" % req.status_code)

def get_surfaces(contents):
    results = []
    for row in contents:
        content = format_text(row)
        tagger = MeCab.Tagger('')
        tagger.parse('')
        surf = []
        node = tagger.parseToNode(content)
        while node:
            surf.append(node.surface)
            node = node.next
        results.append(surf)
    return results

def write_txt(contents):
	try:
		if(len(contents) > 0):
			fileName = CLASS_LABEL + ".txt"
			labelText = CLASS_LABEL + ","

			f = open(fileName,'a')
			for row in contents:
				spaceTokens = "".join(row);
				result = labelText + spaceTokens + "\n"
				f.write(result)
			f.close()

		print(str(len(contents))+"行を書き込み")

	except Exception as e:
		print("テキストの書き込みに失敗")
		print(e)

def format_text(text):
	text=re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
	text=re.sub(r'@[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
	text=re.sub(r'&[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
	text=re.sub(';', "", text)
	text=re.sub('RT', "", text)
	text=re.sub('\n', " ", text)
	return text

if __name__ == '__main__':
	main()

1.エンタメ、2.美容、3.住まい/暮らしのキーワードで集めます。

__label__1: エンタメ
-> 芸能 OR アニメ OR 漫画 OR TV OR ゲーム
__label__2: 美容
-> 肌 OR ヨガ OR 骨盤 OR ウィッグ OR シェイプ
__label__3: 住まい/暮らし
-> リフォーム OR 住宅 OR 家事 OR 収納 OR 食材

label1〜3のテキストを結合させます
$ cat __label__1.txt __label__2.txt __label__3.txt > model.txt
$ ls
__label__1.txt __label__3.txt model.txt
__label__2.txt fastText tweet_get.py

learning.py

import sys
import fasttext as ft

argvs = sys.argv
input_file = argvs[1]
output_file = argvs[2]

classifier = ft.supervised(input_file, output_file)

$ python3 learning.py model.txt model
raise Exception(“`supervised` is not supported any more. Please use `train_supervised`. For more information please refer to https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module”)
Exception: `supervised` is not supported any more. Please use `train_supervised`.

model = ft.train_supervised(input_file)

$ python3 learning.py model.txt ftmodel
Read 0M words
Number of words: 6
Number of labels: 135
Floating point exception

おかしいな、number of labelsが135ではなく3のはずなんだけど。。。