$ sudo pip3 install pandas
# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import pandas as pd import time list_df = pd.DataFrame(columns=['歌詞']) for page in range(1, 3): base_url = 'https://hogehoge' url = 'https://hogehoge/' + str(page) + '/' response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') links = soup.find_all('td', class_='side td1') for link in links: a = base_url + (link.a.get('href')) response = requests.get(a) soup = BeautifulSoup(response.text, 'lxml') song_lyrics = soup.find('div', itemprop='lyrics') song_lyric = song_lyrics.text song_lyric = song_lyric.replace('\n', '') time.sleep tmp_se = pd.DataFrame([song_lyric], index=list_df.columns).T list_df = list_df.append(tmp_se) print(list_df) list_df.to_csv('list.csv', mode='a', encoding='cp932')
### janomeによる形態素解析
$ sudo pip3 install janome
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer import pandas as pd import re df_file = pd.read_csv('list.csv', encoding='cp932') song_lyrics = df_file['歌詞'].tolist() t = Tokenizer() results = [] for s in song_lyrics: tokens = t.tokenize(s) r = [] for tok in tokens: if tok.base_form == '*': word = tok.surface else: word = tok.base_form ps = tok.part_of_speech hinshi = ps.split(',')[0] if hinshi in ['名詞','形容詞','動詞', '副詞']: r.append(word) rl = (' '.join(r)).strip() results.append(rl) result = [i.replace('\u3000', '') for i in results] print(result) text_file = 'wakati_list.txt' with open(text_file, 'w', encoding='utf-8') as fp: fp.write("\n".join(result))
### wordcloud
$ sudo pip3 install wordcloud
https://www.google.com/get/noto/
Noto Sans CJK JPをダウンロードしてNotoSansMonoCJKjp-Regular.otfを使います。
# -*- coding: utf-8 -*- from wordcloud import WordCloud text_file = open('wakati_list.txt', encoding='utf-8') text = text_file.read() fpath = 'NotoSansMonoCJKjp-Regular.otf' stop_words = ['そう', 'ない', 'いる', 'する', 'まま', 'よう', 'てる', 'なる', 'こと', 'もう', 'いい', 'ある', 'ゆく', 'れる', 'ん', 'の'] wordcloud = WordCloud(background_color='white', font_path=fpath, width=800, height=600, stopwords=set(stop_words)).generate(text) wordcloud.to_file('./wordcloud.png')
$ python3 word_cloud.py
LunaSeaでやった場合
lunaseaは「夜」が多いなwww
なるほど、面白いなこれ。
応用としては、、、
エンジニアだと技術情報だけど、ファッション、旅行、スポーツなどでやりたい。特に旅行は万人受けするからなー