最近のトレンドはtransformarを用いた自然言語処理とのことで、wav2vecを使いたい
### ライブラリのinstall
$ pip3 install transformers datasets librosa
main.py
# -*- coding: utf-8 -*- #! /usr/bin/python3 import librosa import matplotlib.pyplot as plt from IPython.display import display, Audio import librosa.display import numpy as np import torch from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC from datasets import load_dataset import soundfile as sf processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") def map_to_array(batch): speech, sr_db = sf.read(batch["file"]) batch["speech"] = speech batch['sr_db'] = sr_db return batch ds = load_dataset("patrickvonplaten/librispeech_asr_dummy","clean",split="validation") ds = ds.map(map_to_array) librosa.display.waveplot(np.array(ds['speech'][0]), sr=ds['sr_db'][0]) plt.savefig('01') display(Audio(np.array(ds['speech'][0]), rate=ds['sr_db'][0]))
input_values = processor(ds["speech"][0], return_tensors="pt").input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) print(transcription)
$ python3 main.py
// 省略
A MAN SAID TO THE UNIVERSE SIR I EXIST
なんだこれええええええええええええええええええ
音声をvector graphicにしてるのはわかるが、、、