まずenglishのwave fileを用意します。
app.py
# -*- coding: utf-8 -*- #! /usr/bin/python3 import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer audio, rate = librosa.load("sample.wav", sr = 16000) tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") input_values = tokenizer(audio, return_tensors="pt").input_values logits = model(input_values).logits prediction = torch.argmax(logits, dim = -1) transcription = tokenizer.batch_decode(prediction)[0] print(transcription)
$ python3 app.py
// 省略
FOUR SCORE AND SEVEN YEARS AGO OUR FATHERS BROUGHT FORTH ON THIS CONTINENT A NEW NATION CONCEIVED IN LIBERTY AND DEDICATED TO THE PROPOSITION THAT ALL MEN ARE CREATED EQUAL
なるほど、これは凄い
tokenizerとmodelを日本語でやりたい
from asrecognition import ASREngine asr = ASREngine("ja", model_path="jonatasgrosman/wav2vec2-large-xlsr-53-japanese") audio_paths = ["itou.wav"] transcriptions = asr.transcribe(audio_paths)
$ python3 app.py
/home/vagrant/.local/lib/python3.8/site-packages/transformers/configuration_utils.py:340: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.
warnings.warn(
Killed
うむ、うまくいかないが近づいている様な気がする
vagrantだとうまくいかないので、vpsでやることにした
micによるliveだと、こちらで出来ますね
https://github.com/oliverguhr/wav2vec2-live