[音声認識] wav2vecを使いたい

最近のトレンドはtransformarを用いた自然言語処理とのことで、wav2vecを使いたい

### ライブラリのinstall
$ pip3 install transformers datasets librosa

main.py

# -*- coding: utf-8 -*-
#! /usr/bin/python3

import librosa
import matplotlib.pyplot as plt
from IPython.display import display, Audio
import librosa.display
import numpy as np 

import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

def map_to_array(batch):
	speech, sr_db = sf.read(batch["file"])
	batch["speech"] = speech
	batch['sr_db'] = sr_db
	return batch

ds = load_dataset("patrickvonplaten/librispeech_asr_dummy","clean",split="validation")
ds = ds.map(map_to_array)

librosa.display.waveplot(np.array(ds['speech'][0]), sr=ds['sr_db'][0])
plt.savefig('01')
display(Audio(np.array(ds['speech'][0]), rate=ds['sr_db'][0]))

input_values = processor(ds["speech"][0], return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)

transcription = processor.decode(predicted_ids[0])

print(transcription)

$ python3 main.py
// 省略
A MAN SAID TO THE UNIVERSE SIR I EXIST

なんだこれええええええええええええええええええ
音声をvector graphicにしてるのはわかるが、、、