Dubliners ASR, v2
wav2vec2 words and phones
!pip -q install --no-deps pyctcdecode kenlm pygtrie
import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"
import json
from pathlib import Path
import torch
import librosa
from transformers import AutoProcessor, AutoModelForCTC, pipeline
EN_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
PHONE_MODEL = "jimregan/wav2vec2-xls-r-300m-phoneme-timit"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUT = Path("/kaggle/working")
pipe_en = pipeline("automatic-speech-recognition", model=EN_MODEL, device=0 if DEVICE=="cuda" else -1)
pipe_phone = pipeline("automatic-speech-recognition", model=PHONE_MODEL, device=0 if DEVICE=="cuda" else -1)
en_proc = AutoProcessor.from_pretrained(EN_MODEL)
en_model = AutoModelForCTC.from_pretrained(EN_MODEL).to(DEVICE).eval()
V2 = Path("/kaggle/input/download-dubliners/v2")
DONE = [
"dubliners_06_joyce_64kb",
"dubliners_08_joyce_64kb",
"dubliners_10_joyce_64kb",
"dubliners_12_joyce_64kb",
"dubliners_13_joyce_64kb",
"dubliners_14_joyce_64kb",
]
for base in [V2]:
prefix = base.name
for file in base.rglob("*.mp3"):
if file.stem in DONE:
continue
stem = f"{prefix}_{file.stem}"
en_out = pipe_en(str(file), chunk_length_s=10, return_timestamps="word")
phone_out = pipe_phone(str(file), chunk_length_s=10, return_timestamps="word")
(OUT / f"{stem}_en.json").write_text(json.dumps(en_out, ensure_ascii=False), encoding="utf-8")
(OUT / f"{stem}_phone.json").write_text(json.dumps(phone_out, ensure_ascii=False), encoding="utf-8")