Original here, with outputs; based on this

!pip -q install --no-deps pyctcdecode kenlm pygtrie

import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

import json
from pathlib import Path

import torch
import librosa

from transformers import AutoProcessor, AutoModelForCTC, pipeline

EN_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
PHONE_MODEL = "jimregan/wav2vec2-xls-r-300m-phoneme-timit"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUT = Path("/kaggle/working")

pipe_en = pipeline("automatic-speech-recognition", model=EN_MODEL, device=0 if DEVICE=="cuda" else -1)
pipe_phone = pipeline("automatic-speech-recognition", model=PHONE_MODEL, device=0 if DEVICE=="cuda" else -1)

en_proc = AutoProcessor.from_pretrained(EN_MODEL)
en_model = AutoModelForCTC.from_pretrained(EN_MODEL).to(DEVICE).eval()

V2 = Path("/kaggle/input/download-dubliners/v2")

DONE = [
    "dubliners_06_joyce_64kb",
    "dubliners_08_joyce_64kb",
    "dubliners_10_joyce_64kb",
    "dubliners_12_joyce_64kb",
    "dubliners_13_joyce_64kb",
    "dubliners_14_joyce_64kb",
]

for base in [V2]:
    prefix = base.name

    for file in base.rglob("*.mp3"):
        if file.stem in DONE:
            continue
        stem = f"{prefix}_{file.stem}"

        en_out = pipe_en(str(file), chunk_length_s=10, return_timestamps="word")
        phone_out = pipe_phone(str(file), chunk_length_s=10, return_timestamps="word")

        (OUT / f"{stem}_en.json").write_text(json.dumps(en_out, ensure_ascii=False), encoding="utf-8")
        (OUT / f"{stem}_phone.json").write_text(json.dumps(phone_out, ensure_ascii=False), encoding="utf-8")