Original here with (partial) output; see also this notebook.

!pip -q install --no-deps pyctcdecode kenlm pygtrie

import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

import json
from pathlib import Path

import torch
import librosa

from transformers import AutoProcessor, AutoModelForCTC, pipeline

EN_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
PHONE_MODEL = "jimregan/wav2vec2-xls-r-300m-phoneme-timit"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUT = Path("/kaggle/working")

pipe_en = pipeline("automatic-speech-recognition", model=EN_MODEL, device=0 if DEVICE=="cuda" else -1)
pipe_phone = pipeline("automatic-speech-recognition", model=PHONE_MODEL, device=0 if DEVICE=="cuda" else -1)

en_proc = AutoProcessor.from_pretrained(EN_MODEL)
en_model = AutoModelForCTC.from_pretrained(EN_MODEL).to(DEVICE).eval()

V1 = Path("/kaggle/input/download-dubliners/v1")
V2 = Path("/kaggle/input/download-dubliners/v2")

for base in (V1, V2):
    prefix = base.name  # "v1" / "v2"

    for file in base.rglob("*.mp3"):
        stem = f"{prefix}_{file.stem}"

        en_out = pipe_en(str(file), chunk_length_s=10, return_timestamps="word")
        phone_out = pipe_phone(str(file), chunk_length_s=10, return_timestamps="word")

        (OUT / f"{stem}_en.json").write_text(json.dumps(en_out, ensure_ascii=False), encoding="utf-8")
        (OUT / f"{stem}_phone.json").write_text(json.dumps(phone_out, ensure_ascii=False), encoding="utf-8")