Based on an earlier notebook

%%capture
!pip install youtube-dl
!pip install phonemizer

%%capture
!apt install espeak-ng

!youtube-dl -x --audio-format best -o '%(id)s.%(ext)s' https://www.youtube.com/watch?v=Kw5jkyLGFGc

[youtube] Kw5jkyLGFGc: Downloading webpage
[youtube] Kw5jkyLGFGc: Downloading MPD manifest
[download] Destination: Kw5jkyLGFGc.m4a
[download] 100% of 10.98MiB in 04:05
[ffmpeg] Correcting container in "Kw5jkyLGFGc.m4a"
[ffmpeg] Post-process file Kw5jkyLGFGc.m4a exists, skipping

%%capture
!ffmpeg -i Kw5jkyLGFGc.m4a -acodec pcm_s16le -ac 1 -ar 16000 Kw5jkyLGFGc.wav

Here starts the actual ASR stuff.

%%capture
!pip install transformers

_SWE_MODEL = "facebook/wav2vec2-lv-60-espeak-cv-ft"

from transformers import pipeline

pipe = pipeline(model=_SWE_MODEL, device=0)

output = pipe("/content/Kw5jkyLGFGc.wav", chunk_length_s=10, return_timestamps="char")

import json
with open("/content/Kw5jkyLGFGc.json", "w") as f:
    json.dump(output, f)