%%capture
!pip install youtube-dl
!pip install phonemizer
%%capture
!apt install espeak-ng
!youtube-dl -x --audio-format best -o '%(id)s.%(ext)s' https://www.youtube.com/watch?v=Kw5jkyLGFGc
[youtube] Kw5jkyLGFGc: Downloading webpage
[youtube] Kw5jkyLGFGc: Downloading MPD manifest
[download] Destination: Kw5jkyLGFGc.m4a
[download] 100% of 10.98MiB in 04:05
[ffmpeg] Correcting container in "Kw5jkyLGFGc.m4a"
[ffmpeg] Post-process file Kw5jkyLGFGc.m4a exists, skipping
%%capture
!ffmpeg -i Kw5jkyLGFGc.m4a -acodec pcm_s16le -ac 1 -ar 16000 Kw5jkyLGFGc.wav

Here starts the actual ASR stuff.

%%capture
!pip install transformers
_SWE_MODEL = "facebook/wav2vec2-lv-60-espeak-cv-ft"
from transformers import pipeline
pipe = pipeline(model=_SWE_MODEL, device=0)
output = pipe("/content/Kw5jkyLGFGc.wav", chunk_length_s=10, return_timestamps="char")
import json
with open("/content/Kw5jkyLGFGc.json", "w") as f:
    json.dump(output, f)