import requests
from bs4 import BeautifulSoup

def get_page_text_and_audio(url, poetry=True):
    req = requests.get(url)
    if req.status_code != 200:
        return None
    soup = BeautifulSoup(req.text, 'html.parser')

    page_text = soup.find("div", {"class": "page-text"})

    audio_file = ""
    audio = page_text.find("audio")
    if audio is not None:
        source = audio.find("source")
        if source is not None:
            audio_file = "https://www.leighleat.com" + source["src"]
    audio.decompose()

    if poetry:
        out_text = page_text.text.strip()
    else:
        pass

    return out_text, audio_file

page_text, audio_url = get_page_text_and_audio("https://www.leighleat.com/poems/26")

audio_file = audio_url.split("/")[-1]
!wget {audio_url} -O {audio_file}

%%capture
wav_file = audio_file.replace(".mp3", ".wav")
!ffmpeg -i {audio_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file}

This was for Colab, but the free GPU doesn't have enough RAM for CTCSegmentation

#%pip install git+https://github.com/pyf98/espnet@owsm-ctc
#%pip install espnet_model_zoo flash-attn

Not a single word of this was correct

import soundfile as sf
import numpy as np
import librosa
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch


s2t = Speech2TextGreedySearch.from_pretrained(
    "pyf98/owsm_ctc_v3.1_1B",
    device="cuda",
    generate_interctc_outputs=False,
    lang_sym='<gle>',
    task_sym='<asr>',
)

speech, rate = sf.read(wav_file)

speech = librosa.util.fix_length(speech, size=(16000 * 30))

res = s2t(speech)[0]
print(res)

utt_text = [f"utt{x} {y}" for x, y in enumerate(page_text.split("\n"), start=1)]

Git LFS initialized.

#!ln -sd owsm_ctc_v3.1_1B/exp/

import soundfile as sf
speech, rate = sf.read(wav_file)

/home/joregan/owsm-ctc
damhan%20alla.wav

from espnet2.bin.s2t_ctc_align import CTCSegmentation

aligner = CTCSegmentation(
    s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
    fs=16000,
    ngpu=1,
    batch_size=16,    # batched parallel decoding; reduce it if your GPU memory is smaller
    kaldi_style_text=True,
    time_stamps="fixed",
    samples_to_frames_ratio=1280,   # 80ms time shift; don't change as it depends on the pre-trained model
    lang_sym="<gle>",
    task_sym="<asr>",
    context_len_in_secs=2,  # left and right context in buffered decoding
    frames_per_sec=12.5,    # 80ms time shift; don't change as it depends on the pre-trained model
)

print(f"speech duration: {len(speech) / rate : .2f} seconds")

segments = aligner(speech, utt_text)

for segment in str(segments).split("\n"):
    parts = segment.split(" ")
    print(" ".join(parts[0:5]))

utt1 utt 0.28 1.24 -1.2300
utt2 utt 3.18 4.04 -0.8518
utt3 utt 4.14 5.00 -1.3033
utt4 utt 5.18 6.12 -1.4109
utt5 utt 6.14 7.16 -1.6551
utt6 utt 7.50 8.68 -1.0598
utt7 utt 8.94 10.12 -0.9344
utt8 utt 10.46 11.96 -0.6786
utt9 utt 12.54 14.68 -0.8216

def segments_to_audacity(segments, filename):
    txt_segments = str(segments).split("\n")
    with open(filename, "w") as outf:
        for segment in txt_segments:
            if segment == "":
                continue
            parts = segment.split(" ")
            start = parts[2]
            end = parts[3]
            text = " ".join(parts[5:])
            outparts = "\t".join([start, end, text])
            outf.write(outparts + "\n")

segments_to_audacity(segments, wav_file.replace(".wav", ".tsv"))

Labels adjusted with audacity: the timings aren't perfect

!cat 'damhan%20alla.txt' |awk -F'\t' '{print $1 "\t" $2}'

0.280000	1.240000
3.398818	4.258818
4.324453	5.184453
5.264552	6.204552
6.140000	7.160000
7.694345	8.874345
9.068335	10.248335
10.630345	12.130345
12.540000	14.680000