OWSM-CTC with CTCSegmentation for Irish
tl;dr: OWSM-CTC is good enough for alignment for Irish
import requests
from bs4 import BeautifulSoup
def get_page_text_and_audio(url, poetry=True):
req = requests.get(url)
if req.status_code != 200:
return None
soup = BeautifulSoup(req.text, 'html.parser')
page_text = soup.find("div", {"class": "page-text"})
audio_file = ""
audio = page_text.find("audio")
if audio is not None:
source = audio.find("source")
if source is not None:
audio_file = "https://www.leighleat.com" + source["src"]
audio.decompose()
if poetry:
out_text = page_text.text.strip()
else:
pass
return out_text, audio_file
page_text, audio_url = get_page_text_and_audio("https://www.leighleat.com/poems/26")
audio_file = audio_url.split("/")[-1]
!wget {audio_url} -O {audio_file}
%%capture
wav_file = audio_file.replace(".mp3", ".wav")
!ffmpeg -i {audio_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file}
This was for Colab, but the free GPU doesn't have enough RAM for CTCSegmentation
#%pip install git+https://github.com/pyf98/espnet@owsm-ctc
#%pip install espnet_model_zoo flash-attn
Not a single word of this was correct
import soundfile as sf
import numpy as np
import librosa
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
s2t = Speech2TextGreedySearch.from_pretrained(
"pyf98/owsm_ctc_v3.1_1B",
device="cuda",
generate_interctc_outputs=False,
lang_sym='<gle>',
task_sym='<asr>',
)
speech, rate = sf.read(wav_file)
speech = librosa.util.fix_length(speech, size=(16000 * 30))
res = s2t(speech)[0]
print(res)
utt_text = [f"utt{x} {y}" for x, y in enumerate(page_text.split("\n"), start=1)]
#!ln -sd owsm_ctc_v3.1_1B/exp/
import soundfile as sf
speech, rate = sf.read(wav_file)
from espnet2.bin.s2t_ctc_align import CTCSegmentation
aligner = CTCSegmentation(
s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
fs=16000,
ngpu=1,
batch_size=16, # batched parallel decoding; reduce it if your GPU memory is smaller
kaldi_style_text=True,
time_stamps="fixed",
samples_to_frames_ratio=1280, # 80ms time shift; don't change as it depends on the pre-trained model
lang_sym="<gle>",
task_sym="<asr>",
context_len_in_secs=2, # left and right context in buffered decoding
frames_per_sec=12.5, # 80ms time shift; don't change as it depends on the pre-trained model
)
print(f"speech duration: {len(speech) / rate : .2f} seconds")
segments = aligner(speech, utt_text)
for segment in str(segments).split("\n"):
parts = segment.split(" ")
print(" ".join(parts[0:5]))
def segments_to_audacity(segments, filename):
txt_segments = str(segments).split("\n")
with open(filename, "w") as outf:
for segment in txt_segments:
if segment == "":
continue
parts = segment.split(" ")
start = parts[2]
end = parts[3]
text = " ".join(parts[5:])
outparts = "\t".join([start, end, text])
outf.write(outparts + "\n")
segments_to_audacity(segments, wav_file.replace(".wav", ".tsv"))
Labels adjusted with audacity: the timings aren't perfect
!cat 'damhan%20alla.txt' |awk -F'\t' '{print $1 "\t" $2}'