Convert HSI data to fairseq
Maybe a fine-tuned wav2vec model will work better with WhisperX
wav_files = "/Users/joregan/Playing/hsi/audio"
textgrids = "/Users/joregan/Playing/hsi_ctmedit/textgrid/"
output = "/tmp/hsifairseq"
from pydub import AudioSegment
from praatio import textgrid
from pathlib import Path
tgpath = Path(textgrids)
wavpath = Path(wav_files)
outpath = Path(output)
parameters = ["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]
def fsify(text):
words = text.split(" ")
words = [w.strip("\",.;:?!").upper() for w in words if not w.startswith("[")]
words = [" ".join(w) for w in words]
return " | ".join(words) + " |"
fsify("This is a tesst,")
import re
tsv_lines = []
ltr_lines = []
if not outpath.exists():
outpath.mkdir()
for textgridfile in tgpath.glob("*.[Tt]ext[Gg]rid"):
tg = textgrid.openTextgrid(textgridfile, includeEmptyIntervals=False)
wavfile = wavpath / f"{textgridfile.stem}.wav"
wav = AudioSegment.from_wav(str(wavfile))
if len(tg.tierNames) == 1:
tier = tg.getTier(tg.tierNames[0])
elif "whisperx" in tg.tierNames:
tier = tg.getTier("whisperx")
elif "utterances" in tg.tierNames:
tier = tg.getTier("utterances")
elif "words" in tg.tierNames:
tier = tg.getTier("words")
else:
print("Be careful: file", textgridfile, "has none of the expected tier names")
for interval in tier.entries:
start = interval[0]
end = interval[1]
text = interval[2]
m = re.match("^\[[^]]+\]$", text)
if m:
continue
start_ms = int(start * 1000)
end_ms = int(end * 1000)
wavname = f"{textgridfile.stem}_{start_ms}_{end_ms}.wav"
clip = wav[start_ms:end_ms]
clip.set_frame_rate(16000)
clip.export(str(outpath / wavname), format="wav", parameters=parameters)
tsv_lines.append(f"{wavname}\t{int(clip.frame_count())}")
ltr_lines.append(fsify(text))
with open(str(outpath / "train.tsv"), "w") as tsvf:
tsvf.write("\n".join(tsv_lines))
with open(str(outpath / "train.ltr"), "w") as ltrf:
ltrf.write("\n".join(ltr_lines))