wav_files = "/Users/joregan/Playing/hsi/audio"
textgrids = "/Users/joregan/Playing/hsi_ctmedit/textgrid/"
output = "/tmp/hsifairseq"

from pydub import AudioSegment

from praatio import textgrid

from pathlib import Path

tgpath = Path(textgrids)
wavpath = Path(wav_files)
outpath = Path(output)

parameters = ["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]

def fsify(text):
    words = text.split(" ")
    words = [w.strip("\",.;:?!").upper() for w in words if not w.startswith("[")]
    words = [" ".join(w) for w in words]
    return " | ".join(words) + " |"

fsify("This is a tesst,")

'T H I S | I S | A | T E S S T |'

import re

tsv_lines = []
ltr_lines = []

if not outpath.exists():
    outpath.mkdir()

for textgridfile in tgpath.glob("*.[Tt]ext[Gg]rid"):
    tg = textgrid.openTextgrid(textgridfile, includeEmptyIntervals=False)
    wavfile = wavpath / f"{textgridfile.stem}.wav"
    wav = AudioSegment.from_wav(str(wavfile))

    if len(tg.tierNames) == 1:
        tier = tg.getTier(tg.tierNames[0])
    elif "whisperx" in tg.tierNames:
        tier = tg.getTier("whisperx")
    elif "utterances" in tg.tierNames:
        tier = tg.getTier("utterances")
    elif "words" in tg.tierNames:
        tier = tg.getTier("words")
    else:
        print("Be careful: file", textgridfile, "has none of the expected tier names")

    for interval in tier.entries:
        start = interval[0]
        end = interval[1]
        text = interval[2]

        m = re.match("^\[[^]]+\]$", text)
        if m:
            continue
        start_ms = int(start * 1000)
        end_ms = int(end * 1000)

        wavname = f"{textgridfile.stem}_{start_ms}_{end_ms}.wav"
        clip = wav[start_ms:end_ms]
        clip.set_frame_rate(16000)
        clip.export(str(outpath / wavname), format="wav", parameters=parameters)
        tsv_lines.append(f"{wavname}\t{int(clip.frame_count())}")
        ltr_lines.append(fsify(text))

with open(str(outpath / "train.tsv"), "w") as tsvf:
    tsvf.write("\n".join(tsv_lines))

with open(str(outpath / "train.ltr"), "w") as ltrf:
    ltrf.write("\n".join(ltr_lines))