BASE = "/mnt/cloud/liepa-split2/"
RAW = f"{BASE}/text.tsv"
import soundfile as sf
text = {}
frames = {}
with open(RAW) as inf:
    for line in inf.readlines():
        if not "\t" in line:
            print(line)
            pass
        parts = line.split("\t")
        if len(parts) != 2:
            print(line)
            pass
        id = parts[0]
        text[id] = parts[1]
        data, sr = sf.read(f"{BASE}/{id}.wav")
        if sr != 16000:
            print(line)
            pass
        frames[id] = len(data)
import re
def cleantext(text):
    ALPHA = "aąbcčdeęėfghiįyjklmnoprsštuųūvzžqx"
    text = text.lower().replace("-", " ")
    chars = []
    for ch in text:
        if ch in ALPHA or ch == " ":
            chars.append(ch)
    text = "".join(chars)
    text = re.sub("  *", " ", text)
    return text.strip()
with open(f"{BASE}/frames-normtext.tsv", "w") as outf:
    for id in text.keys():
        norm = cleantext(text[id])
        outf.write(f"{id}\t{frames[id]}\t{norm}\n")
import IPython
def playwav(id):
    return IPython.display.Audio(f"{BASE}/{id}.wav")

Makes splits; only want 100 hours of train

TRAIN = 16000 * 60 * 60 * 100
TEST = 16000 * 60 * 60 * 5
VALID = 16000 * 60 * 60 * 5

Not entirely sure about the speaker IDs; worst case scenario, there are 10: F1-5, and M1-5.

BASE = "/tmp/outp"
OBASE = "/mnt/cloud/liepa-split2/"
with open(f"{BASE}/train.tsv", "w") as traintsv,\
     open(f"{BASE}/train.ltr", "w") as trainltr,\
     open(f"{BASE}/valid.tsv", "w") as validtsv,\
     open(f"{BASE}/valid.ltr", "w") as validltr,\
     open(f"{BASE}/test.tsv", "w") as testtsv,\
     open(f"{BASE}/test.ltr", "w") as testltr,\
     open(f"{OBASE}/frames-normtext.tsv") as inf:
        for line in inf.readlines():
            parts = line.strip().split("\t")
            if len(parts) != 3:
                print(line)
                continue
            id = parts[0]
            frames = int(parts[1])
            text = parts[2]
            if "_M4_" in id: 
                if (TEST - frames > 0):
                    testtsv.write(f"{id}\t{frames}\n")
                    testltr.write(f"{text}\n")
                    TEST = TEST - frames
                else:
                    continue
            elif "_M5_" in id: 
                if (VALID - frames > 0):
                    validtsv.write(f"{id}\t{frames}\n")
                    validltr.write(f"{text}\n")
                    VALID = VALID - frames
                else:
                    continue
            else:
                if (TRAIN - frames > 0):
                    traintsv.write(f"{id}\t{frames}\n")
                    trainltr.write(f"{text}\n")
                    TRAIN = TRAIN - frames
                else:
                    continue