Liepa to fairseq
Convert the liepa2 corpus to fairseq
BASE = "/mnt/cloud/liepa-split2/"
RAW = f"{BASE}/text.tsv"
import soundfile as sf
text = {}
frames = {}
with open(RAW) as inf:
for line in inf.readlines():
if not "\t" in line:
print(line)
pass
parts = line.split("\t")
if len(parts) != 2:
print(line)
pass
id = parts[0]
text[id] = parts[1]
data, sr = sf.read(f"{BASE}/{id}.wav")
if sr != 16000:
print(line)
pass
frames[id] = len(data)
import re
def cleantext(text):
ALPHA = "aąbcčdeęėfghiįyjklmnoprsštuųūvzžqx"
text = text.lower().replace("-", " ")
chars = []
for ch in text:
if ch in ALPHA or ch == " ":
chars.append(ch)
text = "".join(chars)
text = re.sub(" *", " ", text)
return text.strip()
with open(f"{BASE}/frames-normtext.tsv", "w") as outf:
for id in text.keys():
norm = cleantext(text[id])
outf.write(f"{id}\t{frames[id]}\t{norm}\n")
import IPython
def playwav(id):
return IPython.display.Audio(f"{BASE}/{id}.wav")
Makes splits; only want 100 hours of train
TRAIN = 16000 * 60 * 60 * 100
TEST = 16000 * 60 * 60 * 5
VALID = 16000 * 60 * 60 * 5
Not entirely sure about the speaker IDs; worst case scenario, there are 10: F1-5, and M1-5.
BASE = "/tmp/outp"
OBASE = "/mnt/cloud/liepa-split2/"
with open(f"{BASE}/train.tsv", "w") as traintsv,\
open(f"{BASE}/train.ltr", "w") as trainltr,\
open(f"{BASE}/valid.tsv", "w") as validtsv,\
open(f"{BASE}/valid.ltr", "w") as validltr,\
open(f"{BASE}/test.tsv", "w") as testtsv,\
open(f"{BASE}/test.ltr", "w") as testltr,\
open(f"{OBASE}/frames-normtext.tsv") as inf:
for line in inf.readlines():
parts = line.strip().split("\t")
if len(parts) != 3:
print(line)
continue
id = parts[0]
frames = int(parts[1])
text = parts[2]
if "_M4_" in id:
if (TEST - frames > 0):
testtsv.write(f"{id}\t{frames}\n")
testltr.write(f"{text}\n")
TEST = TEST - frames
else:
continue
elif "_M5_" in id:
if (VALID - frames > 0):
validtsv.write(f"{id}\t{frames}\n")
validltr.write(f"{text}\n")
VALID = VALID - frames
else:
continue
else:
if (TRAIN - frames > 0):
traintsv.write(f"{id}\t{frames}\n")
trainltr.write(f"{text}\n")
TRAIN = TRAIN - frames
else:
continue