Create awb splits
Create splits for the AWB dataset, for use in finetuning wav2vec2 models
Original on kaggle
train_frames = {}
total = 0
with open("../input/cmu-us-awb-arctic-fairseq-files/train.tsv") as f:
for line in f.readlines():
if not "\t" in line:
continue
pieces = line.strip().split("\t")
total += int(pieces[1])
assert len(pieces) == 2
id = pieces[0].replace(".wav", "")
train_frames[id] = int(pieces[1])
total / 16000
4295 / 60
MINS = [i * 5 for i in range(1, 13)]
MINS
WAVDIR = "/kaggle/input/ljspeech-for-asr/wav16"
for min in MINS:
frames = min * 60 * 16000
idlist = [k for k in train_frames.keys()]
outtsv = f"{min}mins.tsv"
with open(outtsv, "w") as of:
current = 0
of.write(f"{WAVDIR}\n")
while frames > 0 and frames > current:
id = idlist.pop(0)
current = train_frames[id]
of.write(f"{id}.wav\t{current}\n")
frames = frames - current
max = 0
maxid = ""
for id in idlist:
time = train_frames[id]
if time > current:
continue
if time > max:
max = time
maxid = id
of.write(f"{maxid}.wav\t{max}\n")
def fairseqify(text):
text = text.strip().replace(" ", " ")
words = text.split(" ")
spread = [" ".join(a) for a in words]
return " | ".join(spread) + " |"
transcripts = {}
with open("../input/cmu-us-awb-arctic-fairseq-files/text.tsv") as tf:
for line in tf.readlines():
line = line.strip()
if not "\t" in line:
pass
parts = line.split("\t")
assert len(parts) == 2
transcripts[parts[0]] = fairseqify(parts[1])
import glob
for tsv in glob.glob("*.tsv"):
out = tsv.replace(".tsv", ".ltr")
with open(tsv) as inf, open(out, "w") as outf:
for line in inf.readlines()[1:]:
id, _ = line.split("\t")
id = id.replace(".wav", "")
outf.write(f"{transcripts[id]}\n")
!wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt
!for i in *mins.tsv;do b=$(basename $i ".tsv");mkdir $b; mv $b.tsv $b/train.tsv; mv $b.ltr $b/train.ltr; cp dict.ltr.txt ../input/cmu-us-awb-arctic-fairseq-files/test.* $b/;cp ../input/cmu-us-awb-arctic-fairseq-files/dev.tsv $b/valid.tsv; cp ../input/cmu-us-awb-arctic-fairseq-files/dev.ltr $b/valid.ltr;done