Adapt `cmu_us_awb_arctic` to fairseq
Writing the tsv/ltr files; from Kaggle
RAWTEXT = "../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/etc/txt.done.data"
NORMS = {
"0.75": "zero point seven five",
"t.h": "t h",
"1880": "eighteen eighty",
"16": "sixteenth",
"1908": "nineteen oh eight",
"18": "eighteenth",
"17": "seventeenth",
"29th": "twenty ninth",
"mrs": "misses",
"etc": "etcetera",
"etc.": "etcetera",
"to-day": "today",
"to-day's": "today's",
"to-morrow": "tomorrow"
}
def _check_apos(word):
if word.endswith("'s"):
return word
elif word.endswith("s'"):
return word
elif word.endswith("'d"):
return word
elif word.endswith("'ve"):
return word
elif word.endswith("'re"):
return word
elif word.endswith("'ll"):
return word
elif word.endswith("n't"):
return word
elif word.endswith("'ve"):
return word
elif word in ["i'm", "'em", "o'brien"]:
return word
else:
return word.replace("'", "")
def fix_apos(text):
words = [_check_apos(w) for w in text.split(" ")]
return " ".join(words)
def normalise(text):
if text[-1] == ".":
text = text[:-1]
text = text.lower()
words = []
text = text.replace(",", "")
for word in text.split(" "):
if word in NORMS:
words.append(NORMS[word])
else:
words.append(word)
text = " ".join(words)
text = text.replace(".", "")
text = text.replace("?", "")
text = text.replace("!", "")
text = text.replace(":", "")
text = text.replace(";", "")
text = text.replace("--", " ")
text = text.replace(" ", " ")
text = text.replace(" - ", " ")
text = text.replace("to- morrow", "tomorrow")
text = fix_apos(text)
text = text.replace("-", " ")
return text.strip().upper()
data = {}
with open(RAWTEXT) as inf:
for line in inf.readlines():
first_space = line.find(' ')
first_quote = line.find('"')
last_quote = line.rfind('"')
id = line[first_space+1:first_quote].strip()
text = line[first_quote+1:last_quote]
data[id] = normalise(text)
with open("text.tsv", "w") as of:
for id in data.keys():
of.write(f"{id}\t{data[id]}\n")
from pathlib import Path
import soundfile as sf
total = 0
WAVPATH = Path("../input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/wav/")
with open("frames.tsv", "w") as of:
for wav in WAVPATH.glob("*.wav"):
frames, sr = sf.read(str(wav))
assert sr == 16000
total += len(frames)
of.write(f"{wav.stem}.wav\t{len(frames)}\n")
print("Total:", total / 16000)
lines=!wc -l frames.tsv|awk '{print $1}'
!tail -n 114 frames.tsv |head -n 57 > test.tsv
!tail -n 114 frames.tsv |tail -n 57 > dev.tsv
!head -n $((1138-114)) frames.tsv > train.tsv
def do_fairseq(text):
words = text.split(" ")
owords = [" ".join(w) for w in words]
return " | ".join(owords) + " |"
for part in ["test", "train", "dev"]:
ids = []
with open(f"{part}.ltr", "w") as of, open(f"{part}.tsv") as inf:
for line in inf.readlines():
if "\t" in line:
parts = line.strip().split("\t")
id = parts[0].replace(".wav", "")
of.write(do_fairseq(data[id]) + "\n")