Fix LJSpeech text
LJSpeech comes with a normalised version, but it needs some extra work
Updated here
PATH = "/home/joregan/ljspeech/LJSpeech-1.1/metadata.csv"
def fix_text(text):
text = text.lower()
text = text.replace(" -- ", " ")
text = text.replace("ü", "u")
text = text.replace("etc.", "etcetera")
text = text.replace("i.e.", "i e ")
text = text.replace(";", "")
text = text.replace(". ", " ")
text = text.replace(",", "")
text = text.replace("\"", "")
text = text.replace(" ", " ")
alpha = "abcdefghijklmnopqrstuvwxyz"
i = 0
buf = []
while i < len(text):
if text[i] in alpha or text[i] == " ":
buf.append(text[i])
elif text[i:i+2] == "'s" or text[i-1:i+2] == "s' ":
buf.append(text[i])
elif i == len(text)-1 and text[-2:] == "s'":
buf.append(text[i])
elif text[i:i+2] == "'d" or text[i:i+3] == "'ve":
buf.append(text[i])
elif text[i] == "-" and text[i-1] in alpha:
buf.append(" ")
else:
pass
i += 1
text = "".join(buf)
return text
items = {}
with open(PATH) as f:
for line in f.readlines():
arr = line.split("|")
if len(arr) != 3:
print(line)
id = arr[0]
text = fix_text(arr[2])
items[id] = text
OUTPATH = "/home/joregan/ljspeech/LJSpeech-1.1/text.tsv"
with open(OUTPATH, "w") as outf:
for key in items.keys():
outf.write(f"{key}\t{items[key]}\n")