LJSpeech for ASR
Resampled wav, more normalised text. From Kaggle
!mkdir wav16
%%capture
!for wav in ../input/the-lj-speech-dataset/LJSpeech-1.1/wavs/*wav; do ffmpeg -i $wav -ar 16000 wav16/$(basename $wav '.wav').wav;done
PATH = "../input/the-lj-speech-dataset/LJSpeech-1.1/metadata.csv"
def fix_text(text):
text = text.lower()
text = text.replace(" -- ", " ")
text = text.replace("ü", "u")
text = text.replace("etc.", "etcetera")
text = text.replace("i.e.", "i e ")
text = text.replace(";", "")
text = text.replace(". ", " ")
text = text.replace(",", "")
text = text.replace("\"", "")
text = text.replace(" ", " ")
alpha = "abcdefghijklmnopqrstuvwxyz"
i = 0
buf = []
while i < len(text):
if text[i] in alpha or text[i] == " ":
buf.append(text[i])
elif text[i:i+2] == "'s" or text[i-1:i+2] == "s' ":
buf.append(text[i])
elif i == len(text)-1 and text[-2:] == "s'":
buf.append(text[i])
elif text[i:i+2] == "'d" or text[i:i+3] == "'ve":
buf.append(text[i])
elif text[i] == "-" and text[i-1] in alpha:
buf.append(" ")
else:
pass
i += 1
text = "".join(buf)
return text
items = {}
with open(PATH) as f:
for line in f.readlines():
arr = line.split("|")
if len(arr) != 3:
print(line)
id = arr[0]
text = fix_text(arr[2])
items[id] = text
OUTPATH = "transcripts.tsv"
with open(OUTPATH, "w") as outf:
for key in items.keys():
outf.write(f"{key}\t{items[key]}\n")
from pathlib import Path
import soundfile as sf
WAVPATH = Path("wav16")
times = {}
for wavfile in WAVPATH.glob("*.wav"):
data, sr = sf.read(str(wavfile))
times[wavfile.stem] = len(data)
with open("frames.tsv", "w") as framef:
for key in times.keys():
framef.write(f"{key}\t{times[key]}\n")