Step 1: Convert sampling rate to 16,000

!mkdir wav16
%%capture
!for wav in ../input/the-lj-speech-dataset/LJSpeech-1.1/wavs/*wav; do ffmpeg -i $wav -ar 16000 wav16/$(basename $wav '.wav').wav;done

Step 2: (Further) normalise the transcripts

PATH = "../input/the-lj-speech-dataset/LJSpeech-1.1/metadata.csv"
def fix_text(text):
    text = text.lower()
    text = text.replace(" -- ", " ")
    text = text.replace("ü", "u")
    text = text.replace("etc.", "etcetera")
    text = text.replace("i.e.", "i e ")
    text = text.replace(";", "")
    text = text.replace(". ", " ")
    text = text.replace(",", "")
    text = text.replace("\"", "")
    text = text.replace("  ", " ")
    alpha = "abcdefghijklmnopqrstuvwxyz"
    i = 0
    buf = []
    while i < len(text):
        if text[i] in alpha or text[i] == " ":
            buf.append(text[i])
        elif text[i:i+2] == "'s" or text[i-1:i+2] == "s' ":
            buf.append(text[i])
        elif i == len(text)-1 and text[-2:] == "s'":
            buf.append(text[i])
        elif text[i:i+2] == "'d" or text[i:i+3] == "'ve":
            buf.append(text[i])
        elif text[i] == "-" and text[i-1] in alpha:
            buf.append(" ")
        else:
            pass
        i += 1
    text = "".join(buf)
    return text
items = {}
with open(PATH) as f:
    for line in f.readlines():
        arr = line.split("|")
        if len(arr) != 3:
            print(line)
        id = arr[0]
        text = fix_text(arr[2])
        items[id] = text
OUTPATH = "transcripts.tsv"
with open(OUTPATH, "w") as outf:
    for key in items.keys():
        outf.write(f"{key}\t{items[key]}\n")

Step 3: Extract number of frames

This is needed by fairseq

from pathlib import Path
import soundfile as sf
WAVPATH = Path("wav16")
times = {}
for wavfile in WAVPATH.glob("*.wav"):
    data, sr = sf.read(str(wavfile))
    times[wavfile.stem] = len(data)
with open("frames.tsv", "w") as framef:
    for key in times.keys():
        framef.write(f"{key}\t{times[key]}\n")