Converting fuaimeanna.ie data for fairseq
To train an Irish phonemic ASR system
Using data from an earlier scraper
from pathlib import Path
BASEPATH = Path("/Users/joregan/Playing/irish-gists/scrape-fuaimeanna-private/")
OUTPATH = Path("/tmp/fuaimeanna-fairseq")
if not OUTPATH.is_dir():
OUTPATH.mkdir()
EMPTY_AUDIO = """
gob_i3_s3.mp3
iioctha_i3_s3.mp3
mo_shuiiochaan_i3_s3.mp3
riail_i3_s3.mp3
""".strip().split("\n")
PHONES_NO_AUDIO = """
d'fhaag_i1_s1.phones
d'fhaag_i2_s2.phones
d'fhaag_i3_s3.phones
""".strip().split("\n")
parameters=["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]
from pydub import AudioSegment
def drop_accents(pron):
parts = pron.split(" ")
return " ".join([x for x in parts if x not in [".", "ˈ", "ˌ"]])
prons_gd = {}
prons_cr = {}
prons_cd = {}
with open(str(BASEPATH / "all-fuaimeanna-data.tsv")) as tsvf:
for line in tsvf.readlines():
if line.startswith("Orth"):
continue
parts = line.split("\t")
fname = parts[1]
fname = fname.replace(".mp3", "").split("/")[2]
prons_gd[fname] = drop_accents(parts[2].strip())
fname = parts[3]
fname = fname.replace(".mp3", "").split("/")[2]
prons_cr[fname] = drop_accents(parts[4].strip())
fname = parts[5]
fname = fname.replace(".mp3", "").split("/")[2]
prons_cd[fname] = drop_accents(parts[6].strip())
prons_all = {**prons_cd, **prons_cr, **prons_gd}
validkeys = list(prons_cd.keys())[:int(len(prons_cd) * .05)]
validkeys += list(prons_cr.keys())[:int(len(prons_cr) * .05)]
validkeys += list(prons_gd.keys())[:int(len(prons_gd) * .05)]
alldata = []
for audio_file in (BASEPATH / "mp3").glob("*.mp3"):
if audio_file.name in EMPTY_AUDIO:
continue
stem = audio_file.stem
if f"{stem}.phones" in PHONES_NO_AUDIO:
continue
data = {}
data["name"] = stem
spoken = AudioSegment.from_mp3(str(audio_file))
outfile = OUTPATH / f"{stem}.wav"
spoken.export(str(outfile), format="wav", parameters=parameters)
data["labels"] = prons_all[stem].replace("#", "|")
data["frames"] = spoken.frame_count()
alldata.append(data)
with open(str(OUTPATH / "train.ltr"), "w") as train_ltr,\
open(str(OUTPATH / "train.tsv"), "w") as train_tsv,\
open(str(OUTPATH / "valid.ltr"), "w") as valid_ltr,\
open(str(OUTPATH / "valid.tsv"), "w") as valid_tsv:
for datum in alldata:
if datum["name"] in validkeys:
out_ltr = valid_ltr
out_tsv = valid_tsv
else:
out_ltr = train_ltr
out_tsv = train_tsv
out_ltr.write(f'{datum["labels"]} |\n')
out_tsv.write(f'{datum["name"]}.wav\t{int(datum["frames"])}\n')