Expand MFA lexicon
MFA has phonological rules, but the implementation is useless. This approximates phonological rules for our speakers
dictfile = "/Users/joregan/Documents/MFA/pretrained_models/dictionary/hsi_english_us_arpa.dict"
tgfiles = "/Users/joregan/Playing/hsi_ctmedit/textgrid/"
outdict = "/tmp/added.dict"
EXP = {
"JH": "Y",
"W": "V",
"V": "B",
"B": "V"
}
TRAP_BATH = ["S", "F", "TH", "N", "V", "Z", "SH", "M", "L", "DH"]
# everything but 'ER'
PURE_VOWELS = [
"AA",
"AE",
"AH",
"AO",
"AW",
"AY",
"EH",
"EY",
"IH",
"IY",
"OW",
"OY",
"UH",
"UW"
]
def is_vowel(phone, pure=True):
if not pure and phone.startswith("ER"):
return True
if phone[-1:] in ["0", "1", "2"]:
phone = phone[:-1]
return phone in PURE_VOWELS
def expand_inner(text):
phones = text.split(" ")
outphones = []
if len(phones) >= 2 and phones[0] == "S" and phones[1] in ["T", "P", "M"]:
outphones.append(["EH0", ""])
i = 0
next_opt = False
for i in range(len(phones)):
cur = [phones[i]]
if next_opt:
cur.append("")
next_opt = False
if phones[i] in EXP:
cur.append(EXP[phones[i]])
if i < len(phones) - 1 and phones[i] in ["S", "N"] and phones[i+1] == "T":
next_opt = True
if i < len(phones) - 1 and phones[i].startswith("AE") and phones[i+1] in TRAP_BATH:
cur.append("AA" + phones[i][-1])
if i < len(phones) - 1 and phones[i].startswith("AA") and phones[i+1] == "R":
cur.append("AE" + phones[i][-1])
if phones[i].startswith("ER"):
emph = phones[i][-1]
cur.append("EH" + emph)
cur.append("AH" + emph)
if i < len(phones) - 1 and is_vowel(phones[i]) and phones[i+1] == "R":
next_opt = True
if i == 0 and is_vowel(phones[i], False):
outphones = [["HH", ""]]
outphones.append(cur)
return outphones
import itertools
def expand(text):
return [list(x) for x in itertools.product(*expand_inner(text))]
entries = {}
with open(dictfile) as df:
for line in df.readlines():
parts = line.strip().split("\t")
if not parts[0] in entries:
entries[parts[0]] = []
entries[parts[0]].append(parts[-1])
from pathlib import Path
from praatio import textgrid
import re
def norm(text):
words = text.split(" ")
words = [w.strip("\",.;:?!").upper() for w in words if not w.startswith("[")]
return words
seen_words = []
missing = []
new_entries = set()
for textgridfile in Path(tgfiles).glob("*.[Tt]ext[Gg]rid"):
tg = textgrid.openTextgrid(textgridfile, includeEmptyIntervals=False)
if len(tg.tierNames) == 1:
tier = tg.getTier(tg.tierNames[0])
elif "whisperx" in tg.tierNames:
tier = tg.getTier("whisperx")
elif "utterances" in tg.tierNames:
tier = tg.getTier("utterances")
elif "words" in tg.tierNames:
tier = tg.getTier("words")
else:
print("Be careful: file", textgridfile, "has none of the expected tier names")
for interval in tier.entries:
start = interval[0]
end = interval[1]
text = interval[2]
m = re.match("^\[[^]]+\]$", text)
if m:
continue
for word in norm(text):
word = word.lower()
if word in seen_words:
continue
if not word in entries:
missing.append(word)
continue
if word.endswith("-"):
continue
for pron in entries[word]:
for expanded in expand(pron):
# filter existing pronunciations!
joined = ' '.join(expanded)
joined = re.sub(" +", " ", joined.strip())
if not joined in entries[word]:
new_entries.add(f"{word}\t{joined}")
[" ".join(x).strip() for x in expand("JH AH1 S T")]
with open(outdict, "w") as outf:
for i in sorted(new_entries):
outf.write(i + "\n")
set(missing)