Extract a dictionary from MFA-aligned TextGrids
Need to re-align, don't have the dictionary
%%capture
!pip install praatio
def irish_lc(word):
if word[0:1] in "nt" and word[1:2] in "AEIOUÁÉÍÓÚ":
return word[0:1] + "-" + word[1:].lower()
else:
return word.lower()
assert irish_lc("nAthair") == "n-athair"
assert irish_lc("nDeas") == "ndeas"
def get_combined_words_and_phones(filename):
from praatio import textgrid
tg = textgrid.openTextgrid(filename, False)
if not tg.tierNameList or tg.tierNameList != ['Word', 'phones']:
return []
word = tg.tierDict['Word']
phones = tg.tierDict['phones']
i = 0
j = 0
out = []
def it_to_dict(it):
ret = {}
ret['start'] = it.start
ret['end'] = it.end
ret['label'] = it.label
return ret
while i < len(word.entryList) and j < len(phones.entryList):
cur_word = it_to_dict(word.entryList[i])
cur_word['phones'] = []
while j < len(phones.entryList) and phones.entryList[j].end <= cur_word['end']:
end_time = phones.entryList[j].end
tmp_phone = it_to_dict(phones.entryList[j])
cur_word['phones'].append(tmp_phone)
j += 1
if end_time == cur_word['end']:
i += 1
out.append(cur_word)
continue
return out
def get_wordlist_from_combined(items, wordnorm=None):
tmp = []
for item in items:
word = item['label']
if wordnorm is None:
word = word.lower()
else:
word = wordnorm(word)
phones = " ".join([a['label'] for a in item['phones']])
if phones == "sil":
continue
tmp.append((word, phones))
return tmp
from pathlib import Path
wd = Path("PATH TO FILES")
tg_data = {}
for tg in wd.glob("*.TextGrid"):
tg_data[tg.stem] = get_wordlist_from_combined(get_combined_words_and_phones(tg), wordnorm=irish_lc)
dictionary = set()
for (tg_name, tg_words) in tg_data.items():
dictionary.update(set(tg_words))
joined = [" ".join(a) for a in dictionary]
with open("output.dict", "w") as outf:
for word in sorted(joined):
outf.write(word + "\n")