Extract phonemes from merlin lab files
🤷 It came up
def break_phones(string):
i = 0
mark = 0
toks = []
pieces = {
0: '^',
1: '-',
2: '+',
3: '=',
4: '@'
}
piece = 0
while i <= len(string):
if string[i:i+1] == pieces[piece]:
if piece < 4:
toks.append(string[mark:i])
else:
if string[i+1:i+2].isdigit():
toks.append(string[mark:i])
else:
toks.append("@")
break
piece += 1
mark = i + 1
i += 1
return toks
assert break_phones('nnj^ii-lj+sil=@@3_2/') == ['nnj', 'ii', 'lj', 'sil', '@']
assert break_phones("x^sil-nnj+ii=lj@1_4") == ['x', 'sil', 'nnj', 'ii', 'lj']
assert break_phones("oo^r-sil+x=x@1_1") == ['oo', 'r', 'sil', 'x', 'x']
def read_phonemes_lab(filename):
phn_bits = []
with open(filename, "r") as f:
for line in f.readlines():
_, _, phones = line.split(' ')
phones = break_phones(phones)
phn_bits.append(phones)
return phn_bits
def check_len(phone_list):
length = str(len(phone_list))
return (length[-1] == "0" or length[-1] == "5")
def prune_phones(phone_list):
if not check_len(phone_list):
return []
return [a[2] for a in phone_list[::5]]
lab_phonemes_raw = {a: read_phonemes_lab(b) for (a, b) in label_files.items()}
lab_phonemes = {a: prune_phones(b) for (a, b) in lab_phonemes_raw.items()}