Fix NST lexicon accents
Attach accents to vowels instead of syllables in the OpenSLR version of the NST Swedish G2P lexicon
_VOWELS = [
"}:",
"2:",
"9",
"a",
"a*U",
"A:",
"e",
"E",
"E*U",
"e:",
"E:",
"I",
"i:",
"O",
"o:",
"U",
"u:",
"u0",
"Y",
"y:"
]
_SAMPLE = """
AFTENPOSTEN "a f t e n %p O s t e n
AFTONBLADET "a f t O n %b l A: d e t
AFTONBLADETS "a f t O n %b l A: d e t s
AFTONBRISVÄGEN "a f t O n b r i: s %v E: g e n
AFTONGATAN "a f t O N %g A: t a n
AFTONVÄGEN "a f t O n %v E: g e n
AFZELIIVÄGEN a f "s e: l I %v E: g e n
AFZELIUS a f "s e: l I u0 s
AGADIR a g a "d i: r
AGAMEMNON a g a "m E m n O n
AGARD "A: g a d`
AGARDH "A: g a d`
AGARDHSGATAN "A: g a d` s` %g A: t a n
AGARDSSON "A: g a d` s` O n
AGASSI a "g a s I
AGASSIS a "g a s I s
AGATA a "g A: t a
AGATAS a "g A: t a s
"""
def split_phone(inphone):
_STRESSMARKS = ['""', '"', '%']
outmark = ''
outphone = inphone
for sm in _STRESSMARKS:
if inphone.startswith(sm):
outmark = sm
outphone = inphone.replace(sm, '')
return (outmark, outphone)
out_words = []
for line in _SAMPLE.split('\n'):
if line == '':
continue
phones_out = []
parts = line.split('\t')
assert len(parts) == 2
current_mark = ''
for phone in parts[1].split(' '):
tmp_mark, actual_phone = split_phone(phone)
if tmp_mark != '':
current_mark = tmp_mark
if actual_phone in _VOWELS and current_mark != '':
phones_out.append(current_mark + actual_phone)
current_mark = ''
else:
phones_out.append(actual_phone)
new_phones = ' '.join(phones_out)
out_words.append(f"{parts[0]}\t{new_phones}")
out_words