Syllabify Phonetisaurus output
Part of the syllable assignment
def is_schwa(phone, is_timit=True):
if is_timit:
return phone in ["ax", "axr", "ix"]
else:
return phone == "AH0"
# CMUdict doesn't have syllabic consonants
def is_syllabic_consonant(phone, is_timit=True):
SYLLC = ["el", "em", "en", "er", "er1", "er2"]
if is_timit and phone in SYLLC:
return True
else:
return False
def is_vowel(phone):
VOWELS = ["aa", "ae", "ah", "ao", "aw", "ax", "axr", "ay", "eh", "ey", "ih", "ix", "iy", "ow", "oy", "uh", "uw"]
if phone[-1] in "012":
return phone[:-1].lower() in VOWELS
else:
return phone.lower() in VOWELS
def is_vocalic(phone):
return is_vowel(phone) or is_syllabic_consonant(phone)
# http://web.archive.org/web/20100614180508/http://semarch.linguistics.fas.nyu.edu/barker/Syllables/syllabify.pl
def sonority(phone):
STOPS = ["p", "b", "t", "d", "k", "g"]
AFFRICATES = ["ch", "jh"]
FRICATIVES = ["th", "dh", "f", "v", "s", "z", "sh", "zh"]
NASALS = ["m", "n", "ng"]
LIQUIDS = ["l", "r"]
GLIDES = ["w", "y"]
# 's' is special
if phone == "s":
return 1
elif phone in STOPS:
return 1
elif phone in AFFRICATES:
return 2
elif phone in FRICATIVES:
return 3
elif phone in NASALS:
return 4
elif phone in LIQUIDS:
return 5
elif phone == "hh":
return 6
elif phone in GLIDES:
return 6
else:
return 7
def last_phoneme(graphone):
grapheme, phoneme = graphone.split('}')
return phoneme.split('|')[-1]
def first_phoneme(graphone):
grapheme, phoneme = graphone.split('}')
return phoneme.split('|')[0]
assert last_phoneme('x}e|k|s') == 's'
assert first_phoneme('x}e|k|s') == 'e'
def voicing_mismatch(phone1, phone2):
VOICED = ["b", "d", "g", "jh", "dh", "v", "z", "zh"]
DEVOICED = ["p", "t", "k", "ch", "th", "f", "s", "sh"]
if phone1 in VOICED and phone2 in DEVOICED:
return True
elif phone2 in VOICED and phone1 in DEVOICED:
return True
else:
return False
def merge_graphones(graphones):
graphemes = []
phonemes = []
for graphone in graphones:
graphemes_string, phonemes_string = graphone.split('}')
cur_graphemes = graphemes_string.split('|')
cur_phonemes = phonemes_string.split('|')
graphemes += cur_graphemes
phonemes += cur_phonemes
if len(graphemes) > 1:
pruned_graphemes = [a for a in graphemes if a != '_']
if len(pruned_graphemes) == 0:
pruned_graphemes = ['_']
else:
pruned_graphemes = graphemes
if len(phonemes) > 1:
pruned_phonemes = [a for a in phonemes if a != '_']
if len(pruned_phonemes) == 0:
pruned_phonemes = ['_']
else:
pruned_phonemes = phonemes
return '}'.join(('|'.join(pruned_graphemes), '|'.join(pruned_phonemes)))
assert merge_graphones("a}a t|h}th x}k|s".split(' ')) == 'a|t|h|x}a|th|k|s'
assert merge_graphones("a}a t|h}th x}k|s e}_".split(' ')) == 'a|t|h|x|e}a|th|k|s'
assert merge_graphones("a}_ t|h}_ x}_ e}_".split(' ')) == 'a|t|h|x|e}_'
assert merge_graphones("_}a _}th _}k|s".split(' ')) == '_}a|th|k|s'
def syllabify(graphones):
sonority_up = True
last_sonority_up = True
last_sonority = 0
isvowel = False
last_isvowel = False
saw_vowel = False
stack = []
output = []
last_phoneme = ""
labials = ["p", "b", "m", "f", "v"]
s_sh = ["s", "sh"]
for graphone in graphones[::-1]:
phoneme = first_phoneme(graphone)
phone_sonority = sonority(phoneme)
isvowel = is_vocalic(phoneme)
sonority_up = last_sonority < phone_sonority
# For timit
if graphone == '_':
stack.append(graphone)
continue
if last_sonority == 3 and phone_sonority == 1:
sonority_up = True
if last_phoneme == 'w' and phoneme in labials:
last_sonority_up = False
sonority_up = True
if last_phoneme == "m" and not sonority_up and not phoneme in s_sh:
last_sonority_up = False
sonority_up = True
if phoneme == "m" and not sonority_up and last_sonority < 7:
last_sonority_up = False
sonority_up = True
if phoneme == "n" and not sonority_up and last_sonority < 6:
last_sonority_up = False
sonority_up = True
if last_phoneme == "m" and not sonority_up and not phoneme in s_sh:
last_sonority_up = False
sonority_up = True
if not sonority_up and phoneme == "ng":
last_sonority_up = False
sonority_up = True
if last_sonority == 7 and phone_sonority == 7:
last_sonority_up = True
sonority_up = True
if sonority_up and last_sonority == 1 and sonority == 1 and phoneme != "s":
sonority_up = True
# avoid bs/ps onsets
if last_phoneme in ["s", "sh", "z", "zh"] and phoneme in "bp":
last_sonority_up = False
sonority_up = True
if last_phoneme == 'l' and phoneme in ['d', 't', 'dh', 'th']:
last_sonority_up = False
sonority_up = True
def splitsyll():
if not saw_vowel:
return False
if isvowel and saw_vowel:
return True
if last_isvowel and isvowel:
return True
if voicing_mismatch(phoneme, last_phoneme):
return True
if not last_sonority_up and sonority_up:
return True
return False
if splitsyll():
output.append(merge_graphones(stack[::-1]))
stack = []
saw_vowel = False
stack.append(graphone)
last_sonority_up = sonority_up
last_phoneme = phoneme
last_sonority = phone_sonority
last_isvowel = isvowel
saw_vowel = saw_vowel or isvowel
output.append(merge_graphones(stack[::-1]))
return output[::-1]
assert syllabify('a}ax b}b o|u}aw1 t}t'.split(' ')) == ['a}ax', 'b|o|u|t}b|aw1|t']
with open('TIMIT.clean.corpus', 'r') as f, open('TIMIT.syllable.corpus', 'w') as of:
for line in f.readlines():
graphones = line.split(' ')
syll = syllabify(graphones)
print(' '.join(syll), file=of)