Polish phonetic comparison
Transcript matching for E2E ASR with phonetic post-processing
from difflib import SequenceMatcher
import icu
plipa = icu.Transliterator.createInstance('pl-pl_FONIPA')
The errors in E2E models are quite often phonetic confusions, so we do the opposite of traditional ASR and generate the phonetic representation from the output as a basis for comparison.
def phonetic_check(word1, word2, ignore_spaces=False):
"""Uses ICU's IPA transliteration to check if words are the same"""
tl1 = plipa.transliterate(word1) if not ignore_spaces else plipa.transliterate(word1.replace(' ', ''))
tl2 = plipa.transliterate(word2) if not ignore_spaces else plipa.transliterate(word2.replace(' ', ''))
return tl1 == tl2
phonetic_check("jórz", "jusz", False)
The Polish y is phonetically a raised schwa; like the schwa in English, it's often deleted in fast speech. This function returns true if the only differences between the first word and the second is are deletions of y, except at the end of the word (which is typically the plural ending).
def no_igrek(word1, word2):
"""Checks if a word-internal y has been deleted"""
sm = SequenceMatcher(None, word1, word2)
for oc in sm.get_opcodes():
if oc[0] == 'equal':
continue
elif oc[0] == 'delete' and word1[oc[1]:oc[2]] != 'y':
return False
elif oc[0] == 'delete' and word1[oc[1]:oc[2]] == 'y' and oc[2] == len(word1):
return False
elif oc[0] == 'insert' or oc[0] == 'replace':
return False
return True
no_igrek('uniwersytet', 'uniwerstet')
no_igrek('uniwerstety', 'uniwerstet')
phonetic_alternatives = [ ['u', 'ó'], ['rz', 'ż'] ]
def reverse_alts(phonlist):
return [ [i[1], i[0]] for i in phonlist ]
sm = SequenceMatcher(None, "już", "jurz")
for oc in sm.get_opcodes():
print(oc)
Reads a CTM-like file, returning a list of lists containing the filename, start time, end time, and word.
def read_ctmish(filename):
output = []
with open(filename, 'r') as f:
for line in f.readlines():
pieces = line.strip().split(' ')
if len(pieces) <= 4:
continue
for piece in pieces[4:]:
output.append([pieces[0], pieces[2], pieces[3], piece])
return output
Returns the contents of a plain text file as a list of lists containing the line number and the word, for use in locating mismatches
def read_text(filename):
output = []
counter = 0
with open(filename, 'r') as f:
for line in f.readlines():
counter += 1
for word in line.strip().split(' ')
output.append([counter, word])
return output
ctmish = read_ctmish("/mnt/c/Users/Jim O\'Regan/git/notes/PlgU9JyTLPE.ctm")
rec_words = [i[3] for i in ctmish]