Polish phonetic comparison
Transcript matching for E2E ASR with phonetic post-processing
from difflib import SequenceMatcher
import icu
plipa = icu.Transliterator.createInstance('pl-pl_FONIPA')
The errors in E2E models are quite often phonetic confusions, so we do the opposite of traditional ASR and generate the phonetic representation from the output as a basis for comparison.
def phonetic_check(word1, word2, ignore_spaces=False):
"""Uses ICU's IPA transliteration to check if words are the same"""
tl1 = plipa.transliterate(word1) if not ignore_spaces else plipa.transliterate(word1.replace(' ', ''))
tl2 = plipa.transliterate(word2) if not ignore_spaces else plipa.transliterate(word2.replace(' ', ''))
return tl1 == tl2
phonetic_check("jórz", "jusz", False)
The Polish y
is phonetically a raised schwa; like the schwa in English, it's often deleted in fast speech. This function returns true if the only differences between the first word and the second is are deletions of y
, except at the end of the word (which is typically the plural ending).
def no_igrek(word1, word2):
"""Checks if a word-internal y has been deleted"""
sm = SequenceMatcher(None, word1, word2)
for oc in sm.get_opcodes():
if oc[0] == 'equal':
continue
elif oc[0] == 'delete' and word1[oc[1]:oc[2]] != 'y':
return False
elif oc[0] == 'delete' and word1[oc[1]:oc[2]] == 'y' and oc[2] == len(word1):
return False
elif oc[0] == 'insert' or oc[0] == 'replace':
return False
return True
no_igrek('uniwersytet', 'uniwerstet')
no_igrek('uniwerstety', 'uniwerstet')
phonetic_alternatives = [ ['u', 'ó'], ['rz', 'ż'] ]
def reverse_alts(phonlist):
return [ [i[1], i[0]] for i in phonlist ]
sm = SequenceMatcher(None, "już", "jurz")
for oc in sm.get_opcodes():
print(oc)
Reads a CTM
-like file, returning a list of lists containing the filename, start time, end time, and word.
def read_ctmish(filename):
output = []
with open(filename, 'r') as f:
for line in f.readlines():
pieces = line.strip().split(' ')
if len(pieces) <= 4:
continue
for piece in pieces[4:]:
output.append([pieces[0], pieces[2], pieces[3], piece])
return output
Returns the contents of a plain text file as a list of lists containing the line number and the word, for use in locating mismatches
def read_text(filename):
output = []
counter = 0
with open(filename, 'r') as f:
for line in f.readlines():
counter += 1
for word in line.strip().split(' ')
output.append([counter, word])
return output
ctmish = read_ctmish("/mnt/c/Users/Jim O\'Regan/git/notes/PlgU9JyTLPE.ctm")
rec_words = [i[3] for i in ctmish]