from difflib import SequenceMatcher
import icu

plipa = icu.Transliterator.createInstance('pl-pl_FONIPA')

The errors in E2E models are quite often phonetic confusions, so we do the opposite of traditional ASR and generate the phonetic representation from the output as a basis for comparison.

def phonetic_check(word1, word2, ignore_spaces=False):
    """Uses ICU's IPA transliteration to check if words are the same"""
    tl1 = plipa.transliterate(word1) if not ignore_spaces else plipa.transliterate(word1.replace(' ', ''))
    tl2 = plipa.transliterate(word2) if not ignore_spaces else plipa.transliterate(word2.replace(' ', ''))
    return tl1 == tl2

phonetic_check("jórz", "jusz", False)

True

The Polish y is phonetically a raised schwa; like the schwa in English, it's often deleted in fast speech. This function returns true if the only differences between the first word and the second is are deletions of y, except at the end of the word (which is typically the plural ending).

def no_igrek(word1, word2):
    """Checks if a word-internal y has been deleted"""
    sm = SequenceMatcher(None, word1, word2)
    for oc in sm.get_opcodes():
        if oc[0] == 'equal':
            continue
        elif oc[0] == 'delete' and word1[oc[1]:oc[2]] != 'y':
            return False
        elif oc[0] == 'delete' and word1[oc[1]:oc[2]] == 'y' and oc[2] == len(word1):
            return False
        elif oc[0] == 'insert' or oc[0] == 'replace':
            return False
    return True

no_igrek('uniwersytet', 'uniwerstet')

True

no_igrek('uniwerstety', 'uniwerstet')

False

phonetic_alternatives = [ ['u', 'ó'], ['rz', 'ż'] ]
def reverse_alts(phonlist):
    return [ [i[1], i[0]] for i in phonlist ]

sm = SequenceMatcher(None, "już", "jurz")
for oc in sm.get_opcodes():
    print(oc)

('equal', 0, 2, 0, 2)
('replace', 2, 3, 2, 4)

Reads a CTM-like file, returning a list of lists containing the filename, start time, end time, and word.

def read_ctmish(filename):
    output = []
    with open(filename, 'r') as f:
        for line in f.readlines():
            pieces = line.strip().split(' ')
            if len(pieces) <= 4:
                continue
            for piece in pieces[4:]:
                output.append([pieces[0], pieces[2], pieces[3], piece])
    return output

Returns the contents of a plain text file as a list of lists containing the line number and the word, for use in locating mismatches

def read_text(filename):
    output = []
    counter = 0
    with open(filename, 'r') as f:
        for line in f.readlines():
            counter += 1
            for word in line.strip().split(' ')
                output.append([counter, word])
    return output

ctmish = read_ctmish("/mnt/c/Users/Jim O\'Regan/git/notes/PlgU9JyTLPE.ctm")

rec_words = [i[3] for i in ctmish]