Simple replacement for Kaldi's align-text
Because life's too short to install Kaldi again
from difflib import SequenceMatcher
import copy
a = ['a', 'bad', 'time', 'today', 'etc']
b = ['not', 'really', 'bad', 'time', 'now']
s = SequenceMatcher(None, a, b)
def pad_replacements(a_in, b_in):
a = copy.deepcopy(a_in)
b = copy.deepcopy(b_in)
if len(a) > len(b):
diff = len(a) - len(b)
for i in range(0, diff+1):
b.append("<eps>")
elif len(b) > len(a):
diff = len(b) - len(a)
for i in range(0, diff+1):
a.append("<eps>")
return [x for x in zip(a, b)]
assert pad_replacements(["a", "b"], ["a"]) == [('a', 'a'), ('b', '<eps>')]
assert pad_replacements(["a", "b", "c"], ["a"]) == [('a', 'a'), ('b', '<eps>'), ('c', '<eps>')]
assert pad_replacements(["a"], ["a", "b", "c"]) == [('a', 'a'), ('<eps>', 'b'), ('<eps>', 'c')]
outputs = []
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == "equal":
for x in a[i1:i2]:
outputs.append(f"{x} {x}")
elif tag == "insert":
for x in b[j1:j2]:
outputs.append(f"<eps> {x}")
elif tag == "delete":
for x in a[i1:i2]:
outputs.append(f"{x} <eps>")
elif tag == "replace":
for x, y in pad_replacements(a[i1:i2], b[j1:j2]):
outputs.append(f"{x} {y}")
" ; ".join(outputs)