Map phonetic English transcription
Can't remember what this was doing
TOK_JSON = """
{"\u0251": 1, "\u00e6": 2, "\u028c": 3, "a\u028a": 4, "\u0259": 5, "\u025d": 6, "a\u026a": 7, "b": 8, "\u02a7": 9, "d": 10, "\u00f0": 11, "\u027e": 12, "\u025b": 13, "l\u0329": 14, "m\u0329": 15, "n\u0329": 16, "\u014b\u0329": 17, "\u025a": 18, "e\u026a": 19, "f": 20, "g": 21, " ": 22, "h": 24, "\u026a": 26, "i\u02d0": 27, "\u02a4": 28, "k": 29, "l": 30, "m": 31, "n": 32, "\u014b": 33, "\u027e\u0303": 34, "o\u028a": 35, "\u0254\u026a": 36, "p": 37, "ʔ": 38, "\u0279": 39, "s": 40, "\u0283": 41, "t": 42, "\u03b8": 43, "\u028a": 44, "u\u02d0": 45, "v": 46, "w": 47, "j": 48, "z": 49, "\u0292": 50, "|": 0, "[UNK]": 51, "[PAD]": 52}
"""
import json
TOKENS = json.loads(TOK_JSON)
TOKLIST = ["\[UNK\]"]
for key in TOKENS:
if "[" in key or key == "|" or key == " ":
continue
TOKLIST.append(key)
# sort TOKLIST by length, longest first
TOKLIST.sort(key=len, reverse=True)
TOK_REGEX_INNER = "|".join(TOKLIST)
TOK_REGEX = fr"({TOK_REGEX_INNER})"
import re
def tokenise(text):
tokens = []
text = text.strip()
while text:
match = re.match(TOK_REGEX, text)
if not match:
raise ValueError(f"Could not match token in text: {text}")
token = match.group(0)
if token == "[UNK]":
tokens.append("ɪ")
else:
tokens.append(token)
text = text[len(token):].strip()
return tokens
SAMPLE_TIMIT = "doʊn t biːɪf ɹeɪʔ l[UNK]ɾl̩wʌn ð[UNK]biːs kæn t[UNK]ɚʔʧuːwɑlʔaɪmɚɹaʊn dɪnðɛnhiːt[UNK]tɪz kæp ʔoʊvɝhɪz lɛf tʔiːɝ ʔɪnʃʊk[UNK]z k lʌb ʔɪʔðɪp ɹ[UNK]n ʔ s"
EQUIVALENT_TOKENS = {
"əl": "l̩",
"tʃ": "ʧ",
}
STOPS = ["b", "p", "d", "t", "g", "k"]
import difflib
def make_equivalent(a, b):
sm = difflib.SequenceMatcher(a=a, b=b, autojunk=False)
ops = sm.get_opcodes()
out_ops = []
for op in ops:
if op[0] == "equal":
out_ops.append(op)
continue
elif op[0] == "replace":
left = " ".join(a[op[1]:op[2]])
right = " ".join(b[op[3]:op[4]])
if left in EQUIVALENT_TOKENS and EQUIVALENT_TOKENS[left] == right:
out_ops.append(("equal", op[1], op[2], op[3], op[4]))
elif right == "ʔ" and left in STOPS:
out_ops.append(("equal", op[1], op[2], op[3], op[4]))
elif left in ["ɐ", "ə"] and right in ["ə", "ɪ"]:
out_ops.append(("equal", op[1], op[2], op[3], op[4]))
else:
out_ops.append(op)
else:
out_ops.append(op)
return ops
A = "d oʊ t b iː ɐ f ɹ eɪ d l ɪ ɾ əl w ʌ n ð ə b iː s t k æ n t h ɜː tʃ uː w ɑː l aɪ m ɐ ɹ aʊ n d æ n d ð ɛ n h iː t ɪ p t ɪ z k æ p oʊ v ɚ h ɪ z l ɛ f t ɪ æ n d ʃ ʊ k h ɪ z k l ʌ b æ t ð ə p ɹ ɪ n s".split(" ")
B = tokenise(SAMPLE_TIMIT)
def print_equivalents(A, B, ops):
for op in ops:
print(f"{op[0]}: A[{op[1]}:{op[2]}] = {A[op[1]:op[2]]}, B[{op[3]}:{op[4]}] = {B[op[3]:op[4]]}")
ops = make_equivalent(A, B)
print_equivalents(A, B, ops)