from pathlib import Path
%cd /Users/joregan/Playing/cvpr-control
/Users/joregan/Playing/cvpr-control
sentences = []
with open("input_sentences.txt") as f:
    for line in f.readlines():
        sentences.append(line.strip())
items = {}
for tsvfile in Path("tsv").glob("**/*.tsv"):
    ident = tsvfile.stem
    if ".interloctr." in str(tsvfile):
           continue
    parts = []
    with open(tsvfile) as inf:
            for line in inf.readlines():
                    parts.append(line.strip().split("\t"))
    items[ident] = {}
    items[ident]["rec"] = " ".join([x[-1] for x in parts])
    items[ident]["original"] = sentences[int(ident.split("_")[-1])]
from string import punctuation

PUNCT = set(punctuation)

def clean_sentence(text):
    words = []
    text = text.replace("—", " ")
    for word in text.split(" "):
        if word.startswith("[") and word.endswith("]"):
            continue
        while word[0:1] in PUNCT:
            word = word[1:]
        while word[-1:] in PUNCT:
            word = word[:-1]
        words.append(word.lower())
    return " ".join(words)
def prune_fillers(text):
    FILLERS = ["uh", "um"]
    words = [x for x in text.split(" ") if x not in FILLERS]
    return " ".join(words)
rest = {}
for item in items:
    cleaned = clean_sentence(items[item]["original"])
    if items[item]["original"] == items[item]["rec"]:
        items[item]["match"] = "exact"
    elif cleaned == items[item]["rec"]:
        items[item]["match"] = "clean"
    elif prune_fillers(cleaned) == items[item]["rec"]:
        items[item]["match"] = "clean,fillers"
    elif prune_fillers(cleaned) == prune_fillers(items[item]["rec"]):
        items[item]["match"] = "clean,fillers_both"
    else:
        rest[item] = items[item]
%pip install jiwer
from jiwer import wer
for item in items:
    if not "match" in items[item]:
        s_wer = wer(clean_sentence(items[item]["original"]), items[item]["rec"])
        items[item]["wer"] = s_wer
from difflib import SequenceMatcher

insertions = {}
deletions = {}
replacements = {}
trace_replace = []

for item in items:
    if not "match" in items[item]:
        sent_id = item.split("_")[-1]
        a = clean_sentence(items[item]["original"]).split(" ")
        b = items[item]["rec"].split(" ")
        s = SequenceMatcher(None, a, b)
        for tag, i1, i2, j1, j2 in s.get_opcodes():
            if tag == "equal":
                continue
            if tag == "replace":
                pair = (sent_id, " ".join(a[i1:i2]), " ".join(b[j1:j2]))
                trace_replace.append((item, " ".join(a[i1:i2]), " ".join(b[j1:j2])))
                if not pair in replacements:
                    replacements[pair] = 1
                else:
                    replacements[pair] += 1
            if tag == "delete":
                pair = (sent_id, " ".join(a[i1:i2]))
                if not pair in deletions:
                    deletions[pair] = 1
                else:
                    deletions[pair] += 1
            if tag == "insert":
                pair = (sent_id, " ".join(b[j1:j2]))
                if not pair in insertions:
                    insertions[pair] = 1
                else:
                    insertions[pair] += 1
trace_replace
[('spkp264_243', 'really', 'real'),
 ('spkp237_605', '“you’d', "you'd"),
 ('spkp237_605', 'instrument,”', 'instrument'),
 ('spkp271_306', 'i i', 'it'),
 ('spkp271_306', 'feel', 'feels'),
 ('spkp282_186', 'gotta', 'got to'),
 ('spkp282_192', 'uh', 'ah'),
 ('spkp318_115', 'earbuds', '<unk>'),
 ('spkp364_156', 'unhear', '<unk>'),
 ('spkp374_156', 'unhear', '<unk>'),
 ('spkp318_129', 'gotta', 'got to'),
 ('spkp317_543', 'heartwarming', '<unk>'),
 ('spkp317_543', 'workouts', '<unk>'),
 ('spkp228_535', 'cause i', 'because'),
 ('spkp317_225', 'patience', 'patients'),
 ('spkp304_494', 'uh', 'ah'),
 ('spkp282_145', 'uh', 'err'),
 ('spkp282_151', 'off-putting', 'off putting'),
 ('spkp317_231', 'curveball', '<unk>'),
 ('spkp339_369', 'cause', 'because'),
 ('spkp339_341', 'panicking', '<unk>'),
 ('spkp317_219', 'can not', 'cannot'),
 ('spkp237_188', 'uh', 'ah'),
 ('spkp237_188', 'open-minded', 'open minded'),
 ('spkp339_433', 'sibling', 'siblings'),
 ('spkp339_355', "it's", 'it'),
 ('spkp237_361', 'panicking', '<unk>'),
 ('spkp232_556', 'replaying', '<unk>'),
 ('spkp232_556', 'heartwarming', '<unk>'),
 ('spkp271_57', 'crafting', '<unk>'),
 ('spkp314_127', 'there’s', "there's"),
 ('spkp318_465', 'right', 'write'),
 ('spkp226_166', 'scrunches', '<unk>'),
 ('spkp237_407', 'tv', '<unk>'),
 ('spkp304_127', 'there’s', "there's"),
 ('spkp282_390', 'cause', 'because'),
 ('spkp237_16', 'uh kinda', 'kind of'),
 ('spkp364_432', 'shake', 'shape'),
 ('spkp304_4', 'realize', 'realise'),
 ('spkp271_138', 'yuck', "<unk> i'm sorry"),
 ('spkp232_218', 'uh', '<unk>'),
 ('spkp232_218', 'uh', '<unk>'),
 ('spkp282_347', 'cause', 'because'),
 ('spk4_19', "should've", 'should have'),
 ('spkp232_595', '20/20', '<unk>'),
 ('spkp282_421', 'worst-case', 'worst case'),
 ('spkp282_353', 'health', 'hell'),
 ('spkp364_397', 'and', 'on'),
 ('spkp374_397', 'and', 'in'),
 ('spkp374_208', 'this', 'the'),
 ('spkp318_511', 'videos', '<unk>'),
 ('spkp282_596', 'gps', '<unk>'),
 ('spkp264_135', 'i am overthinking', "i'm <unk>"),
 ('spkp264_121', 'it’s', "it's"),
 ('spkp264_121', 'i’m', "i'm"),
 ('spkp264_121', 'i’m 16', "i'm <unk>"),
 ('spkp264_121', 'i’d', "i'd"),
 ('spkp318_505', 'bioluminescent', '<unk>'),
 ('spkp228_42', 'adrenaline', '<unk>'),
 ('spkp237_201', 'gotta', 'got to'),
 ('spkp282_97', 'just heartwarming', '<unk>'),
 ('spkp264_109', 'kinda', 'kind of'),
 ('spkp360_604', 'there’s', "there's"),
 ('spkp360_604', '“this', 'this'),
 ('spkp360_604', 'join,” i’m', "join i'm"),
 ('spkp232_378', 'panicking', '<unk>'),
 ('spkp226_6', 'uh', 'er'),
 ('spkp282_83', 'just heartwarming', '<unk>'),
 ('spkp232_26', "it's just mind-blowing", 'mind blowing'),
 ('spkp228_125', 'it’s', "it's"),
 ('spkp228_125', 'world’s', "world's"),
 ('spk1_15', 'uh', 'err'),
 ('spk1_15', 'uh', 'err'),
 ('spkp317_153', 'they', 'i'),
 ('spkp232_387', 'cause', 'because'),
 ('spkp282_541', 'veggie', '<unk>'),
 ('spkp225_106', 'kinda', 'kind of'),
 ('spkp374_98', 'replay', '<unk>'),
 ('spkp374_98', 'resonates', '<unk>'),
 ('spkp374_98', 'soundtrack', 'sound'),
 ('spkp228_119', 'vibe', '<unk>'),
 ('spkp318_288', 'mid-laugh', 'mid laugh'),
 ('spkp232_146', 'ugh', 'uh'),
 ('spkp225_16', 'uh kinda', 'kind of'),
 ('spkp226_210', 'realize', 'realise'),
 ('spkp226_210', "i'm", 'i am'),
 ('spkp226_210', 'uh', 'er'),
 ('spkp226_210', 'realization moving on', 'realisation'),
 ('spkp304_523', 'scrolling', '<unk>'),
 ('spkp304_523', 'emails', '<unk>'),
 ('spkp304_523', 'skeptical', 'sceptical'),
 ('spkp317_392', '20', '<unk>'),
 ('spkp304_537', 'energized', 'energised'),
 ('spkp226_562', 'cinematography', '<unk>'),
 ('spkp314_523', 'scrolling', '<unk>'),
 ('spkp314_523', 'emails', '<unk>'),
 ('spkp314_523', 'this', 'the'),
 ('spkp360_406', "at the spinning wheel waiting hoping and it's", "you're"),
 ('spkp360_406', 'just so incredibly', 'staring credibly'),
 ('spkp271_528', 'café', 'cafe'),
 ('spkp232_608', '“expand', 'expand'),
 ('spkp232_608', 'horizons”', 'horizons'),
 ('spk2_93', 'in', 'and'),
 ('spkp228_441', 'realize', 'realise'),
 ('spkp317_437', 'uh', 'ah'),
 ('spkp264_486', "i've", 'i'),
 ('spkp228_333', 'onto', 'on to'),
 ('spkp228_333', 'despair', 'to spare'),
 ('spkp228_455', 'and', 'in'),
 ('spkp317_423', 'backlash', '<unk>'),
 ('spkp317_423', 'the', 'a'),
 ('spkp339_553', 'heartwarming', '<unk>'),
 ('spkp339_547', 'ambiance', '<unk>'),
 ('spkp304_26', "it's just mind-blowing", 'mind blowing'),
 ('spkp317_378', 'panicking', '<unk>'),
 ('spkp225_339', 'look', 'looked'),
 ('spkp225_463', "i'm at", 'i met'),
 ('spkp225_463', "it's", 'its'),
 ('spkp232_190', 'uh', 'er'),
 ('spkp304_287', 'uh', 'ah'),
 ('spkp225_477', 'uh', 'ah'),
 ('spkp225_477', 'for getting', 'forgetting'),
 ('spkp264_493', 'um', 'and'),
 ('spkp264_478', 'staying', 'stay'),
 ('spk2_92', 'soundtrack', '<unk>'),
 ('spkp360_413', 'backlash', '<unk>'),
 ('spkp360_413', 'second-guess', 'second guess'),
 ('spkp360_413', 'negativity', '<unk>'),
 ('spkp360_407', 'tv', '<unk>'),
 ('spk2_86', 'uh', 'ah'),
 ('spk2_86', 'self-connection of introspection', 'self connection'),
 ('spkp225_488', 'telemarketer', '<unk>'),
 ('spkp225_488', 'cause i', 'because'),
 ('spkp271_529', "i'm", 'am'),
 ('spkp360_361', 'panicking', '<unk>'),
 ('spkp317_387', 'cause', 'cuz'),
 ('spkp314_522', 'paths', 'path'),
 ('spkp226_211', 'judgment', 'judgement'),
 ('spkp226_211', 'judgment', 'judgement'),
 ('spkp314_536', "i'm", 'am'),
 ('spkp282_41', 'colors', 'colours'),
 ('spkp282_41', 'peace', 'piece'),
 ('spkp282_41', 'peace', 'piece'),
 ('spkp317_608', '“expand', 'expand'),
 ('spkp317_608', 'horizons”', 'horizons'),
 ('spkp232_392', '20', '<unk>'),
 ('spkp225_107', "um it's kinda", 'kind of'),
 ('spkp282_540', 'game-changer', 'game changer'),
 ('spkp282_226', 'notices', 'noticed'),
 ('spkp228_80', 'awe-inspiring', 'awe inspiring'),
 ('spkp228_80', 'chaos and', "case isn't"),
 ('spkp360_188', 'open-minded', 'open minded'),
 ('spkp228_130', 'okay', 'ok'),
 ('spkp282_69', 'crafting', '<unk>'),
 ('spk1_14', 'kinda', 'kind of'),
 ('spkp228_124', 'it’s', "it's"),
 ('spkp228_124', 'non-stop', '<unk>'),
 ('spkp228_124', 'i’m', "i'm"),
 ('spkp228_124', '– kinda', 'kind of'),
 ('spkp282_232', 'open-minded', 'open minded'),
 ('spkp282_554', 'favorite', 'favourite'),
 ('spkp374_547', 'ambiance', '<unk>'),
 ('spkp364_553', 'heartwarming', '<unk>'),
 ('spkp374_235', 'just uh', '<unk>'),
 ('spkp374_553', 'so heartwarming', '<unk>'),
 ('spkp360_605', '“you’d', "you'd"),
 ('spkp360_605', 'instrument,”', 'instrument'),
 ('spkp364_221', 'uh a thousand', '<unk> <unk>'),
 ('spkp264_120', 'hadn’t', "hadn't"),
 ('spkp264_120', '90s', '<unk>'),
 ('spkp264_120', '‘why not?’', 'why not'),
 ('spkp228_57', 'uh envelop', 'envelope'),
 ('spkp228_57', 'crafting', '<unk>'),
 ('spkp317_191', 'the', 'this'),
 ('spkp232_423', 'backlash', '<unk>'),
 ('spkp264_134', 'overthinking', '<unk>'),
 ('spkp264_134', 'jewelry’s', "jewelry's"),
 ('spkp264_134', 'it’s', "it's"),
 ('spkp282_408', 'scrambling', '<unk>'),
 ('spkp339_156', 'could unhear', "couldn't hear"),
 ('spk4_18', 'colorful', 'colourful'),
 ('spk4_18', 'uhm', 'um'),
 ('spkp282_346', 'cause', 'because'),
 ('spkp364_341', 'panicking', '<unk>'),
 ('spkp374_433', 'sibling', 'siblings'),
 ('spkp237_348', 'right', 'write'),
 ('spkp304_5', 'uh', 'a'),
 ('spkp364_433', 'sibling', 'siblings'),
 ('spkp374_341', 'panicking', '<unk>'),
 ('spkp318_458', 'armrest just', '<unk>'),
 ('spkp304_126', 'spent', '<unk>'),
 ('spkp271_111', 'kinda', 'kind of'),
 ('spkp226_173', 'uh', 'err'),
 ('spkp232_225', 'patience', 'patients'),
 ('spkp314_132', 'tutorials', '<unk>'),
 ('spkp314_132', '‘how', 'how'),
 ('spkp314_132', 'be?’', 'be'),
 ('spkp232_543', 'heartwarming', '<unk>'),
 ('spkp232_543', 'workouts', '<unk>'),
 ('spkp318_464', 'break-ins', 'break ins'),
 ('spkp318_464', 'double-checking', 'double checking'),
 ('spkp364_369', 'cause', 'because'),
 ('spkp232_231', 'curveball', '<unk>'),
 ('spkp271_105', 'decide', 'decided'),
 ('spkp271_105', "sky's", 'sky is'),
 ('spkp304_132', 'tutorials', '<unk>'),
 ('spkp304_132', '‘how', 'how'),
 ('spkp304_132', 'be?’', 'be'),
 ('spkp225_517', "you'll make in", "you're making"),
 ('spkp225_517', 'impactful', '<unk>'),
 ('spkp304_481', "it's just", 'it'),
 ('spkp225_271', 'their', "they're"),
 ('spkp228_520', 'heartwarming', '<unk>'),
 ('spkp314_495', 'snorkeling', '<unk>'),
 ('spkp317_556', 'replaying', '<unk>'),
 ('spkp317_556', 'heartwarming', '<unk>'),
 ('spkp314_481', 'uh', 'ah'),
 ('spkp304_495', 'snorkeling', '<unk>'),
 ('spkp304_495', "we're", "you're"),
 ('spkp228_252', 'anew', 'and you'),
 ('spkp318_128', 'full-on', 'full on'),
 ('spkp318_128', 'kinda', 'kind of'),
 ('spkp264_518', 'like-minded', 'like minded'),
 ('spkp304_442', 'laundromat', '<unk>'),
 ('spkp314_330', 'uh', 'oh'),
 ('spkp317_595', '20/20', '<unk>'),
 ('spkp226_371', 'what-ifs', 'what ifs'),
 ('spkp304_324', 'timelines', '<unk>'),
 ('spkp271_313', 'onto', 'on to'),
 ('spkp271_313', 'i', "i'd"),
 ('spkp318_114', 'today’s', 'today is'),
 ('spkp314_442', 'laundromat', '<unk>'),
 ('spkp237_604', 'there’s', "there's"),
 ('spkp237_604', '“this', 'this'),
 ('spkp237_604', 'join,” i’m just', "join i'm"),
 ('spkp314_324', 'timelines', '<unk>'),
 ('spkp282_187', 'unsee unhear un-experience', '<unk> <unk> <unk>'),
 ('spkp226_403', 'checkout', '<unk>'),
 ('spkp318_116', 'wound', 'went'),
 ('spkp226_373', 'i have', 'just had'),
 ('spkp374_169', 'chalkboard', '<unk>'),
 ('spkp226_415', 'tv', '<unk>'),
 ('spkp304_440', 'uh', 'ah'),
 ('spkp271_463', "it's", 'its'),
 ('spkp364_169', 'chalkboard', '<unk>'),
 ('spkp237_606', 'there’s', "there's"),
 ('spkp237_606', '“come', 'come'),
 ('spkp237_606', 'it’ll', "it'll"),
 ('spkp237_606', 'you’ll', "you'll"),
 ('spkp237_606', 'it,”', 'it'),
 ('spkp237_606', 'i’m', "i'm"),
 ('spkp318_102', 'kinda', 'kind of'),
 ('spkp364_141', 'ew', 'you'),
 ('spkp360_565', 'theatre', 'theater'),
 ('spkp360_565', 'impactful', '<unk>'),
 ('spkp339_395', 'right', 'write'),
 ('spkp339_395', 'tv', '<unk>'),
 ('spkp339_395', 'glitch', '<unk>'),
 ('spkp374_141', 'ew', 'you'),
 ('spkp271_339', 'look', 'looked'),
 ('spkp339_381', 'just frozen', 'chosen'),
 ('spkp225_298', 'favorite', 'favourite'),
 ('spkp360_571', 'heartwarming', '<unk>'),
 ('spkp304_483', 'stop', 'stopped'),
 ('spkp304_483', 'cause', 'because'),
 ('spkp317_232', 'open-minded', 'open minded'),
 ('spkp339_418', 'what-ifs', 'what ifs'),
 ('spkp317_540', 'game-changer', 'game changer'),
 ('spkp228_536', "i'm", 'am'),
 ('spkp282_608', '“expand', 'expand'),
 ('spkp282_608', 'horizons”', 'horizons'),
 ('spkp271_488', 'telemarketer', '<unk>'),
 ('spkp271_488', 'cause i', 'because'),
 ('spkp225_529', 'had', 'have'),
 ('spkp364_196', 'times', 'time'),
 ('spkp339_342', 'grab', 'grabbed'),
 ('spkp339_342', 'armrests', '<unk>'),
 ('spkp232_541', 'veggie', '<unk>'),
 ('spkp314_130', 'clearer kinda', 'claire'),
 ('spkp314_130', 'okay', 'ok'),
 ('spkp314_130', "they're", 'are'),
 ('spkp226_171', 'too', 'to'),
 ('spkp304_124', 'it’s', "it's"),
 ('spkp304_124', 'non-stop', 'non stop'),
 ('spkp304_124', 'i’m', "i'm"),
 ('spkp304_124', '– kinda', 'kind of'),
 ('spkp304_130', 'clearer kinda', '<unk> kind of'),
 ('spkp304_130', 'okay', 'ok'),
 ('spkp318_300', 'uh', 'oh'),
 ('spkp314_124', 'it’s', "it's"),
 ('spkp314_124', 'non-stop', '<unk>'),
 ('spkp314_124', 'i’m', "i'm"),
 ('spkp314_124', '– kinda', 'kind of'),
 ('spkp282_387', 'cause', 'because'),
 ('spkp364_425', 'health', 'house'),
 ('spkp364_357', 'uh roller coaster', '<unk>'),
 ('spkp226_159', "must've", 'must have'),
 ('spkp232_569', 'recognized', 'recognised'),
 ('spkp339_168', 'uh', 'oh'),
 ('spkp339_168', 'uh', 'so i'),
 ('spkp318_35', 'this', 'the'),
 ('spkp282_422', 'as', 'is'),
 ('spkp232_596', 'recognize', 'recognise'),
 ('spkp232_596', 'gps', '<unk>'),
 ('spkp374_394', 'emails', '<unk>'),
 ('spkp364_394', 'emails', '<unk>'),
 ('spkp282_378', 'panicking', '<unk>'),
 ('spkp318_499', 'paddleboarding', '<unk>'),
 ('spkp318_499', 'adrenaline', '<unk>'),
 ('spkp232_19', "should've", 'should have'),
 ('spkp228_41', "you're", 'here'),
 ('spkp228_41', 'uh', 'oh'),
 ('spkp228_41', 'peace', 'piece'),
 ('spkp228_41', 'peace', 'piece'),
 ('spkp317_187', 'unsee unhear un-experience', '<unk> <unk> <unk>'),
 ('spkp264_122', 'there’s', "there's"),
 ('spkp282_595', '20/20', '<unk>'),
 ('spkp264_136', 'the', 'a'),
 ('spkp228_55', 'uh', 'oh'),
 ('spkp228_55', 'lostness', '<unk>'),
 ('spkp232_421', 'worst-case', 'worst case'),
 ('spkp237_216', 'uh', 'ah'),
 ('spkp237_216', 'unsee unknow', '<unk> oh no'),
 ('spkp364_579', 'a maze', 'domains'),
 ('spkp282_80', 'just awe-inspiring', 'awe inspiring'),
 ('spkp282_80', 'chaos and calm', 'cases come'),
 ('spkp374_545', 'impacting', '<unk>'),
 ('spkp237_558', 'oh', 'owe'),
 ('spkp360_607', '“you’re', "you're"),
 ('spkp360_607', 'don’t', "don't"),
 ('spkp360_607', 'once,”', 'once'),
 ('spkp360_607', 'i’m', "i'm"),
 ('spkp228_69', 'um', 'and'),
 ('spkp228_69', 'crafting', '<unk>'),
 ('spkp364_545', 'impacting', '<unk>'),
 ('spkp228_132', 'tutorials', '<unk>'),
 ('spkp228_132', '‘how', 'how'),
 ('spkp228_132', 'be?’', 'be'),
 ('spkp282_224', 'cause', 'because'),
 ('spkp225_105', "there's", "it's"),
 ('spkp225_105', 'sunrise', 'sun rise'),
 ('spkp225_105', "sky's", 'sky is'),
 ('spkp282_556', 'replaying', '<unk>'),
 ('spkp225_111', 'kinda', 'kind of'),
 ('spk1_16', 'uh kinda', 'a kind of'),
 ('spkp364_592', 'regroup', '<unk>'),
 ('spkp374_592', 'regroup', '<unk>'),
 ('spkp374_592', "a bundle of nerves and it's it's", "just <unk> i'm just"),
 ('spkp314_21', 'um', 'and'),
 ('spkp314_21', 'uh', 'ah'),
 ('spkp282_57', 'uh envelop', 'envelope'),
 ('spkp282_57', 'crafting', '<unk>'),
 ('spkp314_520', 'heartwarming', '<unk>'),
 ('spkp232_151', 'off-putting', 'off putting'),
 ('spkp228_495', 'snorkeling', '<unk>'),
 ('spkp304_534', 'uh', 'ah'),
 ('spkp304_534', 'reflective', 'reflected'),
 ('spkp226_561', 'uh the', 'a'),
 ('spkp271_517', 'impactful', '<unk>'),
 ('spkp304_520', 'heartwarming', '<unk>'),
 ('spkp226_575', 'exhilarating', 'ex so'),
 ('spkp314_252', 'anew', 'and you'),
 ('spkp304_508', "i'm just", 'i'),
 ('spkp364_61', 'awe-inspiring', 'awe inspiring'),
 ('spkp232_192', 'uh', 'oh'),
 ('spkp225_461', 'even', 'the'),
 ('spkp317_346', 'cause', 'because'),
 ('spkp360_388', 'gps', '<unk>'),
 ('spkp360_388', 'rerouting', '<unk>'),
 ('spkp360_388', 'stop-and-go', 'stop and go'),
 ('spkp264_485', 'cause', 'because'),
 ('spkp314_291', "it'll", 'it will'),
 ('spkp228_324', 'timelines', '<unk>'),
 ('spkp228_442', 'laundromat', '<unk>'),
 ('spkp225_448', 'glitch or', '<unk> all'),
 ('spkp225_448', "machine's", 'machines'),
 ('spkp339_545', 'impacting', '<unk>'),
 ('spkp317_353', "can't", 'just just'),
 ('spkp232_187', 'unsee unhear un-experience', '<unk> <unk> <unk>'),
 ('spkp317_71', 'uh', 'ah'),
 ('spkp317_71', "flame's", 'flames'),
 ('spkp317_71', 'mesmerizing', '<unk>'),
 ('spkp317_421', 'uh', 'ah'),
 ('spkp317_421', 'worst-case', 'worst case'),
 ('spkp304_19', "should've", 'should have'),
 ('spkp317_347', 'cause i', 'because'),
 ('spkp317_347', 'leaves me alone', 'just leo lom'),
 ('spkp339_592', 'regroup', '<unk>'),
 ('spkp364_60', "there's", 'it was'),
 ('spkp314_509', 'um and then', "i'm a nen"),
 ('spk2_85', 'just reveling', 'revelling'),
 ('spkp314_253', 'uh', 'ah'),
 ('spkp317_390', 'cause', 'because'),
 ('spkp304_521', 'um so', '<unk>'),
 ('spkp226_212', 'uh', 'err'),
 ('spkp225_14', 'kinda', 'kinder'),
 ('spkp264_335', 'not-funny', 'not funny'),
 ('spkp314_535', 'cause i', 'because'),
 ('spkp232_150', 'realizing', 'realising'),
 ('spkp304_253', 'uh', 'oh'),
 ('spkp304_535', 'cause i', 'because'),
 ('spkp282_56', "you're", 'you'),
 ('spkp317_179', 'i mean', '<unk>'),
 ('spkp282_219', 'can not', 'cannot'),
 ('spkp225_138', 'yuck', '<unk>'),
 ('spkp232_385', 'scared', 'dared'),
 ('spkp317_151', 'off-putting', 'off putting'),
 ('spkp228_97', 'heartwarming', '<unk>'),
 ('spkp228_127', 'there’s', "there's"),
 ('spkp282_231', 'curveball', 'curve ball'),
 ('spkp317_145', 'looked', 'looks'),
 ('spkp225_104', 'surreal', '<unk>'),
 ('spkp282_543', 'heartwarming', '<unk>'),
 ('spkp282_543', 'workouts', '<unk>'),
 ('spkp360_606', 'there’s', "there's"),
 ('spkp360_606', '“come', 'come'),
 ('spkp360_606', 'it’ll', "it'll"),
 ('spkp360_606', 'you’ll', "you'll"),
 ('spkp360_606', 'it,”', 'it'),
 ('spkp360_606', 'i’m', "i'm"),
 ('spkp228_68', 'just heartwarming', '<unk>'),
 ('spkp282_95', "you're", 'you are'),
 ('spkp282_95', 'just savoring', 'savouring'),
 ('spkp237_571', 'heartwarming', '<unk>'),
 ('spkp232_18', 'colorful', 'colourful'),
 ('spkp237_565', 'theatre', 'theater'),
 ('spkp237_565', 'so impactful', '<unk>'),
 ('spkp364_395', 'tv', '<unk>'),
 ('spkp364_395', 'glitch', '<unk>'),
 ('spkp339_141', 'ew', 'you'),
 ('spkp374_381', "can't", 'just look down and i i'),
 ('spkp237_388', 'gps', '<unk>'),
 ('spkp237_388', 'rerouting', '<unk>'),
 ('spkp237_388', 'stop-and-go', 'stop and go'),
 ('spkp374_395', 'tv', '<unk>'),
 ('spkp374_395', 'glitch', '<unk>'),
 ('spkp282_423', 'backlash', '<unk>'),
 ('spkp339_169', 'chalkboard', '<unk>'),
 ('spkp237_439', 'gonna', 'going to'),
 ('spkp304_6', 'uh', 'ah'),
 ('spkp364_342', 'armrests', '<unk>'),
 ('spkp304_119', 'vibe', '<unk>'),
 ('spkp339_196', 'times', 'time'),
 ('spkp374_430', 'scared of', 'dare to'),
 ('spkp237_405', 'um', "i'm"),
 ('spkp237_405', 'uh', 'ah'),
 ('spkp237_405', 'uh', 'ah'),
 ('spkp232_232', 'open-minded', 'open minded'),
 ('spkp232_232', 'uh realizing', 'er realising'),
 ('spkp374_418', 'what-ifs', 'what ifs'),
 ('spkp339_39', 'mesmerizing', '<unk> no <unk>'),
 ('spkp226_602', 'diy', '<unk>'),
 ('spkp314_125', 'it’s', "it's"),
 ('spkp314_125', 'world’s', "world's"),
 ('spkp232_554', 'and oh', 'i know'),
 ('spkp271_41', 'peace', 'piece'),
 ('spkp271_41', 'peace', 'piece'),
 ('spkp237_14', 'kinda', 'kind of'),
 ('spkp232_540', 'game-changer', 'game changer'),
 ('spkp232_540', 'weight', 'white'),
 ('spkp364_418', 'what-ifs', 'what ifs'),
 ('spkp282_392', '20', '<unk>'),
 ('spkp318_473', "it's daunting just", 'jaunting'),
 ('spkp304_125', 'pane', 'paine'),
 ('spkp304_125', 'it’s', "it's"),
 ('spkp304_125', 'world’s', "world's"),
 ('spkp271_112', 'it makes', "and it's"),
 ('spkp226_170', 'cause', 'because'),
 ('spkp339_357', 'roller coaster', '<unk>'),
 ('spkp225_528', 'café', 'cafe'),
 ('spkp225_528', 'cause i', 'because'),
 ('spkp228_251', 'old', 'all'),
 ('spkp317_541', 'veggie', '<unk>'),
 ('spkp228_523', 'scrolling', '<unk>'),
 ('spkp228_523', 'emails', '<unk>'),
 ('spkp317_555', 'uh', 'ah'),
 ('spkp304_469', "there's", 'there is'),
 ('spkp304_469', "can't help", "just it's not helped"),
 ('spkp360_216', 'unsee unknow', '<unk> <unk>'),
 ('spkp339_380', 'relax', 'feel relaxed'),
 ('spkp339_394', 'emails', '<unk>'),
 ('spkp360_564', 'theatre', 'theater'),
 ('spkp264_269', "can't", 'count'),
 ('spkp226_400', 'um to fantasize', '<unk>'),
 ('spkp304_333', 'onto', 'on to'),
 ('spkp318_103', 'kinda', 'kind of'),
 ('spkp237_607', '“you’re', "you're"),
 ('spkp237_607', 'don’t', "don't"),
 ('spkp237_607', 'once,”', 'once'),
 ('spkp237_607', 'i’m', "i'm"),
 ('spkp364_168', "couldn't", 'couldn t'),
 ('spkp271_310', 'uh', 'oh'),
 ('spkp318_117', 'texted', '<unk>'),
 ('spkp226_414', 'paralyzed', 'paralysed'),
 ('spkp264_533', 'cause i', 'because'),
 ('spkp360_560', 'theatre', 'theater'),
 ('spkp339_390', 'cause', 'because'),
 ('spkp314_479', 'uh', 'ah'),
 ('spkp237_159', "must've", 'must have'),
 ('spkp304_445', 'neighbor uh', 'neighbour'),
 ('spkp317_592', 'a', 'the'),
 ('spkp317_592', 'regroup', '<unk>'),
 ('spkp237_603', 'resonates', '<unk>'),
 ('spkp314_445', 'um', 'and'),
 ('spkp360_548', 'favorite', 'favourite'),
 ('spkp360_548', 'flavor', 'flavour'),
 ('spkp318_107', "um it's kinda", 'kind of'),
 ('spkp282_180', "they're explaining", 'there explain'),
 ('spkp264_523', 'scrolling', '<unk>'),
 ('spkp264_523', 'emails', '<unk>'),
 ('spkp226_389', 'service', 'serviced'),
 ('spkp339_353', "can't", "didn't chant didn't work"),
 ('spkp364_187', 'unsee unhear un-experience it', '<unk> <unk> or <unk> at'),
 ('spkp339_421', 'worst-case', 'worst case'),
 ('spkp374_187', 'unsee unhear un-experience', '<unk> <unk> <unk>'),
 ('spkp304_486', 'uh', 'ah'),
 ('spkp304_486', "i've", 'i'),
 ('spkp314_492', 'a bit mystified by the discovery', 'oh man'),
 ('spkp225_262', 'uh', 'oh'),
 ('spkp314_486', "i've", 'i'),
 ('spkp314_486', 'gasp', 'gasped'),
 ('spkp314_486', 'caught', 'coat'),
 ('spkp364_346', 'cause', 'because'),
 ('spkp237_429', 'googling', '<unk>'),
 ('spkp317_8', 'uh', 'ah'),
 ('spkp314_109', 'kinda', 'kind of'),
 ('spkp374_346', 'cause', 'because'),
 ('spkp237_415', 'tv', '<unk>'),
 ('spkp271_116', 'wound', 'went'),
 ('spkp304_121', 'it’s', "it's"),
 ('spkp304_121', 'i’m', "i'm"),
 ('spkp304_121', 'i’m 16', "i'm <unk>"),
 ('spkp304_121', 'i’d forgotten', "i'd forgot"),
 ('spkp237_373', "can't watch my heart just just racing too much", 'be like'),
 ('spkp314_135', 'i am overthinking', "i'm <unk>"),
 ('spkp237_367', "it's", 'it'),
 ('spkp364_408', 'um', 'and'),
 ('spkp314_121', 'it’s', "it's"),
 ('spkp314_121', 'i’m', "i'm"),
 ('spkp314_121', 'i’m 16', "i'm <unk>"),
 ('spkp314_121', 'i’d', "i'd"),
 ('spkp226_606', 'there’s', "there's"),
 ('spkp226_606', '“come', 'come'),
 ('spkp226_606', 'it’ll', "it'll"),
 ('spkp226_606', 'you’ll', "you'll"),
 ('spkp226_606', 'it,”', 'it'),
 ('spkp226_606', 'i’m', "i'm"),
 ('spkp304_135', 'overthinking', '<unk>'),
 ('spkp271_102', 'kinda', 'kind of'),
 ('spkp339_151', 'off-putting', 'off putting'),
 ('spkp282_369', 'cause', 'because'),
 ('spkp318_488', 'telemarketer', '<unk>'),
 ('spkp318_488', 'cause i', 'because'),
 ('spkp237_398', 'uh', "you're just"),
 ('spkp282_355', "it's", 'it'),
 ('spkp282_341', 'panicking', '<unk>'),
 ('spk3_16', 'uh kinda', 'kind of'),
 ('spkp339_179', 'try', 'tried'),
 ('spkp374_540', 'game-changer', 'game changer'),
 ('spkp364_232', 'open-minded', 'open minded'),
 ('spkp226_55', "same time and it's like", 'slight'),
 ('spkp226_55', 'lostness', '<unk>'),
 ('spkp374_232', 'open-minded', 'open minded'),
 ('spkp232_418', 'what-ifs', 'what ifs'),
 ('spkp364_540', 'game-changer', 'game changer'),
 ('spkp364_226', 'nope', 'no'),
 ('spkp360_602', 'diy', 'di'),
 ('spkp360_602', "i'm", 'and'),
 ('spkp264_127', 'there’s', "there's"),
 ('spkp226_69', 'um', "i'm"),
 ('spkp226_69', 'crafting', '<unk>'),
 ('spkp360_158', 'this', 'the'),
 ('spkp228_50', 'with', 'of'),
 ('spkp228_50', 'just perfect', 'purse puffed'),
 ('spkp232_342', 'armrests', '<unk>'),
 ('spkp282_52', 'freeing', 'free'),
 ('spkp317_169', 'chalkboard', '<unk>'),
 ('spkp282_209', 'uh', 'are'),
 ('spkp225_128', 'full-on', 'full on'),
 ('spkp225_128', 'kinda', 'kind of'),
 ('spkp232_395', 'uh', 'er'),
 ('spkp232_395', 'tv', '<unk>'),
 ('spkp232_395', 'glitch', '<unk>'),
 ('spkp314_18', 'uhm', 'um'),
 ('spkp225_114', 'today’s', "today's"),
 ('spkp225_114', 'made', 'make'),
 ('spkp232_168', 'like', 'light'),
 ('spkp237_8', 'uh', 'ah'),
 ('spkp360_400', 'fantasize', '<unk>'),
 ('spkp304_519', 'um', "i'm"),
 ('spkp304_519', 'right', 'write'),
 ('spkp304_519', 'just', 'so so'),
 ('spkp339_596', 'gps', '<unk>'),
 ('spkp226_202', 'uh', 'er'),
 ('spkp314_525', 'groundbreaking', '<unk>'),
 ('spkp228_484', 'get-together', 'get together'),
 ('spkp226_216', 'unsee unknow', '<unk> i know'),
 ('spkp317_394', 'emails', '<unk>'),
 ('spkp317_394', 'rechecking', 're checking'),
 ('spkp304_21', 'um', "i'm"),
 ('spk2_56', 'the sky is', 'this'),
 ('spkp339_555', 'uh', 'ah'),
 ('spkp225_464', 'break-ins', 'break ins'),
 ('spkp225_464', 'double-checking', 'double checking'),
 ('spkp228_335', 'not-funny', 'not funny'),
 ('spkp317_61', "it's", 'is'),
 ('spkp317_61', 'just awe-inspiring', 'awe inspiring'),
 ('spkp314_294', 'want to', 'wanna'),
 ('spkp317_357', "i'm", 'am'),
 ('spkp317_342', 'armrests', '<unk>'),
 ('spkp225_303', 'into', 'to'),
 ('spkp228_452', 'googling', '<unk>'),
 ('spkp228_452', "i can't calm", 'count'),
 ('spkp339_232', 'open-minded', 'open minded'),
 ('spkp339_540', 'game-changer', 'game changer'),
 ('spkp225_459', 'just becomes', 'comes'),
 ('spkp317_418', 'what-ifs', 'what ifs'),
 ('spk2_57', 'envelop', 'envelope'),
 ('spk2_57', 'crafting', '<unk>'),
 ('spkp271_275', 'uh', 'well'),
 ('spkp228_485', 'cause', 'because'),
 ('spkp317_395', 'right', 'write'),
 ('spkp317_395', 'tv', '<unk>'),
 ('spkp317_395', "there's", "it's"),
 ('spkp317_395', 'glitch', '<unk>'),
 ('spkp226_571', 'heartwarming seeing', '<unk> seen'),
 ('spkp226_571', 'realize', 'realise'),
 ('spkp304_524', 'cause i', 'because'),
 ('spkp271_513', "you've um you've fantasized", 'you <unk>'),
 ('spkp271_513', "happening and it's just just so surreal", 'all'),
 ('spkp226_565', 'impactful', '<unk>'),
 ('spkp264_59', 'um', 'and'),
 ('spkp264_442', 'uh', 'ah'),
 ('spkp264_442', 'laundromat', '<unk>'),
 ('spkp228_491', 'cause i', 'because'),
 ('spkp264_324', 'timelines', '<unk>'),
 ('spkp314_524', "i'm", 'am'),
 ('spkp314_524', 'cause i', 'because'),
 ('spkp271_261', 'do', 'did'),
 ('spkp226_203', 'apologizing', 'apologising'),
 ('spkp304_518', 'like-minded', 'like minded'),
 ('spkp364_71', "flame's", 'flames'),
 ('spkp314_518', 'like-minded', 'like minded'),
 ('spkp232_169', 'chalkboard', '<unk>'),
 ('spkp360_415', 'tv just', '<unk>'),
 ('spkp314_19', "should've", 'should have'),
 ('spkp228_122', 'there’s', "there's"),
 ('spkp225_115', 'earbuds', '<unk>'),
 ('spkp232_394', 'emails', '<unk>'),
 ('spkp282_546', "it's", 'is'),
 ('spkp364_596', 'gps', '<unk>'),
 ('spkp364_596', 'just puzzled', 'tousled'),
 ('spkp360_159', "must've", 'must have'),
 ('spkp264_132', 'tutorials', '<unk>'),
 ('spkp264_132', '‘how', 'how'),
 ('spkp264_132', 'be?’', 'be'),
 ('spkp374_569', 'it was just', 'is'),
 ('spkp232_357', 'roller coaster', '<unk>'),
 ('spkp364_541', 'this', 'the'),
 ('spkp364_541', 'veggie', '<unk>'),
 ('spkp374_233', 'trying to', 'try and'),
 ('spkp360_603', 'resonates', '<unk>'),
 ('spkp374_541', 'veggie', '<unk>'),
 ('spkp232_21', 'uh', 'ah'),
 ('spkp232_592', 'regroup', '<unk>'),
 ('spkp271_87', 'this', 'the'),
 ('spk4_22', 'how', 'her'),
 ('spkp237_399', 'um', "i'm"),
 ('spkp364_390', 'cause', 'because'),
 ('spkp318_19', "should've", 'should have'),
 ('spkp374_390', 'cause', 'because'),
 ('spkp314_120', 'hadn’t', "hadn't"),
 ('spkp314_120', '90s', '<unk>'),
 ('spkp314_120', '‘why not?’', 'why not'),
 ('spkp237_400', 'fantasize', '<unk>'),
 ('spkp271_103', 'kinda', 'kind of'),
 ('spkp304_134', 'overthinking', '<unk>'),
 ('spkp304_134', 'jewelry’s', "jewellery's"),
 ('spkp304_134', 'it’s', "it's"),
 ('spkp304_120', 'hadn’t', "hadn't"),
 ('spkp304_120', '90s', '<unk>'),
 ('spkp304_120', '‘why not?’', 'why not'),
 ('spkp271_117', 'texted', '<unk>'),
 ('spkp226_175', 'cause', 'because'),
 ('spkp232_545', 'impacting', '<unk>'),
 ('spkp314_134', 'overthinking', '<unk>'),
 ('spkp314_134', 'jewelry’s', "jewelry's"),
 ('spkp314_134', 'it’s', "it's"),
 ('spkp237_372', 'uh', 'ah'),
 ('spkp304_3', 'uh', 'ah'),
 ('spkp304_3', 'like', 'that'),
 ('spkp374_347', 'cause', 'because'),
 ('spkp318_338', 'this', 'the'),
 ('spkp339_187', 'unsee unhear un-experience', '<unk> <unk> <unk>'),
 ('spkp374_421', 'worst-case', 'worst case'),
 ('spkp364_347', 'cause', 'because'),
 ('spkp314_108', 'cause uh', 'because'),
 ('spkp314_108', 'kinda', 'kind of'),
 ('spkp364_421', 'worst-case', 'worst case'),
 ('spkp264_287', 'just feel', 'still'),
 ('spkp314_487', 'cause i', 'because'),
 ('spkp225_511', 'videos', '<unk>'),
 ('spkp282_156', 'unhear', '<unk>'),
 ('spkp339_434', 'about', 'that'),
 ('spkp226_388', 'gps', '<unk>'),
 ('spkp226_388', 'rerouting', '<unk>'),
 ('spkp226_388', 'stop-and-go', 'stop and go'),
 ('spkp318_106', 'kinda', 'kind of'),
 ('spkp314_444', 'squishy', '<unk>'),
 ('spkp264_244', 'out for', 'up from'),
 ('spkp264_244', "they're not there anymore and", 'then'),
 ('spkp237_602', 'diy', '<unk>'),
 ('spkp237_164', 'uh there', 'i'),
 ('spkp304_450', 'uh', 'a'),
 ('spkp226_405', 'um so', '<unk>'),
 ('spkp226_405',
  'drilling into your head making it impossible to uh to find any peace',
  'just'),
 ('spkp374_179', 'try', 'tried'),
 ('spkp374_179', 'uh', 'a'),
 ('spkp304_444', 'squishy', '<unk>'),
 ('spkp314_336', "i'm", 'am'),
 ('spkp271_315', 'uh', 'oh'),
 ('spkp364_151', 'off-putting', 'off putting'),
 ('spkp304_478', 'braving', 'breathing'),
 ('spkp374_151', 'off-putting', 'off putting'),
 ('spkp225_288', 'mid-laugh', 'mid laugh'),
 ('spkp271_329', "they're just", "there's"),
 ('spkp364_145', 'looked', 'looks'),
 ('spkp318_138', 'yuck', '<unk>'),
 ('spkp364_153', 'they', 'i'),
 ('spkp339_393', 'uh', 'ah'),
 ('spkp304_452', 'googling', '<unk>'),
 ('spkp264_520', 'this', 'the'),
 ('spkp264_520', 'heartwarming', '<unk>'),
 ('spkp237_166', 'scrunches', '<unk>'),
 ('spkp237_600', 'salsa', '<unk>'),
 ('spkp318_104', 'surreal', '<unk>'),
 ('spkp226_361', 'panicking', '<unk>'),
 ('spkp304_320', 'uh', 'ah'),
 ('spkp374_609', 'forms', 'forums'),
 ('spkp264_252', 'anew', 'and you'),
 ('spkp314_452', 'googling', '<unk>'),
 ('spkp304_446', 'um', "you're just"),
 ('spkp339_422', 'as', 'is'),
 ('spkp364_190', 'the', 'a'),
 ('spkp304_491', 'cause i', 'because'),
 ('spkp225_507', 'travelers', 'travellers'),
 ('spkp339_378', 'panicking', '<unk>'),
 ('spkp314_485', 'cause', 'because'),
 ('spkp228_524', 'cause i', 'cuz'),
 ('spkp225_275', 'uh', 'ah'),
 ('spkp304_485', 'cause', 'because'),
 ('spkp228_242', 'uh', 'oh'),
 ('spkp374_423', 'backlash', '<unk>'),
 ('spkp237_358', 'tossing', '<unk>'),
 ('spkp304_1', 'um', "i'm"),
 ('spkp364_423', 'backlash', '<unk>'),
 ('spkp318_448', 'glitch', '<unk>'),
 ('spkp339_16', 'uh kinda', 'kind of'),
 ('spkp237_364', 'second-guess', 'second guess'),
 ('spkp314_122', 'there’s', "there's"),
 ('spkp226_605', '“you’d', "you'd"),
 ('spkp226_605', 'instrument,”', 'instrument'),
 ('spkp282_395', 'uh', 'are'),
 ('spkp282_395', 'right', 'write'),
 ('spkp282_395', 'tv', '<unk>'),
 ('spkp282_395', 'glitch', '<unk>'),
 ('spkp237_370', 'but', 'it'),
 ('spkp232_547', 'favorite', 'favourite'),
 ('spkp232_547', 'ambiance', '<unk>'),
 ('spkp364_379', "there's this", 'there is the'),
 ('spkp237_416', 'right', 'write'),
 ('spkp271_115', 'earbuds', '<unk>'),
 ('spkp304_122', 'there’s', "there's"),
 ('spkp282_418', 'what-ifs', 'what ifs'),
 ('spkp364_392', '20', '<unk>'),
 ('spkp374_392', '20', '<unk>'),
 ('spkp226_188', 'open-minded', 'open minded'),
 ('spkp282_342', 'armrests', '<unk>'),
 ('spkp339_608', '“expand', 'expand'),
 ('spkp339_608', 'horizons”', 'horizons'),
 ('spkp364_225', 'patience', 'patients'),
 ('spkp374_231', 'curveball', 'curve ball'),
 ('spkp318_528', 'café', 'cafe'),
 ('spkp318_528', 'cause i', 'because'),
 ('spkp364_543', 'heartwarming', '<unk>'),
 ('spkp364_543', 'workouts', '<unk>'),
 ('spkp374_225', 'patience', 'patients'),
 ('spkp374_543', 'heartwarming', '<unk>'),
 ('spkp374_543', 'workouts', '<unk>'),
 ('spkp364_231', 'curveball', 'curve ball'),
 ('spkp282_593', 'daunting', 'so'),
 ('spkp232_341', 'panicking', '<unk>'),
 ('spkp232_427', 'i i', "i'd"),
 ('spkp228_53', 'uh', 'oh'),
 ('spkp374_219', 'can not', 'cannot'),
 ('spkp237_562', 'cinematography', '<unk>'),
 ('spkp264_124', 'it’s', "it's"),
 ('spkp264_124', 'non-stop', 'non stop'),
 ('spkp264_124', 'i’m', "i'm"),
 ('spkp264_124', '– kinda', 'kind of'),
 ('spkp374_594', "i'm just", 'not'),
 ('spkp364_580', "it's", 'is'),
 ('spkp228_108', 'danced', 'dance'),
 ('spk6_19', "should've", 'should have'),
 ('spkp225_117', 'texted', '<unk>'),
 ('spkp228_120', 'hadn’t', "hadn't"),
 ('spkp228_120', '90s', '<unk>'),
 ('spkp228_120', '‘why not?’', 'why not'),
 ('spkp317_156', 'unhear', '<unk>'),
 ('spkp228_134', 'overthinking', '<unk>'),
 ('spkp228_134', 'jewelry’s', "jewelry's"),
 ('spkp228_134', 'it’s', "it's"),
 ('spkp225_103', 'realize', 'realise'),
 ('spkp314_268', 'and i', 'an eye'),
 ('spkp339_595', '20/20', '<unk>'),
 ('spkp317_89', 'self-care', 'self care'),
 ('spkp360_417', 'this', 'the'),
 ('spkp339_581', "there's", 'there is'),
 ('spkp360_371', 'what-ifs', 'what ifs'),
 ('spkp271_511', 'um', "i'm"),
 ('spkp271_511', 'videos', '<unk>'),
 ('spkp304_526', 'cause i', 'because'),
 ('spkp228_487', 'cause', 'because'),
 ('spkp304_240', 'uh', 'ah'),
 ('spkp271_505', 'um', "i'm"),
 ('spkp271_505', 'bioluminescent', '<unk>'),
 ('spkp339_556', 'replaying', '<unk>'),
 ('spkp339_556', 'heartwarming', '<unk>'),
 ('spkp228_478', 'staying in', 'stay again'),
 ('spkp271_288', 'mid-laugh', 'mid laugh'),
 ('spkp339_224', 'cause', 'because'),
 ('spk2_55', "you're", 'you all'),
 ('spk2_55', 'uh', 'are'),
 ('spk2_55', 'lostness', '<unk>'),
 ('spkp364_98', 'replay', '<unk>'),
 ('spkp364_98', 'resonates', '<unk>'),
 ('spkp364_98', 'soundtrack', '<unk>'),
 ('spkp228_444', 'squishy', '<unk>'),
 ('spkp232_180', 'uh', 'ah'),
 ('spkp317_62', "you're", 'you are'),
 ('spk2_69', 'crafting', '<unk>'),
 ('spkp228_336', 'lift', 'live'),
 ('spkp339_219', 'i can', 'and'),
 ('spkp304_296', 'sadder', 'sad'),
 ('spkp317_63', 'uh', 'ah'),
 ('spkp225_466', "just can't", "don't"),
 ('spkp264_496', "um i'm just", 'uh'),
 ('spkp264_496', 'trek', 'trick'),
 ('spkp264_496', 'just thinking', '<unk>'),
 ('spkp317_433', 'sibling', 'siblings'),
 ('spkp364_99', 'colors', 'colours'),
 ('spkp317_77', 'past', 'pass'),
 ('spkp339_225', 'patience', 'patients'),
 ('spkp339_543', 'heartwarming', '<unk>'),
 ('spkp339_543', 'workouts', '<unk>'),
 ('spkp339_231', 'curveball', 'curve ball'),
 ('spkp232_156', 'unhear', '<unk>'),
 ('spkp226_200', 'uh', 'ah'),
 ('spkp271_504', "i'm just", 'i'),
 ('spkp317_382', 'jolts', '<unk>'),
 ('spkp317_382', 'but', 'i'),
 ('spkp360_358', 'tossing', '<unk>'),
 ('spkp228_486', "i've", 'i'),
 ('spkp314_533', 'cause i', 'because'),
 ('spkp304_269', 'realization', 'realisation'),
 ('spkp225_499', 'paddleboarding', '<unk>'),
 ('spkp225_499', 'adrenaline', '<unk>'),
 ('spkp360_364', 'second-guess', 'second guess'),
 ('spkp226_228', 'uh', 'a'),
 ('spkp228_135', 'i am overthinking', "i'm <unk>"),
 ('spkp282_545', 'impacting', '<unk>'),
 ('spkp225_102', 'kinda', 'kind of'),
 ('spkp225_116', 'wound', 'went'),
 ('spk6_18', 'colorful', 'colourful'),
 ('spk6_18', 'uhm', 'erm'),
 ('spkp228_121', 'it’s', "it's"),
 ('spkp228_121', 'i’m', "i'm"),
 ('spkp228_121', 'i’m 16', "i'm <unk>"),
 ('spkp228_121', 'i’d', "i'd"),
 ('spkp228_109', 'kinda', 'kind of'),
 ('spkp364_595', '20/20', '<unk>'),
 ('spkp374_595', '20/20', '<unk>'),
 ('spkp314_26', "it's just mind-blowing", 'mind blowing'),
 ('spkp364_581', "there's", 'there is'),
 ('spkp364_218', 'this', 'a'),
 ('spkp282_592', 'regroup', '<unk>'),
 ('spkp318_515', "you're", 'you were'),
 ('spkp317_194', 'uh', "i'm"),
 ('spkp374_224', 'cause', 'cuz'),
 ('spkp226_57', 'um', "i'm"),
 ('spkp226_57', 'envelop', 'envelope'),
 ('spkp226_57', 'crafting', '<unk>'),
 ('spkp226_57', 'act of creating of bringing joy through food', 'acts'),
 ('spkp364_556', 'replaying', '<unk>'),
 ('spkp364_556', 'so heartwarming', '<unk>'),
 ('spkp282_87', 'this', 'the'),
 ('spkp264_119', 'vibe', '<unk>'),
 ('spkp374_556', 'um', 'then'),
 ('spkp374_556', 'replaying', '<unk>'),
 ('spkp374_556', "brings a smile to your face it's like you're", '<unk>'),
 ('spkp374_556',
  "so thankful for those moments for that perfect date and it's it's just so so heartwarming",
  '<unk>'),
 ('spkp364_224', 'cause', 'because'),
 ('spkp360_166', 'scrunches', '<unk>'),
 ('spkp318_529', "i'm", 'am'),
 ('spkp318_529', 'cause i', 'because'),
 ('spkp282_357', 'uh roller coaster', '<unk>'),
 ('spkp282_357', "i'm", 'am'),
 ('spkp282_431', 'just imagining', 'imagine'),
 ('spkp318_32', 'um', 'and'),
 ('spkp339_153', 'tune out', 'two now'),
 ('spkp364_387', 'cause', 'because'),
 ('spkp364_393', 'uh', 'us'),
 ('spk4_21', 'um', "i'm"),
 ('spkp374_387', 'cause', 'because'),
 ('spkp237_371', 'what-ifs', 'what ifs'),
 ('spkp282_394', 'emails', '<unk>'),
 ('spkp282_394', 'rechecking', "we're checking"),
 ('spkp271_114', 'today’s', "today's"),
 ('spkp364_378', 'panicking', '<unk>'),
 ('spkp237_403', 'checkout', '<unk>'),
 ('spkp237_403', 'dragging', 'jogging'),
 ('spkp232_234', 'uh', '<unk>'),
 ('spkp318_307', 'this', 'the'),
 ('spkp226_604', 'there’s', "there's"),
 ('spkp226_604', '“this', 'this'),
 ('spkp226_604', 'join,” i’m just', "join i'm"),
 ('spkp374_378', 'panicking', '<unk>'),
 ('spkp237_365', 'obsessively', '<unk>'),
 ('spkp364_422',
  "take a break because it's it's too much just just too real",
  'ugh'),
 ('spkp271_128', 'full-on', 'full on'),
 ('spkp232_208', 'uh', 'ah'),
 ('spkp364_436', 'in the', 'a'),
 ('spkp228_525', 'groundbreaking', '<unk>'),
 ('spkp317_553', 'heartwarming', '<unk>'),
 ('spkp314_490', 'cause i', 'because like'),
 ('spkp282_155', 'the uh', 'that'),
 ('spkp304_484', 'get-together', 'get together'),
 ('spkp304_490', 'cause', 'because'),
 ('spkp317_221', 'uh a thousand', '<unk> <unk>'),
 ('spkp314_484', 'get-together', 'get together'),
 ('spkp314_484', 'just shocked', 'shock'),
 ('spkp282_169', 'chalkboard', '<unk>'),
 ('spkp339_423', 'backlash', '<unk>'),
 ('spkp339_423', 'the', 'a'),
 ('spkp374_608', '“expand', 'expand'),
 ('spkp374_608', 'horizons”', 'horizons'),
 ('spkp318_111', 'kinda', 'kind of'),
 ('spkp304_447', 'update', '<unk>'),
 ('spkp264_535',
  'surprised and excited cause i i never meet anyone',
  'surprise sighted one'),
 ('spkp314_335', 'not-funny', 'not funny'),
 ('spkp228_294', 'letting go', 'lingo'),
 ('spkp364_608', '“expand', 'expand'),
 ('spkp364_608', 'horizons”', 'horizons'),
 ('spkp271_464', 'break-ins', 'break ins'),
 ('spkp271_464', 'double-checking', 'double checking'),
 ('spkp226_406',
  "staring at the spinning wheel waiting hoping and it's",
  'just just'),
 ('spkp318_105', 'decide', 'decided'),
 ('spkp304_335', 'not-funny', 'not funny'),
 ('spkp339_392', '20', '<unk>'),
 ('spkp360_562', 'cinematography', '<unk>'),
 ('spkp271_458', 'armrest', '<unk>'),
 ('spkp271_458', 'praying', 'trying'),
 ('spkp374_146', 'ugh', 'ah'),
 ('spkp271_403', 'checkout', '<unk>'),
 ('spkp318_604', 'there’s', "there's"),
 ('spkp318_604', '“this', 'this'),
 ('spkp318_604', 'join,” i’m just', "join i'm"),
 ('spkp271_365', 'obsessively', '<unk>'),
 ('spkp364_109', 'kinda', 'kind of'),
 ('spkp228_595', '20/20', '<unk>'),
 ('spkp304_346', 'brace', 'braced'),
 ('spkp271_371', 'what-ifs', 'what ifs'),
 ('spkp237_114', 'today’s', "today's"),
 ('spkp374_121', 'it’s', "it's"),
 ('spkp374_121', 'i’m', "i'm"),
 ('spkp374_121', 'i’m 16', "i'm <unk>"),
 ('spkp374_121', 'i’d', "i'd"),
 ('spkp364_135', 'i am overthinking', "i'm <unk>"),
 ('spkp360_511', 'um so', '<unk>'),
 ('spkp360_511', 'videos', '<unk>'),
 ('spkp264_208', 'this', 'a'),
 ('spkp364_121', 'it’s', "it's"),
 ('spkp364_121', 'i’m', "i'm"),
 ('spkp364_121', 'i’m 16', "i'm <unk>"),
 ('spkp364_121', 'i’d', "i'd"),
 ('spkp360_505', 'bioluminescent', '<unk>'),
 ('spkp237_128', 'full-on', 'full on'),
 ('spkp237_128', 'but', 'that'),
 ('spkp339_493', 'um', "i'm"),
 ('spkp374_135', 'overthinking', '<unk>'),
 ('spkp360_288', 'mid-laugh', 'mid laugh'),
 ('spkp228_556', 'replaying', '<unk>'),
 ('spkp228_556', 'so heartwarming', '<unk>'),
 ('spkp314_391', 'halt', 'hold'),
 ('spkp282_132', 'tutorials', '<unk>'),
 ('spkp282_132', '‘how', 'how'),
 ('spkp282_132', 'be?’', 'be'),
 ('spkp339_444', 'squishy', '<unk>'),
 ('spkp237_464', 'break-ins', 'break ins'),
 ('spkp237_464', 'double-checking', 'double checking'),
 ...]