TEST_A = "/Users/joregan/Playing/rd_ctm_edit/H5C120171011va"
lines = []
with open(TEST_A) as f:
    for line in f.readlines():
        lines.append(line.strip())
def accept_all(lines):
    outlines = []
    for line in lines:
        parts = line.split(" ")
        if parts[-1] == "cor":
            outlines.append(line)
        elif parts[-1] == "sub":
            parts[4] = parts[6]
            parts[-1] = "cor"
            outlines.append(" ".join(parts))
    return outlines
l2 = accept_all(lines)
l2[0:2]
['2442205210012872721 1 27.86 0.06 Herr 1.0 Herr cor',
 '2442205210012872721 1 28.0 0.48 talman! 1.0 talman! cor']
def ctm_to_timed(lines):
    output = []
    for line in lines:
        parts = line.split(" ")
        start = float(parts[2])
        dur = float(parts[3])
        output.append({
            "start": start,
            "end": start + dur,
            "text": parts[6]
        })
    return output
side_a = ctm_to_timed(accept_all(lines))
phonfile = "/Users/joregan/Playing/rd_phonetic/2442205210012872721_480p.json"
import json
with open(phonfile) as f:
    pieces = json.load(f)
def hf_json_to_timed(data):
    output = []
    for chunk in data["chunks"]:
        output.append({
            "start": chunk["timestamp"][0],
            "end": chunk["timestamp"][1],
            "text": chunk["text"]
        })
    return output
side_b = hf_json_to_timed(pieces)
def prune_to_other(left, right, fudge=0.5):
    output = []
    for item in right:
        if item["start"] < left[0]["start"] - fudge:
            continue
        elif item["end"] > left[-1]["end"] + fudge:
            continue
        else:
            output.append(item)

    return left, output
new_a, new_b = prune_to_other(side_a, side_b)
def end_cost(a, b):
    return abs(a["end"] - b["end"])

def start_cost(a, b):
    return abs(a["start"] - b["start"])

def cost(a, b):
    starts = start_cost(a, b)
    ends = end_cost(a, b)
    return starts + ends
def in_start_range(a, b, range=0.2):
    return abs(a["start"] - b["start"]) <= range

def in_end_range(a, b, range=0.2):
    return abs(a["end"] - b["end"]) <= range

def in_range(a, b, range=0.2):
    r_start = in_start_range(a, b, range)
    r_end = in_end_range(a, b, range)
    return r_start or r_end
def falls_between(a1, a2, b):
    if b["end"] <= a2["start"] and b["start"] >= a1["end"]:
        return True
    return False
import numpy as np
def approx_eq(start1, start2, factor=0.04):
    return start1 == start2 or abs(start1 - start2) < factor
def align_times(new_a, new_b, merge_end_flexibility=0.06):
    s1 = len(new_a)
    s2 = len(new_b)

    additionals = []
    merges = {}

    dist_matrix = np.matrix(np.ones((s1, s2)))
    pair_cost = 0.0

    for i in range(s1):
        for j in range(s2):
            if not in_range(new_a[i], new_b[j]):
                continue

            if i == 0 and new_b[j]["end"] < new_a[0]["start"]:
                additionals.append((-1, 0, j))
                dist_matrix[i, j] = 1.0
                continue
            elif i < (s1 - 1) and falls_between(new_a[i], new_a[i + 1], new_b[j]):
                additionals.append((i, i + 1, j))
                dist_matrix[i, j] = 1.0
                continue
            elif i == s1 and new_b[j]["start"] >= new_a[i]["end"]:
                additionals.append((i, -1, j))
                dist_matrix[i, j] = 1.0
                continue

            if approx_eq(new_a[i]["start"], new_b[j]["start"]):
                tmp_j = j
                fwd = []
                extent = new_b[tmp_j]
                if i < (s1 - 2) and new_b[tmp_j]["end"] < new_a[i + 1]["end"]:
                    extent = new_a[i + 1]
                while tmp_j < (s2 - 1) and not in_end_range(new_a[i], extent, merge_end_flexibility):
                    fwd.append((end_cost(new_a[i], new_b[tmp_j]), tmp_j))
                    tmp_j += 1
                if len(fwd) > 1:
                    sfwd = sorted(fwd)
                    new_j = sfwd[0][1]
                    if new_j != j:
                        pair_cost = sfwd[0][0]
                        merges[i] = [x for x in range(j, new_j + 1)]
                        j = new_j
            if pair_cost != 1.:
                pair_cost = cost(new_a[i], new_b[j])
            dist_matrix[i, j] = pair_cost
    return dist_matrix, additionals, merges
dist_m, additions, mrg = align_times(new_a, new_b)
new_b[24]
{'start': 38.14, 'end': 38.32, 'text': 'fɪne'}
additions
[(4, 5, 8), (14, 15, 21)]
dist_m.shape
(18, 25)
def walk_matrix(dist_matrix, additions):
    i = 0
    j = 0

    s1 = dist_matrix.shape[0]
    s2 = dist_matrix.shape[1]

    path = []
    def do_additions(i, j):
        if (i-1, i, j) in additions:
            return True
        if i+1 < s1 and (i, i + 1, j) in additions:
            return True
        if i == s1 and (i, -1, j) in additions:
            return True
        return False

    while i < s1:
        while j < s2:
            if not i in mrg:
                if do_additions(i, j):
                    j += 1
                    continue
                pairs = []
                tmpj = j
                while tmpj < s2 - 1 and dist_matrix[i,tmpj] != 1.0:
                    pairs.append((dist_matrix[i,tmpj], tmpj))
                    tmpj += 1
                if pairs != []:
                    spairs = sorted(pairs)
                    j = spairs[0][1]
                path.append((i, j))
                i += 1
                j += 1
                continue
            else:
                path += [(i, x) for x in mrg[i]]
                j = mrg[i][-1] + 1
                i += 1
                continue
    return path
additions
[(4, 5, 8), (14, 15, 21)]
path = walk_matrix(dist_m, additions)
for pp in path:
    print(new_a[pp[0]]["text"], new_b[pp[1]]["text"])
Herr ɑː
talman! tɑːlman
Riksdagsledamöter! rɪksasleːda
Riksdagsledamöter! møːtœ̞
Allianspartierna al
Allianspartierna aspatiːæɳa
Moderaterna, mʊdɑːtœ̞ɔɳa
Centerpartiet, sentə
Centerpartiet, patiːət
Liberalerna lɪbɑːlɔɳa
och oː
Kristdemokraterna kɪs
Kristdemokraterna demɔkɑːtɔɳa
föreslår fœ̞ːesoː
som sɔm
riksdagens rɪksdɑːɡəns
förste fœ̞st
vice viːsə
talman tɑːlman
Ewa eva
Thalén tareːn
Finné. fɪne
mrg
{2: [2, 3], 3: [5, 6], 5: [9, 10], 8: [13, 14]}
for aa in mrg:
    for bb in mrg[aa]:
        print(new_a[aa], new_b[bb])
{'start': 28.72, 'end': 29.68, 'text': 'Riksdagsledamöter!'} {'start': 28.7, 'end': 29.18, 'text': 'rɪksasleːda'}
{'start': 28.72, 'end': 29.68, 'text': 'Riksdagsledamöter!'} {'start': 29.24, 'end': 29.58, 'text': 'møːtœ̞'}
{'start': 30.16, 'end': 30.98, 'text': 'Allianspartierna'} {'start': 30.16, 'end': 30.26, 'text': 'al'}
{'start': 30.16, 'end': 30.98, 'text': 'Allianspartierna'} {'start': 30.32, 'end': 30.98, 'text': 'aspatiːæɳa'}
{'start': 32.08, 'end': 32.82, 'text': 'Centerpartiet,'} {'start': 32.06, 'end': 32.3, 'text': 'sentə'}
{'start': 32.08, 'end': 32.82, 'text': 'Centerpartiet,'} {'start': 32.36, 'end': 32.86, 'text': 'patiːət'}
{'start': 33.88, 'end': 34.639, 'text': 'Kristdemokraterna'} {'start': 33.84, 'end': 34.04, 'text': 'kɪs'}
{'start': 33.88, 'end': 34.639, 'text': 'Kristdemokraterna'} {'start': 34.08, 'end': 34.64, 'text': 'demɔkɑːtɔɳa'}
import pandas as pd
df = pd.DataFrame(data=dist_m,index=[x["text"] for x in new_a], columns=[x["text"] for x in new_b])
from phonemizer import phonemize
for it_a in new_a:
    print(it_a["text"], phonemize(it_a["text"], language='sv'))
Herr hɛr 
talman! tɑːlman 
Riksdagsledamöter! rɪksdɑːɡsleːdamøːtər 
Allianspartierna aliːanspatiːərna 
Moderaterna, muːdeːratərna 
Centerpartiet, sɛntərpatiːət 
Liberalerna liːbəralərna 
och ɔk 
Kristdemokraterna kriːstdəmɔkrɑːtɛrna 
föreslår føːrəsloːr 
som sɔm 
riksdagens rɪksdɑːɡɛns 
förste fœʂtə 
vice viːsə 
talman tɑːlman 
Ewa eːva 
Thalén thɑːleːn 
Finné. fɪneː 
CHECK_MERGED = [
    ("rɪksasleːdamøːtœ̞", "rɪksdɑːɡsleːdamøːtər"),
    ("alaspatiːæɳa", "aliːanspatiːərna"),
    ("sentəpatiːət", "sɛntərpatiːət"),
    ("kɪsdemɔkɑːtɔɳa", "kriːstdəmɔkrɑːtɛrna")
]
from difflib import SequenceMatcher

for mpair in CHECK_MERGED:
    a = mpair[0]
    b = mpair[1]
    s = SequenceMatcher(None, a, b)
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(
            tag, i1, i2, j1, j2, a[i1:i2], b[j1:j2]))
    print(s.ratio())
    print()
equal     a[0:4] --> b[0:4]   'rɪks' --> 'rɪks'
replace   a[4:5] --> b[4:8]      'a' --> 'dɑːɡ'
equal     a[5:15] --> b[8:18] 'sleːdamøːt' --> 'sleːdamøːt'
replace   a[15:17] --> b[18:20]     'œ̞' --> 'ər'
0.7567567567567568

equal     a[0:2] --> b[0:2]     'al' --> 'al'
insert    a[2:2] --> b[2:4]       '' --> 'iː'
equal     a[2:3] --> b[4:5]      'a' --> 'a'
insert    a[3:3] --> b[5:6]       '' --> 'n'
equal     a[3:9] --> b[6:12] 'spatiː' --> 'spatiː'
replace   a[9:11] --> b[12:15]     'æɳ' --> 'ərn'
equal     a[11:12] --> b[15:16]      'a' --> 'a'
0.7142857142857143

equal     a[0:1] --> b[0:1]      's' --> 's'
replace   a[1:2] --> b[1:2]      'e' --> 'ɛ'
equal     a[2:5] --> b[2:5]    'ntə' --> 'ntə'
insert    a[5:5] --> b[5:6]       '' --> 'r'
equal     a[5:12] --> b[6:13] 'patiːət' --> 'patiːət'
0.88

equal     a[0:1] --> b[0:1]      'k' --> 'k'
replace   a[1:2] --> b[1:4]      'ɪ' --> 'riː'
equal     a[2:3] --> b[4:5]      's' --> 's'
insert    a[3:3] --> b[5:6]       '' --> 't'
equal     a[3:4] --> b[6:7]      'd' --> 'd'
replace   a[4:5] --> b[7:8]      'e' --> 'ə'
equal     a[5:8] --> b[8:11]    'mɔk' --> 'mɔk'
insert    a[8:8] --> b[11:12]       '' --> 'r'
equal     a[8:11] --> b[12:15]    'ɑːt' --> 'ɑːt'
replace   a[11:13] --> b[15:18]     'ɔɳ' --> 'ɛrn'
equal     a[13:14] --> b[18:19]      'a' --> 'a'
0.6060606060606061

df
ɑː tɑːlman rɪksasleːda møːtœ̞ <pa> al aspatiːæɳa mʊdɑːtœ̞ɔɳa <pa> sentə ... fœ̞ːesoː sɔm rɪksdɑːɡəns fœ̞st viːsə tɑːlman <pa> eva tareːn fɪne
Herr 0.02 0.82 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
talman! 0.72 0.12 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
Riksdagsledamöter! 1.00 1.00 1.0 0.62 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
Allianspartierna 1.00 1.00 1.0 1.00 1.1 1.0 0.16 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
Moderaterna, 1.00 1.00 1.0 1.00 1.0 1.0 1.00 0.02 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
Centerpartiet, 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
Liberalerna 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
och 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
Kristdemokraterna 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
föreslår 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 0.059 0.601 1.000 1.000 1.000 1.00 1.0 1.00 1.00 1.00
som 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 0.659 0.041 0.781 1.000 1.000 1.00 1.0 1.00 1.00 1.00
riksdagens 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 0.779 0.001 1.000 1.000 1.00 1.0 1.00 1.00 1.00
förste 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 0.019 1.000 1.00 1.0 1.00 1.00 1.00
vice 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 0.021 1.00 1.0 1.00 1.00 1.00
talman 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 0.04 1.0 1.00 1.00 1.00
Ewa 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 0.02 1.00 1.00
Thalén 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 0.02 1.00
Finné. 1.00 1.00 1.0 1.00 1.0 1.0 1.00 1.00 1.0 1.0 ... 1.000 1.000 1.000 1.000 1.000 1.00 1.0 1.00 1.00 0.14

18 rows × 25 columns

phon_dist = np.matrix(np.ones((len(new_a), len(new_b))))

for i in range(len(new_a)):
    a = phonemize(new_a[i]["text"], language='sv')
    for j in range(len(new_b)):
        s = SequenceMatcher(None, a, new_b[j]["text"])
        phon_dist[i, j] = s.ratio()
    
df = pd.DataFrame(data=phon_dist,index=[x["text"] for x in new_a], columns=[x["text"] for x in new_b])
df
ɑː tɑːlman rɪksasleːda møːtœ̞ <pa> al aspatiːæɳa mʊdɑːtœ̞ɔɳa <pa> sentə ... fœ̞ːesoː sɔm rɪksdɑːɡəns fœ̞st viːsə tɑːlman <pa> eva tareːn fɪne
Herr 0.000000 0.000000 0.133333 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.133333 0.000000 0.000000 0.000000 0.000000 0.000000 0.200000 0.000000
talman! 0.400000 0.933333 0.210526 0.142857 0.166667 0.200000 0.333333 0.315789 0.166667 0.153846 ... 0.125000 0.181818 0.315789 0.153846 0.153846 0.933333 0.166667 0.181818 0.428571 0.166667
Riksdagsledamöter! 0.173913 0.285714 0.625000 0.296296 0.080000 0.086957 0.193548 0.250000 0.080000 0.307692 ... 0.137931 0.166667 0.562500 0.153846 0.153846 0.285714 0.080000 0.166667 0.222222 0.160000
Allianspartierna 0.105263 0.250000 0.285714 0.173913 0.190476 0.210526 0.592593 0.071429 0.190476 0.272727 ... 0.240000 0.100000 0.214286 0.181818 0.363636 0.250000 0.190476 0.100000 0.260870 0.095238
Moderaterna, 0.125000 0.285714 0.240000 0.300000 0.111111 0.125000 0.250000 0.240000 0.111111 0.315789 ... 0.272727 0.117647 0.240000 0.105263 0.210526 0.285714 0.111111 0.235294 0.300000 0.111111
Centerpartiet, 0.125000 0.095238 0.240000 0.100000 0.222222 0.125000 0.500000 0.160000 0.222222 0.421053 ... 0.181818 0.117647 0.160000 0.210526 0.315789 0.095238 0.222222 0.117647 0.100000 0.111111
Liberalerna 0.133333 0.300000 0.250000 0.105263 0.117647 0.266667 0.260870 0.166667 0.117647 0.111111 ... 0.095238 0.000000 0.250000 0.000000 0.333333 0.300000 0.117647 0.125000 0.210526 0.117647
och 0.000000 0.000000 0.142857 0.000000 0.000000 0.000000 0.000000 0.142857 0.000000 0.000000 ... 0.000000 0.333333 0.142857 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Kristdemokraterna 0.181818 0.296296 0.258065 0.230769 0.083333 0.090909 0.200000 0.322581 0.083333 0.240000 ... 0.214286 0.173913 0.387097 0.160000 0.320000 0.296296 0.083333 0.086957 0.230769 0.083333
föreslår 0.153846 0.222222 0.363636 0.235294 0.000000 0.153846 0.095238 0.090909 0.000000 0.125000 ... 0.526316 0.142857 0.272727 0.250000 0.250000 0.222222 0.000000 0.000000 0.117647 0.133333
som 0.000000 0.181818 0.133333 0.200000 0.000000 0.000000 0.142857 0.133333 0.000000 0.222222 ... 0.166667 0.857143 0.133333 0.222222 0.222222 0.181818 0.000000 0.000000 0.000000 0.000000
riksdagens 0.285714 0.315789 0.434783 0.111111 0.000000 0.000000 0.181818 0.260870 0.000000 0.235294 ... 0.200000 0.133333 0.869565 0.117647 0.117647 0.315789 0.000000 0.000000 0.333333 0.250000
förste 0.000000 0.153846 0.000000 0.166667 0.000000 0.000000 0.125000 0.117647 0.000000 0.363636 ... 0.285714 0.000000 0.117647 0.545455 0.181818 0.153846 0.000000 0.000000 0.166667 0.200000
vice 0.250000 0.153846 0.117647 0.166667 0.000000 0.000000 0.250000 0.117647 0.000000 0.363636 ... 0.285714 0.222222 0.235294 0.181818 0.909091 0.153846 0.000000 0.222222 0.166667 0.000000
talman 0.400000 0.933333 0.210526 0.142857 0.166667 0.200000 0.333333 0.315789 0.166667 0.153846 ... 0.125000 0.181818 0.315789 0.153846 0.153846 0.933333 0.166667 0.181818 0.428571 0.166667
Ewa 0.285714 0.333333 0.375000 0.181818 0.222222 0.285714 0.266667 0.250000 0.222222 0.200000 ... 0.307692 0.000000 0.125000 0.000000 0.200000 0.333333 0.222222 0.750000 0.363636 0.222222
Thalén 0.363636 0.625000 0.300000 0.133333 0.000000 0.181818 0.210526 0.200000 0.000000 0.142857 ... 0.352941 0.000000 0.300000 0.142857 0.142857 0.625000 0.000000 0.166667 0.533333 0.153846
Finné. 0.250000 0.153846 0.352941 0.166667 0.000000 0.000000 0.125000 0.117647 0.000000 0.181818 ... 0.428571 0.000000 0.235294 0.181818 0.181818 0.153846 0.000000 0.222222 0.333333 0.800000

18 rows × 25 columns

collected = set()
with open("/tmp/lev-su") as f, open("/tmp/matched.tsv", "w") as of:
    for line in f.readlines():
        parts = line.strip().split()
        ph1 = phonemize(parts[1], language='sv')
        ph2 = phonemize(parts[2], language='sv')
        if ph1 == ph2:
            of.write(f"{parts[1]}\t{parts[2]}\n")