First attempts at alignment (take 2)
Wish I could find the first version
TEST_A = "/Users/joregan/Playing/rd_ctm_edit/H5C120171011va"
lines = []
with open(TEST_A) as f:
for line in f.readlines():
lines.append(line.strip())
def accept_all(lines):
outlines = []
for line in lines:
parts = line.split(" ")
if parts[-1] == "cor":
outlines.append(line)
elif parts[-1] == "sub":
parts[4] = parts[6]
parts[-1] = "cor"
outlines.append(" ".join(parts))
return outlines
accept_all(lines)
def ctm_to_timed(lines):
output = []
for line in lines:
parts = line.split(" ")
start = float(parts[2])
dur = float(parts[3])
output.append({
"start": start,
"end": start + dur,
"text": parts[6]
})
return output
side_a = ctm_to_timed(accept_all(lines))
phonfile = "/Users/joregan/Playing/rd_phonetic/2442205210012872721_480p.json"
import json
with open(phonfile) as f:
pieces = json.load(f)
def hf_json_to_timed(data):
output = []
for chunk in data["chunks"]:
output.append({
"start": chunk["timestamp"][0],
"end": chunk["timestamp"][1],
"text": chunk["text"]
})
return output
side_b = hf_json_to_timed(pieces)
def prune_to_other(left, right, fudge=0.5):
safe = left
working = right
# if right[0]["start"] > (left[0]["start"] - fudge) and right[-1]["end"] < (left[-1]["end"] + fudge):
# safe = right
# working = left
output = []
for item in working:
if item["start"] < safe[0]["start"] - fudge:
continue
elif item["end"] > safe[-1]["end"] + fudge:
continue
else:
output.append(item)
return safe, output
new_a, new_b = prune_to_other(side_a, side_b)
new_b[0]
new_a[0]
def cost(a, b):
starts = abs(a["start"] - b["start"])
ends = abs(a["end"] - b["end"])
return starts + ends
def in_start_range(a, b, range=0.2):
return abs(a["start"] - b["start"]) <= range
def in_end_range(a, b, range=0.2):
return abs(a["end"] - b["end"]) <= range
def in_range(a, b, range=0.2):
r_start = in_start_range(a, b, range)
r_end = in_end_range(a, b, range)
return r_start or r_end
in_range(new_a[0], new_b[-1])
in_range(new_a[0], new_b[0])
import numpy as np
dist_matrix = np.zeros((len(new_a) + 1, len(new_b) + 1))
for i in range(1, len(new_a) + 1):
for j in range(1, len(new_b) + 1):
if not in_range(new_a[i-1], new_b[j-1]):
continue
pair_cost = cost(new_a[i-1], new_b[j-1])
dist_matrix[i, j] = pair_cost
print(new_a[i-1], new_b[j-1], pair_cost)
s1 = len(new_a) + 1
s2 = len(new_b) + 1
for i in range(s1):
dist_matrix[i, 0] = float(i)
for j in range(s2):
dist_matrix[0, j] = float(j)
for i in range(1, s1):
for j in range(1, s2):
if not in_range(new_a[i-1], new_b[j-1]):
continue
pair_cost = cost(new_a[i-1], new_b[j-1])
dist_matrix[i, j] = min(
dist_matrix[i - 1][j] + pair_cost,
dist_matrix[i][j - 1] + pair_cost,
dist_matrix[i - 1][j - 1] + pair_cost
)
import pandas as pd
df = pd.DataFrame(data=dist_matrix,index=[""] + [x["text"] for x in new_a], columns=[""] + [x["text"] for x in new_b])
def falls_between(a1, a2, b):
if b["end"] <= a2["start"] and b["start"] >= a1["end"]:
return True
return False
df