Reassemble TSVs
For MMConv
import json
from pathlib import Path
JSONDIR = "/tmp/aded02f0c81b97c5c5d130453c6bdbb5"
TSVDIR = "/tmp/word-tsv"
TXTDIR = "/tmp/39b5958071804a33f4cd1780d029f602"
JSONPATH = Path(JSONDIR)
TSVPATH = Path(TSVDIR)
TXTPATH = Path(TXTDIR)
if not TSVPATH.is_dir():
TSVPATH.mkdir()
def read_json(filename):
if type(filename) is Path:
filename = str(filename)
out = []
with open(filename) as inf:
data = json.load(inf)
for seg in data["segments"]:
out.append({
"start": seg["start"],
"end": seg["end"],
"word": seg["text"]
})
return out
NORMS = {
"Tromso": "Tromsø",
"Skane.": "Skåne.",
"ehh,": "eh,",
"ehh": "eh",
"eehh": "eh",
"ehh...": "eh...",
"Ostermalm.": "Östermalm.",
"Timothee": "Timothée",
"fatolj?": "fåtölj?",
"fatolj.": "fåtölj.",
}
def reinterpret_json(filename: Path):
data = read_json(filename)
stem = filename.stem
orig_text = TXTPATH / f"{stem}.txt"
with open(str(orig_text)) as inf:
text = inf.read().strip()
orig_words = text.split(" ")
if len(orig_words) == len(data):
for pp in zip(data, orig_words):
if pp[0]["word"] in NORMS:
pp[0]["word"] = NORMS[pp[0]["word"]]
continue
if pp[0]["word"] == pp[1]:
continue
if pp[0]["word"] == pp[1].replace("-", ""):
pp[0]["word"] = pp[1]
else:
print("Argh!", filename, pp[0]["word"], pp[1])
return data
reinterpret_json(Path("/tmp/reassemble/hsi_4_0717_211_002_main_110516_112501.json"))
filepieces = {}
for file in JSONPATH.glob("*.json"):
filestem = file.stem
if not filestem.startswith("hsi_"):
continue
pieces = filestem.split("_")
if len(pieces) != 8:
continue
base = "_".join(pieces[0:6])
if not base in filepieces:
filepieces[base] = []
startint = int(pieces[6])
endint = int(pieces[7])
words = reinterpret_json(file)
for word in words:
w = word["word"].strip()
if w.startswith("[") or w.endswith("]"):
continue
start = (float(startint) / 1000.0) + word["start"]
end = (float(startint) / 1000.0) + word["end"]
filepieces[base].append((start, end, w))
for piece in filepieces:
pieces = filepieces[piece]
pieces_sorted = sorted(pieces, key=lambda x: x[0])
outfile = TSVPATH / f"{piece}.tsv"
with open(outfile, "w") as of:
for item in pieces_sorted:
of.write(f"{item[0]:.03f}\t{item[1]:.03f}\t{item[2]}\n")