import json
from pathlib import Path
JSONDIR = "/tmp/aded02f0c81b97c5c5d130453c6bdbb5"
TSVDIR = "/tmp/word-tsv"
TXTDIR = "/tmp/39b5958071804a33f4cd1780d029f602"
JSONPATH = Path(JSONDIR)
TSVPATH = Path(TSVDIR)
TXTPATH = Path(TXTDIR)
if not TSVPATH.is_dir():
    TSVPATH.mkdir()
def read_json(filename):
    if type(filename) is Path:
        filename = str(filename)
    out = []
    with open(filename) as inf:
        data = json.load(inf)
    for seg in data["segments"]:
        out.append({
            "start": seg["start"],
            "end": seg["end"],
            "word": seg["text"]
        })
    return out
NORMS = {
    "Tromso": "Tromsø",
    "Skane.": "Skåne.",
    "ehh,": "eh,",
    "ehh": "eh",
    "eehh": "eh",
    "ehh...": "eh...",
    "Ostermalm.": "Östermalm.",
    "Timothee": "Timothée",
    "fatolj?": "fåtölj?",
    "fatolj.": "fåtölj.",
}
def reinterpret_json(filename: Path):
    data = read_json(filename)
    stem = filename.stem
    orig_text = TXTPATH / f"{stem}.txt"
    with open(str(orig_text)) as inf:
        text = inf.read().strip()
    orig_words = text.split(" ")
    if len(orig_words) == len(data):
        for pp in zip(data, orig_words):
            if pp[0]["word"] in NORMS:
                pp[0]["word"] = NORMS[pp[0]["word"]]
                continue
            if pp[0]["word"] == pp[1]:
                continue
            if pp[0]["word"] == pp[1].replace("-", ""):
                pp[0]["word"] = pp[1]
            else:
                print("Argh!", filename, pp[0]["word"], pp[1])
    return data
reinterpret_json(Path("/tmp/reassemble/hsi_4_0717_211_002_main_110516_112501.json"))
[{'start': 0.14, 'end': 0.54, 'word': 'Maybe'},
 {'start': 0.54, 'end': 0.78, 'word': "it's"},
 {'start': 0.78, 'end': 0.88, 'word': 'a'},
 {'start': 0.88, 'end': 1.5, 'word': 'well-known'},
 {'start': 1.5, 'end': 1.84, 'word': 'thing.'}]
filepieces = {}
for file in JSONPATH.glob("*.json"):
    filestem = file.stem
    if not filestem.startswith("hsi_"):
        continue
    pieces = filestem.split("_")
    if len(pieces) != 8:
        continue
    base = "_".join(pieces[0:6])
    if not base in filepieces:
        filepieces[base] = []

    startint = int(pieces[6])
    endint = int(pieces[7])
    words = reinterpret_json(file)
    for word in words:
        w = word["word"].strip()
        if w.startswith("[") or w.endswith("]"):
            continue
        start = (float(startint) / 1000.0) + word["start"]
        end = (float(startint) / 1000.0) + word["end"]
        filepieces[base].append((start, end, w))
for piece in filepieces:
    pieces = filepieces[piece]
    pieces_sorted = sorted(pieces, key=lambda x: x[0])
    outfile = TSVPATH / f"{piece}.tsv"
    with open(outfile, "w") as of:
        for item in pieces_sorted:
            of.write(f"{item[0]:.03f}\t{item[1]:.03f}\t{item[2]}\n")