input_dir = "/Users/joregan/Playing/hsi/audio/whisperx-json/"
output_dir = "/tmp/label-studio"
model_version = "whisperx-large-v2"
data_path = "/data/local-files/?d=hsi/"
from pathlib import Path

input_path = Path(input_dir)
output_path = Path(output_dir)
import json
import uuid

def convert_json(filename):
    with open(filename) as inf:
        data = json.load(inf)

    outputs = []
    for segment in data["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        scores = [w["score"] for w in segment["words"] if "score" in w]
        if len(segment["words"]) == 0:
            score = 0.0
        else:
            score = sum(scores) / len(segment["words"])
        gen_id = str(uuid.uuid4())[:6]
        segment = {
            "model_version": model_version,
            "score": score,
            "result": {
                "start": start,
                "end": end,
                "channel": 0,
                "labels": ["Speech"]
            },
            "from_name": "labels",
            "to_name": "audio",
            "type": "labels",
            "id": gen_id
        }
        rec = {
            "model_version": model_version,
            "score": score,
            "result": {
                "start": start,
                "end": end,
                "channel": 0,
                "text": [text.strip()]
            },
            "from_name": "transcription",
            "to_name": "audio",
            "type": "textarea",
            "id": gen_id
        }
        outputs.append(segment)
        outputs.append(rec)

    return outputs
if not output_path.is_dir():
    output_path.mkdir()

for jsonfile in input_path.glob("*.json"):
    annots = convert_json(str(jsonfile))
    filename = data_path + jsonfile.stem + ".wav"
    output = {}
    output["data"] = {
        "audio": filename
    }
    output["predictions"] = annots
    out_json = output_path / f"{jsonfile.name}"
    with open(out_json, "w") as outf:
        json.dump(output, outf)