WhisperX JSON to Label Studio
Via the API
input_dir = "/Users/joregan/Playing/hsi/audio/whisperx-json/"
output_dir = "/tmp/label-studio"
model_version = "whisperx-large-v2"
data_path = "/data/local-files/?d=hsi/"
from pathlib import Path
input_path = Path(input_dir)
output_path = Path(output_dir)
import json
import uuid
def convert_json(filename):
with open(filename) as inf:
data = json.load(inf)
outputs = []
for segment in data["segments"]:
start = segment["start"]
end = segment["end"]
text = segment["text"]
scores = [w["score"] for w in segment["words"] if "score" in w]
if len(segment["words"]) == 0:
score = 0.0
else:
score = sum(scores) / len(segment["words"])
gen_id = str(uuid.uuid4())[:6]
segment = {
"model_version": model_version,
"score": score,
"result": {
"start": start,
"end": end,
"channel": 0,
"labels": ["Speech"]
},
"from_name": "labels",
"to_name": "audio",
"type": "labels",
"id": gen_id
}
rec = {
"model_version": model_version,
"score": score,
"result": {
"start": start,
"end": end,
"channel": 0,
"text": [text.strip()]
},
"from_name": "transcription",
"to_name": "audio",
"type": "textarea",
"id": gen_id
}
outputs.append(segment)
outputs.append(rec)
return outputs
if not output_path.is_dir():
output_path.mkdir()
for jsonfile in input_path.glob("*.json"):
annots = convert_json(str(jsonfile))
filename = data_path + jsonfile.stem + ".wav"
output = {}
output["data"] = {
"audio": filename
}
output["predictions"] = annots
out_json = output_path / f"{jsonfile.name}"
with open(out_json, "w") as outf:
json.dump(output, outf)