Reverb CTM to Label Studio
Because their import sucks
tl;dr---reverb is quite good at getting the starts of words, awful at end times (almost everything is 0.1s). It's good at picking up on filler, etc., but bad at context-based disambiguation (``waist'' basket)
def slurpfile(filename) -> str:
with open(filename) as inf:
return inf.read().strip()
host = "http://130.237.3.107:8080/api/"
api_token: str = slurpfile("label_studio_reverb")
input_dir = "/Users/joregan/Playing/hsi/inter_output/attention_rescoring/"
import requests
import json
from pathlib import Path
headers = {
"Authorization": f"Token {api_token}"
}
def get_projects():
req = requests.get(f"{host}projects", headers=headers)
assert req.status_code == 200
data = json.loads(req.text)
return data
def get_project_id_from_name(name):
projects = get_projects()
for res in projects["results"]:
if res["title"].strip() == name.strip():
return res["id"]
get_project_id_from_name("Main 6")
def get_tasks(projectid):
req = requests.get(f"{host}tasks", headers=headers, params={"project": projectid})
assert req.status_code == 200
data = json.loads(req.text)
return data
def index_task_filestem_to_id(tasks_data):
tasks = tasks_data["tasks"]
mapping = {}
for task in tasks:
task_id = task["id"]
if "storage_filename" in task:
task_raw_path = task["storage_filename"]
else:
task_raw_path = task["data"]["audio"]
if not task_raw_path:
continue
task_stem = task_raw_path.split("/")[-1]
mapping[task_stem] = task_id
return mapping
get_project_id_from_name("Speaker 3")
tasks = get_tasks(8)
mapping = index_task_filestem_to_id(tasks)
mapping
ctmfile = "/Users/joregan/Playing/hsi/inter_output/attention_rescoring/hsi_7_0719_209_002_inter.ctm"
outputs = []
segments = []
with open(ctmfile) as ctmf:
def is_marker(item):
if item["word"].startswith("<") and item["word"].endswith(">"):
item["word"] = f'[{item["word"][1:-1]}]'
return True
return False
last = 0.0
this_seg = []
for line in ctmf.readlines():
line = line.strip()
parts = line.split(" ")
cur = {
"start": float(parts[2]),
"dur": float(parts[3]),
"word": parts[4],
"end": float(parts[2]) + float(parts[3]),
}
if (cur["end"] - last) > 1.0 or is_marker(cur):
segments.append(this_seg)
this_seg = []
this_seg.append(cur)
last = cur["end"]
segments.append(this_seg)
segments
import json
import uuid
def ctm_to_result(ctmfile):
def is_marker(item):
if item["word"].startswith("<") and item["word"].endswith(">"):
item["word"] = f'[{item["word"][1:-1]}]'
return True
return False
outputs = []
segments = []
with open(ctmfile) as ctmf:
last = 0.0
this_seg = []
for line in ctmf.readlines():
line = line.strip()
parts = line.split(" ")
cur = {
"start": float(parts[2]),
"dur": float(parts[3]),
"word": parts[4],
"end": float(parts[2]) + float(parts[3]),
}
if (cur["end"] - last) > 1.0 or is_marker(cur):
segments.append(this_seg)
this_seg = []
this_seg.append(cur)
last = cur["end"]
segments.append(this_seg)
for segment in segments:
if segment == []:
continue
start = segment[0]["start"]
end = segment[-1]["end"]
text = " ".join([x["word"] for x in segment])
add_label = False
label = "Speech"
if text == "[laugh]":
add_label = True
label = "Laughter"
elif text == "[inaudible]":
add_label = True
label = "Noise"
gen_id = str(uuid.uuid4())[:6]
segment = {
"value": {
"start": start,
"end": end,
"channel": 0,
"labels": [label]
},
"from_name": "labels",
"to_name": "audio",
"type": "labels",
"id": gen_id
}
rec = {
"value": {
"start": start,
"end": end,
"channel": 0,
"text": [text]
},
"from_name": "transcription",
"to_name": "audio",
"type": "textarea",
"id": gen_id
}
if add_label:
outputs.append(segment)
outputs.append(rec)
return outputs
def post_results(id, task, project, results):
ep = f"{host}annotations/{id}/?taskID={task}&project={project}"
cur_headers = {i: headers[i] for i in headers}
cur_headers["Content-type"] = "application/json"
content = {
"was_cancelled": False,
"ground_truth": False,
"project": project,
"draft_id": 0,
"parent_prediction": None,
"parent_annotation": None,
"result": results
}
r = requests.patch(ep, data=json.dumps(content), headers=cur_headers)
return r
file = f"{input_dir}hsi_7_0719_209_002_inter.ctm"
data = ctm_to_result(file)
r = post_results(300, 55, 4, data)
print(r.text)
from pathlib import Path
for fn in mapping:
ctmfile = fn.replace(".wav", ".TextGrid")
if (Path(input_dir) / ctmfile).exists():
print(mapping[fn], fn)
count = 230
for task in mapping:
jsonfile = task.replace(".wav", ".TextGrid")
file = f"{input_dir}{jsonfile}"
if not (Path(input_dir) / jsonfile).exists():
continue
data = tg_to_result(file)
r = post_results(count, mapping[task], 8, data)
count += 1
print(r.text)
tmap = {}
count = 99
for task in mapping:
tmap[task] = count
count += 1
from pathlib import Path
for file in Path("/Users/joregan/Playing/hsi_ctmedit/textgrid").glob("*.TextGrid"):
wavfile = file.stem + ".wav"
if wavfile in mapping:
print(wavfile, mapping[wavfile])