tl;dr---reverb is quite good at getting the starts of words, awful at end times (almost everything is 0.1s). It's good at picking up on filler, etc., but bad at context-based disambiguation (``waist'' basket)

def slurpfile(filename) -> str:
    with open(filename) as inf:
        return inf.read().strip()
host = "http://130.237.3.107:8080/api/"
api_token: str = slurpfile("label_studio_reverb")
input_dir = "/Users/joregan/Playing/hsi/inter_output/attention_rescoring/"
import requests
import json
from pathlib import Path

headers = {
    "Authorization": f"Token {api_token}"
}
def get_projects():
    req = requests.get(f"{host}projects", headers=headers)
    assert req.status_code == 200
    data = json.loads(req.text)
    return data
def get_project_id_from_name(name):
    projects = get_projects()
    for res in projects["results"]:
        if res["title"].strip() == name.strip():
            return res["id"]
get_project_id_from_name("Main 6")
7
def get_tasks(projectid):
    req = requests.get(f"{host}tasks", headers=headers, params={"project": projectid})
    assert req.status_code == 200
    data = json.loads(req.text)
    return data
def index_task_filestem_to_id(tasks_data):
    tasks = tasks_data["tasks"]
    mapping = {}
    for task in tasks:
        task_id = task["id"]
        if "storage_filename" in task:
            task_raw_path = task["storage_filename"]
        else:
            task_raw_path = task["data"]["audio"]
        if not task_raw_path:
            continue
        task_stem = task_raw_path.split("/")[-1]
        mapping[task_stem] = task_id
    return mapping
get_project_id_from_name("Speaker 3")
1
tasks = get_tasks(8)
mapping = index_task_filestem_to_id(tasks)
mapping
{'hsi_7_0719_209_001_main.wav': 107,
 'hsi_7_0719_209_002_main.wav': 108,
 'hsi_7_0719_209_003_main.wav': 109,
 'hsi_7_0719_210_001_main.wav': 110,
 'hsi_7_0719_210_002_main.wav': 111,
 'hsi_7_0719_210_003_main.wav': 112,
 'hsi_7_0719_211_002_main.wav': 113,
 'hsi_7_0719_211_004_main.wav': 114,
 'hsi_7_0719_222_002_main.wav': 115,
 'hsi_7_0719_222_004_main.wav': 116,
 'hsi_7_0719_227_002_main.wav': 117,
 'hsi_7_0719_227_003_main.wav': 118}
ctmfile = "/Users/joregan/Playing/hsi/inter_output/attention_rescoring/hsi_7_0719_209_002_inter.ctm"
outputs = []

segments = []

with open(ctmfile) as ctmf:
    def is_marker(item):
        if item["word"].startswith("<") and item["word"].endswith(">"):
            item["word"] = f'[{item["word"][1:-1]}]'
            return True
        return False

    last = 0.0
    this_seg = []
    for line in ctmf.readlines():
        line = line.strip()
        parts = line.split(" ")
        cur = {
            "start": float(parts[2]),
            "dur": float(parts[3]),
            "word": parts[4],
            "end": float(parts[2]) + float(parts[3]),
        }
        if (cur["end"] - last) > 1.0 or is_marker(cur):
            segments.append(this_seg)
            this_seg = []
        this_seg.append(cur)
        last = cur["end"]
    segments.append(this_seg)
segments
import json
import uuid


def ctm_to_result(ctmfile):
    def is_marker(item):
        if item["word"].startswith("<") and item["word"].endswith(">"):
            item["word"] = f'[{item["word"][1:-1]}]'
            return True
        return False

    outputs = []

    segments = []

    with open(ctmfile) as ctmf:
        last = 0.0
        this_seg = []
        for line in ctmf.readlines():
            line = line.strip()
            parts = line.split(" ")
            cur = {
                "start": float(parts[2]),
                "dur": float(parts[3]),
                "word": parts[4],
                "end": float(parts[2]) + float(parts[3]),
            }
            if (cur["end"] - last) > 1.0 or is_marker(cur):
                segments.append(this_seg)
                this_seg = []
            this_seg.append(cur)
            last = cur["end"]
        segments.append(this_seg)

    for segment in segments:
        if segment == []:
            continue

        start = segment[0]["start"]
        end = segment[-1]["end"]
        text = " ".join([x["word"] for x in segment])

        add_label = False
        label = "Speech"
        if text == "[laugh]":
            add_label = True
            label = "Laughter"
        elif text == "[inaudible]":
            add_label = True
            label = "Noise"

        gen_id = str(uuid.uuid4())[:6]
        segment = {
            "value": {
                "start": start,
                "end": end,
                "channel": 0,
                "labels": [label]
            },
            "from_name": "labels",
            "to_name": "audio",
            "type": "labels",
            "id": gen_id
        }
        rec = {
            "value": {
                "start": start,
                "end": end,
                "channel": 0,
                "text": [text]
            },
            "from_name": "transcription",
            "to_name": "audio",
            "type": "textarea",
            "id": gen_id
        }
        if add_label:
            outputs.append(segment)
        outputs.append(rec)

    return outputs
def post_results(id, task, project, results):
    ep = f"{host}annotations/{id}/?taskID={task}&project={project}"

    cur_headers = {i: headers[i] for i in headers}
    cur_headers["Content-type"] = "application/json"

    content = {
        "was_cancelled": False,
        "ground_truth": False,
        "project": project,
        "draft_id": 0,
        "parent_prediction": None,
        "parent_annotation": None,
        "result": results
    }
    r = requests.patch(ep, data=json.dumps(content), headers=cur_headers)
    return r
file = f"{input_dir}hsi_7_0719_209_002_inter.ctm"
data = ctm_to_result(file)
r = post_results(300, 55, 4, data)
print(r.text)
{"id":300,"result":[{"value":{"start":1.95,"end":4.6899999999999995,"channel":0,"text":["yeah but just let me know because it is an easy fix to"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"35e926"},{"value":{"start":6.15,"end":6.25,"channel":0,"text":["um"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"537955"},{"value":{"start":7.47,"end":12.049999999999999,"channel":0,"text":["but yeah i think it looks good i started it now so do you need to uh point that"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"d363c0"},{"value":{"start":12.47,"end":17.85,"channel":0,"text":["[inaudible] uh yeah i think that's good if you do that but maybe in this one because it's a landlord so maybe you can be very picky about stuff okay"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"5e6cdd"},{"value":{"start":18.79,"end":35.160000000000004,"channel":0,"text":["um but yeah was very nice living here for four months thanks yeah okay i hope you you have taken care of the things of course thoroughly so uh you you actually performed a clean yes okay that's great uh but you know i'm just uh"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"f1749b"},{"value":{"start":36.34,"end":36.440000000000005,"channel":0,"text":["but"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"3ca46d"},{"value":{"start":37.94,"end":39.0,"channel":0,"text":["there is a stain on"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"8b0b53"},{"value":{"start":40.14,"end":41.35,"channel":0,"text":["on the pillow oh"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"ce74ad"},{"value":{"start":42.49,"end":46.75,"channel":0,"text":["but i mean that that's not supposed to be like that yeah i can i can actually see it"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"092ecb"},{"value":{"start":48.17,"end":53.15,"channel":0,"text":["oh yeah you are right you are right i still need to clean that phone uh-huh okay okay yeah that's good that's good"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"a7be6a"},{"value":{"start":54.29,"end":55.03,"channel":0,"text":["what happened here"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"def9a5"},{"value":{"start":57.85,"end":59.870000000000005,"channel":0,"text":["where's there's a mark on the wall oh"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"38c491"},{"value":{"start":60.85,"end":63.22,"channel":0,"text":["i think it was there but i think we took photos we can check later"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"1cc9d7"},{"value":{"start":64.28,"end":66.33999999999999,"channel":0,"text":["yeah yeah let let- let's check that because"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"c14607"},{"value":{"start":67.88,"end":69.3,"channel":0,"text":["yeah i i don't think that was there"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"04eaf9"},{"value":{"start":74.84,"end":83.72999999999999,"channel":0,"text":["that what happened with the statue which one the black one because uh it was doing this with the finger oh no it's it had a finger no it doesn't have a finger"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"cff3f7"},{"value":{"start":84.83,"end":88.49,"channel":0,"text":["it it it had ah now it doesn't it doesn't no ah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"a4cc6a"},{"value":{"start":89.67,"end":91.37,"channel":0,"text":["okay i think we should also- ares"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"a1c2ae"},{"value":{"start":92.67,"end":100.57,"channel":0,"text":["not really but i think we can check it later with the photos or two yeah yeah a hundred percent because yeah really it had a finger painted uh-huh yes yes"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"a65509"},{"value":{"start":102.66,"end":102.75999999999999,"channel":0,"text":["um"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"ebd182"},{"value":{"start":104.14,"end":105.0,"channel":0,"text":["i think uh"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"340e2a"},{"value":{"start":107.02,"end":115.96,"channel":0,"text":["yeah you have a lot of things from yours here right because i mean that trash bin was not from the apartment blue one ah yeah that's right i need to take that it was only only this one"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"279075"},{"value":{"start":117.06,"end":133.43,"channel":0,"text":["okay yeah yeah because uh the the gray one was the one that was here yeah yeah right but this one is it was not yeah i just put it in for recycling okay okay good so uh and that's also your computer i guess you mean the laptop yes yes that's mine but the other one is not okay"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"9abdb3"},{"value":{"start":134.65,"end":141.87,"channel":0,"text":["have you have you changed the table well which what do you mean which one the the central table here uh no i maybe i moved it a bit"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"cee5f4"},{"value":{"start":144.4,"end":146.01999999999998,"channel":0,"text":["yeah it could be yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"aad211"},{"value":{"start":148.72,"end":154.7,"channel":0,"text":["yeah the problem is that you know whenever the weight goes on then move some mark oh on the on the carpet"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"4f20ee"},{"value":{"start":155.92,"end":157.1,"channel":0,"text":["that that doesn't good"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"50b37d"},{"value":{"start":160.84,"end":160.94,"channel":0,"text":["so"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"242aad"},{"value":{"start":162.24,"end":162.7,"channel":0,"text":["you move the"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"1897cb"},{"value":{"start":164.59,"end":166.13,"channel":0,"text":["um yeah i cleaned under it"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"40e088"},{"value":{"start":167.59,"end":171.48999999999998,"channel":0,"text":["i can see some dust in the back oh it could be maybe we do"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"d2407c"},{"value":{"start":172.71,"end":177.65,"channel":0,"text":["final cleaning due this afternoon yeah please please do that because i still see dirt mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"37c251"},{"value":{"start":177.69,"end":177.73,"channel":0,"text":["[affirmative]"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"e5d2f6"},{"value":{"start":178.67,"end":178.76999999999998,"channel":0,"text":["yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"87ea20"},{"value":{"start":181.07,"end":183.69,"channel":0,"text":["i mean this this furniture look great i think you"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"4e90de"},{"value":{"start":184.66,"end":189.6,"channel":0,"text":["take care of it is the tv uh working right it's functional yes okay great"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"373d5f"},{"value":{"start":193.62,"end":197.51999999999998,"channel":0,"text":["yeah did you misplace this what do you mean the"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"4a4d18"},{"value":{"start":199.06,"end":204.79999999999998,"channel":0,"text":["container for the plants for water in the plants yeah i think it was by the fireplace"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"bfb404"},{"value":{"start":206.29,"end":221.63,"channel":0,"text":["yeah uh yeah if if you could just actually put it back to where it was i think it's important for for the next one so i-huh track of the changes so do you prefer it by where do you prefer it by the by the fireplace as it was okay that's good yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"2a16aa"},{"value":{"start":223.01,"end":225.07,"channel":0,"text":["i think those little bit"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"60cc4e"},{"value":{"start":227.12,"end":241.26,"channel":0,"text":["yeah i think- you coming from inside and outside or just inside just inside but you're right i should yeah i think we need to tocheck them on the outside because i can see some from the uh-huh but doesn' look good yeah yeah it's not good"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"05feef"},{"value":{"start":244.24,"end":244.34,"channel":0,"text":["yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"4ffa2b"},{"value":{"start":252.67,"end":252.76999999999998,"channel":0,"text":["here"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"bbd6e9"},{"value":{"start":254.23,"end":255.25,"channel":0,"text":["the door it"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"391ddf"},{"value":{"start":258.39,"end":258.49,"channel":0,"text":["mark"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"0715bc"},{"value":{"start":260.43,"end":268.44000000000005,"channel":0,"text":["uh i can actually see a mark yeah i think we should check it on the pictures also this one i can i i i don't if ca- uh recall anything"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"4288a6"},{"value":{"start":269.46,"end":270.72,"channel":0,"text":["happening what's the-"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"e5d8c1"},{"value":{"start":291.01,"end":300.95000000000005,"channel":0,"text":["<inaudible> hmm the sculpture yeah it has a scratch on the on the palm no i think it was there it was very distinctive i thought it was on purpose it's part of the art piece"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"ede52d"},{"value":{"start":301.77,"end":312.42,"channel":0,"text":["i'm not so sure about that i think it's it not there before mmhmm i think that's we can so check on the pictures yeah yeah we should check it on the pictures 'cause it'cause really it's not uh"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"0a48ea"},{"value":{"start":313.48,"end":325.66,"channel":0,"text":["i don't think it' was there and of course of course i will move this like here you see my moving boxes here yeah please please uh so so move it because uh yeah not of the the newspaper there mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"60d724"},{"value":{"start":325.7,"end":327.78000000000003,"channel":0,"text":["[affirmative] i mean try to move all"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"dc0cb6"},{"value":{"start":328.71,"end":328.89,"channel":0,"text":["mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"d5a86c"},{"value":{"start":328.93,"end":333.05,"channel":0,"text":["[affirmative] otherwise it's not good for the rest yes yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"d5f624"},{"value":{"start":334.91,"end":335.49,"channel":0,"text":["so what else"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"fffa81"},{"value":{"start":341.07,"end":353.48,"channel":0,"text":["i also misplace the paintings i hope you i can take it back if you yeah i was actually gonna comment about about that that can actually i mean how did you clean them no the paintings no okay"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"749b1a"},{"value":{"start":354.58,"end":359.96,"channel":0,"text":["i can see a little bit of dust maybe you can just uh- no i- put a little bit mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"559740"},{"value":{"start":360.0,"end":360.04,"channel":0,"text":["[affirmative]"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"ef91c1"},{"value":{"start":361.02,"end":361.84000000000003,"channel":0,"text":["clean it mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"b306bb"},{"value":{"start":361.88,"end":363.32000000000005,"channel":0,"text":["[affirmative] yeah on the upper part especially"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"f3ad52"},{"value":{"start":365.62,"end":366.04,"channel":0,"text":["an actual"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"b0b1f6"},{"value":{"start":367.46,"end":368.24,"channel":0,"text":["you layer of"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"f144f6"},{"value":{"start":369.25,"end":386.31,"channel":0,"text":["of course i was just very careful with that one no of course of course but you you know when you have the frame so in the upper part and in the lower part there is a little bit of a line of uh-huh yeah and then it it's not good so uh if you could actually not wet flos uh-huh normal drycloth mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"6c924d"},{"value":{"start":386.35,"end":388.71,"channel":0,"text":["[affirmative] and then you put put it over uh-huh"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"b31bfc"},{"value":{"start":388.75,"end":391.42,"channel":0,"text":["[affirmative] like towards all the sites mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"c6ab6c"},{"value":{"start":391.46,"end":393.54,"channel":0,"text":["[affirmative] and i think that is uh that is good yes"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"9eb6b9"},{"value":{"start":395.04,"end":395.14000000000004,"channel":0,"text":["okay"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"e0b3d7"},{"value":{"start":397.08,"end":401.1,"channel":0,"text":["especially in the frame ones the ones that uh don't have a frame like those- mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"fcc036"},{"value":{"start":401.14,"end":404.14,"channel":0,"text":["[affirmative] and those ones are not that problematic mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"d38392"},{"value":{"start":404.18,"end":409.06,"channel":0,"text":["[affirmative] but these ones that have a frame are you don't have to take that much care mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"73dc78"},{"value":{"start":409.1,"end":415.41,"channel":0,"text":["[affirmative] in terms of uh damaging them 'cause they have like a glass so you can actually clean them normally mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"c4e8a1"},{"value":{"start":415.45,"end":419.05,"channel":0,"text":["[affirmative] the the the ones that you need to take more care about are those ones mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"f9412f"},{"value":{"start":419.09,"end":421.21000000000004,"channel":0,"text":["[affirmative] uh 'cause they don't have a frame mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"68ebaa"},{"value":{"start":421.25,"end":422.49,"channel":0,"text":["[affirmative] that makes sense mm-hmm"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"a7941f"},{"value":{"start":422.53,"end":422.57,"channel":0,"text":["[affirmative]"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"54100c"},{"value":{"start":423.55,"end":435.68,"channel":0,"text":["okay but then i think we are all if we finished almost no we can just do the afternoon cleaning and then we can do the inspection yeah let's check this those marks because otherwise i won't be able to give"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"b47070"},{"value":{"start":436.62,"end":437.92,"channel":0,"text":["back oh okay yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"4abbae"},{"value":{"start":438.32,"end":438.36,"channel":0,"labels":["Laughter"]},"from_name":"labels","to_name":"audio","type":"labels","id":"3c2447"},{"value":{"start":438.32,"end":438.36,"channel":0,"text":["[laugh]"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"3c2447"},{"value":{"start":439.34,"end":439.44,"channel":0,"text":["okay"]},"from_name":"transcription","to_name":"audio","type":"textarea","id":"b1b15b"}],"created_username":" reverb@fake.com, 12","created_ago":"21 minutes","completed_by":12,"was_cancelled":false,"ground_truth":false,"created_at":"2024-10-16T12:51:19.552432Z","updated_at":"2024-10-16T13:12:23.081498Z","draft_created_at":null,"lead_time":1.733,"import_id":null,"last_action":null,"task":55,"project":4,"updated_by":12,"parent_prediction":null,"parent_annotation":null,"last_created_by":null}
from pathlib import Path

for fn in mapping:
    ctmfile = fn.replace(".wav", ".TextGrid")
    if (Path(input_dir) / ctmfile).exists():
        print(mapping[fn], fn)
count = 230
for task in mapping:
    jsonfile = task.replace(".wav", ".TextGrid")
    file = f"{input_dir}{jsonfile}"
    if not (Path(input_dir) / jsonfile).exists():
        continue
    data = tg_to_result(file)
    r = post_results(count, mapping[task], 8, data)
    count += 1
    print(r.text)
tmap = {}
count = 99
for task in mapping:
    tmap[task] = count
    count += 1
from pathlib import Path

for file in Path("/Users/joregan/Playing/hsi_ctmedit/textgrid").glob("*.TextGrid"):
    wavfile = file.stem + ".wav"
    if wavfile in mapping:
        print(wavfile, mapping[wavfile])