import json

from pathlib import Path
JSONPATH = Path("/Users/joregan/Playing/rd_phonetic")

def get_slice(a, n, window=2):
    start = max(n - window, 0)
    centre = 2
    if n < 0:
        centre = centre - (n - window)
    end = min(n + (window + 1), len(a))
    slice = a[start:end]
    return slice, centre

def any_in(checks, text):
    for check in checks:
        if check in text:
            return True
    return False

from dataclasses import dataclass


@dataclass
class Found():
    stem: str
    start: float
    end: float
    text: str

    def __hash__(self):
        return hash((self.stem, str(self.start), str(self.end), self.text))

def inspect_file(filename, checks=["<hes>", "<ha>"]):
    collected = set()
    with open(filename) as f:
        stem = Path(filename).stem
        data = json.load(f)
        chunks = data["chunks"]
        for i in range(len(chunks)):
            slice, centre = get_slice(chunks, i)
            if any_in(checks, slice[centre]["text"]):
                current = Found(
                    stem=stem,
                    start=slice[0]["timestamp"][0],
                    end=slice[-1]["timestamp"][1],
                    text=" ".join([s["text"] for s in slice]),
                )
                collected.add(current)
        return collected

all = set()
for filename in JSONPATH.glob("*.json"):
    found = inspect_file(filename)
    all.update(found)

with open("/tmp/found.json", "w") as f:
    json.dump([f.__dict__ for f in all], f, indent=2)

len(all)

992335

import random

subset = random.sample(all, 200)

/var/folders/d9/cbkhg23x349_t692zq6yhcv00000gn/T/ipykernel_57884/3143163464.py:3: DeprecationWarning: Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.
  subset = random.sample(all, 200)

files = set([x.stem for x in subset])

with open("/tmp/write_ffmpeg.sh", "w") as of:
    for f in files:
        of.write(f"ffmpeg -i /sbtal/riksdag-video/{f}.mp4 -acodec pcm_s16le -ac 1 -ar 16000 /tmp/samples-tmp/{f}.wav\n")

with open("/tmp/write_ffmpeg_clips.sh", "w") as of, open("/tmp/clips.tsv", "w") as of2:
    for item in subset:
        clipname = f"{item.stem}_{item.start}-{item.end}.wav"
        of.write(f"ffmpeg -i /tmp/samples-tmp/{item.stem}.wav -ss {item.start} -t {item.end - item.start} -c copy /home/joregan/hes-clips/{clipname}\n")
        of2.write(f"{clipname}\t{item.text}\n")