Extract phonetic chunks
For hesitations
import json
from pathlib import Path
JSONPATH = Path("/Users/joregan/Playing/rd_phonetic")
def get_slice(a, n, window=2):
start = max(n - window, 0)
centre = 2
if n < 0:
centre = centre - (n - window)
end = min(n + (window + 1), len(a))
slice = a[start:end]
return slice, centre
def any_in(checks, text):
for check in checks:
if check in text:
return True
return False
from dataclasses import dataclass
@dataclass
class Found():
stem: str
start: float
end: float
text: str
def __hash__(self):
return hash((self.stem, str(self.start), str(self.end), self.text))
def inspect_file(filename, checks=["<hes>", "<ha>"]):
collected = set()
with open(filename) as f:
stem = Path(filename).stem
data = json.load(f)
chunks = data["chunks"]
for i in range(len(chunks)):
slice, centre = get_slice(chunks, i)
if any_in(checks, slice[centre]["text"]):
current = Found(
stem=stem,
start=slice[0]["timestamp"][0],
end=slice[-1]["timestamp"][1],
text=" ".join([s["text"] for s in slice]),
)
collected.add(current)
return collected
all = set()
for filename in JSONPATH.glob("*.json"):
found = inspect_file(filename)
all.update(found)
with open("/tmp/found.json", "w") as f:
json.dump([f.__dict__ for f in all], f, indent=2)
len(all)
import random
subset = random.sample(all, 200)
files = set([x.stem for x in subset])
with open("/tmp/write_ffmpeg.sh", "w") as of:
for f in files:
of.write(f"ffmpeg -i /sbtal/riksdag-video/{f}.mp4 -acodec pcm_s16le -ac 1 -ar 16000 /tmp/samples-tmp/{f}.wav\n")
with open("/tmp/write_ffmpeg_clips.sh", "w") as of, open("/tmp/clips.tsv", "w") as of2:
for item in subset:
clipname = f"{item.stem}_{item.start}-{item.end}.wav"
of.write(f"ffmpeg -i /tmp/samples-tmp/{item.stem}.wav -ss {item.start} -t {item.end - item.start} -c copy /home/joregan/hes-clips/{clipname}\n")
of2.write(f"{clipname}\t{item.text}\n")