Extract segments for ctc-forced-aligner
From tsv-formatted data
Built on earlier notebook, using data extracted with this notebook
from pathlib import Path
segments = {}
count = 0
for tsvfile in Path("/tmp/textgrid_cut").glob("*.csv"):
current = []
filestem = tsvfile.stem
with open(str(tsvfile)) as tsvf:
for line in tsvf.readlines():
line = line.strip()
if line == "":
continue
parts = line.split("\t")
if len(parts) != 3:
print(line)
continue
start = float(parts[0])
end = float(parts[1])
text = parts[2]
duration = (end - start)
current.append((start, end, text))
segments[filestem] = current
len(segments)
from pydub import AudioSegment
parameters = ["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]
matcha_data = {}
outdir = Path("/tmp/audio_clips")
if not outdir.is_dir():
outdir.mkdir()
for scene in segments:
parts = scene.split("_")
if parts[-1] == "main":
speaker = int(parts[1]) - 2
else:
if parts[1] == "3":
speaker = 5
else:
speaker = 6
if speaker not in matcha_data:
matcha_data[speaker] = []
orig_audio = AudioSegment.from_file(f"/Users/joregan/Playing/hsi/audio/audio_cut/{scene}.wav")
for seg in segments[scene]:
start = int(seg[0] * 1000)
end = int(seg[1] * 1000)
text = seg[2]
chunk = orig_audio[start:end]
chunk_basename = f"{scene}_{start}_{end}.wav"
chunk_name = str(outdir / chunk_basename)
chunk.export(chunk_name, format="wav", parameters=parameters)
with open(str(outdir / chunk_basename.replace(".wav", ".txt")), "w") as txtout:
txtout.write(text)
Aligner built with this docker
docker run -it --entrypoint /bin/bash -v'/home/joregan/audio_clips:/audio' --gpus all ctc-forced-aligner
for i in /audio/*.wav;do ctc-forced-aligner --audio_path $i --text_path $(echo $i|sed -e 's/wav/txt/') --language en ;done