Extract segments for Matcha
From tsv-formatted data
from pathlib import Path
segments = {}
count = 0
for tsvfile in Path("/tmp/textgrid_cut").glob("*.tsv"):
current = []
filestem = tsvfile.stem
with open(str(tsvfile)) as tsvf:
for line in tsvf.readlines():
line = line.strip()
if line == "":
continue
parts = line.split("\t")
if len(parts) != 3:
print(line)
continue
start = float(parts[0])
end = float(parts[1])
text = parts[2]
duration = (end - start)
if (duration >= 2.0) and (duration <= 30.0):
current.append((start, end, text))
segments[filestem] = current
len(segments)
from pydub import AudioSegment
parameters = ["-ac", "1", "-acodec", "pcm_s16le", "-ar", "22050"]
matcha_data = {}
outdir = Path("/tmp/audio_clips")
if not outdir.is_dir():
outdir.mkdir()
for scene in segments:
parts = scene.split("_")
if parts[-1] == "main":
speaker = int(parts[1]) - 2
else:
if parts[1] == "3":
speaker = 5
else:
speaker = 6
if speaker not in matcha_data:
matcha_data[speaker] = []
orig_audio = AudioSegment.from_file(f"/Users/joregan/Playing/hsi/audio/{scene}.wav")
for seg in segments[scene]:
start = int(seg[0] * 1000)
end = int(seg[1] * 1000)
text = seg[2]
chunk = orig_audio[start:end]
chunk_basename = f"{scene}_{start}_{end}.wav"
chunk_name = str(outdir / chunk_basename)
chunk.export(chunk_name, format="wav", parameters=parameters)
matcha_data[speaker].append(f"data/mm-conv/{chunk_basename}|{speaker}|{text}")
import random
def decision():
return random.random() < 0.98
with (
open(outdir / "train.txt", "w", encoding="utf-8") as tf,
open(outdir / "valid.txt", "w", encoding="utf-8") as vf,
):
for spk in matcha_data:
for line in matcha_data[spk]:
if decision():
tf.write(line + "\n")
else:
vf.write(line + "\n")