Extract text for F5TTS
Extracting text from our data for synthesis
SAMPLE = """
{
"1": {
"snippet": "Yes, I will try. Let's see here. So we have couch.",
"references": [
{
"ref_id": 0,
"phrase": "couch",
"resolved_ref": "Sofa_b5c02446",
"plurality": "single",
"original_label": "exact",
"object_id": "Sofa_b5c02446",
"phrase_start": 15.892,
"phrase_end": 17.032
}
],
"start": 10.192,
"end": 17.092000000000002,
"high_level": {
"current_topic": "Sofa_b5c02446",
"topic_duration_id": 0
}
}
}
"""
import json
sample = json.loads(SAMPLE)
from pathlib import Path
JSONDIR = Path("/tmp/resolved_references_with_timings")
WAVDIR = Path("/tmp/audio_prompt_clips")
for wavfile in WAVDIR.glob("*.wav"):
wavstem = wavfile.stem
parts = wavstem.split("_")
orig_stem = "_".join(parts[:5])
jsonfile = JSONDIR / f"{orig_stem}_with_timings.json"
with open(jsonfile, "r") as f:
data = json.load(f)
phrase_id = parts[5]
orig_start = float(parts[6])
orig_end = float(parts[7])
text = data[phrase_id]["snippet"]
with open(WAVDIR / f"{wavstem}.txt", "w") as f:
f.write(text)
from string import punctuation
punct = set(punctuation)
with open("/Users/joregan/Downloads/object_utterances_batch1_to_4.md") as inf:
count = 1
for line in inf.readlines():
line = line.strip()
if line == "":
continue
parts = line.split(" ")
new_parts = []
for part in parts:
if part.startswith("**"):
new_parts.append(f"*{part[2:]}*")
elif part.endswith("**"):
new_parts.append(f"{part[:-2]}")
elif part[-3:-1] == "**":
new_parts.append(f"{part[:-3]}{part[-1:]}")
else:
new_parts.append(part)
with open(f"/tmp/test_batch/test_batch_{count}.txt", "w") as of:
of.write(" ".join(new_parts))
count += 1
def read_reference_dir(ref_dir):
pairs = {}
if type(ref_dir) == str:
ref_dir = Path(ref_dir)
for wavfile in ref_dir.glob("*.wav"):
name = str(wavfile)
text = name.replace(".wav", ".txt")
with open(text, "r") as f:
data = f.readlines().strip()
pairs[wavfile] = data
return pairs
def get_mmconv_speaker_room_pairs(ref_dir):
pairs = {}
if type(ref_dir) == str:
ref_dir = Path(ref_dir)
for wavfile in ref_dir.glob("*.wav"):
name = str(wavfile)
text = name.replace(".wav", ".txt")
parts = text.split("_")
speaker = parts[1]
if not speaker in pairs:
pairs[speaker] = {}
room = parts[3]
if not room in pairs[speaker]:
pairs[speaker][room] = {}
with open(text, "r") as f:
data = f.readlines().strip()
pairs[speaker][room][wavfile] = data
return pairs