Get repeated utterances from merged TTS data
For a project
with open("/Users/joregan/Playing/work-2026/arctic/fdupes-all") as dups_file:
lines = []
for line in dups_file.readlines():
line = line.strip()
lines.append(line)
all_lines = "\n".join(lines)
dups = all_lines.split("\n\n")
vctk = []
awb = {}
for dup in dups:
if "txt.done.data" in dup:
continue
if "vctk" in dup:
current = []
for item in dup.split("\n"):
pieces = item.replace(".txt", "").split("/")
current.append(pieces[-1])
vctk.append(current)
elif "awb" in dup:
id = ""
awb_id = ""
for item in dup.split("\n"):
if "awb" in item:
awb_id = item.split("/")[-1]
else:
id = item.split("/")[-1]
awb[awb_id] = id
GBI_REMAP = {"EN0628": "EN0470"}
vctk_mapping = {}
for index, item in enumerate(vctk):
key = None
for individual in item:
if individual.startswith("s"):
key = individual
if key is None:
key = f"vctk_{index}"
for individual in item:
if individual != key:
vctk_mapping[individual] = key
DATA = {
"vctk": vctk_mapping,
"awb": awb,
"gbi": GBI_REMAP
}
import json
with open("utterance-map.json", "w") as out_file:
json.dump(DATA, out_file, indent=4)
with open("arctic-remapped-ids.json") as remap_file:
remap = json.load(remap_file)
awb_different_norm = {}
for key in remap["awb"]:
if not key in awb:
awb_different_norm[key] = remap["awb"][key]
DATA = {
"vctk": vctk_mapping,
"awb": awb,
"awb_different_norm": awb_different_norm,
"gbi": GBI_REMAP
}
import json
with open("utterance-map.json", "w") as out_file:
json.dump(DATA, out_file, indent=4)