Build merged TTS dataset
Early version; final version is in the dataset repo
import datasets
vctk = datasets.load_dataset("kth-tmh/vctk", "default")
if "path" in vctk["train"][0]["audio"]:
vctk = vctk.filter(lambda example: example["audio"]['path'].endswith("_mic2.flac"))
else:
vctk = vctk.filter(lambda example: example["audio"].__dict__['_hf_encoded']['path'].endswith("_mic2.flac"))
from datasets import Audio
vctk = vctk.cast_column("audio", Audio(sampling_rate=16000))
gbi = datasets.load_dataset("kth-tmh/google-britain-ireland")
gbi = gbi.cast_column("audio", Audio(sampling_rate=16000))
dialects = set()
for dialect in gbi['train']['dialect']:
dialects.add(dialect)
print(dialects)
def remap_gbi_sentence_id(item):
if item['sentence_id'] == 'EN0628':
item['sentence_id'] = 'EN0470'
return item
gbi.map(remap_gbi_sentence_id)
NO_MIC2 = ['p315', 'p280']
vctk = vctk.filter(lambda example: example['speaker_id'] not in NO_MIC2)
import json
with open("utterance-map.json") as f:
utterance_map = json.load(f)
def remap_vctk(item):
if item['region'] in ['Southern England', 'Surrey', 'Essex', 'London', 'Oxford', 'Suffolk', 'SW England', 'SE England']:
item['dialect'] = 'Southern English'
if item['region'] in ['Birmingham', 'Leicester', 'Nottingham', 'Staffordshire']:
item['dialect'] = 'Midlands English'
if item['region'] in ['Manchester', 'Cumbria', 'Stockton-on-tees', 'Cheshire', 'Newcastle', 'York', 'Yorkshire']:
item['dialect'] = 'Northern English'
if item['accent'] == 'NorthernIrish':
item['dialect'] = 'Irish'
if item['accent'] == 'SouthAfrican':
item['dialect'] = 'South African'
if item['accent'] == 'NewZealand':
item['dialect'] = 'New Zealand'
if item['accent'] in ['Scottish', 'Irish', 'American', 'Canadian', 'Australian']:
item['dialect'] = item['accent']
if not 'dialect' in item:
item['dialect'] = None
del item['region']
del item['accent']
del item['age']
sentence_key = f"{item['speaker_id']}_{item['text_id']}"
if sentence_key in utterance_map["vctk"]:
item['sentence_id'] = utterance_map["vctk"][sentence_key]
else:
item['sentence_id'] = sentence_key
item['utterance_id'] = sentence_key
del item['text_id']
return item
vctk = vctk.map(remap_vctk)
vctk = vctk.filter(lambda example: example['dialect'] is not None)
cmu_arctic = datasets.load_dataset("kth-tmh/cmu_arctic")
cmu_arctic = cmu_arctic.cast_column("audio", Audio(sampling_rate=16000))
cmu_arctic
SIMILAR = {
"jmk": {
"arctic_b0028": "arctic_b0028"
},
"awb": {
"arctic_a0335": "arctic_a0333",
"arctic_a0336": "arctic_a0334",
"arctic_a0433": "arctic_c0036",
"arctic_a0557": "arctic_a0554",
"arctic_b0211": "arctic_b0211",
"arctic_b0234": "arctic_b0234"
}
}
utterance_map.keys()
Missing: arctic_b0028 jmk lord fitzhugh is the key to this whole situation Missing: arctic_a0335 awb this is no place fer you Missing: arctic_a0336 awb he'll knock yeh off a few sticks in no time Missing: arctic_a0433 awb her true course had been west 0.75 south Missing: arctic_a0557 awb jack london waikiki beach honolulu oahu t.h Missing: arctic_a0567 awb an hallucination began to trouble him Missing: arctic_b0211 awb this is 1880 Missing: arctic_b0234 awb and watch out for wet feet,' was his parting advice Missing: arctic_b0419 awb to begin with i read late Missing: arctic_b0426 awb there were orange green gold green and a copper green Missing: arctic_b0445 awb he sat beside the gendarme and beamed
vctk["train"][0]
MISSED_NORMS = {
"arctic_a0343": "arctic_a0341",
"arctic_a0427": "arctic_a0425",
"arctic_b0453": "arctic_b0451",
"arctic_b0030": "arctic_b0030",
"arctic_b0365": "arctic_b0365",
"arctic_b0348": "arctic_b0348",
"arctic_a0181": "arctic_a0179",
"arctic_a0063": "arctic_a0063",
"arctic_b0208": "arctic_b0208",
"arctic_b0434": "arctic_b0433",
"arctic_a0301": "arctic_a0299",
"arctic_a0091": "arctic_a0089",
"arctic_a0260": "arctic_a0258",
"arctic_a0060": "arctic_a0060",
"arctic_b0313": "arctic_b0313",
"arctic_b0493": "arctic_b0491",
"arctic_a0167": "arctic_a0165",
"arctic_a0249": "arctic_a0247",
"arctic_a0294": "arctic_a0292",
"arctic_a0260": "arctic_a0258",
"arctic_a0592": "arctic_a0588",
"arctic_b0377": "arctic_b0377",
"arctic_b0199": "arctic_b0199",
"arctic_b0280": "arctic_b0280",
"arctic_a0461": "arctic_a0458",
"arctic_a0543": "arctic_a0540",
"arctic_a0392": "arctic_a0390",
"arctic_b0374": "arctic_b0374",
"arctic_a0152": "arctic_a0150",
"arctic_b0107": "arctic_b0107",
"arctic_b0204": "arctic_b0204",
"arctic_b0080": "arctic_b0080",
"arctic_b0334": "arctic_b0334",
"arctic_a0094": "arctic_a0092",
"arctic_b0166": "arctic_b0166",
"arctic_b0041": "arctic_b0041",
"arctic_a0103": "arctic_a0101",
"arctic_b0228": "arctic_b0228",
"arctic_a0001": "arctic_a0001",
"arctic_b0248": "arctic_b0248",
"arctic_b0132": "arctic_b0132",
"arctic_b0181": "arctic_b0181",
"arctic_b0192": "arctic_b0192",
"arctic_b0173": "arctic_b0173",
"arctic_b0274": "arctic_b0274",
"arctic_b0376": "arctic_b0376",
"arctic_b0167": "arctic_b0167",
"arctic_b0013": "arctic_b0013",
"arctic_b0330": "arctic_b0330",
"arctic_b0213": "arctic_b0213",
"arctic_b0344": "arctic_b0344",
"arctic_b0307": "arctic_b0307",
"arctic_b0171": "arctic_b0171",
"arctic_b0091": "arctic_b0091",
"arctic_b0516": "arctic_b0514",
"arctic_b0144": "arctic_b0144",
}
def remap_cmu_arctic(item):
item['utterance_id'] = f"cmu_us_{item['speaker_id']}_{item['sentence_id']}"
if item['sentence_id'] == "arctic_c0035":
item['sentence_id'] = "arctic_a0456"
if item['speaker_id'] == "jmk" and item['sentence_id'] == "arctic_b0028":
item['sentence_id'] = "jmk_arctic_b0028"
unique = ["arctic_b0445", "arctic_b0426", "arctic_b0419", "arctic_a0567", "arctic_a0074"]
diff_enough = ["arctic_a0335", "arctic_a0557", "arctic_a0336"]
if item['speaker_id'] == "awb":
if item['sentence_id'] in utterance_map["awb"]:
item['sentence_id'] = utterance_map["awb"][item['sentence_id']]
elif item['sentence_id'] in utterance_map["awb_different_norm"]:
item['sentence_id'] = utterance_map["awb_different_norm"][item['sentence_id']]
elif item['sentence_id'] in MISSED_NORMS:
item['sentence_id'] = MISSED_NORMS[item['sentence_id']]
elif item['sentence_id'] in unique:
item['sentence_id'] = f"awb_{item['sentence_id']}"
elif item['sentence_id'] in diff_enough:
item['sentence_id'] = f"awb_{item['sentence_id']}"
elif item['sentence_id'] in ["arctic_b0234", "arctic_a0433", "arctic_b0211"]:
item['sentence_id'] = item['sentence_id']
else:
raise ValueError(f"Unexpected sentence_id {item['sentence_id']} for speaker awb")
return item
cmu_arctic = cmu_arctic.map(remap_cmu_arctic)
def fix_arctic(item):
if "English" in item['accent']:
item['dialect'] = item['accent'].split()[0]
else:
item['dialect'] = None
del item['accent']
return item
cmu_arctic = cmu_arctic.map(fix_arctic)
cmu_arctic = cmu_arctic.filter(lambda example: example['dialect'] is not None)
cmu_arctic = cmu_arctic.filter(lambda example: example['gender'] != "unknown")
gbi
cmu_arctic
cmu_gbi = datasets.concatenate_datasets([cmu_arctic["train"], gbi["train"]])
cmu_gbi
vctk = vctk.remove_columns(["file", "comment"])
merged = datasets.concatenate_datasets([cmu_gbi, vctk["train"]])
def fix_gender(item):
if item["gender"].lower() == "m":
item["gender"] = "male"
elif item["gender"].lower() == "f":
item["gender"] = "female"
return item
merged = merged.map(fix_gender)
merged = merged.rename_columns({
"sentence_id": "semantic_label",
"speaker_id": "speaker_id_label",
"dialect": "dialect_label",
"gender": "gender_label",
"audio": "anchor_input_features",
})
from sentence_transformers import SentenceTransformer
# all-mpnet-base-v2 outputs 768-dim, matching wavlm-base-plus (ENCODER_DIM=768).
# If you switch to wavlm-large (1024-dim), use a different text model and
# update ENCODER_DIM accordingly.
text_model = SentenceTransformer("all-mpnet-base-v2")
# Encode each unique sentence once, keyed by semantic_label.
# semantic_label groups all utterances of the same sentence across speakers,
# so this avoids redundant encoding.
unique_sentences = {
label: text
for label, text in zip(merged["semantic_label"], merged["text"])
}
label_to_embedding = {
label: text_model.encode(text).tolist()
for label, text in unique_sentences.items()
}
merged = merged.map(
lambda row: {"semantic_pos_sentence_embedding": label_to_embedding[row["semantic_label"]]}
)
merged.save_to_disk("merged_dataset")
merged = datasets.load_from_disk("merged_dataset")
def fix_gender(item):
if item["gender_label"].lower() == "m":
item["gender_label"] = "male"
elif item["gender_label"].lower() == "f":
item["gender_label"] = "female"
return item
merged = merged.map(fix_gender)
merged.save_to_disk("merged_dataset")