Convert CMU Arctic to a huggingface dataset
Retains original sentence IDs, but these don't match for awb
from pathlib import Path
base = Path(".")
GENDER = {
"slt": "female",
"clb": "female",
"axb": "female",
"eey": "female",
"ljm": "female",
"lnh": "female",
"bdl": "male",
"rms": "male",
"jmk": "male",
"awb": "male",
"ksp": "male",
"ahw": "male",
"aup": "male",
"fem": "male",
"gka": "male",
"rxr": "male",
"slp": "male",
}
ACCENTS = {
"jmk": "Canadian English",
"awb": "Scottish English",
"ksp": "Indian English",
"ahw": "German",
"aup": "Indian",
"axb": "Indian",
"fem": "German",
"gka": "Indian",
"rxr": "Israeli",
"slp": "Indian",
}
MISSING = {
"jmk": {
"arctic_a0564": "Then it is as I said, Womble announced",
"arctic_b0313": "The apron string loomed near",
"arctic_b0229": "I saw it all myself",
"arctic_a0512": "No, it is a palace",
"arctic_a0576": "And the big Persian knew of his existence",
"arctic_a0542": "Without a doubt",
"arctic_a0392": "There is that magnificent Bob",
"arctic_a0568": "He was just bursting with joy",
"arctic_a0578": "But we'll just pos-",
"arctic_a0341": "Why, doggone you all",
"arctic_a0208": "Youth had come back to her",
"arctic_a0130": "She was his now",
"arctic_a0108": "He waded into the edge of the water",
"arctic_b0223": "They likewise are disinclined",
"arctic_a0561": "Bill lingered, contemplating his work",
"arctic_a0575": "There weren't any missions",
"arctic_a0565": "With them were Indians"
},
"eey": {
"arctic_a0282": "If you mean to insinuate -- Brentwood began hotly."
},
"bdl": {
"arctic_a0507": "In short, my joyous individualism was dominated by the orthodox bourgeois ethic."
}
}
def read_text(file: Path):
text_pairs = {}
with open(file) as f:
for line in f.readlines():
text_start = line.find('"')
text_end = line.rfind('"')
text = line[text_start + 1:text_end]
text_id = line[1:text_start].strip()
text_pairs[text_id] = text
return text_pairs
import soundfile as sf
from datasets import Dataset, Audio, Features, Value
examples = []
for dir in base.glob("cmu_us_*"):
speaker_id = dir.name.split("_")[-2]
accent = ACCENTS.get(speaker_id, "American English")
gender = GENDER.get(speaker_id, "unknown")
text = dir / "etc" / "txt.done.data"
pairs = read_text(text)
for wav_file in dir.glob("wav/*.wav"):
wav_id = wav_file.stem
if wav_id not in pairs:
print(f"Missing text for {wav_id} in {speaker_id}")
continue
text = pairs[wav_id]
examples.append({
"audio": str(wav_file.resolve()),
"text": pairs[wav_id],
"sentence_id": wav_id,
"speaker_id": speaker_id,
"gender": gender,
"accent": accent,
})
Missing text for arctic_a0564 in jmk
Missing text for arctic_b0313 in jmk
Missing text for arctic_b0229 in jmk
Missing text for arctic_a0512 in jmk
Missing text for arctic_a0576 in jmk
Missing text for arctic_a0456 in jmk
Missing text for arctic_a0542 in jmk
Missing text for arctic_a0392 in jmk
Missing text for arctic_b0223 in jmk
Missing text for arctic_a0130 in jmk
Missing text for arctic_a0108 in jmk
Missing text for arctic_a0578 in jmk
Missing text for arctic_a0568 in jmk
Missing text for arctic_a0341 in jmk
Missing text for arctic_a0561 in jmk
Missing text for arctic_a0575 in jmk
Missing text for arctic_a0208 in jmk
Missing text for arctic_a0565 in jmk
Missing text for arctic_a0507 in bdl
Missing text for arctic_a0282 in eey
import pandas as pd
df = pd.DataFrame(examples)
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio("wav"))
dataset.save_to_disk(".")