Create Huggingface dataset from Hungarian TTS data
Mostly, it's the push_to_hub part that I'll forget
PATH = "/Users/joregan/Playing/hu-tts/hungarian-single-speaker-tts"
!pip install datasets librosa
from pathlib import Path
import datasets
path = Path(PATH)
outdir = path / "data"
transcript = path / "transcript.txt"
def clean_text(text, norm = False):
if text.endswith(" -"):
text = text[:-2]
text = text.replace(".S ", ". S ")
if norm:
text = text.replace(". ", " ")
if len(text) > 1 and text[-1] in [".", "!", "?", ":"]:
text = text[:-1]
text = text.replace(" -, ", " ")
text = text.replace(" - ", " ")
text = text.replace(". ", " ")
text = text.replace(", ", " ")
text = text.replace("? ", " ")
text = text.replace(": ", " ")
text = text.replace("! ", " ")
text = text.lower()
return text
data = []
with open(str(transcript)) as ts:
for line in ts.readlines():
data.append(line.strip().split("|"))
def data_gen():
for i in data:
filepath = i[0]
fileid = filepath.split("/")[1].replace(".wav", "")
text = i[1]
lightly_cleaned = clean_text(text)
fully_cleaned = clean_text(text, True)
yield {
"id": fileid,
"audio": str(path / filepath),
"original_text": lightly_cleaned,
"text": fully_cleaned,
"duration": float(i[3])
}
Features
from datasets import Dataset, Audio
dataset = Dataset.from_generator(data_gen).cast_column("audio", Audio(sampling_rate=22050))
dataset.push_to_hub("KTH/hungarian-single-speaker-tts", max_shard_size="500MB")