PATH = "/Users/joregan/Playing/hu-tts/hungarian-single-speaker-tts"

!pip install datasets librosa

from pathlib import Path
import datasets

path = Path(PATH)
outdir = path / "data"
transcript = path / "transcript.txt"

def clean_text(text, norm = False):
    if text.endswith(" -"):
        text = text[:-2]
        text = text.replace(".S ", ". S ")
    if norm:
        text = text.replace(". ", " ")
        if len(text) > 1 and text[-1] in [".", "!", "?", ":"]:
            text = text[:-1]
        text = text.replace(" -, ", " ")
        text = text.replace(" - ", " ")
        text = text.replace(". ", " ")
        text = text.replace(", ", " ")
        text = text.replace("? ", " ")
        text = text.replace(": ", " ")
        text = text.replace("! ", " ")
        text = text.lower()
    return text

data = []
with open(str(transcript)) as ts:
    for line in ts.readlines():
        data.append(line.strip().split("|"))

def data_gen():
    for i in data:
        filepath = i[0]
        fileid = filepath.split("/")[1].replace(".wav", "")
        text = i[1]
        lightly_cleaned = clean_text(text)
        fully_cleaned = clean_text(text, True)
        yield {
            "id": fileid,
            "audio": str(path / filepath),
            "original_text": lightly_cleaned,
            "text": fully_cleaned,
            "duration": float(i[3])
        }

Features

from datasets import Dataset, Audio
dataset = Dataset.from_generator(data_gen).cast_column("audio", Audio(sampling_rate=22050))

dataset.push_to_hub("KTH/hungarian-single-speaker-tts", max_shard_size="500MB")

100%|██████████| 1/1 [00:05<00:00,  5.55s/ba]
100%|██████████| 1/1 [00:01<00:00,  1.52s/ba]0%|          | 0/7 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:01<00:00,  1.52s/ba]9%|██▊       | 2/7 [00:01<00:03,  1.29it/s]
100%|██████████| 1/1 [00:01<00:00,  1.09s/ba]3%|████▎     | 3/7 [00:03<00:04,  1.11s/it]
100%|██████████| 1/1 [00:01<00:00,  1.69s/ba]7%|█████▋    | 4/7 [00:04<00:03,  1.11s/it]
100%|██████████| 1/1 [00:01<00:00,  1.77s/ba]1%|███████▏  | 5/7 [00:06<00:02,  1.34s/it]
100%|██████████| 1/1 [00:01<00:00,  1.04s/ba]6%|████████▌ | 6/7 [00:07<00:01,  1.49s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 7/7 [00:08<00:00,  1.27s/it]