The datasets are on the hub: jimregan/clarinpl_studio and jimregan/clarinpl_sejmsenat

!wget http://mowa.clarin-pl.eu/korpusy/audio.tar.gz
!tar zxvf audio.tar.gz
!cat /content/audio/SES0001/spk.txt
SPK0001
!cat /content/audio/SES0001/sent030.txt
gdy maluch już się wypluska wytrzyjcie go dokładnie ręcznikiem posmarujcie jeszcze raz kremem przeciwsłonecznym i ubierzcie w suche u branie
!head /content/SejmSenat/test/wav.scp.orig
!head /content/SejmSenat/test/spk2utt
!head /content/SejmSenat/train/text
!huggingface-cli login
!huggingface-cli repo create clarinpl_studio --type dataset
!rm -rf clarinpl_studio
!git clone https://huggingface.co/datasets/jimregan/clarinpl_studio
Cloning into 'clarinpl_studio'...
remote: Enumerating objects: 6, done.
remote: Counting objects: 100% (6/6), done.
remote: Compressing objects: 100% (5/5), done.
remote: Total 6 (delta 0), reused 0 (delta 0)
Unpacking objects: 100% (6/6), done.
!datasets-cli test clarinpl_studio --save_infos --all_configs
Testing builder 'clean' (1/1)
Downloading and preparing dataset clarin_pl_studio/clean (download: 4.59 GiB, generated: 4.50 MiB, post-processed: Unknown size, total: 4.60 GiB) to /root/.cache/huggingface/datasets/clarin_pl_studio/clean/2.1.0/733df40ff099ad45628c8c755782c0abb5554817218890a3d232ed359122252c...
0 examples [00:00, ? examples/s]2021-04-15 10:43:00.739700: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Dataset clarin_pl_studio downloaded and prepared to /root/.cache/huggingface/datasets/clarin_pl_studio/clean/2.1.0/733df40ff099ad45628c8c755782c0abb5554817218890a3d232ed359122252c. Subsequent calls will reuse this data.
100% 3/3 [00:00<00:00, 176.78it/s]
Dataset Infos file saved at clarinpl_studio/dataset_infos.json
Test successful.
# coding=utf-8
# Copyright 2021 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
# Copyright 2021 Jim O'Regan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""ClarinPL Studio automatic speech recognition dataset."""

import os

import datasets


_CITATION = """\
@article{korvzinek2017polish,
  title={Polish read speech corpus for speech tools and services},
  author={Kor{\v{z}}inek, Danijel and Marasek, Krzysztof and Brocki, {\L}ukasz and Wo{\l}k, Krzysztof},
  journal={arXiv preprint arXiv:1706.00245},
  year={2017}
}
"""

_DESCRIPTION = """\
The corpus consists of 317 speakers recorded in 554
sessions, where each session consists of 20 read sentences and 10 phonetically rich words. The size of
the audio portion of the corpus amounts to around 56 hours, with transcriptions containing 356674 words
from a vocabulary of size 46361.

Note that in order to limit the required storage for preparing this dataset, the audio
is stored in the .wav format and is not converted to a float32 array. To convert the audio
file to a float32 array, please make use of the `.map()` function as follows:

```python
import soundfile as sf

def map_to_array(batch):
    speech_array, _ = sf.read(batch["file"])
    batch["speech"] = speech_array
    return batch

dataset = dataset.map(map_to_array, remove_columns=["file"])
```
"""

_URL = "https://mowa.clarin-pl.eu/"
_DS_URL = "http://mowa.clarin-pl.eu/korpusy/audio.tar.gz"
_TRAIN_URL = "https://raw.githubusercontent.com/danijel3/ClarinStudioKaldi/master/local_clarin/train.sessions"
_TEST_URL = "https://raw.githubusercontent.com/danijel3/ClarinStudioKaldi/master/local_clarin/test.sessions"
_VALID_URL = "https://raw.githubusercontent.com/danijel3/ClarinStudioKaldi/master/local_clarin/dev.sessions"

class ClarinPLStudioASRConfig(datasets.BuilderConfig):
    """BuilderConfig for ClarinPLStudioASR."""

    def __init__(self, **kwargs):
        """
        Args:
          data_dir: `string`, the path to the folder containing the files in the
            downloaded .tar
          citation: `string`, citation for the data set
          url: `string`, url for information about the data set
          **kwargs: keyword arguments forwarded to super.
        """
        super(ClarinPLStudioASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)


class ClarinPLStudio(datasets.GeneratorBasedBuilder):
    """ClarinPL Studio dataset."""

    BUILDER_CONFIGS = [
        ClarinPLStudioASRConfig(name="clean", description="'Clean' speech."),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "text": datasets.Value("string"),
                    "speaker_id": datasets.Value("string"),
                    "id": datasets.Value("string"),
                }
            ),
            supervised_keys=("file", "text"),
            homepage=_URL,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        def get_sessions(path):
          sessions = []
          with open(path, 'r') as f:
            for line in f:
              sessions.append(line.strip())
            return sessions
        archive_path = dl_manager.download_and_extract(_DS_URL)
        train_sessions_path = dl_manager.download(_TRAIN_URL)
        test_sessions_path = dl_manager.download(_TEST_URL)
        valid_sessions_path = dl_manager.download(_VALID_URL)

        train_sessions = get_sessions(train_sessions_path)
        test_sessions = get_sessions(test_sessions_path)
        valid_sessions = get_sessions(valid_sessions_path)

        archive_path = os.path.join(archive_path, "audio")
        return [
            datasets.SplitGenerator(name="train", gen_kwargs={
                "archive_path": archive_path,
                "sessions": train_sessions
                }),
            datasets.SplitGenerator(name="test", gen_kwargs={
                "archive_path": archive_path,
                "sessions": test_sessions
                }),
            datasets.SplitGenerator(name="valid", gen_kwargs={
                "archive_path": archive_path,
                "sessions": valid_sessions
                }),
        ]

    def _generate_examples(self, archive_path, sessions):
        """Generate examples from a ClarinPL Studio archive_path."""
        def get_single_line(path):
          lines = []
          with open(path, 'r', encoding="utf-8") as f:
            for line in f:
              line = line.strip()
              lines.append(line)
          assert(len(lines) == 1)
          return lines[0]
        for session in sessions:
          session_path = os.path.join(archive_path, session)
          speaker = get_single_line(os.path.join(session_path, "spk.txt"))
          text_glob = os.path.join(session_path, "*.txt")
          for text_file in sorted(glob.glob(text_glob)):
            if text_file.endswith("spk.txt"):
              continue
            basename = os.path.basename(text_file)
            basename = basename.replace('.txt', '')
            key = f'{session}_{basename}'
            text = get_single_line(text_file)
            audio = text_file.replace('.txt', '.wav')
            example = {
                "id": key,
                "speaker_id": speaker,
                "file": audio,
                "text": text,
            }
            yield key, example
!pip install datasets
from datasets import load_dataset
dataset = load_dataset('clarinpl_studio.py')
Downloading and preparing dataset clarin_pl_studio/clean (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/clarin_pl_studio/clean/2.1.0/733df40ff099ad45628c8c755782c0abb5554817218890a3d232ed359122252c...
Dataset clarin_pl_studio downloaded and prepared to /root/.cache/huggingface/datasets/clarin_pl_studio/clean/2.1.0/733df40ff099ad45628c8c755782c0abb5554817218890a3d232ed359122252c. Subsequent calls will reuse this data.
dataset
DatasetDict({
    train: Dataset({
        features: ['file', 'text', 'speaker_id', 'id'],
        num_rows: 11222
    })
    test: Dataset({
        features: ['file', 'text', 'speaker_id', 'id'],
        num_rows: 1362
    })
    valid: Dataset({
        features: ['file', 'text', 'speaker_id', 'id'],
        num_rows: 1229
    })
})
import IPython

IPython.display.Audio(dataset['train']['file'][2184])
dataset
DatasetDict({
    train: Dataset({
        features: ['file', 'text', 'speaker_id', 'id'],
        num_rows: 6622
    })
    test: Dataset({
        features: ['file', 'text', 'speaker_id', 'id'],
        num_rows: 130
    })
})
dataset['train'][0]
{'file': '/root/.cache/huggingface/datasets/downloads/extracted/333ddc746f2df1e1d19b44986992d4cbe28710fde81d533a220e755ee6c5c519/audio/SES0001/rich001.wav',
 'id': 'SES0001_rich001',
 'speaker_id': 'SPK0001',
 'text': 'drożdże dżip gwożdżenie ozimina wędzarz rdzeń wędzonka ingerować kładzenie jutrzenka'}
!wc -l /root/.cache/huggingface/datasets/downloads/extracted/4143b1d75559b10028c1c7e8800c9ccc05934ca5a8ea15f8f9a92770576a1ee3/SejmSenat/*/text
    130 /root/.cache/huggingface/datasets/downloads/extracted/4143b1d75559b10028c1c7e8800c9ccc05934ca5a8ea15f8f9a92770576a1ee3/SejmSenat/test/text
   6622 /root/.cache/huggingface/datasets/downloads/extracted/4143b1d75559b10028c1c7e8800c9ccc05934ca5a8ea15f8f9a92770576a1ee3/SejmSenat/train/text
   6752 total
!find /root/.cache/huggingface/datasets/downloads/extracted/4143b1d75559b10028c1c7e8800c9ccc05934ca5a8ea15f8f9a92770576a1ee3/SejmSenat/audio/ -type f|wc
   6752    6752 1159384
!rm -rf SejmSenat/