Process test/validation data
For SBTal Riksdag dataset
from pathlib import Path
from pydub import AudioSegment
BASE = Path("/home/joregan")
TSV_PATH = BASE / "train-valid-deliverable/round1/C1A1"
AUDIO_PATH = BASE / "train-valid-deliverable" / "C1_audio"
OUT_PATH = BASE / "train-valid-deliverable" / "split"
parameters=["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]
MARKERS = [
"#BREATH",
"#COUGH",
"#EH",
"#INAUDIBLE",
"#LIPSMACK",
"#NOISE",
"#OTHER",
"#PAUSE",
"#TRUNC",
"#UNKNOWN"
]
TEST_VALID_SPEAKERS = """
01 Jörgen Hellman TEST_M
02 Agneta Gille VAL_F
03 Amir Adan TEST_M
04 Teresa Carvalho TEST_F
05 Kerstin Nilsson VAL_F
06 Niclas Malmberg VAL_M
07 Carina Ståhl Herrstedt TEST_F
08 Vasiliki Tsouplaki VAL_F
09 Cecilie Tenfjord Toftby VAL_F
10 Ann-Britt Åsebol TEST_F
11 Karin Nilsson TEST_F
12 Ingemar Nilsson TEST_M
13 Mats Nordberg TEST_M
14 Ulrika Jörgensen TEST_F
15 Aylin Fazelian VAL_F
16 Björn Wiechel VAL_M
17 Sedat Dogru VAL_M
18 Oskar Öholm TEST_M
19 Eva Lohman VAL_F
20 Karin Granbom Ellison TEST_F
21 Åsa Karlsson VAL_F
22 Yilmaz Kerimo VAL_M
23 Aphram Melki TEST_M
24 Yasmine Bladelius TEST_F
25 Désirée Liljevall VAL_F
26 Erik Slottner VAL_M
27 Gustav Nilsson VAL_M
28 Linda Wemmert TEST_F
29 Mats Sander VAL_M
30 Arin Karapet VAL_M
31 Daniel Andersson TEST_M
32 David Josefsson TEST_M
"""
TEST = []
VALID = []
for line in TEST_VALID_SPEAKERS.split("\n"):
if not "\t" in line:
continue
parts = line.split("\t")
if parts[2].startswith("TEST"):
TEST.append(parts[0])
else:
VALID.append(parts[0])
OUT_DIR = BASE / "train-valid-deliverable" / "split"
valid_tsv = open(str(BASE / "valid.tsv"), "w")
test_tsv = open(str(BASE / "test.tsv"), "w")
valid_tsv.write("path\tduration\ttext\n")
test_tsv.write("path\tduration\ttext\n")
for tsvfile in TSV_PATH.glob("*.tsv"):
stem = tsvfile.stem
in_wav = AUDIO_PATH / f"{stem}.wav"
wav_as = AudioSegment.from_wav(str(in_wav))
counter = 1
stem_pieces = stem.split("-")
speaker_id = stem_pieces[1]
test_set = False
if speaker_id in VALID:
test_set = False
else:
test_set = True
with open(str(tsvfile)) as tsv:
for line in tsv.readlines():
parts = line.strip().split("\t")
if line.startswith("Start"):
continue
if parts[2].strip() in MARKERS:
continue
start = int(parts[0])
end = int(parts[1])
dur = end - start
text = parts[2].strip()
current_wav = wav_as[start:end]
outname = OUT_PATH / f"{stem}_{start}_{end}.wav"
text_raw = text
word_buf = []
for word in text.split(" "):
if word.startswith("#") or word.endswith("*"):
continue
word_buf.append(word)
text = " ".join(word_buf)
if text == "":
continue
if test_set:
test_tsv.write(f"{str(outname)}\t{dur}\t{text}\n")
else:
valid_tsv.write(f"{str(outname)}\t{dur}\t{text}\n")
current_wav.export(str(outname), format="wav", parameters=parameters)