wav2vec-u Common Voice Swedish - prepare tsv
For use with wav2vec-u
Original here
import soundfile
input = {
'train': '/kaggle/input/download-common-voice-swedish/cv-corpus-6.1-2020-12-11/sv-SE/train.tsv',
'test': '/kaggle/input/download-common-voice-swedish/cv-corpus-6.1-2020-12-11/sv-SE/test.tsv',
'valid': '/kaggle/input/download-common-voice-swedish/cv-corpus-6.1-2020-12-11/sv-SE/dev.tsv'
}
for split in input.keys():
with open(input[split], 'r') as tsv:
with open(f'/kaggle/working/{split}.tsv', 'w') as out:
print('/kaggle/input/common-voice-swedish-16bit-wav/', file=out)
for line in tsv.readlines():
data = line.split('\t')
if data[1] == 'path':
continue
file = data[1]
file = file.replace('.mp3', '.wav')
path = f'/kaggle/input/common-voice-swedish-16bit-wav/{file}'
frames = soundfile.info(path).frames
print("{}\t{}".format(file, frames), file=out)