Kashubian - extract text from tlog
tlog output from wav2vec2 Polish
import json
with open("/tmp/csb/kashubian-data.json", "r") as read_file:
data = json.load(read_file)
for datum in data:
file = datum['audio'].split('/')[-1].replace('.ogg', '.txt')
with open(f'/tmp/csb/{file}', 'w') as f:
text = '\n'.join([a.strip() for a in datum['text'].split('\n') if a.strip() != ''])
f.write(text)
import glob
for file in glob.glob('/tmp/csb/*.ogg.wav.tlog'):
outfile = file.replace('.ogg.wav.tlog', '.rec.txt')
with open(file, "r") as tlog:
data = json.load(tlog)
with open(outfile, "w") as rectxt:
for datum in data:
rectxt.write(f"{datum['transcript']}\n")