WhisperX to word-level tsv
including basic number denormalisation
from pathlib import Path
BASE = Path("/Users/joregan/Playing/hsi/audio/whisperx-json/")
TSVBASE = Path("/Users/joregan/Playing/hsi/audio/whisperx-tsv/")
import json
if not TSVBASE.is_dir():
TSVBASE.mkdir()
%pip install num2words
from num2words import num2words
import re
def get_words(word):
if re.match("^[2-9]0s$", word):
oword = num2words(int(word[0:-1]), lang="en")
return oword[:-1] + "ies"
if re.match("^[0-9]+(?:th|st|nd|rd)$", word):
return num2words(int(word[0:-2]), lang="en", to="ordinal")
if re.match("^[0-9]+(?:,[0-9][0-9][0-9])+$", word):
return num2words(int(word.replace(",", "")), lang="en")
if re.match("^\$[0-9]+(?:,[0-9][0-9][0-9])*$", word):
return num2words(int(word[1:].replace(",", "")), lang="en") + " dollars"
if re.match("^20[1-9][0-9]$", word):
return "twenty " + num2words(int(word[2:]), lang="en")
if re.match("^[0-9]+$", word):
return num2words(int(word), lang="en")
return word
for jsonfile in BASE.glob("*.json"):
last_end = 0.0
outtsv = TSVBASE / f"{jsonfile.stem}.tsv"
with open(str(jsonfile)) as inf, open(str(outtsv), "w") as outf:
data = json.load(inf)
for segment in data['segments']:
for word in segment['words']:
if not "start" in word:
start = last_end
end = last_end + 0.05
text = get_words(word['word'])
else:
start = word['start']
end = word['end']
text = word['word']
outf.write(str(start) + "\t" + str(end) + "\t" + text + "\n")
last_end = end
TSVBASE = Path("/Users/joregan/Playing/hsi/audio/whisperx-tsv-sentence/")
if not TSVBASE.is_dir():
TSVBASE.mkdir()
def get_words_punct(word):
m = re.match("^([^A-Za-z0-9]+)(.*[0-9].*)([^A-Za-z0-9]+)$", word)
if m:
pre = m.group(1)
mid = m.group(2)
post = m.group(3)
return pre + get_words(mid) + post
m = re.match("^([^A-Za-z0-9]+)(.*[0-9][A-Za-z0-9]+)$", word)
if m:
pre = m.group(1)
mid = m.group(2)
return pre + get_words(mid)
m = re.match("^([A-Za-z0-9].*[0-9].*)([^A-Za-z0-9]+)$", word)
if m:
mid = m.group(1)
post = m.group(2)
return get_words(mid) + post
return get_words(word)
for jsonfile in BASE.glob("*.json"):
outtsv = TSVBASE / f"{jsonfile.stem}.tsv"
with open(str(jsonfile)) as inf, open(str(outtsv), "w") as outf:
data = json.load(inf)
for segment in data['segments']:
start = str(segment['start'])
end = str(segment['end'])
text = segment['text'].strip()
normed = [get_words_punct(x) for x in text.split(" ")]
outf.write(start + "\t" + end + "\t" + " ".join(normed) + "\n")