Rewrite NST Whisper output
Local, incomplete
import json
in_path = "/Users/joregan/Playing/nst_swedish_tts/whisper-v3-aligned-to-sw_all.json"
out_path = "/Users/joregan/Playing/nst_swedish_tts/whisper-v3-aligned-to-sw_all_ordered.json"
with open(in_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Map int → (original_key, value)
items = [(int(k), v) for k, v in data.items()]
items.sort(key=lambda x: x[0])
# Reconstruct with the original (possibly zero-padded) key strings
ordered = {str(k).zfill(len(items[0][0].__str__())): v for k, v in items}
with open(out_path, "w", encoding="utf-8") as f:
json.dump(ordered, f, ensure_ascii=False, indent=2)
print(f"✅ Ordered JSON written to {out_path}")
!pip install num2words
from num2words import num2words
def approx_num(raw_number, text):
try:
number = int(raw_number)
except ValueError:
return False
forms = set()
if len(raw_number) == 4:
a = num2words(int(number[0:2]), lang='sv')
b = num2words(int(number[2]), lang='sv')
forms.add(f"{a} hundra {b}")
forms.add(f"{a}hundra {b}")
forms.add(f"{a}hundra{b}")
card = num2words(number, lang='sv', to='cardinal')
ord = num2words(number, lang='sv', to='ordinal')
forms.add(card)
forms.add(ord)
for inner in ["tusen", "hundra"]:
if inner in card:
forms.add(card.replace(inner, inner + " "))
forms.add(card.replace(inner, " " + inner))
forms.add(card.replace(inner, " " + inner + " "))
if inner in ord:
forms.add(ord.replace(inner, inner + " "))
forms.add(ord.replace(inner, " " + inner))
forms.add(ord.replace(inner, " " + inner + " "))
for form in forms:
if form in text:
return True
return False
OFFSETS = {
4155: 1,
5144: 2,
}
rejigged = {}
offset = 0
for k_str, v in ordered.items():
k = int(k_str)
if k > 5277:
break
if k in OFFSETS:
offset = OFFSETS[k]
new_k = k + offset
rejigged[k_str.zfill(4)] = {
"res": v["res"],
"orig": ordered[str(new_k)]["orig"]
}
for item in rejigged:
res = rejigged[item]["res"]
orig = rejigged[item]["orig"]
if res == orig.replace("_", " "):
rejigged[item]["norm"] = res
else:
norm_simple = res.replace(".", "").replace(",", "").replace("?", "").replace("!", "").lower()
if orig == norm_simple:
rejigged[item]["norm"] = res
def simple_norm(res):
return res.replace(".", "").replace(",", "").replace("?", "").replace("!", "").lower()
import re
def check_numbers_match(res, orig):
res_norm = simple_norm(res)
res_words = res_norm.split()
for word in res_words:
if re.fullmatch(r'^\d+$', word):
if approx_num(word, orig):
return True
return False
def capitalize_and_punctuate(text, punct="."):
if not text:
return text
text = text[0].upper() + text[1:]
text += punct
return text
OK_WHISPER = [
"0004", "0111", "3956", "4811", "4699", "4676", "4665", "4651",
"0172", "0348", "0442", "0738", "0799", "0880", "0999", "1031",
"1075", "1098", "1177", "1279", "1320", "1441", "1493", "1501",
"1508", "1511", "1513", "1514", "1516", "1518", "1519", "1520",
"1524", "1090", "1917"
]
UC_WHISPER = ["0079", "2537"]
PUNCT_WHISPER = ["0085", "4838", "0672", "1498", "1499", "1517", "1526", "1528", "1530", "1531",
"1533", "1535", "2150"]
ORIG_UC_FS = [
"0001", "0011", "0045", "0052", "0064", "0065", "0063", "0070", "0080", "0084", "0113", "0123",
"0125", "0104", "0127", "2543", "3748", "0143", "0145", "5030", "5060", "5080", "5117", "5123",
"4858", "4717", "4707", "4703", "4695", "4576", "4484", "0158", "0129", "0151", "0165", "0171",
"0173", "1365", "1366", "2148", "2144", "2137", "2133", "2134"
]
FULL_NORM = {
"0112": "Jag vill ta en titt i ert gevärsskåp. Det är allt.",
"5202": "Per Henricson.",
"5203": "Westergren presstödsnämnden.",
"5208": "Actinvest är ett seriöst bolag, säger en analytiker i Milano.",
"5185": "Microsoft-mus ingår.",
"5147": "Skalbarhet under Windows NT har förut varit en black om foten.",
"5138": "Palmquist som är vd på UB Networks i Sverige.",
"5143": "Henrik Lind har anställts som utvecklingschef på Living Questions.",
"3881": "I det nya EU är detta en omöjlighet.",
"4869": "ModelQuest Expert och kan hantera upp till 60 olika indata.",
"4728": "Norden Rolf Hallencreutz.",
"4726": "Lars Öquist, vd, TCM.",
"4706": "Lärarförbundets ordförande Christer Romilson.",
"4581": "Med vänliga hälsningar två Vilhelminabor.",
"5275": "Det gällde både konst, arkitektur, litteratur, musik, filosofi och vetenskap.",
"0128": "Han var fullblodspolitikern, rutinerad och oöm.",
}
for item in rejigged:
if "norm" not in rejigged[item]:
orig = rejigged[item]["orig"]
orig = orig.replace("_", " ")
res = rejigged[item]["res"]
if item in OK_WHISPER:
rejigged[item]["norm"] = res
elif item in UC_WHISPER:
rejigged[item]["norm"] = capitalize_and_punctuate(res, "")
elif item in PUNCT_WHISPER:
rejigged[item]["norm"] = capitalize_and_punctuate(res)
elif item in ORIG_UC_FS:
rejigged[item]["norm"] = capitalize_and_punctuate(orig)
elif item in FULL_NORM:
rejigged[item]["norm"] = FULL_NORM[item]
else:
print(f"⚠️ Mismatch at item {item}:")
print(f" res: '{res}'")
print(f" orig: '{orig}'")
print(f". \"{item}\": \"{orig}\",")
with open("rejigged_whisper.json", "w", encoding="utf-8") as f:
json.dump(rejigged, f, ensure_ascii=False, indent=2)
with open("rejigged_whisper.json") as f:
rejigged = json.load(f)
for item in rejigged:
if not "norm" in rejigged[item]:
continue
if not rejigged[item]["norm"][0].isupper():
rejigged[item]["norm"] = rejigged[item]["norm"][0].upper() + rejigged[item]["norm"][1:]
if rejigged[item]["norm"][-1] not in [".", "!", "?"]:
rejigged[item]["norm"] += "."
with open("rejigged_whisper.json", "w", encoding="utf-8") as f:
json.dump(rejigged, f, ensure_ascii=False, indent=2)