Filter Braxen with Hunspell
Local, incomplete
LANG_LOC = "/tmp"
from pathlib import Path
LANG_PATH = Path(LANG_LOC)
ok_words = {}
norms = {}
def check_nobdan(word, suggestions):
sugg = suggestions.split(", ")
pairs = {
"ae": "æ",
"Ae": "Æ",
"AE": "Æ",
"oe": "ø",
"Oe": "Ø",
"OE": "Ø",
"aa": "å",
"Aa": "Å",
"AA": "Å",
"ä": "æ",
"Ä": "Æ",
"ö": "ø",
"Ö": "Ø"
}
for k, v in pairs.items():
if word.replace(k, v) in sugg:
return (word, word.replace(k, v))
return None
with open(LANG_PATH / "hunspell_results.tsv", "r") as f:
for line in f.readlines():
if line.startswith("file_code"):
continue
parts = line.strip().split("\t")
lang = parts[0]
word = parts[1]
status = parts[2]
suggestions = parts[3] if len(parts) > 3 else ""
if not lang in ok_words:
ok_words[lang] = []
if not lang in norms:
norms[lang] = []
if status == "OK":
ok_words[lang].append(word)
elif lang in ["nob", "dan"]:
ck = check_nobdan(word, suggestions)
if ck:
norms[lang].append(ck)
else:
sugg = suggestions.split(", ")
sugg_lc = [s.lower() for s in sugg]
sugg_map = {s.lower(): s for s in sugg}
if word.lower() in sugg_lc:
norms[lang].append((word, sugg_map[word.lower()]))
for lang in ok_words.keys():
norm_dict = {w: n for w, n in norms[lang]}
with open(LANG_PATH / f"braxen-{lang}.txt") as f, \
open(LANG_PATH / f"braxen-{lang}-filtered.txt", "w") as out_f:
for line in f.readlines():
line = line.strip()
word, pron = line.split("\t")
if word in ok_words[lang]:
out_f.write(f"{word}\t{pron}\n")
elif word in norm_dict:
out_f.write(f"{norm_dict[word]}\t{pron}\n")