Check Braxen multilingual entries
Some are very wrong
!apt-get -qq update
!apt-get -qq install -y hunspell git libhunspell-dev
!pip install hunspell
!echo $PWD/dictionaries/dictionaries
!git clone https://github.com/wooorm/dictionaries
!pwd
import os, re, glob, pandas as pd, hunspell
FILE_DIR = "/kaggle/input/split-braxen-by-language"
DICT_ROOT = "/kaggle/working/dictionaries/dictionaries" # as you showed
OUT_TSV = "hunspell_results.tsv"
MIN_ENTRIES = 100
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}
# find index.aff/index.dic pairs under your DICT_ROOT
pairs = {}
for aff in glob.glob(os.path.join(DICT_ROOT, "*", "index.aff")):
d = os.path.dirname(aff)
dic = os.path.join(d, "index.dic")
code = os.path.basename(d)
if os.path.isfile(dic):
pairs[code] = (dic, aff)
CODE2DICT = {
"lat":["la"],
"swe":["sv"],
"nob":["nb"],
"nno":["nn"],
"dan":["da"],
"isl":["is"],
"fin":["fi"],
"est":["et"],
"lav":["lv"],
"lit":["lt"],
"pol":["pl"],
"cze":["cs"],
"slk":["sk"],
"slv":["sl"],
"hrv":["hr"],
"srp":["sr-Latn"],
"bos":["bs"],
"mkd":["mk"],
"bul":["bg"],
"ukr":["uk"],
"rus":["ru"],
"deu":["de"],
"nld":["nl","dut"],
"eng":["en","en-GB","en-CA","en-AU","en-ZA"],
"fre":["fr"],
"ita":["it"],
"spa":["es","es-MX","es-AR","es-CL","es-ES"],
"por":["pt","pt-PT"],
"rom":["ro"],
"hun":["hu"],
"tur":["tr"],
"gre":["el"],
}
def read_text(p):
b = open(p,"rb").read()
try: return b.decode("utf-8")
except UnicodeDecodeError: return b.decode("utf-8", errors="ignore")
def get_words(text):
output = []
for line in text.split("\n"):
parts = line.split("\t")
output.append(parts[0])
return output
def file_code(p):
b = os.path.basename(p)
return b[len("braxen-"):-4] if b.startswith("braxen-") else b
def load_hs(dict_codes):
for c in dict_codes:
if c in pairs:
dic, aff = pairs[c]
return hunspell.HunSpell(dic, aff), c
return None, None
files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR) if f.startswith("braxen-") and f.endswith(".txt")]
file_words, file_sizes = {}, {}
for p in files:
ws = set(get_words(read_text(p)))
file_words[p] = ws
file_sizes[p] = len(ws)
candidates = []
for p, n in file_sizes.items():
code = file_code(p)
if n >= MIN_ENTRIES and code not in SKIP_CODES and code in CODE2DICT:
candidates.append(p)
rows = []
for p in sorted(candidates):
code = file_code(p)
hs, used = load_hs(CODE2DICT[code])
if not hs:
print(f"skip {code}: dict not found for {CODE2DICT[code]}")
continue
words = sorted(file_words[p])
print(f"{code}: {len(words)} tokens via {used}")
for w in words:
if not w:
continue
if all(ord(ch) < 128 for ch in w) and len(w) < 2:
continue
wcheck = w.replace("ö","ø").replace("Ö","Ø") if code in {"nor","dan"} else w
if hs.spell(wcheck):
rows.append((code, w, "OK", ""))
else:
sugs = ", ".join(hs.suggest(wcheck))
rows.append((code, w, "MISS", sugs))
df = pd.DataFrame(rows, columns=["file_code","word","status","suggestions"])
df.to_csv(OUT_TSV, sep="\t", index=False, encoding="utf-8")
print(f"Wrote {OUT_TSV} with {len(df):,} rows")
df.head(20)