!apt-get -qq update
!apt-get -qq install -y hunspell git libhunspell-dev
!pip install hunspell
!echo $PWD/dictionaries/dictionaries
/kaggle/working/dictionaries/dictionaries
!git clone https://github.com/wooorm/dictionaries
!pwd
import os, re, glob, pandas as pd, hunspell

FILE_DIR = "/kaggle/input/split-braxen-by-language"
DICT_ROOT = "/kaggle/working/dictionaries/dictionaries"  # as you showed
OUT_TSV = "hunspell_results.tsv"
MIN_ENTRIES = 100
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}

# find index.aff/index.dic pairs under your DICT_ROOT
pairs = {}
for aff in glob.glob(os.path.join(DICT_ROOT, "*", "index.aff")):
    d = os.path.dirname(aff)
    dic = os.path.join(d, "index.dic")
    code = os.path.basename(d)
    if os.path.isfile(dic):
        pairs[code] = (dic, aff)

CODE2DICT = {
    "lat":["la"],
    "swe":["sv"],
    "nob":["nb"],
    "nno":["nn"],
    "dan":["da"],
    "isl":["is"],
    "fin":["fi"],
    "est":["et"],
    "lav":["lv"],
    "lit":["lt"],
    "pol":["pl"],
    "cze":["cs"],
    "slk":["sk"],
    "slv":["sl"],
    "hrv":["hr"],
    "srp":["sr-Latn"],
    "bos":["bs"],
    "mkd":["mk"],
    "bul":["bg"],
    "ukr":["uk"],
    "rus":["ru"],
    "deu":["de"],
    "nld":["nl","dut"],
    "eng":["en","en-GB","en-CA","en-AU","en-ZA"],
    "fre":["fr"],
    "ita":["it"],
    "spa":["es","es-MX","es-AR","es-CL","es-ES"],
    "por":["pt","pt-PT"],
    "rom":["ro"],
    "hun":["hu"],
    "tur":["tr"],
    "gre":["el"],
}

def read_text(p):
    b = open(p,"rb").read()
    try: return b.decode("utf-8")
    except UnicodeDecodeError: return b.decode("utf-8", errors="ignore")

def get_words(text):
    output = []
    for line in text.split("\n"):
        parts = line.split("\t")
        output.append(parts[0])
    return output

def file_code(p):
    b = os.path.basename(p)
    return b[len("braxen-"):-4] if b.startswith("braxen-") else b

def load_hs(dict_codes):
    for c in dict_codes:
        if c in pairs:
            dic, aff = pairs[c]
            return hunspell.HunSpell(dic, aff), c
    return None, None

files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR) if f.startswith("braxen-") and f.endswith(".txt")]

file_words, file_sizes = {}, {}
for p in files:
    ws = set(get_words(read_text(p)))
    file_words[p] = ws
    file_sizes[p] = len(ws)

candidates = []
for p, n in file_sizes.items():
    code = file_code(p)
    if n >= MIN_ENTRIES and code not in SKIP_CODES and code in CODE2DICT:
        candidates.append(p)

rows = []
for p in sorted(candidates):
    code = file_code(p)
    hs, used = load_hs(CODE2DICT[code])
    if not hs:
        print(f"skip {code}: dict not found for {CODE2DICT[code]}")
        continue
    words = sorted(file_words[p])
    print(f"{code}: {len(words)} tokens via {used}")
    for w in words:
        if not w: 
            continue
        if all(ord(ch) < 128 for ch in w) and len(w) < 2:
            continue
        wcheck = w.replace("ö","ø").replace("Ö","Ø") if code in {"nor","dan"} else w
        if hs.spell(wcheck):
            rows.append((code, w, "OK", ""))
        else:
            sugs = ", ".join(hs.suggest(wcheck))
            rows.append((code, w, "MISS", sugs))

df = pd.DataFrame(rows, columns=["file_code","word","status","suggestions"])
df.to_csv(OUT_TSV, sep="\t", index=False, encoding="utf-8")
print(f"Wrote {OUT_TSV} with {len(df):,} rows")
df.head(20)
cze: 371 tokens via cs
dan: 1904 tokens via da
eng: 19700 tokens via en
skip fin: dict not found for ['fi']
fre: 5465 tokens via fr
gre: 817 tokens via el
hun: 361 tokens via hu
ita: 3051 tokens via it
lat: 3753 tokens via la
nob: 3763 tokens via nb
pol: 707 tokens via pl
por: 440 tokens via pt
rus: 1321 tokens via ru
spa: 2405 tokens via es
swe: 707415 tokens via sv
tur: 762 tokens via tr
ukr: 106 tokens via uk
Wrote hunspell_results.tsv with 752,322 rows
file_code word status suggestions
0 cze Adamkova OK
1 cze Allertova MISS Albertova, Gallertová, Albertov, Tolerovat
2 cze Babiš OK
3 cze Balazova MISS Balasova, Balažova, Balažová, Balážova, Balážo...
4 cze Banik OK
5 cze Banska MISS Bánská, Banka, Baska, Blanska, Banika, Baníka,...
6 cze Baranka OK
7 cze Bartecko MISS Bartečko, Barteckou, Bartesko, Bartecký, Barte...
8 cze Bartok OK
9 cze Bartosak MISS Bartošák, Barto sak, Barto-sak, Bartok
10 cze Batovska MISS Bátovská, Bítovska, Baťovská
11 cze Bejlek OK
12 cze Benice OK
13 cze Bezpecnostni MISS Bezpečnostní
14 cze Bittová OK
15 cze Blazíková MISS Blažíkova, Blažíková, Bazíková, Blaníková, Bla...
16 cze Boleslav OK
17 cze Boril MISS Bořil, Borli, Borik, Borl, Briol, Borel, Bortl...
18 cze Borivoj MISS Bořivoj, Borisov
19 cze Bouzkova OK