import os, re
from collections import defaultdict
import pandas as pd

FILE_DIR = "/kaggle/input/split-braxen-by-language"
MIN_ENTRIES = 1000
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}
SAMPLES_PER_FILE = 50

def has_cyrillic(s):
    for ch in s:
        o = ord(ch)
        if (0x0400 <= o <= 0x04FF) or (0x0500 <= o <= 0x052F) or (0x2DE0 <= o <= 0x2DFF) or (0xA640 <= o <= 0xA69F):
            return True
    return False

def has_greek(s):
    for ch in s:
        o = ord(ch)
        if (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF):
            return True
    return False

DIACRITICS = {
    "pol": set("ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"),
    "cze": set("áéíóúýčďěňřšťůžÁÉÍÓÚÝČĎĚŇŘŠŤŮŽ"),
    "slk": set("áäčďéíľĺňóôŕšťúýžÁÄČĎÉÍĽĹŇÓÔŔŠŤÚÝŽ"),
    "slv": set("蚞ȊŽ"),
    "hrv": set("čćđšžČĆĐŠŽ"),
    "srp": set("čćđšžČĆĐŠŽ"),
    "rom": set("ăâîșţșțĂÂÎȘŢȚ"),
    "hun": set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ"),
    "tur": set("çğıöşüÇĞİÖŞÜ"),
    "lit": set("ąčęėįšųūžĄČĘĖĮŠŲŪŽ"),
    "lav": set("āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ"),
    "deu": set("äöüßÄÖÜ"),
    "fre": set("àâæçéèêëîïôœùûüÿÀÂÆÇÉÈÊËÎÏÔŒÙÛÜŸ"),  # <-- 'fre', not 'fra'
    "spa": set("áéíñóúüÁÉÍÑÓÚÜ"),
    "por": set("áâãàçéêíóôõúÁÂÃÀÇÉÊÍÓÔÕÚ"),
    "isl": set("áéíóúýðþæöÁÉÍÓÚÝÐÞÆÖ"),
}

def file_code_from_name(fname):
    base = os.path.basename(fname)
    if base.startswith("braxen-") and base.endswith(".txt"):
        return base[len("braxen-"):-len(".txt")]
    return base

def read_text_safely(path):
    with open(path, "rb") as f:
        data = f.read()
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError:
        return data.decode("utf-8", errors="ignore")

# start-letter, then letters/digits/underscore/apostrophes/hyphens/en-dash
WORD_RE = re.compile(r"[^\W\d_][\w’'\-\u2011\u2013\u2014]*", flags=re.UNICODE)

def tokenize_words(text):
    return WORD_RE.findall(text)

def detect_diacritic_langs(word):
    hits = []
    for code, chars in DIACRITICS.items():
        if any(ch in word for ch in chars):
            hits.append(code)
    return hits

def is_bulgarian_like_cyrillic(word):
    w = word.lower()
    if "ъ" in w:
        return True
    if (w.endswith(("ът","та","то","те")) and ("ь" not in w)) and ("ы" not in w and "ё" not in w):
        return True
    return False

CZE_DIACS = DIACRITICS["cze"]

files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR)
         if f.startswith("braxen-") and f.endswith(".txt")]

file_word_sets, file_sizes = {}, {}
for path in files:
    text = read_text_safely(path)
    words = tokenize_words(text)
    uniq = set(words)
    file_word_sets[path] = uniq
    file_sizes[path] = len(uniq)

candidate_paths = []
for p, n in file_sizes.items():
    code = file_code_from_name(p)
    if n >= MIN_ENTRIES and code not in SKIP_CODES:
        candidate_paths.append(p)

print(f"Found {len(candidate_paths)} large non-catch-all files (>= {MIN_ENTRIES} entries).")

records = []
per_file_flags = defaultdict(lambda: defaultdict(int))

def add_record(word, current_code, suggested, reason):
    records.append((word, current_code, suggested, reason))
    per_file_flags[current_code][reason] += 1

for path in candidate_paths:
    code = file_code_from_name(path)
    uniq_words = file_word_sets[path]

    for w in uniq_words:
        if has_greek(w) and code != "gre":
            add_record(w, code, "gre", "Greek script in non-Greek file")
        elif has_cyrillic(w) and code not in {"rus","ukr","bul","mkd","srp"}:
            add_record(w, code, "cyrillic?", "Cyrillic characters in non-Cyrillic file")

    for w in uniq_words:
        if all(ord(ch) < 128 for ch in w):
            continue
        langs = detect_diacritic_langs(w)
        if not langs:
            continue
        if code in langs:
            continue
        # suppress French diacritics flagged inside Arabic file
        if code == "ara" and "fre" in langs:
            continue
        suggested = langs[0] if len(langs) == 1 else "ambiguous(" + ",".join(sorted(langs)) + ")"
        add_record(w, code, suggested, f"Contains diacritics typical of {','.join(sorted(langs))}")

    if code == "lat":
        for w in uniq_words:
            if any(ch in DIACRITICS["lit"] for ch in w):
                add_record(w, code, "lit", "Lithuanian diacritics in LAT")
            elif any(ch in DIACRITICS["lav"] for ch in w):
                add_record(w, code, "lav", "Latvian diacritics in LAT")

    if code == "rus":
        for w in uniq_words:
            if has_cyrillic(w) and is_bulgarian_like_cyrillic(w):
                add_record(w, code, "bul", "Bulgarian hard vowel/definite article in RUS")

    if code == "pol":
        for w in uniq_words:
            if any(ch in CZE_DIACS for ch in w):
                add_record(w, code, "cze", "Czech diacritics in POL")

    if code == "chi":
        for w in uniq_words:
            if any(ch in CZE_DIACS for ch in w) and w not in {"Zhéng"}:
                add_record(w, code, "cze", "Czech diacritics in CHI")

detailed_df = pd.DataFrame(records, columns=["word", "current_code", "suggested_code", "reason"]).drop_duplicates()
detailed_df.to_csv("braxen_obvious_misclassifications_filtered.csv", index=False, encoding="utf-8")

summary_rows = []
for code, counters in per_file_flags.items():
    total = sum(counters.values())
    row = {"file_code": code, "total_flags": total}
    row.update(counters)
    summary_rows.append(row)
summary_df = pd.DataFrame(summary_rows).sort_values("total_flags", ascending=False)
summary_df.to_csv("braxen_obvious_misclassifications_per_file_summary.csv", index=False, encoding="utf-8")

sample_rows = []
if not detailed_df.empty:
    for code, sub in detailed_df.groupby("current_code"):
        sample_rows.append(sub.head(SAMPLES_PER_FILE))
    pd.concat(sample_rows, ignore_index=True).to_csv("braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8")
else:
    pd.DataFrame(columns=["word","current_code","suggested_code","reason"]).to_csv(
        "braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8"
    )

print("Wrote:")
print(" - braxen_obvious_misclassifications_filtered.csv")
print(" - braxen_obvious_misclassifications_per_file_summary.csv")
print(" - braxen_obvious_misclassifications_samples.csv")

from IPython.display import display
display(summary_df.head(20))
display(detailed_df.head(50))
Found 12 large non-catch-all files (>= 1000 entries).
Wrote:
 - braxen_obvious_misclassifications_filtered.csv
 - braxen_obvious_misclassifications_per_file_summary.csv
 - braxen_obvious_misclassifications_samples.csv
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1458: RuntimeWarning: invalid value encountered in greater
  has_large_values = (abs_vals > 1e6).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in less
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in greater
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1458: RuntimeWarning: invalid value encountered in greater
  has_large_values = (abs_vals > 1e6).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in less
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in greater
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
file_code total_flags Contains diacritics typical of lav Contains diacritics typical of deu,slk Contains diacritics typical of cze,hun,isl,pol,por,slk,spa Contains diacritics typical of cze,hun,isl,por,slk,spa Contains diacritics typical of deu,hun,isl,tur Contains diacritics typical of por Contains diacritics typical of fre,isl Contains diacritics typical of deu,hun,isl,slk,tur ... Contains diacritics typical of cze,deu,hun,isl,pol,por,slk,spa,tur Contains diacritics typical of pol Contains diacritics typical of deu Contains diacritics typical of deu,fre,hun,isl,por,slk,tur Contains diacritics typical of hrv,srp Contains diacritics typical of cze,hun,isl,lav,por,slk,spa Contains diacritics typical of cze,isl,slk Contains diacritics typical of hrv,pol,srp Contains diacritics typical of deu,fre,hun,por,rom,spa,tur Bulgarian hard vowel/definite article in RUS
4 swe 155799 2.0 81218 89.0 277.0 59317.0 3.0 155.0 9671.0 ... 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN
10 ger 938 NaN 210 NaN 1.0 236.0 NaN NaN NaN ... NaN NaN 21.0 NaN NaN NaN NaN NaN NaN NaN
11 fin 678 NaN 556 NaN 1.0 63.0 NaN NaN 58.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 dan 312 NaN 14 NaN NaN 30.0 NaN 259.0 5.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 nob 268 NaN 28 1.0 2.0 74.0 NaN 160.0 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 lat 39 2.0 3 NaN 2.0 3.0 NaN 18.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 eng 38 NaN 8 1.0 3.0 8.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 spa 13 NaN 1 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 fre 12 NaN 2 3.0 3.0 3.0 1.0 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
0 ara 11 1.0 1 1.0 8.0 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 ita 9 NaN 1 2.0 1.0 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 rus 8 NaN 1 NaN 1.0 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 1.0 NaN NaN 1.0

12 rows × 43 columns

word current_code suggested_code reason
0 Ḥayāh ara lav Contains diacritics typical of lav
1 ä ara ambiguous(deu,slk) Contains diacritics typical of deu,slk
2 Ómar ara ambiguous(cze,hun,isl,pol,por,slk,spa) Contains diacritics typical of cze,hun,isl,pol...
3 Bahá ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
4 Abdu'l-Bahá ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
5 Baháulláh ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
6 Bahá'u'lláh ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
7 Al-Qáda ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
8 Bahái ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
9 Nabíl-i ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
10 Suáad ara ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
11 Rósant fre ambiguous(cze,hun,isl,pol,por,slk,spa) Contains diacritics typical of cze,hun,isl,pol...
12 äppeltarte fre ambiguous(deu,slk) Contains diacritics typical of deu,slk
13 Falcón fre ambiguous(cze,hun,isl,pol,por,slk,spa) Contains diacritics typical of cze,hun,isl,pol...
14 Chát fre ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
15 ä fre ambiguous(deu,slk) Contains diacritics typical of deu,slk
16 USA-BÖRSERNA fre ambiguous(deu,hun,isl,tur) Contains diacritics typical of deu,hun,isl,tur
17 ö fre ambiguous(deu,hun,isl,tur) Contains diacritics typical of deu,hun,isl,tur
18 Guimón fre ambiguous(cze,hun,isl,pol,por,slk,spa) Contains diacritics typical of cze,hun,isl,pol...
19 Chà fre por Contains diacritics typical of por
20 fre ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
21 Ricör fre ambiguous(deu,hun,isl,tur) Contains diacritics typical of deu,hun,isl,tur
22 Alán fre ambiguous(cze,hun,isl,por,slk,spa) Contains diacritics typical of cze,hun,isl,por...
23 længere dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
24 æteren dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
25 Sætran dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
26 Fölsgaards dan ambiguous(deu,hun,isl,tur) Contains diacritics typical of deu,hun,isl,tur
27 Grænsepatruljen dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
28 Forældreløse dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
29 sættes dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
30 værløse dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
31 fjernsynsfænomen dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
32 undersögende dan ambiguous(deu,hun,isl,tur) Contains diacritics typical of deu,hun,isl,tur
33 faghæfte dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
34 Rödby dan ambiguous(deu,hun,isl,tur) Contains diacritics typical of deu,hun,isl,tur
35 Karrebæk dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
36 bibelfortællinger dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
37 afhængig dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
38 bæredygtig dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
39 adækvanslæren dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
40 Halkjær dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
41 Matthæus dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
42 næste dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
43 trække dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
44 bæredygtige dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
45 fortæller dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
46 tænkning dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
47 grundværdier dan ambiguous(fre,isl) Contains diacritics typical of fre,isl
48 ildsjäle dan ambiguous(deu,slk) Contains diacritics typical of deu,slk
49 Skälskörs dan ambiguous(deu,hun,isl,slk,tur) Contains diacritics typical of deu,hun,isl,slk...