Check Braxen multilingual entries
Some are very wrong
import os, re
from collections import defaultdict
import pandas as pd
FILE_DIR = "/kaggle/input/split-braxen-by-language"
MIN_ENTRIES = 1000
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}
SAMPLES_PER_FILE = 50
def has_cyrillic(s):
for ch in s:
o = ord(ch)
if (0x0400 <= o <= 0x04FF) or (0x0500 <= o <= 0x052F) or (0x2DE0 <= o <= 0x2DFF) or (0xA640 <= o <= 0xA69F):
return True
return False
def has_greek(s):
for ch in s:
o = ord(ch)
if (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF):
return True
return False
DIACRITICS = {
"pol": set("ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"),
"cze": set("áéíóúýčďěňřšťůžÁÉÍÓÚÝČĎĚŇŘŠŤŮŽ"),
"slk": set("áäčďéíľĺňóôŕšťúýžÁÄČĎÉÍĽĹŇÓÔŔŠŤÚÝŽ"),
"slv": set("蚞ȊŽ"),
"hrv": set("čćđšžČĆĐŠŽ"),
"srp": set("čćđšžČĆĐŠŽ"),
"rom": set("ăâîșţșțĂÂÎȘŢȚ"),
"hun": set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ"),
"tur": set("çğıöşüÇĞİÖŞÜ"),
"lit": set("ąčęėįšųūžĄČĘĖĮŠŲŪŽ"),
"lav": set("āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ"),
"deu": set("äöüßÄÖÜ"),
"fre": set("àâæçéèêëîïôœùûüÿÀÂÆÇÉÈÊËÎÏÔŒÙÛÜŸ"), # <-- 'fre', not 'fra'
"spa": set("áéíñóúüÁÉÍÑÓÚÜ"),
"por": set("áâãàçéêíóôõúÁÂÃÀÇÉÊÍÓÔÕÚ"),
"isl": set("áéíóúýðþæöÁÉÍÓÚÝÐÞÆÖ"),
}
def file_code_from_name(fname):
base = os.path.basename(fname)
if base.startswith("braxen-") and base.endswith(".txt"):
return base[len("braxen-"):-len(".txt")]
return base
def read_text_safely(path):
with open(path, "rb") as f:
data = f.read()
try:
return data.decode("utf-8")
except UnicodeDecodeError:
return data.decode("utf-8", errors="ignore")
# start-letter, then letters/digits/underscore/apostrophes/hyphens/en-dash
WORD_RE = re.compile(r"[^\W\d_][\w’'\-\u2011\u2013\u2014]*", flags=re.UNICODE)
def tokenize_words(text):
return WORD_RE.findall(text)
def detect_diacritic_langs(word):
hits = []
for code, chars in DIACRITICS.items():
if any(ch in word for ch in chars):
hits.append(code)
return hits
def is_bulgarian_like_cyrillic(word):
w = word.lower()
if "ъ" in w:
return True
if (w.endswith(("ът","та","то","те")) and ("ь" not in w)) and ("ы" not in w and "ё" not in w):
return True
return False
CZE_DIACS = DIACRITICS["cze"]
files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR)
if f.startswith("braxen-") and f.endswith(".txt")]
file_word_sets, file_sizes = {}, {}
for path in files:
text = read_text_safely(path)
words = tokenize_words(text)
uniq = set(words)
file_word_sets[path] = uniq
file_sizes[path] = len(uniq)
candidate_paths = []
for p, n in file_sizes.items():
code = file_code_from_name(p)
if n >= MIN_ENTRIES and code not in SKIP_CODES:
candidate_paths.append(p)
print(f"Found {len(candidate_paths)} large non-catch-all files (>= {MIN_ENTRIES} entries).")
records = []
per_file_flags = defaultdict(lambda: defaultdict(int))
def add_record(word, current_code, suggested, reason):
records.append((word, current_code, suggested, reason))
per_file_flags[current_code][reason] += 1
for path in candidate_paths:
code = file_code_from_name(path)
uniq_words = file_word_sets[path]
for w in uniq_words:
if has_greek(w) and code != "gre":
add_record(w, code, "gre", "Greek script in non-Greek file")
elif has_cyrillic(w) and code not in {"rus","ukr","bul","mkd","srp"}:
add_record(w, code, "cyrillic?", "Cyrillic characters in non-Cyrillic file")
for w in uniq_words:
if all(ord(ch) < 128 for ch in w):
continue
langs = detect_diacritic_langs(w)
if not langs:
continue
if code in langs:
continue
# suppress French diacritics flagged inside Arabic file
if code == "ara" and "fre" in langs:
continue
suggested = langs[0] if len(langs) == 1 else "ambiguous(" + ",".join(sorted(langs)) + ")"
add_record(w, code, suggested, f"Contains diacritics typical of {','.join(sorted(langs))}")
if code == "lat":
for w in uniq_words:
if any(ch in DIACRITICS["lit"] for ch in w):
add_record(w, code, "lit", "Lithuanian diacritics in LAT")
elif any(ch in DIACRITICS["lav"] for ch in w):
add_record(w, code, "lav", "Latvian diacritics in LAT")
if code == "rus":
for w in uniq_words:
if has_cyrillic(w) and is_bulgarian_like_cyrillic(w):
add_record(w, code, "bul", "Bulgarian hard vowel/definite article in RUS")
if code == "pol":
for w in uniq_words:
if any(ch in CZE_DIACS for ch in w):
add_record(w, code, "cze", "Czech diacritics in POL")
if code == "chi":
for w in uniq_words:
if any(ch in CZE_DIACS for ch in w) and w not in {"Zhéng"}:
add_record(w, code, "cze", "Czech diacritics in CHI")
detailed_df = pd.DataFrame(records, columns=["word", "current_code", "suggested_code", "reason"]).drop_duplicates()
detailed_df.to_csv("braxen_obvious_misclassifications_filtered.csv", index=False, encoding="utf-8")
summary_rows = []
for code, counters in per_file_flags.items():
total = sum(counters.values())
row = {"file_code": code, "total_flags": total}
row.update(counters)
summary_rows.append(row)
summary_df = pd.DataFrame(summary_rows).sort_values("total_flags", ascending=False)
summary_df.to_csv("braxen_obvious_misclassifications_per_file_summary.csv", index=False, encoding="utf-8")
sample_rows = []
if not detailed_df.empty:
for code, sub in detailed_df.groupby("current_code"):
sample_rows.append(sub.head(SAMPLES_PER_FILE))
pd.concat(sample_rows, ignore_index=True).to_csv("braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8")
else:
pd.DataFrame(columns=["word","current_code","suggested_code","reason"]).to_csv(
"braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8"
)
print("Wrote:")
print(" - braxen_obvious_misclassifications_filtered.csv")
print(" - braxen_obvious_misclassifications_per_file_summary.csv")
print(" - braxen_obvious_misclassifications_samples.csv")
from IPython.display import display
display(summary_df.head(20))
display(detailed_df.head(50))