import re
from typing import List, Dict, Optional

LANG_MAP = {
    "air.": "Old Irish",
    "mir.": "Middle Irish",
    "nir.": "Modern Irish",
    "engl.": "English",
    "aengl.": "Old English",
    "mengl.": "Middle English",
    "anord.": "Old Norse",
    "aisl.": "Old Icelandic",
    "aschott.": "Old Scots",
    "lat.": "Latin",
    "kymr.": "Welsh",
    "korn.": "Cornish",
    "bret.": "Breton",
    "span.": "Spanish",
    "afranz.": "Old French",
}

WORK_MAP = {
    "Molloy": "Molloy",
    "Keat.": "Keating",
    "O’R.": "O’Reilly",
    "O'R.": "O’Reilly",
    "O’Clery": "O’Clery",
    "O'Clery": "O’Clery",
    "Atk.": "Atkinson",
    "Bk. of Deer": "Book of Deer",
    "Book of Deer": "Book of Deer",
    "Joyce": "Joyce",
}

ROMAN_RE = r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)"

def split_top_level_semicolons(text: str) -> List[str]:
    parts, buf, depth = [], [], 0
    for ch in text:
        if ch == '(':
            depth += 1
        elif ch == ')':
            depth = max(0, depth - 1)
        if ch == ';' and depth == 0:
            chunk = ''.join(buf).strip()
            if chunk:
                parts.append(chunk)
            buf = []
        else:
            buf.append(ch)
    last = ''.join(buf).strip()
    if last:
        parts.append(last)
    return parts

def parse_neben(chunk: str) -> List[str]:
    alts = re.findall(r"neben\s+''([^']+)''", chunk)
    return alts

def parse_phonetic_head(chunk: str) -> Optional[List[str]]:
    m = re.search(r"^''([^']+)''", chunk.strip())
    if not m:
        return None
    return [p for p in m.group(1).split() if p]

def parse_gloss(chunk: str) -> Optional[str]:
    m = re.search(r"„([^“]+)“", chunk)
    return m.group(1).strip() if m else None

def parse_gender(chunk: str) -> Optional[str]:
    m = re.search(r"\b([mfn])\.\b", chunk)
    if not m: return None
    return {"m": "masculine", "f": "feminine", "n": "neuter"}[m.group(1)]

def parse_vgl_crossrefs(chunk: str) -> List[Dict[str, str]]:
    """
    Handles: (vgl. II 251, 15), (vgl. II 251, 15. 266, 5), (vgl. I 263)
    Returns list of {volume,page[,line]}
    """
    out = []
    for par in re.findall(r"\(([^)]*vgl\.[^)]*)\)", chunk):
        s = par
        vgl_m = re.search(r"vgl\.\s*(.*)$", s)
        if not vgl_m: 
            continue
        tail = vgl_m.group(1).strip()

        tokens = [t for t in re.split(r"[,\.\s]+", tail) if t]
        cur_vol = None
        i = 0
        while i < len(tokens):
            tok = tokens[i]
            if re.fullmatch(ROMAN_RE, tok):
                cur_vol = tok
                i += 1
                continue
            if tok.isdigit():
                page = tok
                line = None

                if i + 1 < len(tokens) and tokens[i+1].isdigit():
                    line = tokens[i+1]
                    i += 1
                if cur_vol:
                    entry = {"volume": cur_vol, "page": page}
                    if line: entry["line"] = line
                    out.append(entry)
                i += 1
                continue
            i += 1
    return out

def parse_etymology(chunk: str) -> List[Dict[str, str]]:
    """
    Collects historical stages like: air. bél; mir. bláth; aengl. bróc; anord. brókr
    Splits multiple forms after the same label (e.g. 'air. biaid bieid').
    """
    out = []

    labels = sorted(LANG_MAP.keys(), key=len, reverse=True)
    pattern = r"\b(" + "|".join(map(re.escape, labels)) + r")\s+([^;,()]+)"
    for abbr, forms_blob in re.findall(pattern, chunk):
        forms = [f.strip() for f in forms_blob.split() if f.strip()]
        for f in forms:
            f = f.rstrip(".,:;")
            if f:
                out.append({"language": LANG_MAP[abbr], "form": f})
    return out

def parse_sources(chunk: str) -> List[Dict[str, object]]:
    """
    Captures easy modern source references:
      - 'Molloy 49: áthúil' → {work:"Molloy", page:"49", forms:["áthúil"]}
      - 'Keat. breódhaim, breóghaim' → {work:"Keating", forms:[...]}
      - 'O’R.' → {work:"O’Reilly"} (form optional)
      - '(Bk. of Deer)' → {work:"Book of Deer"}
    """
    out = []
    for m in re.finditer(r"\b(Molloy)\s+(\d+)(?::\s*([^);]+))?", chunk):
        work = WORK_MAP[m.group(1)]
        page = m.group(2)
        forms = []
        if m.group(3):
            forms = [f.strip() for f in re.split(r",\s*", m.group(3).strip()) if f.strip()]
        entry = {"work": work, "page": page}
        if forms: entry["forms"] = forms
        out.append(entry)

    for key, label in WORK_MAP.items():
        if key == "Molloy":  # already handled
            continue

        for m in re.finditer(r"(?:\(|\b)"+re.escape(key)+r"(?:\)|\b)(?::\s*([^);]+))?", chunk):
            forms_blob = m.group(1)
            entry = {"work": label}
            if forms_blob:
                forms = [f.strip() for f in re.split(r",\s*", forms_blob.strip()) if f.strip()]
                if forms: entry["forms"] = forms

            if entry not in out:
                out.append(entry)
    return out

def extract_easy_entries(volume: str, page: str, section: str, page_text: str) -> List[Dict[str, object]]:
    """
    Minimal-but-useful pass:
      - returns a list of dicts with `volume`, `page`, `raw`
      - plus: phonetic, alongside (from 'neben'), gloss, gender, see_section, etymology, source_refs
    """
    results = []
    page_text = page_text.replace("&nbsp;", " ")
    for raw in split_top_level_semicolons(page_text):
        item = {
            "volume": volume,
            "page": page,
            "section": section,
            "raw": raw.strip()
        }
        head = parse_phonetic_head(raw)
        if head: item["phonetic"] = head
        alts = parse_neben(raw)
        if alts:
            item["alongside"] = alts[0] if len(alts) == 1 else alts
        gloss = parse_gloss(raw)
        if gloss: item["gloss"] = gloss
        gender = parse_gender(raw)
        if gender: item["gender"] = gender
        refs = parse_vgl_crossrefs(raw)
        if refs: item["see_section"] = refs
        ety = parse_etymology(raw)
        if ety: item["etymology"] = ety
        src = parse_sources(raw)
        if src: item["source_refs"] = src
        results.append(item)
    return results
P4 = """''pus'' „lippe“, mir. bus; ''gax'' „jeder“, gach, air. cach cech; r. práis prás „messing“, mengl. bras; nir. blaosc, mir. blaesc, kymr. blisc „schale“, nir. plaosg, manx. pleaysc, kymr. plisg"""
def enumerate_entries(entries: List[Dict[str, object]], start: int = 78) -> None:
    for i, entry in enumerate(entries):
        entry["id"] = f"{entry['volume']}_{entry['section']}_{i + start}"

def write_json(path = "/private/tmp/irish-attested-pronunciations/finck/raw/", data = [], section = "4"):
    with open(f"{path}/section{section}.json", "w", encoding="utf-8") as f:
        import json
        json.dump(data, f, ensure_ascii=False, indent=2)
TEXT = """''ǵē'' „gans“, mir. géd; ''ǵēg'' „äst“, mir. géc;
''ǵ<u>ǝ</u>r'' „scharf“, air. gér; ''ǵlē'' „eiweiss“, air. glé; ''ǵlēs'' „kleid, instrument“;
''ǵlēsĭm'' „bekleide, mache fertig“, mir. glés (vgl. 
II 275,25 ff.); ''ǵȴō'' „lärm“, mir. gleó; ''ǵlōvr̥'' von ''ǵlō''; ''ǵrēsī''
„schuhmacher“, zu mir. gréss; ''hŕēš'' „nach“, tar éis; ''hūkr̥'' 
(II 139 irrtümlich: ''hūḱr̥''), engl. hooker; ''īntəx'' (neben ''iəntəx'') 
„erstaunt“, mir. ingantach; ''īntəs'' (neben ''iəntəs'') „wunder“, mir. 
ingantus; ''ī'' „nacht“ (neben ''ihə iə īhə''), air. aidche; ''ī īhə'' „essen“
(neben ''īə iə'') (Molloy 81: ighthe), mir. ithe; ''īhŕ̥'' „ackerfeld“,
mir. ithir; ''īxtr̥'' „unterer teil“, air. íchtar; ''īm'' „esse“, air. ithim 
(aber ''īm'' „butter“ mit eingipfliger exspiration); ''kā'' „spreu“, air. 
cáith; ''kȧ kǡv'', verbalsubst. zu ''kȧhĭm'' „verzehre“ etc., mir. 
caithem; ''klai'' „steinumzäunung“ (vgl. II 153), mir. clad; ''kōtə'' 
„rock“, engl. coat; ''kū'' (neben ''kuə'') „kummer“, mir. cuma; ''kūŋ'' 
„eng“, air. cumang; ''kūŕ'' „gegenwart“, mir. comair; 
''kūŕt'' (neben ''kuəŕc'') „besuch“, air. cuairt (dagegen ''kūŕc'' „court“ in der regel 
eingipflig); ''ḱǡŕ'' „vier“ neben ''ḱȧhŕ̥'', air. cethir; ''ḱē'' „wer, was, 
obwohl“, air. cia; ''ḱēxtə'' (vgl. II 284, 1) (Molloy 33: céchta)
„pflug“, mir. cécht; ''ḱēsĭm'' „kreuzige“, air. céssaim; ''ḱēšc'' „frage“
(neben ''ḱešc'' II 284, 2); ''ḱēšcuə'' „examen“, mir. ceist, lat. quaestio;
''ḱēš'' „sau“, céis, O’R.; ''ḱēv'' „schiffslände“, engl. quay (vgl. II 
284, 5); ''ḱlēv'' gen. sing. und nom./acc. plur. zu ''ḱliəv'', cléibh;
''ḱō, ḱōvr̥, ḱōbr̥nəx'' (vgl. II 284, 32) „nebel, neblig, nebligkeit“,
mir. ceó (zu ''ḱōbr̥nəx'' nicht ''ḱobr̥nəx'' wie irrtümlich II 284, 32,
folgendes: ceobhraonach (O’R.) „mizzling, misty“, von ceobhráin
„heavy dew falling like rain“, ceobhráon ceobhrán „small rain, 
mizzling rain“; braon „tropfen“, Keat., air. bróen „pluvia“,
Z.-E. 31); ''ḱŕē'' „thon, erde“, air. cré; ''ḱŕīx'' „ende“, air. crích 
(Molloy 23: criach); ''ḱīx'' „weibliche brust“, mir. cích (Molloy
23: ciach); ''ḱŕīst'' „Christus“, air. Críst; ''l̄aibrərī'' „bibliothek“, 
engl. library; ''l̄ai l̄aiə'' „liegen“, luighe (Molloy 80: luigheadh),
air. lige; ''l̄aiĭm'' „liege“, mir. laigim; ''l̄aiəd'' „kleinheit“, mir. 
laiget; ''l̄aiəŕḱīn'' (vgl. II 285, 3), mir. ladar; ''l̄auə l̄au'' „verfaulen“, 
''l̄auĭm'' „verfaule“, air. lobad; ''l̄ā'' „tag“, air. láthe etc.; ''l̄āx'' (neben 
''l̄ahəx'') „schmutz“, mir. lathach; ''l̄uə l̄ū'' (vgl. 251, 16) „weniger“,
air. lugu;
""".replace("\n", " ")
entries = []
for e in extract_easy_entries("I", page="6", section="4", page_text=TEXT):
    entries.append(e)

enumerate_entries(entries, start=79)

write_json(data=entries, section="4b")
italics = "𝘢𝘣𝘤𝘥𝘦𝘧𝘨𝘩𝘪𝘫𝘬𝘭𝘮𝘯𝘰𝘱𝘲𝘳𝘴𝘵𝘶𝘷𝘸𝘹𝘺𝘻"