Process Finck
Extract Finck data as JSON; partly generated
import re
from typing import List, Dict, Optional
LANG_MAP = {
"air.": "Old Irish",
"mir.": "Middle Irish",
"nir.": "Modern Irish",
"engl.": "English",
"aengl.": "Old English",
"mengl.": "Middle English",
"anord.": "Old Norse",
"aisl.": "Old Icelandic",
"aschott.": "Old Scots",
"lat.": "Latin",
"kymr.": "Welsh",
"korn.": "Cornish",
"bret.": "Breton",
"span.": "Spanish",
"afranz.": "Old French",
}
WORK_MAP = {
"Molloy": "Molloy",
"Keat.": "Keating",
"O’R.": "O’Reilly",
"O'R.": "O’Reilly",
"O’Clery": "O’Clery",
"O'Clery": "O’Clery",
"Atk.": "Atkinson",
"Bk. of Deer": "Book of Deer",
"Book of Deer": "Book of Deer",
"Joyce": "Joyce",
}
ROMAN_RE = r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)"
def split_top_level_semicolons(text: str) -> List[str]:
parts, buf, depth = [], [], 0
for ch in text:
if ch == '(':
depth += 1
elif ch == ')':
depth = max(0, depth - 1)
if ch == ';' and depth == 0:
chunk = ''.join(buf).strip()
if chunk:
parts.append(chunk)
buf = []
else:
buf.append(ch)
last = ''.join(buf).strip()
if last:
parts.append(last)
return parts
def parse_neben(chunk: str) -> List[str]:
alts = re.findall(r"neben\s+''([^']+)''", chunk)
return alts
def parse_phonetic_head(chunk: str) -> Optional[List[str]]:
m = re.search(r"^''([^']+)''", chunk.strip())
if not m:
return None
return [p for p in m.group(1).split() if p]
def parse_gloss(chunk: str) -> Optional[str]:
m = re.search(r"„([^“]+)“", chunk)
return m.group(1).strip() if m else None
def parse_gender(chunk: str) -> Optional[str]:
m = re.search(r"\b([mfn])\.\b", chunk)
if not m: return None
return {"m": "masculine", "f": "feminine", "n": "neuter"}[m.group(1)]
def parse_vgl_crossrefs(chunk: str) -> List[Dict[str, str]]:
"""
Handles: (vgl. II 251, 15), (vgl. II 251, 15. 266, 5), (vgl. I 263)
Returns list of {volume,page[,line]}
"""
out = []
for par in re.findall(r"\(([^)]*vgl\.[^)]*)\)", chunk):
s = par
vgl_m = re.search(r"vgl\.\s*(.*)$", s)
if not vgl_m:
continue
tail = vgl_m.group(1).strip()
tokens = [t for t in re.split(r"[,\.\s]+", tail) if t]
cur_vol = None
i = 0
while i < len(tokens):
tok = tokens[i]
if re.fullmatch(ROMAN_RE, tok):
cur_vol = tok
i += 1
continue
if tok.isdigit():
page = tok
line = None
if i + 1 < len(tokens) and tokens[i+1].isdigit():
line = tokens[i+1]
i += 1
if cur_vol:
entry = {"volume": cur_vol, "page": page}
if line: entry["line"] = line
out.append(entry)
i += 1
continue
i += 1
return out
def parse_etymology(chunk: str) -> List[Dict[str, str]]:
"""
Collects historical stages like: air. bél; mir. bláth; aengl. bróc; anord. brókr
Splits multiple forms after the same label (e.g. 'air. biaid bieid').
"""
out = []
labels = sorted(LANG_MAP.keys(), key=len, reverse=True)
pattern = r"\b(" + "|".join(map(re.escape, labels)) + r")\s+([^;,()]+)"
for abbr, forms_blob in re.findall(pattern, chunk):
forms = [f.strip() for f in forms_blob.split() if f.strip()]
for f in forms:
f = f.rstrip(".,:;")
if f:
out.append({"language": LANG_MAP[abbr], "form": f})
return out
def parse_sources(chunk: str) -> List[Dict[str, object]]:
"""
Captures easy modern source references:
- 'Molloy 49: áthúil' → {work:"Molloy", page:"49", forms:["áthúil"]}
- 'Keat. breódhaim, breóghaim' → {work:"Keating", forms:[...]}
- 'O’R.' → {work:"O’Reilly"} (form optional)
- '(Bk. of Deer)' → {work:"Book of Deer"}
"""
out = []
for m in re.finditer(r"\b(Molloy)\s+(\d+)(?::\s*([^);]+))?", chunk):
work = WORK_MAP[m.group(1)]
page = m.group(2)
forms = []
if m.group(3):
forms = [f.strip() for f in re.split(r",\s*", m.group(3).strip()) if f.strip()]
entry = {"work": work, "page": page}
if forms: entry["forms"] = forms
out.append(entry)
for key, label in WORK_MAP.items():
if key == "Molloy": # already handled
continue
for m in re.finditer(r"(?:\(|\b)"+re.escape(key)+r"(?:\)|\b)(?::\s*([^);]+))?", chunk):
forms_blob = m.group(1)
entry = {"work": label}
if forms_blob:
forms = [f.strip() for f in re.split(r",\s*", forms_blob.strip()) if f.strip()]
if forms: entry["forms"] = forms
if entry not in out:
out.append(entry)
return out
def extract_easy_entries(volume: str, page: str, section: str, page_text: str) -> List[Dict[str, object]]:
"""
Minimal-but-useful pass:
- returns a list of dicts with `volume`, `page`, `raw`
- plus: phonetic, alongside (from 'neben'), gloss, gender, see_section, etymology, source_refs
"""
results = []
page_text = page_text.replace(" ", " ")
for raw in split_top_level_semicolons(page_text):
item = {
"volume": volume,
"page": page,
"section": section,
"raw": raw.strip()
}
head = parse_phonetic_head(raw)
if head: item["phonetic"] = head
alts = parse_neben(raw)
if alts:
item["alongside"] = alts[0] if len(alts) == 1 else alts
gloss = parse_gloss(raw)
if gloss: item["gloss"] = gloss
gender = parse_gender(raw)
if gender: item["gender"] = gender
refs = parse_vgl_crossrefs(raw)
if refs: item["see_section"] = refs
ety = parse_etymology(raw)
if ety: item["etymology"] = ety
src = parse_sources(raw)
if src: item["source_refs"] = src
results.append(item)
return results
P4 = """''pus'' „lippe“, mir. bus; ''gax'' „jeder“, gach, air. cach cech; r. práis prás „messing“, mengl. bras; nir. blaosc, mir. blaesc, kymr. blisc „schale“, nir. plaosg, manx. pleaysc, kymr. plisg"""
def enumerate_entries(entries: List[Dict[str, object]], start: int = 78) -> None:
for i, entry in enumerate(entries):
entry["id"] = f"{entry['volume']}_{entry['section']}_{i + start}"
def write_json(path = "/private/tmp/irish-attested-pronunciations/finck/raw/", data = [], section = "4"):
with open(f"{path}/section{section}.json", "w", encoding="utf-8") as f:
import json
json.dump(data, f, ensure_ascii=False, indent=2)
TEXT = """''ǵē'' „gans“, mir. géd; ''ǵēg'' „äst“, mir. géc;
''ǵ<u>ǝ</u>r'' „scharf“, air. gér; ''ǵlē'' „eiweiss“, air. glé; ''ǵlēs'' „kleid, instrument“;
''ǵlēsĭm'' „bekleide, mache fertig“, mir. glés (vgl.
II 275,25 ff.); ''ǵȴō'' „lärm“, mir. gleó; ''ǵlōvr̥'' von ''ǵlō''; ''ǵrēsī''
„schuhmacher“, zu mir. gréss; ''hŕēš'' „nach“, tar éis; ''hūkr̥''
(II 139 irrtümlich: ''hūḱr̥''), engl. hooker; ''īntəx'' (neben ''iəntəx'')
„erstaunt“, mir. ingantach; ''īntəs'' (neben ''iəntəs'') „wunder“, mir.
ingantus; ''ī'' „nacht“ (neben ''ihə iə īhə''), air. aidche; ''ī īhə'' „essen“
(neben ''īə iə'') (Molloy 81: ighthe), mir. ithe; ''īhŕ̥'' „ackerfeld“,
mir. ithir; ''īxtr̥'' „unterer teil“, air. íchtar; ''īm'' „esse“, air. ithim
(aber ''īm'' „butter“ mit eingipfliger exspiration); ''kā'' „spreu“, air.
cáith; ''kȧ kǡv'', verbalsubst. zu ''kȧhĭm'' „verzehre“ etc., mir.
caithem; ''klai'' „steinumzäunung“ (vgl. II 153), mir. clad; ''kōtə''
„rock“, engl. coat; ''kū'' (neben ''kuə'') „kummer“, mir. cuma; ''kūŋ''
„eng“, air. cumang; ''kūŕ'' „gegenwart“, mir. comair;
''kūŕt'' (neben ''kuəŕc'') „besuch“, air. cuairt (dagegen ''kūŕc'' „court“ in der regel
eingipflig); ''ḱǡŕ'' „vier“ neben ''ḱȧhŕ̥'', air. cethir; ''ḱē'' „wer, was,
obwohl“, air. cia; ''ḱēxtə'' (vgl. II 284, 1) (Molloy 33: céchta)
„pflug“, mir. cécht; ''ḱēsĭm'' „kreuzige“, air. céssaim; ''ḱēšc'' „frage“
(neben ''ḱešc'' II 284, 2); ''ḱēšcuə'' „examen“, mir. ceist, lat. quaestio;
''ḱēš'' „sau“, céis, O’R.; ''ḱēv'' „schiffslände“, engl. quay (vgl. II
284, 5); ''ḱlēv'' gen. sing. und nom./acc. plur. zu ''ḱliəv'', cléibh;
''ḱō, ḱōvr̥, ḱōbr̥nəx'' (vgl. II 284, 32) „nebel, neblig, nebligkeit“,
mir. ceó (zu ''ḱōbr̥nəx'' nicht ''ḱobr̥nəx'' wie irrtümlich II 284, 32,
folgendes: ceobhraonach (O’R.) „mizzling, misty“, von ceobhráin
„heavy dew falling like rain“, ceobhráon ceobhrán „small rain,
mizzling rain“; braon „tropfen“, Keat., air. bróen „pluvia“,
Z.-E. 31); ''ḱŕē'' „thon, erde“, air. cré; ''ḱŕīx'' „ende“, air. crích
(Molloy 23: criach); ''ḱīx'' „weibliche brust“, mir. cích (Molloy
23: ciach); ''ḱŕīst'' „Christus“, air. Críst; ''l̄aibrərī'' „bibliothek“,
engl. library; ''l̄ai l̄aiə'' „liegen“, luighe (Molloy 80: luigheadh),
air. lige; ''l̄aiĭm'' „liege“, mir. laigim; ''l̄aiəd'' „kleinheit“, mir.
laiget; ''l̄aiəŕḱīn'' (vgl. II 285, 3), mir. ladar; ''l̄auə l̄au'' „verfaulen“,
''l̄auĭm'' „verfaule“, air. lobad; ''l̄ā'' „tag“, air. láthe etc.; ''l̄āx'' (neben
''l̄ahəx'') „schmutz“, mir. lathach; ''l̄uə l̄ū'' (vgl. 251, 16) „weniger“,
air. lugu;
""".replace("\n", " ")
entries = []
for e in extract_easy_entries("I", page="6", section="4", page_text=TEXT):
entries.append(e)
enumerate_entries(entries, start=79)
write_json(data=entries, section="4b")
italics = "𝘢𝘣𝘤𝘥𝘦𝘧𝘨𝘩𝘪𝘫𝘬𝘭𝘮𝘯𝘰𝘱𝘲𝘳𝘴𝘵𝘶𝘷𝘸𝘹𝘺𝘻"