Parse pre-standard Irish
Colab, incomplete
!pip install stanza
from __future__ import annotations
import json
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Dict, List, Optional, Sequence, Tuple
import stanza
STD_API = "https://cadhan.com/api/intergaelic/3.0"
# Injected pronouns (only these)
PRON_FEATS: Dict[str, str] = {
"mé": "Person=1|Number=Sing",
"tú": "Person=2|Number=Sing",
"muid": "Person=1|Number=Plur",
"sinn": "Person=1|Number=Plur",
"sibh": "Person=2|Number=Plur",
"siad": "Person=3|Number=Plur",
}
PRON_FORMS = set(PRON_FEATS.keys())
# Stanza setup
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)
nlp = stanza.Pipeline(
lang="ga",
processors="tokenize,pos,lemma,depparse",
tokenize_pretokenized=True,
no_ssplit=True,
verbose=False,
)
def standardise(text: str, lang: str = "ga") -> List[Tuple[str, str]]:
"""Return list of (orig_chunk, std_chunk) rewrite units from Intergaelic."""
data = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
hdrs = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"}
req = urllib.request.Request(STD_API, data, hdrs)
with urllib.request.urlopen(req) as resp:
return [tuple(x) for x in json.loads(resp.read())]
@dataclass(frozen=True)
class Tok:
"""
A single OUTPUT token (orig_tok) aligned to a single STANZA token (std_tok).
If Intergaelic injected a pronoun (std has +1 token and last is PRON), it is stored
on the LAST token of the rewrite unit (inj_pron).
"""
orig_tok: str
std_tok: str
inj_pron: Optional[str] = None
def expand_and_align(pairs: Sequence[Tuple[str, str]]) -> List[Tok]:
"""
Expand Intergaelic rewrite units into a flat token stream with strict alignment.
Allowed:
1) N→N by whitespace splitting: align positionally.
2) Injection: len(std_parts) == len(orig_parts) + 1 AND std_parts[-1] in PRON_FORMS.
Align shared prefix positionally; attach inj_pron to the LAST aligned token.
Everything else: raise (no guessing).
"""
out: List[Tok] = []
for i, (orig_chunk, std_chunk) in enumerate(pairs):
orig_parts = (orig_chunk or "").split()
std_parts = (std_chunk or "").split()
# Treat empty std as identity on the orig side (rare, but your earlier code did this)
if not std_parts and orig_parts:
out.extend(Tok(o, o) for o in orig_parts)
continue
if len(orig_parts) == len(std_parts):
out.extend(Tok(o, s) for o, s in zip(orig_parts, std_parts))
continue
if len(std_parts) == len(orig_parts) + 1 and std_parts[-1].lower() in PRON_FORMS:
inj = std_parts[-1].lower()
shared = std_parts[:-1]
if len(shared) != len(orig_parts):
raise ValueError(
f"Internal alignment error at pair {i}: orig={orig_chunk!r} std={std_chunk!r}"
)
for j, (o, s) in enumerate(zip(orig_parts, shared)):
out.append(Tok(o, s, inj_pron=inj if j == len(orig_parts) - 1 else None))
continue
raise ValueError(
f"Unsupported Intergaelic mapping at pair index {i}: "
f"orig={orig_chunk!r} ({len(orig_parts)} toks) std={std_chunk!r} ({len(std_parts)} toks)"
)
return out
def compute_spaceafter(raw_text: str, orig_tokens: List[str]) -> List[bool]:
"""
True => there WAS whitespace after this token in raw_text (so no SpaceAfter=No)
False => no whitespace after (emit SpaceAfter=No)
Monotonic substring alignment; raises if a token can't be located.
"""
flags: List[bool] = []
pos = 0
n = len(raw_text)
for i, tok in enumerate(orig_tokens):
# Skip whitespace before token
while pos < n and raw_text[pos].isspace():
pos += 1
# Prefer exact match at current position
if raw_text.startswith(tok, pos):
start = pos
else:
start = raw_text.find(tok, pos)
if start == -1:
raise ValueError(f"Could not align token {i} {tok!r} near pos {pos}")
end = start + len(tok)
pos = end
if pos >= n:
flags.append(True) # end-of-text
else:
flags.append(raw_text[pos].isspace())
return flags
def sentences_from_tokens(tokens: Sequence[Tok]) -> List[List[int]]:
"""
Sentence segmentation over the STANZA token stream:
end sentence at . ! ? on std_tok.
Returns sentences as lists of indices into `tokens`.
"""
sents: List[List[int]] = []
buf: List[int] = []
for i, t in enumerate(tokens):
buf.append(i)
if t.std_tok in {".", "!", "?"}:
sents.append(buf)
buf = []
if buf:
sents.append(buf)
return sents
def feats_to_dict(feats: str) -> Dict[str, str]:
if not feats or feats == "_":
return {}
out: Dict[str, str] = {}
for part in feats.split("|"):
if "=" in part:
k, v = part.split("=", 1)
out[k] = v
return out
def dict_to_feats(d: Dict[str, str]) -> str:
if not d:
return "_"
return "|".join(f"{k}={v}" for k, v in sorted(d.items()))
def merge_feats_preserve(base: str, add: str) -> str:
"""
Merge without overwriting existing keys (so we don't stomp on Stanza if it already
provided Person/Number).
"""
bd = feats_to_dict(base)
ad = feats_to_dict(add)
for k, v in ad.items():
bd.setdefault(k, v)
return dict_to_feats(bd)
def merge_misc(*items: str) -> str:
parts: List[str] = []
for it in items:
if it and it != "_":
parts.append(it)
return "_" if not parts else "|".join(parts)
def choose_rep_word(words, idxs: List[int]) -> int:
"""
Representative word for lemma/POS/feats/deprel/head among a group.
We only group when we *decide* to later; here each Tok is 1:1 with a stanza word.
Keep this for possible future extension; currently idxs will be length 1.
"""
for i in idxs:
if (words[i].upos or "") != "PRON":
return i
return idxs[0]
def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
"""
Outputs CoNLL-U:
- Tokenization = aligned original tokens (after expanding orig chunks by whitespace)
- Stanza run on aligned standardized tokens
- Injected pronoun (Intergaelic-only) contributes Person/Number to the LAST token
in the rewrite unit (Tok.inj_pron), never creates a token.
- SpaceAfter=No derived from raw_text spacing.
"""
pairs = standardise(raw_text, lang)
toks = expand_and_align(pairs)
orig_tokens = [t.orig_tok for t in toks]
spaceafter = compute_spaceafter(raw_text, orig_tokens)
sents = sentences_from_tokens(toks)
pretok: List[List[str]] = [[toks[i].std_tok for i in sent] for sent in sents]
doc = nlp(pretok)
out: List[str] = []
global_idx = 0 # index into toks
for sid, (sent_idxs, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
raw_slice = [toks[i].orig_tok for i in sent_idxs]
std_slice = [toks[i].std_tok for i in sent_idxs]
out += [
f"# sent_id = {sid}",
f"# text = {' '.join(raw_slice)}",
f"# text_standard = {' '.join(std_slice)}",
]
words = sent_doc.words # 1 per pretokenized token in this sentence
# Map stanza word index (sentence-local) -> output token id (sentence-local)
# Here it's 1:1 by construction.
for widx, tok_i in enumerate(sent_idxs):
tid = widx + 1
t = toks[tok_i]
w = words[widx]
# Head remap: stanza head is 1-based within this sentence; 0=root
head_tid = w.head if (w.head is not None and w.head != 0) else 0
feats = w.feats or "_"
misc_parts: List[str] = []
if not spaceafter[tok_i]:
misc_parts.append("SpaceAfter=No")
if t.inj_pron is not None:
# Guaranteed to be in PRON_FEATS by expand_and_align()
feats = merge_feats_preserve(feats, PRON_FEATS[t.inj_pron])
misc_parts.append(f"InjPron={t.inj_pron}")
misc = merge_misc(*misc_parts)
out.append("\t".join([
str(tid),
t.orig_tok or "_",
w.lemma or "_",
w.upos or "_",
w.xpos or "_",
feats,
str(head_tid),
w.deprel or "_",
"_",
misc,
]))
global_idx += 1
out.append("")
return "\n".join(out)
f = project_with_stanza('Do leanadar ag "seasamh a gcirt" go dtí gur dhein Eoghan Rua Ó Néill, ag an mBeinn mBorb, gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin.')
print(f)