Parse pre-standard Irish via standardiser
Partial, incomplete
%%capture
!pip install stanza
import urllib.parse, urllib.request, json, sys
import stanza
STD_API = "https://cadhan.com/api/intergaelic/3.0"
def standardise(text: str, lang: str = "ga"):
"""Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
data = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
hdrs = {"Content-Type": "application/x-www-form-urlencoded",
"Accept": "application/json"}
req = urllib.request.Request(STD_API, data, hdrs)
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)
nlp = stanza.Pipeline(
lang="ga",
processors="tokenize,pos,lemma,depparse",
# Let Stanza decide sentences & tokens
tokenize_pretokenized=True,
no_ssplit=True,
verbose=False
)
from itertools import groupby
from typing import List, Tuple
def _split_std(std: str, orig: str) -> List[str]:
"""Return the token(s) that should feed Stanza for this pair."""
if not std.strip():
return [orig]
return std.split()
def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
"""Very light sentence splitter: keep everything up to . ! ?"""
sent, buf = [], []
for i, (orig, std) in enumerate(pairs):
parts = _split_std(std, orig)
for j, part in enumerate(parts):
buf.append((i, j, len(parts), orig, part))
if part in {".", "!", "?"}:
sent.append(buf); buf = []
if buf:
sent.append(buf)
return sent
def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
pairs = standardise(raw_text, lang)
sents = _sentences_from_pairs(pairs)
pretok = [[m[4] for m in sent] for sent in sents]
doc = nlp(pretok)
conllu_lines = []
for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
raw_slice = [m[3] for m in sent_map if m[1] == 0]
std_slice = [m[4] for m in sent_map]
conllu_lines += [
f"# sent_id = {sid}",
f"# text = {' '.join(raw_slice)}",
f"# text_standard = {' '.join(std_slice)}",
]
# token lines
widx = 0
tid = 1
for m in sent_map:
orig_i, sub_i, n_sub, orig_tok, std_tok = m
word = sent_doc.words[widx]
if sub_i == 0 and n_sub > 1:
conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")
form = orig_tok if n_sub == 1 else std_tok
conllu_lines.append("\t".join([
str(tid),
form,
word.lemma or "_",
word.upos or "_",
word.xpos or "_",
word.feats or "_",
str(word.head) if word.head else "_",
word.deprel or "_",
"_",
"_",
]))
widx += 1
tid += 1
conllu_lines.append("")
return "\n".join(conllu_lines)
nlp_tok = stanza.Pipeline(
lang="ga",
processors="tokenize,pos,lemma,depparse",
tokenize_pretokenized=False,
verbose=False
)
pp = project_with_stanza("E-, ‘firing range’ a mbíonns acub agus é seo agus é siúd.")
lines = "{:C}".format(nlp_tok(raw)).split("\n")
print("\n".join(lines))
!pip install pytesseract opencv-python-headless
!sudo apt install tesseract-ocr tesseract-ocr-gle
import requests
import numpy as np
import os
def read_image_from_url(url):
response = requests.get(url)
image_array = np.frombuffer(response.content, np.uint8)
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
return image
import pytesseract
import cv2
def extract_text_from_bbox(image_path, bbox, lang="gle"):
image = cv2.imread(image_path)
# Extract the region of interest
x1, y1, x2, y2 = bbox
roi = image[y1:y2, x1:x2]
# Convert the ROI to grayscale
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray, lang=lang)
return text
import pytesseract
import cv2
from IPython.display import display, Image
import io
def extract_text_from_bbox_and_url(url, bbox, lang="gle"):
image = read_image_from_url(url)
# Extract the region of interest
x1, y1, x2, y2 = bbox
roi = image[y1:y2, x1:x2]
# Convert the ROI to grayscale
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, buffer = cv2.imencode('.png', roi)
io_buf = io.BytesIO(buffer)
display(Image(io_buf.getvalue()))
text = pytesseract.image_to_string(gray, lang=lang)
return text
import requests
from bs4 import BeautifulSoup
def get_image_selector_from_url(url, selector):
req = requests.get(url)
assert req.status_code == 200, f"Failed to fetch {url}"
soup = BeautifulSoup(req.text, 'html.parser')
element = soup.select_one(selector)
if element:
return element['src']
else:
return None
def get_image_from_data(url, selector, bbox_text):
bbox = [int(x) for x in bbox_text.split(" ")]
img = get_image_selector_from_url(url, selector)
return extract_text_from_bbox_and_url(img, bbox)
b = get_image_from_data("https://www.leighleat.com/pages/1803", "#ajax-page-container > div > div:nth-child(2) > img", "297 681 725 742")
lines = "{:C}".format(nlp_tok(b)).split("\n")
print("\n".join(lines))
cor = "Nuair a thagann Brídín abhaile, ordaíonn a mamaí di í féin a ghlanadh."
lines = "{:C}".format(nlp_tok(cor)).split("\n")
print("\n".join(lines))