%%capture
!pip install stanza

import urllib.parse, urllib.request, json, sys
import stanza

STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False
)

from itertools import groupby
from typing import List, Tuple

def _split_std(std: str, orig: str) -> List[str]:
    """Return the token(s) that should feed Stanza for this pair."""
    if not std.strip():
        return [orig]
    return std.split()

def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
    """Very light sentence splitter: keep everything up to . ! ?"""
    sent, buf = [], []
    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        for j, part in enumerate(parts):
            buf.append((i, j, len(parts), orig, part))
            if part in {".", "!", "?"}:
                sent.append(buf);  buf = []
    if buf:
        sent.append(buf)
    return sent

def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    pairs  = standardise(raw_text, lang)

    sents  = _sentences_from_pairs(pairs)
    pretok = [[m[4] for m in sent] for sent in sents]

    doc = nlp(pretok)

    conllu_lines = []
    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        raw_slice = [m[3] for m in sent_map if m[1] == 0]
        std_slice = [m[4] for m in sent_map]
        conllu_lines += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        # token lines
        widx = 0
        tid  = 1
        for m in sent_map:
            orig_i, sub_i, n_sub, orig_tok, std_tok = m
            word = sent_doc.words[widx]

            if sub_i == 0 and n_sub > 1:
                conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")

            form = orig_tok if n_sub == 1 else std_tok

            conllu_lines.append("\t".join([
                str(tid),
                form,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))

            widx += 1
            tid  += 1
        conllu_lines.append("")

    return "\n".join(conllu_lines)

nlp_tok = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    tokenize_pretokenized=False,
    verbose=False
)

pp = project_with_stanza("E-, ‘firing range’ a mbíonns acub agus é seo agus é siúd.")

lines = "{:C}".format(nlp_tok(raw)).split("\n")
print("\n".join(lines))

!pip install pytesseract opencv-python-headless

!sudo apt install tesseract-ocr tesseract-ocr-gle

import requests
import numpy as np
import os

def read_image_from_url(url):
    response = requests.get(url)
    image_array = np.frombuffer(response.content, np.uint8)
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    return image

import pytesseract
import cv2

def extract_text_from_bbox(image_path, bbox, lang="gle"):
    image = cv2.imread(image_path)

    # Extract the region of interest
    x1, y1, x2, y2 = bbox
    roi = image[y1:y2, x1:x2]

    # Convert the ROI to grayscale
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

    text = pytesseract.image_to_string(gray, lang=lang)

    return text

import pytesseract
import cv2
from IPython.display import display, Image
import io

def extract_text_from_bbox_and_url(url, bbox, lang="gle"):
    image = read_image_from_url(url)

    # Extract the region of interest
    x1, y1, x2, y2 = bbox
    roi = image[y1:y2, x1:x2]

    # Convert the ROI to grayscale
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

    _, buffer = cv2.imencode('.png', roi)
    io_buf = io.BytesIO(buffer)
    display(Image(io_buf.getvalue()))

    text = pytesseract.image_to_string(gray, lang=lang)

    return text

import requests
from bs4 import BeautifulSoup

def get_image_selector_from_url(url, selector):
    req = requests.get(url)
    assert req.status_code == 200, f"Failed to fetch {url}"
    soup = BeautifulSoup(req.text, 'html.parser')
    element = soup.select_one(selector)
    if element:
        return element['src']
    else:
        return None

def get_image_from_data(url, selector, bbox_text):
    bbox = [int(x) for x in bbox_text.split(" ")]
    img = get_image_selector_from_url(url, selector)
    return extract_text_from_bbox_and_url(img, bbox)

b = get_image_from_data("https://www.leighleat.com/pages/1803", "#ajax-page-container > div > div:nth-child(2) > img", "297 681 725 742")

lines = "{:C}".format(nlp_tok(b)).split("\n")
print("\n".join(lines))

cor = "Nuair a thagann Brídín abhaile, ordaíonn a mamaí di í féin a ghlanadh."
lines = "{:C}".format(nlp_tok(cor)).split("\n")
print("\n".join(lines))