%%capture
!pip install stanza
import json
import stanza
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False
)
paras = [x.replace("\n", " ") for x in PAGE.split("\n\n")]
nlp_tok = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    tokenize_pretokenized=False,
    verbose=False
)
!pip install pytesseract opencv-python-headless
!sudo apt install tesseract-ocr tesseract-ocr-gle
import requests
import numpy as np
import os

def read_image_from_url(url):
    response = requests.get(url)
    image_array = np.frombuffer(response.content, np.uint8)
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    return image
import pytesseract
import cv2

def extract_text_from_bbox(image_path, bbox, lang="gle"):
    image = cv2.imread(image_path)

    # Extract the region of interest
    x1, y1, x2, y2 = bbox
    roi = image[y1:y2, x1:x2]

    # Convert the ROI to grayscale
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

    text = pytesseract.image_to_string(gray, lang=lang)

    return text
import pytesseract
import cv2
from IPython.display import display, Image
import io

def extract_text_from_bbox_and_url(url, bbox, lang="gle"):
    image = read_image_from_url(url)

    # Extract the region of interest
    x1, y1, x2, y2 = bbox
    roi = image[y1:y2, x1:x2]

    # Convert the ROI to grayscale
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

    _, buffer = cv2.imencode('.png', roi)
    io_buf = io.BytesIO(buffer)
    display(Image(io_buf.getvalue()))

    text = pytesseract.image_to_string(gray, lang=lang)

    return text
import requests
from bs4 import BeautifulSoup

def get_image_selector_from_url(url, selector):
    req = requests.get(url)
    assert req.status_code == 200, f"Failed to fetch {url}"
    soup = BeautifulSoup(req.text, 'html.parser')
    element = soup.select_one(selector)
    if element:
        return element['src']
    else:
        return None
def get_image_from_data(url, selector, bbox_text):
    bbox = [int(x) for x in bbox_text.split(" ")]
    img = get_image_selector_from_url(url, selector)
    return extract_text_from_bbox_and_url(img, bbox)
b = get_image_from_data("https://www.leighleat.com/pages/1803", "#ajax-page-container > div > div:nth-child(2) > img", "297 681 725 742")
lines = "{:C}".format(nlp_tok(b)).split("\n")
print("\n".join(lines))