Parse from Tesseract output
Using Stanza, for Irish
%%capture
!pip install stanza
import json
import stanza
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)
nlp = stanza.Pipeline(
lang="ga",
processors="tokenize,pos,lemma,depparse",
# Let Stanza decide sentences & tokens
tokenize_pretokenized=True,
no_ssplit=True,
verbose=False
)
paras = [x.replace("\n", " ") for x in PAGE.split("\n\n")]
nlp_tok = stanza.Pipeline(
lang="ga",
processors="tokenize,pos,lemma,depparse",
tokenize_pretokenized=False,
verbose=False
)
!pip install pytesseract opencv-python-headless
!sudo apt install tesseract-ocr tesseract-ocr-gle
import requests
import numpy as np
import os
def read_image_from_url(url):
response = requests.get(url)
image_array = np.frombuffer(response.content, np.uint8)
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
return image
import pytesseract
import cv2
def extract_text_from_bbox(image_path, bbox, lang="gle"):
image = cv2.imread(image_path)
# Extract the region of interest
x1, y1, x2, y2 = bbox
roi = image[y1:y2, x1:x2]
# Convert the ROI to grayscale
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray, lang=lang)
return text
import pytesseract
import cv2
from IPython.display import display, Image
import io
def extract_text_from_bbox_and_url(url, bbox, lang="gle"):
image = read_image_from_url(url)
# Extract the region of interest
x1, y1, x2, y2 = bbox
roi = image[y1:y2, x1:x2]
# Convert the ROI to grayscale
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, buffer = cv2.imencode('.png', roi)
io_buf = io.BytesIO(buffer)
display(Image(io_buf.getvalue()))
text = pytesseract.image_to_string(gray, lang=lang)
return text
import requests
from bs4 import BeautifulSoup
def get_image_selector_from_url(url, selector):
req = requests.get(url)
assert req.status_code == 200, f"Failed to fetch {url}"
soup = BeautifulSoup(req.text, 'html.parser')
element = soup.select_one(selector)
if element:
return element['src']
else:
return None
def get_image_from_data(url, selector, bbox_text):
bbox = [int(x) for x in bbox_text.split(" ")]
img = get_image_selector_from_url(url, selector)
return extract_text_from_bbox_and_url(img, bbox)
b = get_image_from_data("https://www.leighleat.com/pages/1803", "#ajax-page-container > div > div:nth-child(2) > img", "297 681 725 742")
lines = "{:C}".format(nlp_tok(b)).split("\n")
print("\n".join(lines))