Tuairisc question page scraper
First step towards question answering
import requests
from bs4 import BeautifulSoup
url1 = 'https://tuairisc.ie/leamhthuiscint-faoiseamh-agus-saoirse-sa-snamh/'
def _get_url(url):
r = requests.get(url)
if r.status_code != 200:
raise Exception("Failed to open landing page")
return r.content
def _stop_reading(elem):
from bs4.element import NavigableString
if isinstance(elem, NavigableString):
return False
elems = [c for c in elem.children]
return len(elems) == 1 and elems[0].name == 'h2' and ' '.join(elems[0]['class']) == 'heading-banner education__banner'
t1 = _get_url(url1)
soup = BeautifulSoup(t1, 'html.parser')
desc = soup.find('meta', {'property': 'og:description'})['content']
title = soup.find('meta', {'property': 'og:title'})['content']
article_outer = soup.find('article')
article = article_outer.find('div', {'itemprop': 'articleBody'})
def _extract_text(article):
from bs4.element import NavigableString
paragraphs = []
for i in article.children:
if isinstance(i, NavigableString):
continue
if _stop_reading(i):
return paragraphs
paragraphs.append(i.text.replace('\xa0', ' '))
def _extract_questions(article):
out = []
for p in article.find('ol').findAll('li'):
out.append(p.text)
return out
qs = _extract_questions(article)
qs
x.findAll('li')