Tuairisc scraper pieces
_SITEMAP='https://tuairisc.ie/sitemap.xml'
import requests
from bs4 import BeautifulSoup
def _read_main_sitemap():
output = []
sm = requests.get(_SITEMAP)
if sm.status_code != 200:
raise Exception("Failed to read sitemap")
base_soup = BeautifulSoup(sm.text, 'lxml')
for submap in base_soup.findAll('sitemap'):
location = submap.find('loc').text
if 'sitemap-pt' in location:
output.append(_read_sub_sitemap(location))
return output
def _read_sub_sitemap(url):
output = []
sm = requests.get(url)
if sm.status_code != 200:
raise Exception("Failed to read sitemap")
base_soup = BeautifulSoup(sm.text, "lxml")
for submap in base_soup.findAll("url"):
output.append(submap.find("loc").text)
return output
def _fetch_article(url):
page = requests.get(url)
if page.status_code != 200:
raise Exception("Failed to read page: " + url)
return page.text
def _get_article_text(content):
base_soup = BeautifulSoup(content, "lxml")
main = base_soup.find("div", {"class": "article--full__content"})
paras = [p.text.strip() for p in main.findAll("p") if p.text.strip() != '']
return(paras)
def _get_pub_date(content):
base_soup = BeautifulSoup(content, "lxml")
date = base_soup.find("time", {"itemprop": "datePublished"})
return date["datetime"]