Foinse scraper pieces
Scraping Foinse, from the Wayback Machine
link = "http://web.archive.org/web/20130922081459/http://www.foinse.ie/nuacht/nuacht-is-deanai/6765-suil-go-gcruthofar-158-post-nua-le-tograi-ata-ceadaithe-ag-unag"
import requests
from bs4 import BeautifulSoup
page = requests.get(link)
assert page.status_code == 200
In purely text terms, much of the junk can be discarded using these comments:
if "<!-- CONTENT -->" in page.text:
trim = page.text.split("<!-- CONTENT -->")[1]
if trim and "<!-- //CONTENT -->" in trim:
trim = trim.split("<!-- //CONTENT -->")[0]
... but it's easier with BeautifulSoup to just extract <div class="item-page">
soup = BeautifulSoup(page.text, "lxml")
content = soup.find("div", {"class": "item-page"})
title = content.find("h2").text.strip()
published_tag = content.find("dd", {"class": "published"})
if published_tag:
published = published_tag.text.strip()
author_tag = content.find("dd", {"class": "createdby"})
if author_tag:
author = author_tag.text.strip()
author
paragraphs_tags = content.find_all("p", {"class": "MsoNormal"})
paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
vocab_list = []
for p in paragraphs_tags:
for vocab in p.find_all("a", {"class": "glossarylink"}):
item = {}
item["en"] = vocab.get("title").strip()
item["ga"] = vocab.text.strip()
vocab_list.append(item)
check = "http://web.archive.org/web/20171222073817/http://www.foinse.ie/nuacht/nuacht-is-deanai/6822-seanoiri-ag-dul-i-mbun-agoide-maidir-le-ciorruithe"
page2 = requests.get(check)
assert page2.status_code == 200
def get_content(url):
out = {}
page = requests.get(url)
if page.status_code != 200:
return {}
soup = BeautifulSoup(page.text, "lxml")
content = soup.find("div", {"class": "item-page"})
if not content:
return {}
title = content.find("h2").text.strip()
if title:
out["title"] = title
published_tag = content.find("dd", {"class": "published"})
if published_tag:
out["published"] = published_tag.text.strip()
author_tag = content.find("dd", {"class": "createdby"})
if author_tag:
out["author"] = author_tag.text.strip()
paragraphs_tags = content.find_all("p", {"class": "MsoNormal"})
paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
out["text"] = paragraphs
vocab_list = []
for p in paragraphs_tags:
for vocab in p.find_all("a", {"class": "glossarylink"}):
item = {}
item["en"] = vocab.get("title").strip()
item["ga"] = vocab.text.strip()
vocab_list.append(item)
out["vocab"] = vocab_list
return out
def filter_para_list(inlist):
out = []
for para in inlist:
if para == "":
continue
elif para.strip() == "Foinse - News as Gaeilge":
return out
else:
out.append(para)
return out
def extract_summary(inlist):
if len(inlist) > 2:
if inlist[-2] == "Did you understand this story? Here are the main points:":
return inlist[-1]
return ""