Foinse scraper pieces, ctd
Scraping Foinse, from the Wayback Machine
link = "http://web.archive.org/web/20171209002240/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois"
import requests
from bs4 import BeautifulSoup
def extract_summary(inlist):
if len(inlist) > 2:
if inlist[-2] == "Did you understand this story? Here are the main points:":
return inlist[-1]
return ""
def filter_para_list(inlist):
out = []
for para in inlist:
if para == "":
continue
elif para.strip() == "Foinse - News as Gaeilge":
return out
elif para.strip() == "Did you understand this story? Here are the main points:":
return out
else:
out.append(para)
return out
def get_content(url, text=""):
out = {}
if text:
page_content = text
else:
page = requests.get(url)
if page.status_code != 200:
return {}
page_content = page.text
soup = BeautifulSoup(page_content, "lxml")
content = soup.find("div", {"class": "item-page"})
if not content:
content = soup.find("div", {"id": "ja-main"})
if not content:
return {}
breadcrumbs = soup.find("div", {"class": "ja-breadcrums"})
if breadcrumbs:
here = breadcrumbs.find("a", {"class": "pathway"})
if not here:
here = breadcrumbs.find("span", {"class": "pathway"})
if here:
out["category"] = here.text.strip()
# junk
jc = content.find("div", {"id": "jc"})
if jc:
jc.extract()
pagenav = content.find("ul", {"class": "pagenav"})
if pagenav:
pagenav.extract()
for js in content.find_all("script", {"type": "text/javascript"}):
js.extract()
h2 = content.find("h2")
if h2:
title = h2.text.strip()
if title:
out["title"] = title
h2.extract()
h1 = content.find("h1")
if h1:
heading = h1.text.strip()
if heading:
out["subcategory"] = heading
h1.extract()
published_tag = content.find("dd", {"class": "published"})
if not published_tag:
published_tag = content.find("span", {"class": "createdate"})
if published_tag:
out["published"] = published_tag.text.strip()
author_tag = content.find("dd", {"class": "createdby"})
if not author_tag:
author_tag = content.find("span", {"class": "createby"})
if author_tag:
out["author"] = author_tag.text.strip()
artinfo = content.find("dl", {"class": "article-info"})
if not artinfo:
artinfo = content.find("div", {"class": "article-meta"})
if artinfo:
artinfo.extract()
paragraphs_tags = content.find_all("p")
paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
out["text"] = paragraphs
raw_text = content.text
raw_out = []
for raw_line in raw_text.split("\n"):
line = raw_line.replace("\xa0", " ").strip()
if line == "":
continue
raw_out.append(line)
if paragraphs != raw_out:
out["text"] = raw_out
summary = extract_summary(out["text"])
if summary:
out["summary"] = summary
out["text"] = filter_para_list(out["text"])
vocab_list = []
for vocab in content.find_all("a", {"class": "glossarylink"}):
item = {}
item["en"] = vocab.get("title").strip()
item["ga"] = vocab.text.strip()
vocab_list.append(item)
out["vocab"] = vocab_list
return out
page = requests.get(link)
soup = BeautifulSoup(page.text, "lxml")
content = soup.find("div", {"class": "item-page"})
if not content:
print("Empty")
Oh, hey, I've already downloaded this stuff and made a list of likely good articles. Might not work out well for anyone else.
BASE_DIR = "/home/jim/Playing/foinseunpacked"
file = open(f"{BASE_DIR}/attempt1", "r")
pages = []
for link in file.readlines():
pages.append(link.strip())
foinse_data = []
with open("/home/jim/foinse-bad.txt", "w") as bad_list:
for page in pages:
print(page)
page_path = BASE_DIR + page.strip()[6:]
with open(page_path, "r") as pagef:
plines = pagef.readlines()
ptext = "\n".join(plines)
content = get_content(page_path, ptext)
if content:
foinse_data.append(content)
else:
bad_list.write(page + "\n")
import json
with open('foinse.json', 'w') as outfile:
json.dump(foinse_data, outfile)