Continued

link = "http://web.archive.org/web/20171209002240/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois"

import requests
from bs4 import BeautifulSoup

def extract_summary(inlist):
    if len(inlist) > 2:
        if inlist[-2] == "Did you understand this story? Here are the main points:":
            return inlist[-1]
    return ""

def filter_para_list(inlist):
    out = []
    for para in inlist:
        if para == "":
            continue
        elif para.strip() == "Foinse - News as Gaeilge":
            return out
        elif para.strip() == "Did you understand this story? Here are the main points:":
            return out
        else:
            out.append(para)
    return out

def get_content(url, text=""):
    out = {}
    if text:
        page_content = text
    else:
        page = requests.get(url)
        if page.status_code != 200:
            return {}
        page_content = page.text

    soup = BeautifulSoup(page_content, "lxml")

    content = soup.find("div", {"class": "item-page"})
    if not content:
        content = soup.find("div", {"id": "ja-main"})
    if not content:
        return {}
    
    breadcrumbs = soup.find("div", {"class": "ja-breadcrums"})
    if breadcrumbs:
        here = breadcrumbs.find("a", {"class": "pathway"})
        if not here:
            here = breadcrumbs.find("span", {"class": "pathway"})
        if here:
            out["category"] = here.text.strip()
    
    # junk
    jc = content.find("div", {"id": "jc"})
    if jc:
        jc.extract()
    pagenav = content.find("ul", {"class": "pagenav"})
    if pagenav:
        pagenav.extract()
    for js in content.find_all("script", {"type": "text/javascript"}):
        js.extract()

    h2 = content.find("h2")
    if h2:
        title = h2.text.strip()
        if title:
            out["title"] = title
        h2.extract()

    h1 = content.find("h1")
    if h1:
        heading = h1.text.strip()
        if heading:
            out["subcategory"] = heading
        h1.extract()

    published_tag = content.find("dd", {"class": "published"})
    if not published_tag:
        published_tag = content.find("span", {"class": "createdate"})
    if published_tag:
        out["published"] = published_tag.text.strip()

    author_tag = content.find("dd", {"class": "createdby"})
    if not author_tag:
        author_tag = content.find("span", {"class": "createby"})
    if author_tag:
        out["author"] = author_tag.text.strip()
    artinfo = content.find("dl", {"class": "article-info"})
    if not artinfo:
        artinfo = content.find("div", {"class": "article-meta"})
    if artinfo:
        artinfo.extract()

    paragraphs_tags = content.find_all("p")
    paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
    out["text"] = paragraphs
    
    raw_text = content.text
    
    raw_out = []
    for raw_line in raw_text.split("\n"):
        line = raw_line.replace("\xa0", " ").strip()
        if line == "":
            continue
        raw_out.append(line)
    if paragraphs != raw_out:
        out["text"] = raw_out
        
    summary = extract_summary(out["text"])
    if summary:
        out["summary"] = summary
    out["text"] = filter_para_list(out["text"])

    vocab_list = []
    for vocab in content.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)
    out["vocab"] = vocab_list
    
    return out

page = requests.get(link)

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})
if not content:
    print("Empty")

Oh, hey, I've already downloaded this stuff and made a list of likely good articles. Might not work out well for anyone else.

BASE_DIR = "/home/jim/Playing/foinseunpacked"
file = open(f"{BASE_DIR}/attempt1", "r")
pages = []
for link in file.readlines():
    pages.append(link.strip())

foinse_data = []
with open("/home/jim/foinse-bad.txt", "w") as bad_list:
    for page in pages:
        print(page)
        page_path = BASE_DIR + page.strip()[6:]
        with open(page_path, "r") as pagef:
            plines = pagef.readlines()
            ptext = "\n".join(plines)
        content = get_content(page_path, ptext)
        if content:
            foinse_data.append(content)
        else:
            bad_list.write(page + "\n")

import json
with open('foinse.json', 'w') as outfile:
    json.dump(foinse_data, outfile)