link = "http://web.archive.org/web/20130922081459/http://www.foinse.ie/nuacht/nuacht-is-deanai/6765-suil-go-gcruthofar-158-post-nua-le-tograi-ata-ceadaithe-ag-unag"

import requests
from bs4 import BeautifulSoup

page = requests.get(link)
assert page.status_code == 200

In purely text terms, much of the junk can be discarded using these comments:

if "<!-- CONTENT -->" in page.text:
    trim = page.text.split("<!-- CONTENT -->")[1]

if trim and "<!-- //CONTENT -->" in trim:
    trim = trim.split("<!-- //CONTENT -->")[0]

... but it's easier with BeautifulSoup to just extract <div class="item-page">

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})

title = content.find("h2").text.strip()

'Súil go gcruthófar 158 post nua le tograí atá ceadaithe ag ÚnaG'

published_tag = content.find("dd", {"class": "published"})

if published_tag:
    published = published_tag.text.strip()

author_tag = content.find("dd", {"class": "createdby"})
if author_tag:
    author = author_tag.text.strip()

author

'Scríofa ag Foinse'

paragraphs_tags = content.find_all("p", {"class": "MsoNormal"})

paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]

vocab_list = []
for p in paragraphs_tags:
    for vocab in p.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)

check = "http://web.archive.org/web/20171222073817/http://www.foinse.ie/nuacht/nuacht-is-deanai/6822-seanoiri-ag-dul-i-mbun-agoide-maidir-le-ciorruithe"

page2 = requests.get(check)
assert page2.status_code == 200

def get_content(url):
    out = {}
    page = requests.get(url)
    if page.status_code != 200:
        return {}

    soup = BeautifulSoup(page.text, "lxml")

    content = soup.find("div", {"class": "item-page"})
    if not content:
        return {}

    title = content.find("h2").text.strip()
    if title:
        out["title"] = title

    published_tag = content.find("dd", {"class": "published"})
    if published_tag:
        out["published"] = published_tag.text.strip()

    author_tag = content.find("dd", {"class": "createdby"})
    if author_tag:
        out["author"] = author_tag.text.strip()

    paragraphs_tags = content.find_all("p", {"class": "MsoNormal"})
    paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
    out["text"] = paragraphs

    vocab_list = []
    for p in paragraphs_tags:
        for vocab in p.find_all("a", {"class": "glossarylink"}):
            item = {}
            item["en"] = vocab.get("title").strip()
            item["ga"] = vocab.text.strip()
            vocab_list.append(item)
    out["vocab"] = vocab_list

    return out

def filter_para_list(inlist):
    out = []
    for para in inlist:
        if para == "":
            continue
        elif para.strip() == "Foinse - News as Gaeilge":
            return out
        else:
            out.append(para)
    return out

def extract_summary(inlist):
    if len(inlist) > 2:
        if inlist[-2] == "Did you understand this story? Here are the main points:":
            return inlist[-1]
    return ""