sample = 'http://nos.ie/cultur/scannain/fisean-out-of-innocence-agallamh-le-heoin-o-dubhghaill/'
import requests
from bs4 import BeautifulSoup
page = requests.get(sample)
soup = BeautifulSoup(page.text, 'lxml')
vid = soup.find('div', {'id', 'video-wrapper'})
_get_video(soup)
'https://www.youtube.com/embed/lXr1QZPY7aY'
def _get_video(soup):
    vid = soup.find('div', {'id': 'video-wrapper'})
    if vid:
        iframe = vid.find('iframe')
        if iframe:
            return iframe.get('src', '')
    return ''
def _get_details(soup):
    details = {}
    pubdet = soup.find("div", {"id": "single-publish-details"})
    ptags = [p for p in pubdet.find_all('p')]
    if ptags[0].b:
        details['author'] = ptags[0].b.get_text(strip=True)
    if ptags[1]:
        details['date'] = ptags[1].get_text(strip=True)
    broll = pubdet.find("div", {"class": "blogroll-tag-category"})
    cats = set()
    for cat in broll.find_all("a", {"class": "featured-category"}):
        if cat.get_text(strip=True) != "":
            cats.add(cat.get_text(strip=True))
    if len(cats) > 0:
        details['categories'] = list(cats)

    tags = set()
    for tag in broll.find_all("a", {"class": "featured-tag"}):
        if tag.get_text(strip=True) != "":
            tags.add(tag.get_text(strip=True))
    if len(tags) > 0:
        details['tags'] = list(tags)
    return details
_get_subhead(soup)
''
def _get_subhead(soup):
    out = []
    content = soup.find("div", {"id": "single-area-center"})
    if content.h1 and content.h1.span:
        return content.h1.span.get_text(strip=True)
    else:
        return ''
def _mksoup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    return soup
def _read_menu():
    page = requests.get("http://nos.ie/")
    soup = BeautifulSoup(page.text, 'lxml')
    menu = soup.find("ul", {"id": "menu-main-menu"})
    cat_pages = set()
    for li in menu.find_all("li"):
        if li.a:
            cat_pages.add(li.a['href'])
    return cat_pages
links = _read_menu()
a = _get_article_list(links)
len(a)
296
def _get_article_list(urls):
    rest = set()
    articles = set()
    for url in urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        new = _get_remainder(soup)
        rest = rest.union(new)
        art = _collect_articles(soup)
        articles = articles.union(art)
    for url in rest:
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        art = _collect_articles(soup)
        articles = articles.union(art)
    return list(articles)
def _get_remainder(soup):
    import re
    pagination = soup.find("div", {"class": "pagination"})
    if not pagination:
        return []
    current = pagination.find("span", {"class": "current"})
    if not (current and current.get_text(strip=True) == "1"):
        return []
    cats = [a for a in pagination.find_all('a')]
    last_cat = cats[-1]
    last_url = last_cat.get('href', '')
    if not last_url:
        return []
    print(last_url)
    m = re.match("(.*/)([0-9]+)/$", last_url)
    if not m:
        return []
    base = m.group(1)
    num = int(m.group(2)) + 1
    return [f'{base}{i}/' for i in range(2, num)]
def _collect_articles(soup):
    out = set()
    for art in soup.find_all("article", {"class": "blogroll-post"}):
        a = art.find('a')
        out.add(a.get('href'))
    return list(out)
top = _read_menu()
page = requests.get("http://nos.ie/category/cultur/ceol/")
soup = BeautifulSoup(page.text, 'lxml')
_collect_articles(soup)
arts = _get_article_list(top)