Nós scraper pieces
In progress pieces of Nós scraper
sample = 'http://nos.ie/cultur/scannain/fisean-out-of-innocence-agallamh-le-heoin-o-dubhghaill/'
import requests
from bs4 import BeautifulSoup
page = requests.get(sample)
soup = BeautifulSoup(page.text, 'lxml')
vid = soup.find('div', {'id', 'video-wrapper'})
_get_video(soup)
def _get_video(soup):
vid = soup.find('div', {'id': 'video-wrapper'})
if vid:
iframe = vid.find('iframe')
if iframe:
return iframe.get('src', '')
return ''
def _get_details(soup):
details = {}
pubdet = soup.find("div", {"id": "single-publish-details"})
ptags = [p for p in pubdet.find_all('p')]
if ptags[0].b:
details['author'] = ptags[0].b.get_text(strip=True)
if ptags[1]:
details['date'] = ptags[1].get_text(strip=True)
broll = pubdet.find("div", {"class": "blogroll-tag-category"})
cats = set()
for cat in broll.find_all("a", {"class": "featured-category"}):
if cat.get_text(strip=True) != "":
cats.add(cat.get_text(strip=True))
if len(cats) > 0:
details['categories'] = list(cats)
tags = set()
for tag in broll.find_all("a", {"class": "featured-tag"}):
if tag.get_text(strip=True) != "":
tags.add(tag.get_text(strip=True))
if len(tags) > 0:
details['tags'] = list(tags)
return details
_get_subhead(soup)
def _get_subhead(soup):
out = []
content = soup.find("div", {"id": "single-area-center"})
if content.h1 and content.h1.span:
return content.h1.span.get_text(strip=True)
else:
return ''
def _mksoup(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
return soup
def _read_menu():
page = requests.get("http://nos.ie/")
soup = BeautifulSoup(page.text, 'lxml')
menu = soup.find("ul", {"id": "menu-main-menu"})
cat_pages = set()
for li in menu.find_all("li"):
if li.a:
cat_pages.add(li.a['href'])
return cat_pages
links = _read_menu()
a = _get_article_list(links)
len(a)
def _get_article_list(urls):
rest = set()
articles = set()
for url in urls:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
new = _get_remainder(soup)
rest = rest.union(new)
art = _collect_articles(soup)
articles = articles.union(art)
for url in rest:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
art = _collect_articles(soup)
articles = articles.union(art)
return list(articles)
def _get_remainder(soup):
import re
pagination = soup.find("div", {"class": "pagination"})
if not pagination:
return []
current = pagination.find("span", {"class": "current"})
if not (current and current.get_text(strip=True) == "1"):
return []
cats = [a for a in pagination.find_all('a')]
last_cat = cats[-1]
last_url = last_cat.get('href', '')
if not last_url:
return []
print(last_url)
m = re.match("(.*/)([0-9]+)/$", last_url)
if not m:
return []
base = m.group(1)
num = int(m.group(2)) + 1
return [f'{base}{i}/' for i in range(2, num)]
def _collect_articles(soup):
out = set()
for art in soup.find_all("article", {"class": "blogroll-post"}):
a = art.find('a')
out.add(a.get('href'))
return list(out)
top = _read_menu()
page = requests.get("http://nos.ie/category/cultur/ceol/")
soup = BeautifulSoup(page.text, 'lxml')
_collect_articles(soup)
arts = _get_article_list(top)