Scraper pieces for beo.ie
Incomplete, most pieces done
sample_url = 'http://beo.ie/alt-an-eaglais-fein-a-bheas-thios-leis-ma-chuirtear-ba.aspx'
import requests
from bs4 import BeautifulSoup
page = requests.get(sample_url)
soup = BeautifulSoup(page.text, 'html.parser')
def _get_translations(soup):
out = []
for gloss in soup.find_all('span', {'class': 'gloss'}):
if gloss.get('title') != None and gloss.text:
out.append({'en': gloss.get('title'), 'ga': gloss.text})
return out
def _get_captioned_images(soup):
out = []
for pic in soup.find_all('div', {'class': 'pic'}):
title = pic.find('div', {'class': 'title'})
if title:
imgtag = pic.find('img')
out.append({'image': f"http://beo.ie/{imgtag.get('src')}", 'caption': title.text})
return out
def _get_title(soup):
title = soup.find('title').text
if title and title.startswith('Beo! - '):
return(title[7:])
else:
return None
def _get_blurb(soup):
return soup.find('div', {'class', 'blurb'}).text.strip()
def _get_author(soup):
dauth = soup.find('div', {'class': 'author'})
return dauth.find('span', {'class': 'smallscreenInline'}).text.strip()
def _get_paragraphs(soup):
out = []
content = soup.find('div', {'class': 'content'})
for p in content.find_all('p'):
text = p.text.strip()
if text:
out.append(text)
return out
edition_sample = 'http://beo.ie/eagran-2014-09.aspx'
def _get_article_links(url):
out = set()
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for article in soup.find_all('div', {'class': 'articleListing'}):
for a in article.find_all('a'):
link = a.get('href')
if link:
out.add(f"http://beo.ie/{link}")
return list(out)
def _get_edition_links():
out = set()
for i in range(1, 15):
url = f"http://beo.ie/Editions.aspx?Year=20{i:02}"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
eds = soup.find('ul', {'class': 'editions'})
for ed in eds.find_all('a'):
if ed.get('href'):
out.add(f"http://beo.ie/{ed.get('href')}")
return list(out)