Seanchas Rann na Feirste scraper pieces
Incomplete pieces for a scraper
import requests
from bs4 import BeautifulSoup
BASE='http://www.rannnafeirste.com'
class Page:
def __init__(self, id, title):
self.id = id
self.title = title
self.url = '{}/{}'.format(BASE, id)
# TODO: stop trying to make fetch happen
def _fetch_text(self):
req = requests.get(self.url)
if req.status_code != 200:
raise Exception('Error fetching page ' + self.url)
self.content = req.content
def _soupynorman(self):
self.soup = BeautifulSoup(self.content, 'html.parser')
def _fetch_audio(self):
audio_div = self.soup.find("div", class_='sqs-audio-embed')
self.audio = audio_div["data-url"]
def _fetch_fragments(self):
for i in self.soup.find_all("div", class_='sqs-block-content'):
children = list(i.children)
if children[0].name == "h1":
self.fragments = children
## don't actually need this, because the title comes from the landing page
def _fetch_title(self):
if self.fragments[0].name == "h1":
self.title = fragments[0].text
else:
raise Exception('Error reading title: ' + self.url)
def _fetch_author(self):
if len(self.fragments) > 2 and self.fragments[1].name == "h2":
self.author = self.fragments[1].text
else:
raise Exception('Error reading author: ' + self.url)
def _fetch_paragraphs(self):
raw_paras = [n for n in self.fragments if n.name == "p"]
for frag in raw_paras:
for br in frag.find_all("br"):
br.insert(0, '\n')
br.unwrap()
first = list(raw_paras[0].children)
if len(first) == 1 and first[0].name == 'em':
self.em_para = raw_paras[0].text.strip()
del raw_paras[0]
extent = len(raw_paras)
counter = 0
for i in raw_paras:
if i.text.strip().startswith('Nóta') or i.text.strip().startswith('NÓTA') and extent > counter:
extent = counter
counter += 1
filt = raw_paras[0:extent]
self.paragraphs = [p.text for p in filt]
def get_initials(self):
fada = {
'Á': 'A',
'É': 'E',
'Í': 'I',
'Ó': 'O',
'Ú': 'U'
}
def initial(s):
if s == None or len(s) < 1:
return ''
else:
return fada.get(s.upper()[0]) or s.upper()[0]
try:
return "".join([initial(i) for i in self.author.split(' ')])
except:
print('Author missing: did you run scrape()?')
def _specifics(self):
title = ['mo-bhaile-dchais', 'taiscidh-ghleann-domhain', 'banron-an-uaignis', 'non-an-r-agus-an-frog', 'seanchaithe-agus-fil-rann-na-feirste', 'an-ghaeltacht-bheo']
titlele = ['liontar-duinn-an-cruiscin', 'oireachtas-na-ndise', 'fidilir-ghleann-fhinne']
if self.id in title:
self.paragraphs.insert(0, self.title)
if self.id in titlele:
second = self.em_para.replace(' a chum', '')
self.paragraphs.insert(0, '{} le {}'.format(self.title, second))
def scrape(self):
self._fetch_text()
self._soupynorman()
self._fetch_audio()
self._fetch_fragments()
self._fetch_author()
self._fetch_paragraphs()
self._specifics()
foo = Page('deorai-an-oileain', 'Mo Bhaile')
foo.scrape()
foo.paragraphs
para = foo.fragments[4]
#one = para.contents[0]
for br in para.find_all("br"):
br.insert(0, '\n')
br.unwrap()
para.contents
raw_paras = [n for n in foo.fragments if n.name == "p"]
#raw_paras
first = list(raw_paras[0].children)
if len(first) == 1 and first[0].name == 'em':
del raw_paras[0]
extent = len(raw_paras)
counter = 0
for i in raw_paras:
print(i.text)
if i.text.strip().startswith('Nóta') or i.text.strip().startswith('NÓTA') and extent > counter:
extent = counter
counter += 1
raw_paras[0:extent]
#counter