Doegen recordings scraper
Just the scraper, processing later
import requests
from bs4 import BeautifulSoup
import json
_BASE = 'https://doegen.ie/counties'
def do_get(url):
r = requests.get(url, headers = {'User-agent': 'Mozilla/5.0'})
if r.status_code != 200:
raise Exception("Failed to open landing page")
return r.content
soup = BeautifulSoup(do_get(_BASE), 'html.parser')
counties = soup.find('ul', {'class': 'vocabindex'}).find_all('li')
pages = []
for county in counties:
item = {}
anchor = county.find('a')
href = anchor['href']
item['link'] = f'https://doegen.ie{href}'
if anchor.find('span').text.strip() != '(0)':
item['county'] = anchor.text.split()[1]
pages.append(item)
def proc_page(url):
result = {}
html = do_get(url)
soup = BeautifulSoup(html, 'html.parser')
main = soup.find('div', {'id': 'main'})
content = main.find('div', {'class': 'content'})
source = content.find('source')
if source == None:
return {}
result['mp3'] = source['src']
result['transcript'] = content.find('div', id='transcript').text
if content.find('div', id='translation') != None:
result['translation'] = content.find('div', id='translation').text
if content.find('div', id='footnote') != None:
result['footnote'] = content.find('div', id='footnote').text
result['recording_metadata'] = content.find('div', id='recording_metadata').text
return result
def proc_county(item):
content = do_get(item['link'])
soup = BeautifulSoup(content, 'html.parser')
main = soup.find('div', id='main')
nodes = main.find_all('div', {'class': 'node'})
stories = []
for node in nodes:
story = {}
anchor = node.find('a')
story['link'] = f"https://doegen.ie{anchor['href']}"
story['content'] = proc_page(story['link'])
if story['content'] == {}:
continue
tags = node.find('div', {'class': 'terms'}).find_all('a', rel='tag')
text = anchor.text
if ' - ' in text:
tmp = text.split(' - ')
if len(tmp) == 2:
story['title'] = tmp[0]
story['speaker_name'] = tmp[1]
name_parts = tmp[1].split(' ')
first = name_parts[0]
for tag in tags:
if first in tag.text:
story['speaker_url'] = f"https://doegen.ie{tag['href']}"
else:
story['raw'] = text
else:
story['raw'] = text
stories.append(story)
item['stories'] = stories
for page in pages:
proc_county(page)
with open('doegen.json', 'w') as f:
json.dump(pages, f)