import requests
from bs4 import BeautifulSoup
import json

_BASE = 'https://doegen.ie/counties'
def do_get(url):
    r = requests.get(url, headers = {'User-agent': 'Mozilla/5.0'})
    if r.status_code != 200:
        raise Exception("Failed to open landing page")
    return r.content

soup = BeautifulSoup(do_get(_BASE), 'html.parser')

counties = soup.find('ul', {'class': 'vocabindex'}).find_all('li')

pages = []
for county in counties:
    item = {}
    anchor = county.find('a')
    href = anchor['href']
    item['link'] = f'https://doegen.ie{href}'
    if anchor.find('span').text.strip() != '(0)':
        item['county'] = anchor.text.split()[1]
        pages.append(item)

def proc_page(url):
    result = {}
    html = do_get(url)
    soup = BeautifulSoup(html, 'html.parser')
    main = soup.find('div', {'id': 'main'})
    content = main.find('div', {'class': 'content'})
    source = content.find('source')
    if source == None:
        return {}
    result['mp3'] = source['src']
    result['transcript'] = content.find('div', id='transcript').text
    if content.find('div', id='translation') != None:
        result['translation'] = content.find('div', id='translation').text
    if content.find('div', id='footnote') != None:
        result['footnote'] = content.find('div', id='footnote').text
    result['recording_metadata'] = content.find('div', id='recording_metadata').text
    return result

def proc_county(item):
    content = do_get(item['link'])
    soup = BeautifulSoup(content, 'html.parser')
    main = soup.find('div', id='main')
    nodes = main.find_all('div', {'class': 'node'})
    stories = []
    for node in nodes:
        story = {}
        anchor = node.find('a')
        story['link'] = f"https://doegen.ie{anchor['href']}"
        story['content'] = proc_page(story['link'])
        if story['content'] == {}:
            continue
        tags = node.find('div', {'class': 'terms'}).find_all('a', rel='tag')
        text = anchor.text
        if ' - ' in text:
            tmp = text.split(' - ')
            if len(tmp) == 2:
                story['title'] = tmp[0]
                story['speaker_name'] = tmp[1]
                name_parts = tmp[1].split(' ')
                first = name_parts[0]
                for tag in tags:
                    if first in tag.text:
                        story['speaker_url'] = f"https://doegen.ie{tag['href']}"
            else:
                story['raw'] = text
        else:
            story['raw'] = text
        stories.append(story)
    item['stories'] = stories

for page in pages:
    proc_county(page)

with open('doegen.json', 'w') as f:
    json.dump(pages, f)