LANDING = "https://accent.gmu.edu/browse_language.php?function=find&language=english"
SAMPLE = "https://accent.gmu.edu/browse_language.php?function=detail&speakerid=1217"
import requests
from bs4 import BeautifulSoup

def get_accents(landing):
    # This needs a user agent.
    response = requests.get(landing, headers={'User-Agent': 'Mozilla/5.0'})
    assert response.status_code == 200, f"Failed to fetch {landing}: {response.status_code}"
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # <div id="maincontent"><div class="content">
    main_content = soup.find('div', id='maincontent')
    content = main_content.find('div', class_='content')

    tovisit = []
    for p in content.find_all('p'):
        a = p.find('a')
        if a:
            url = f'https://accent.gmu.edu/{a.get("href")}'
            id = a.text.strip()
            rest = p.text.replace(a.text, '').strip()
            tovisit.append((id, url, rest))

    return tovisit
english_native = get_accents(LANDING)
import re

def get_page_details(sample):
    response = requests.get(sample, headers={'User-Agent': 'Mozilla/5.0'})
    assert response.status_code == 200, f"Failed to fetch {sample}: {response.status_code}"
    soup = BeautifulSoup(response.text, 'html.parser')

    bio = soup.find('ul', class_='bio')
    details = {}
    for li in bio.find_all('li'):
        key = li.find('em').text.strip(':').strip()
        value = li.text.replace(li.find('em').text, '').strip()
        value = value.replace("\n", " ")
        details[key] = value
    
    elicitation = soup.find('p', {"class": "transtext"})
    elicitation = elicitation.text.strip() if elicitation else None
    if elicitation:
        elicitation = elicitation.replace("\xa0", " ")
        elicitation = " ".join([line.strip() for line in elicitation.split("\n")])
        elicitation = re.sub(r'\s+', ' ', elicitation)
    audio_raw = soup.find("source", {"type": "audio/mpeg", "alt": "SpeechSample"})
    audio = f"https://accent.gmu.edu{audio_raw.get('src')}" if audio_raw else None
    return {
        "details": details,
        "elicitation": elicitation,
        "audio": audio
    }
data = []
for item in english_native:
    details = get_page_details(item[1])
    details["id"] = item[0]
    details["url"] = item[1]
    details["rest"] = item[2]
    data.append(details)
import json

with open("english_native.json", "w") as f:
    json.dump(data, f, indent=2)