accents.gmu.edu scraper
To get files spoken by English native speakers
LANDING = "https://accent.gmu.edu/browse_language.php?function=find&language=english"
SAMPLE = "https://accent.gmu.edu/browse_language.php?function=detail&speakerid=1217"
import requests
from bs4 import BeautifulSoup
def get_accents(landing):
# This needs a user agent.
response = requests.get(landing, headers={'User-Agent': 'Mozilla/5.0'})
assert response.status_code == 200, f"Failed to fetch {landing}: {response.status_code}"
soup = BeautifulSoup(response.text, 'html.parser')
# <div id="maincontent"><div class="content">
main_content = soup.find('div', id='maincontent')
content = main_content.find('div', class_='content')
tovisit = []
for p in content.find_all('p'):
a = p.find('a')
if a:
url = f'https://accent.gmu.edu/{a.get("href")}'
id = a.text.strip()
rest = p.text.replace(a.text, '').strip()
tovisit.append((id, url, rest))
return tovisit
english_native = get_accents(LANDING)
import re
def get_page_details(sample):
response = requests.get(sample, headers={'User-Agent': 'Mozilla/5.0'})
assert response.status_code == 200, f"Failed to fetch {sample}: {response.status_code}"
soup = BeautifulSoup(response.text, 'html.parser')
bio = soup.find('ul', class_='bio')
details = {}
for li in bio.find_all('li'):
key = li.find('em').text.strip(':').strip()
value = li.text.replace(li.find('em').text, '').strip()
value = value.replace("\n", " ")
details[key] = value
elicitation = soup.find('p', {"class": "transtext"})
elicitation = elicitation.text.strip() if elicitation else None
if elicitation:
elicitation = elicitation.replace("\xa0", " ")
elicitation = " ".join([line.strip() for line in elicitation.split("\n")])
elicitation = re.sub(r'\s+', ' ', elicitation)
audio_raw = soup.find("source", {"type": "audio/mpeg", "alt": "SpeechSample"})
audio = f"https://accent.gmu.edu{audio_raw.get('src')}" if audio_raw else None
return {
"details": details,
"elicitation": elicitation,
"audio": audio
}
data = []
for item in english_native:
details = get_page_details(item[1])
details["id"] = item[0]
details["url"] = item[1]
details["rest"] = item[2]
data.append(details)
import json
with open("english_native.json", "w") as f:
json.dump(data, f, indent=2)