Sámi pronunciations scraper
Courtesy of ChatGPT
Scrapes the audio + text + IPA from this page
import re
import csv
import json
from html import unescape
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
URL = "https://oahpamuinna.wordpress.com/2021/12/28/guide-to-north-sami-pronunciation/"
IPA_RE = re.compile(r"/([^/]+?)/") # capture IPA inside /.../
def fetch_soup(url: str) -> BeautifulSoup:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; bs4-scraper/1.0; +https://example.com/bot)"
}
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
def parse_audio_entries(soup: BeautifulSoup, base_url: str):
entries = []
# Each audio example is typically:
# <figure class="wp-block-audio">
# <audio controls src="..."></audio>
# <figcaption> ... <strong>WORD</strong> /IPA/ <em>gloss</em> ... </figcaption>
# </figure>
# :contentReference[oaicite:1]{index=1}
for fig in soup.select("figure.wp-block-audio"):
audio = fig.find("audio")
if not audio or not audio.get("src"):
continue
audio_url = unescape(audio["src"]).strip()
audio_url = urljoin(base_url, audio_url)
cap = fig.find("figcaption")
caption_text = cap.get_text(" ", strip=True) if cap else ""
# word text: usually the first <strong> in the caption (often inside an <a>)
word = None
if cap:
strong = cap.find("strong")
if strong:
word = strong.get_text(strip=True)
# IPA(s): capture all /.../ in caption; keep first as "ipa" and all as "ipas"
ipas = IPA_RE.findall(caption_text) if caption_text else []
ipas = [ipa.strip() for ipa in ipas if ipa.strip()]
ipa = ipas[0] if ipas else None
if not ipas:
continue
# optional English gloss (often in <em>...</em>)
gloss = None
if cap:
em = cap.find("em")
if em:
gloss = em.get_text(" ", strip=True)
entries.append(
{
"audio_url": audio_url,
"word": word,
"ipa": ipa,
"ipas": ipas,
"gloss": gloss,
"caption": caption_text,
}
)
return entries
def main():
soup = fetch_soup(URL)
entries = parse_audio_entries(soup, URL)
print(f"Found {len(entries)} audio entries")
print(json.dumps(entries[:3], ensure_ascii=False, indent=2))
# Save JSON
with open("north_sami_pronunciation_audio.json", "w", encoding="utf-8") as f:
json.dump(entries, f, ensure_ascii=False, indent=2)
# Save CSV (flatten ipas)
with open("north_sami_pronunciation_audio.csv", "w", encoding="utf-8", newline="") as f:
w = csv.DictWriter(
f,
fieldnames=["audio_url", "word", "ipa", "ipas", "gloss", "caption"],
)
w.writeheader()
for e in entries:
row = dict(e)
row["ipas"] = " | ".join(row["ipas"] or [])
w.writerow(row)
main()