Scrapes the audio + text + IPA from this page

import re
import csv
import json
from html import unescape
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

URL = "https://oahpamuinna.wordpress.com/2021/12/28/guide-to-north-sami-pronunciation/"

IPA_RE = re.compile(r"/([^/]+?)/")  # capture IPA inside /.../

def fetch_soup(url: str) -> BeautifulSoup:
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; bs4-scraper/1.0; +https://example.com/bot)"
    }
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_audio_entries(soup: BeautifulSoup, base_url: str):
    entries = []

    # Each audio example is typically:
    # <figure class="wp-block-audio">
    #   <audio controls src="..."></audio>
    #   <figcaption> ... <strong>WORD</strong> /IPA/ <em>gloss</em> ... </figcaption>
    # </figure>
    # :contentReference[oaicite:1]{index=1}
    for fig in soup.select("figure.wp-block-audio"):
        audio = fig.find("audio")
        if not audio or not audio.get("src"):
            continue

        audio_url = unescape(audio["src"]).strip()
        audio_url = urljoin(base_url, audio_url)

        cap = fig.find("figcaption")
        caption_text = cap.get_text(" ", strip=True) if cap else ""

        # word text: usually the first <strong> in the caption (often inside an <a>)
        word = None
        if cap:
            strong = cap.find("strong")
            if strong:
                word = strong.get_text(strip=True)

        # IPA(s): capture all /.../ in caption; keep first as "ipa" and all as "ipas"
        ipas = IPA_RE.findall(caption_text) if caption_text else []
        ipas = [ipa.strip() for ipa in ipas if ipa.strip()]
        ipa = ipas[0] if ipas else None
        if not ipas:
            continue

        # optional English gloss (often in <em>...</em>)
        gloss = None
        if cap:
            em = cap.find("em")
            if em:
                gloss = em.get_text(" ", strip=True)

        entries.append(
            {
                "audio_url": audio_url,
                "word": word,
                "ipa": ipa,
                "ipas": ipas,
                "gloss": gloss,
                "caption": caption_text,
            }
        )

    return entries

def main():
    soup = fetch_soup(URL)
    entries = parse_audio_entries(soup, URL)

    print(f"Found {len(entries)} audio entries")
    print(json.dumps(entries[:3], ensure_ascii=False, indent=2))

    # Save JSON
    with open("north_sami_pronunciation_audio.json", "w", encoding="utf-8") as f:
        json.dump(entries, f, ensure_ascii=False, indent=2)

    # Save CSV (flatten ipas)
    with open("north_sami_pronunciation_audio.csv", "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(
            f,
            fieldnames=["audio_url", "word", "ipa", "ipas", "gloss", "caption"],
        )
        w.writeheader()
        for e in entries:
            row = dict(e)
            row["ipas"] = " | ".join(row["ipas"] or [])
            w.writerow(row)

main()