%pip install requests beautifulsoup4 genanki
Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (2.32.4)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)
Collecting genanki
  Downloading genanki-0.13.1-py3-none-any.whl.metadata (7.5 kB)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests) (3.4.4)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests) (3.11)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests) (2026.1.4)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)
Collecting cached-property (from genanki)
  Downloading cached_property-2.0.1-py3-none-any.whl.metadata (10 kB)
Requirement already satisfied: frozendict in /usr/local/lib/python3.12/dist-packages (from genanki) (2.4.7)
Collecting chevron (from genanki)
  Downloading chevron-0.14.0-py3-none-any.whl.metadata (4.9 kB)
Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (from genanki) (6.0.3)
Downloading genanki-0.13.1-py3-none-any.whl (16 kB)
Downloading cached_property-2.0.1-py3-none-any.whl (7.4 kB)
Downloading chevron-0.14.0-py3-none-any.whl (11 kB)
Installing collected packages: chevron, cached-property, genanki
Successfully installed cached-property-2.0.1 chevron-0.14.0 genanki-0.13.1
import requests
from bs4 import BeautifulSoup

URL = "https://duome.eu/vocabulary/en/hu"
resp = requests.get(URL)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

words = soup.find("div", {"id": "words"})
collected = []

for word in words.find_all("li"):
    current = {}
    audio = word.find("div", {"class": "playback voice speak xs"})
    if audio:
        current["audio"] = audio["data-src"]
    hu = word.find("span", {"class": "_blue wA"})
    if hu:
        current["hu"] = hu.text.strip()
    en = word.find("span", {"class": "cCCC wT"})
    if en:
        current["en"] = en.text.strip()
    if current != {}:
        collected.append(current)
import os
import time
from urllib.parse import urlparse
from pathlib import Path

# Create a directory for audio files
AUDIO_DIR = Path("duolingo_audio")
AUDIO_DIR.mkdir(exist_ok=True)

def download_audio(url):
    """Download audio file and return the local filename."""
    if not url:
        return None
    filename = os.path.basename(urlparse(url).path)
    local_path = AUDIO_DIR / filename
    if not local_path.exists():
        try:
            r = requests.get(url)
            r.raise_for_status()
            local_path.write_bytes(r.content)
            time.sleep(0.1)  # be polite
        except Exception as e:
            print(f"Failed to download {url}: {e}")
            return None
    return filename

# Download audio for each entry
for entry in collected:
    if "audio" in entry:
        entry["audio_file"] = download_audio(entry["audio"])

print(f"Downloaded audio for {sum(1 for e in collected if e.get('audio_file'))} / {len(collected)} entries")
collected[:5]
Downloaded audio for 2402 / 2402 entries
[{'audio': 'https://d1vq87e9lcf771.cloudfront.net/hanna/bb0be378469529cf17546c335b73302f',
  'hu': 'a',
  'en': '- the',
  'audio_file': 'bb0be378469529cf17546c335b73302f'},
 {'audio': 'https://d1vq87e9lcf771.cloudfront.net/hanna/db01f31aa8d3df1b7ad0f8315a292f8b',
  'hu': 'a barátnőmmel',
  'en': '- with my girlfriend',
  'audio_file': 'db01f31aa8d3df1b7ad0f8315a292f8b'},
 {'audio': 'https://d1vq87e9lcf771.cloudfront.net/hanna/8caa276f6a2733c0564e4a4cfb61bbc7',
  'hu': 'abba',
  'en': '- (into) that',
  'audio_file': '8caa276f6a2733c0564e4a4cfb61bbc7'},
 {'audio': 'https://d1vq87e9lcf771.cloudfront.net/hanna/c0d04a48542395b98022abaedf35851d',
  'hu': 'abban',
  'en': '- in that',
  'audio_file': 'c0d04a48542395b98022abaedf35851d'},
 {'audio': 'https://d1vq87e9lcf771.cloudfront.net/hanna/03e55b58f5db8d136be7bd2996f56d9b',
  'hu': 'abból',
  'en': '- from the one (that), out of that (one), from the one, out of, from that',
  'audio_file': '03e55b58f5db8d136be7bd2996f56d9b'}]
import genanki

MODEL_ID = 1607392319  # arbitrary stable id
DECK_ID = 2059400110   # arbitrary stable id

model = genanki.Model(
    MODEL_ID,
    "Duolingo HU-EN",
    fields=[
        {"name": "Hungarian"},
        {"name": "English"},
        {"name": "Audio"},
    ],
    templates=[
        {
            "name": "HU → EN",
            "qfmt": "{{Hungarian}}<br>{{Audio}}",
            "afmt": '{{FrontSide}}<hr id="answer">{{English}}',
        },
        {
            "name": "EN → HU",
            "qfmt": "{{English}}",
            "afmt": '{{FrontSide}}<hr id="answer">{{Hungarian}}<br>{{Audio}}',
        },
    ],
)

deck = genanki.Deck(DECK_ID, "Duolingo Hungarian")
media_files = []

for entry in collected:
    hu = entry.get("hu", "")
    en = entry.get("en", "")
    audio_file = entry.get("audio_file")

    if audio_file:
        audio_field = f"[sound:{audio_file}]"
        media_files.append(str(AUDIO_DIR / audio_file))
    else:
        audio_field = ""

    if hu and en:
        note = genanki.Note(model=model, fields=[hu, en, audio_field])
        deck.add_note(note)

OUTPUT = "duolingo_hungarian.apkg"
package = genanki.Package(deck)
package.media_files = media_files
package.write_to_file(OUTPUT)

print(f"Wrote {len(deck.notes)} notes with {len(media_files)} audio files to {OUTPUT}")
Wrote 2402 notes with 2402 audio files to duolingo_hungarian.apkg