Duolingo Hungarian Vocabulary → Anki Deck
Scrapes vocabulary from duome.eu and writes an Anki `.apkg` file using genanki.
%pip install requests beautifulsoup4 genanki
import requests
from bs4 import BeautifulSoup
URL = "https://duome.eu/vocabulary/en/hu"
resp = requests.get(URL)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
words = soup.find("div", {"id": "words"})
collected = []
for word in words.find_all("li"):
current = {}
audio = word.find("div", {"class": "playback voice speak xs"})
if audio:
current["audio"] = audio["data-src"]
hu = word.find("span", {"class": "_blue wA"})
if hu:
current["hu"] = hu.text.strip()
en = word.find("span", {"class": "cCCC wT"})
if en:
current["en"] = en.text.strip()
if current != {}:
collected.append(current)
import os
import time
from urllib.parse import urlparse
from pathlib import Path
# Create a directory for audio files
AUDIO_DIR = Path("duolingo_audio")
AUDIO_DIR.mkdir(exist_ok=True)
def download_audio(url):
"""Download audio file and return the local filename."""
if not url:
return None
filename = os.path.basename(urlparse(url).path)
local_path = AUDIO_DIR / filename
if not local_path.exists():
try:
r = requests.get(url)
r.raise_for_status()
local_path.write_bytes(r.content)
time.sleep(0.1) # be polite
except Exception as e:
print(f"Failed to download {url}: {e}")
return None
return filename
# Download audio for each entry
for entry in collected:
if "audio" in entry:
entry["audio_file"] = download_audio(entry["audio"])
print(f"Downloaded audio for {sum(1 for e in collected if e.get('audio_file'))} / {len(collected)} entries")
collected[:5]
import genanki
MODEL_ID = 1607392319 # arbitrary stable id
DECK_ID = 2059400110 # arbitrary stable id
model = genanki.Model(
MODEL_ID,
"Duolingo HU-EN",
fields=[
{"name": "Hungarian"},
{"name": "English"},
{"name": "Audio"},
],
templates=[
{
"name": "HU → EN",
"qfmt": "{{Hungarian}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{English}}',
},
{
"name": "EN → HU",
"qfmt": "{{English}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Hungarian}}<br>{{Audio}}',
},
],
)
deck = genanki.Deck(DECK_ID, "Duolingo Hungarian")
media_files = []
for entry in collected:
hu = entry.get("hu", "")
en = entry.get("en", "")
audio_file = entry.get("audio_file")
if audio_file:
audio_field = f"[sound:{audio_file}]"
media_files.append(str(AUDIO_DIR / audio_file))
else:
audio_field = ""
if hu and en:
note = genanki.Note(model=model, fields=[hu, en, audio_field])
deck.add_note(note)
OUTPUT = "duolingo_hungarian.apkg"
package = genanki.Package(deck)
package.media_files = media_files
package.write_to_file(OUTPUT)
print(f"Wrote {len(deck.notes)} notes with {len(media_files)} audio files to {OUTPUT}")