Scrape SweDia
For a student project
import requests
from bs4 import BeautifulSoup
!apt install wget
def get_dialect_links(LANDING = "https://swedia.ling.gu.se/snabbmeny.html"):
req = requests.get(LANDING)
if req.status_code != 200:
return []
soup = BeautifulSoup(req.text, "html.parser")
layer = soup.find("div", {"id": "Layer1"})
links = []
for anchor in layer.find_all("a"):
link = anchor["href"]
parts = link.replace("/index.html", "").split("/")
if len(parts) == 3:
links.append(link)
return [f"https://swedia.ling.gu.se/{l}" for l in links]
links = get_dialect_links()
def get_dialect_speaker_links(dialect_link):
req = requests.get(dialect_link)
base = dialect_link.replace("index.html", "")
out = []
if req.status_code != 200:
return []
soup = BeautifulSoup(req.text, "html.parser")
audio = soup.find("div", {"id": "Lyssna"})
links = []
for anchor in audio.find_all("a"):
link = anchor["href"]
out.append(f"{base}{link}")
return out
get_dialect_speaker_links("https://swedia.ling.gu.se/Gotaland/Bohuslan/Karna/index.html")
def get_dialect_media(link):
req = requests.get(link)
if req.status_code != 200:
return []
soup = BeautifulSoup(req.text, "html.parser")
links = []
text = soup.find("div", {"id": "Oversattning"})
for img in text.find_all("img"):
if img["src"].endswith(".gif"):
link = img["src"]
if link.startswith("../../../"):
link = link.replace("../../../", "https://swedia.ling.gu.se/")
links.append(link)
trans = soup.find("div", {"id": "Transkription"})
for img in trans.find_all("img"):
if img["src"].endswith(".gif"):
link = img["src"]
if link.startswith("../../../"):
link = link.replace("../../../", "https://swedia.ling.gu.se/")
links.append(link)
audio = soup.find("div", {"id": "Speaker"})
for anchor in audio.find_all("a"):
if anchor["href"].endswith(".mp3") or anchor["href"].endswith(".wav"):
link = anchor["href"]
if link.startswith("../../../"):
link = link.replace("../../../", "https://swedia.ling.gu.se/").replace("/./", "/")
links.append(link)
return links
media = []
for link in links:
for dialect in get_dialect_speaker_links(link):
media += get_dialect_media(dialect)
for file in media:
!wget {file}