Scraper for Interspeech, etc.
This one actually scrapes the page
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.isca-speech.org/archive/")
assert req.status_code == 200
soup = BeautifulSoup(req.text, 'html.parser')
for div in soup.find_all("div", {"class": "w3-container"}):
if div.text.strip().endswith("1987"):
top = div
raw_data = []
for a_tag in top.find_all("a"):
href = a_tag.attrs["href"]
conf = href.split("_")[0]
year = href.split("/")[0].split("_")[-1]
if conf.endswith("speech"):
doi = f"10.21437/{conf[0].upper()}{conf[1:]}.{year}"
elif conf == "icslp":
doi = f"10.21437/{conf.upper()}.{year}"
else:
doi = ""
raw_data.append([href, conf, year, doi])
output = []
output.append("| Year | Conference | Title | DOI |")
output.append("|------|------------|-------|-----|")
for item in raw_data:
output.append(f"| {item[2]} | {item[1]} | [{item[1].upper()} {item[2]}](https://www.isca-speech.org/archive/{item[0]}) | {item[3]} |")
from IPython.display import display, Markdown
display(Markdown("\n".join(output)))