Scraper for ICASSP from IEEE
From a pre-extracted page, content seems to be fetched separately
from bs4 import BeautifulSoup
The temporary file below is extracted from IEEE
with open("/tmp/icassp") as inf:
html = inf.read()
soup = BeautifulSoup(html, 'html.parser')
top = soup.find("div", {"class": "issue-list-container"})
import re
data = []
for li in top.find_all("li", {"_ngcontent-ftl-c305": ""}):
raw = li.text.strip()
a_tag = li.find("a")
href = a_tag.attrs["href"]
title = a_tag.text.strip()
m1 = re.search("^ICASSP (20\d\d).*", raw)
m2 = re.search(".*((?:20|19)\d\d) (?:IEEE )?International.*", raw)
m3 = re.search(".*ICASSP '(\d\d).*", raw)
if m1:
year = m1.group(1)
elif m2:
year = m2.group(1)
elif m3:
year = m3.group(1)
# Why yes, in this, the Year of Our Lord, 2022,
# writing this scraper has meant having to do
# Y2K compensation
if year.startswith("0"):
year = "20" + year
else:
year = "19" + year
elif raw == "International Conference on Acoustics, Speech, and Signal Processing":
year = "1990"
elif raw == "International Conference on Acoustics, Speech, and Signal Processing,":
year == "1989"
elif raw.startswith("ICASSP-88.,"):
year = "1988"
loc_tag = li.find("span", {"_ngcontent-ftl-c305": ""})
if loc_tag:
loc = loc_tag.text.strip().replace("Location: ", "")
data.append([href, title, year, loc])
count = 1
data.reverse()
for item in data:
item.append(str(count))
count += 1
data.reverse()
template = "https://ieeexplore.ieee.org/rest/publication/home/metadata?pubid="
import requests
raw_json = requests.get(template + "9413349")
raw_json.text
Bah. Ok, API is a waste of time: limit is too low, registering for a key is a bit annoying for a one-time operation.
DOIs = """
10.1109/ICASSP43922.2022
10.1109/ICASSP39728.2021
DOI: 10.1109/ICASSP40776.2020
DOI: 10.1109/ICASSP35589.2019
DOI: 10.1109/ICASSP34228.2018
DOI: 10.1109/ICASSP31846.2017
DOI: 10.1109/ICASSP17257.2015
DOI: 10.1109/ICASSP18874.2014
DOI: 10.1109/ICASSP16080.2013
DOI: 10.1109/ICASSP15465.2012
DOI: 10.1109/ICASSP15948.2011
DOI: 10.1109/ICASSP15600.2010
DOI: 10.1109/ICASSP13629.2009
DOI: 10.1109/ICASSP12235.2008
DOI: 10.1109/ICASSP10710.2007
DOI: 10.1109/ICASSP10488.2006
DOI: 10.1109/ICASSP8829.2005
DOI: 10.1109/ICASSP.2004
DOI: 10.1109/ICASSP.2003
DOI: 10.1109/ICASSP.2002
DOI: 10.1109/ICASSP.2001
DOI: 10.1109/ICASSP.2000
DOI: 10.1109/ICASSP.1999
DOI: 10.1109/ICASSP.1998
DOI: 10.1109/ICASSP.1997
DOI: 10.1109/ICASSP.1996
DOI: 10.1109/ICASSP.1995
DOI: 10.1109/ICASSP.1994
DOI: 10.1109/ICASSP.1993
DOI: 10.1109/ICASSP.1992
DOI: 10.1109/ICASSP.1991
DOI: 10.1109/ICASSP.1990
DOI: 10.1109/ICASSP.1989
DOI: 10.1109/ICASSP.1988
DOI: 10.1109/ICASSP.1987
DOI: 10.1109/ICASSP.1986
DOI: 10.1109/ICASSP.1985
DOI: 10.1109/ICASSP.1984
DOI: 10.1109/ICASSP.1983
DOI: 10.1109/ICASSP.1982
DOI: 10.1109/ICASSP.1981
DOI: 10.1109/ICASSP.1980
DOI: 10.1109/ICASSP.1979
DOI: 10.1109/ICASSP.1978
DOI: 10.1109/ICASSP.1977
DOI: 10.1109/ICASSP.1976
"""
doi_dict = {}
for line in DOIs.replace("DOI: ", "").split("\n"):
if line == "":
continue
parts = line.split(".")
doi_dict[parts[-1]] = line
output = []
output.append("| Year | Ordinal | DOI | Issue | Location |")
output.append("|------|---------|-----|-------|----------|")
for item in data:
#[href, title, year, loc, ord]
if item[2] == "2016":
doi = ""
else:
doi = doi_dict[item[2]]
output.append(f"| {item[2]} | {item[4]} | {doi} | [{item[1]}](https://ieeexplore.ieee.org/{item[0]}) | {item[3]} |")
from IPython.display import display, Markdown
display(Markdown("\n".join(output)))