QPSR scraper
Scrape the TMH quarterly reports
import requests
from bs4 import BeautifulSoup
def get_years():
_TOP = "https://www.speech.kth.se/qpsr/"
_TOP_HTML = requests.get(_TOP)
assert _TOP_HTML.status_code == 200
_TOP_SOUP = BeautifulSoup(_TOP_HTML.text, 'html.parser')
by_years = _TOP_SOUP.find_all("select", {"name": "year"})
years = [opt.text for by_year in by_years for opt in by_year.find_all("option")]
return years
_TITLES = """
http://www.speech.kth.se/prod/publications/files/3605.pdf\tProductive Vocabulary Size Development in Children Aged 18-24 Months – Gender Differences\tIda Andersson, Jenny Gauding, Anna Graca, Katarina Holm, Linda Öhlin, Ulrika Marklund, Anna Ericsson
http://www.speech.kth.se/prod/publications/files/3607.pdf\tChildren’s perception of their modified speech – preliminary findings\tSofia Strömbergsson
http://www.speech.kth.se/prod/publications/files/3579.pdf\tImitation of bird song in folklore – onomatopoeia or not?\tÅsa Abelin
http://www.speech.kth.se/prod/publications/files/3586.pdf\tAnticipatory lip rounding– a pilot study using The Wave Speech Research System\tGabrielsson, D., Kirchner, S., Nilsson, K., Norberg, A., Widlund, C.
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_121-124.pdf\tSIMULEKT – modelling Swedish regional intonation\tGösta Bruce, Susanne Schötz, Björn Granström
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_033-036.pdf\tFilibuster – a new Swedish text-to-speech system\tChristina Ericsson, Jesper Klein, Kåre Sjölander, Lars Sönnebo
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_049-052.pdf\tSwedish word accents in a ‘confirmation’ context\tGilbert Ambrazaitis
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_053-056.pdf\tEstimates of Infants’ Vocabulary Composition and the Role of Adult-instructions for Early Word-learning\tKlintfors E., Lacerda F., Sundberg U.
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_061-064.pdf\tMushyPeek – an experiment framework for controlled investigation of human-human interaction control behaviour\tJens Edlund, Jonas Beskow, Mattias Heldner
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_077-080.pdf\tWhat you Hear is what you See – a study of visual vs. auditive noise\tAnna Berg, Annelie Brandt
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_093-096.pdf\tLinguistic challenges for bilingual schoolchildren in Rosengård\tPetra Bodén, Gudrun Svensson
http://www.speech.kth.se/prod/publications/files/qpsr/2007/2007_50_1_097-100.pdf\tVoxalys – a Pedagogical Praat Plugin for Voice Analysis\tJonas Lindh
http://www.speech.kth.se/prod/publications/files/qpsr/2006/2006_48_1_035-043.pdf\tMusical structure: A translation of István Ipolyi: Innføring i Musikkspråkets Opprinnelse og Struktur\tFalkenberg Hansen, K.
http://www.speech.kth.se/prod/publications/files/qpsr/2002/2002_44_1_085-088.pdf\tStød and Vowel Length: Acoustic and Cognitive Reality?\tGrønnum, N.
http://www.speech.kth.se/prod/publications/files/qpsr/2002/2002_44_1_145-148.pdf\tStress judgements by naïve listeners\tMolin, J.
http://www.speech.kth.se/prod/publications/files/qpsr/1995/1995_36_2-3_063-070.pdf\tMatching the rule parameters of PHRASE ARCH to performances of ”Träumerei”: a preliminary study\tFriberg, A.
http://www.speech.kth.se/prod/publications/files/qpsr/1975/1975_16_4_027-035.pdf\tLippenablesehilfe für Gehörlose: Visuelle oder taktile Darbietung von Ergänzungsinformation?\tTraunmuller, H.
"""
MISSING = {}
for missed in _TITLES.split("\n"):
if not "\t" in missed:
continue
parts = missed.split("\t")
MISSING[parts[0]] = parts[1:]
def read_page(page):
if page.startswith("http"):
url = page
year = page[len(page)-4:]
else:
url = f"https://www.speech.kth.se/qpsr/show_by_year.php?year={page}"
year = page
req = requests.get(url)
assert req.status_code == 200
soup = BeautifulSoup(req.text, 'html.parser')
pubs = []
for pub in soup.find_all("p", class_="publications_apa_entry"):
data = {}
data["year"] = year
raw_text = pub.text
author = pub.find("span", class_="publications_apa_author")
if author.text == ", . (Ed.).":
data["author"] = ""
else:
data["author"] = author.text
raw_text = raw_text.replace(author.text, "").lstrip()
if not raw_text.startswith(f"({year})."):
raise Exception(f"Expected year {year}, but got {raw_text[1:5]} - " + pub.text)
raw_text = raw_text[8:]
pub_title = pub.find("span", class_="publications_apa_title")
data["publication_full"] = pub_title.text
pub_pieces = pub_title.text.split(", ")
if pub_pieces[-1].isdigit():
data["volume"] = pub_pieces[-1]
data["publication"] = ", ".join(pub_pieces[0:-1])
pub_title_start = raw_text.find(pub_title.text)
pub_title_end = pub_title_start + len(pub_title.text)
data["title"] = raw_text[0:pub_title_start].strip()
if data["title"].endswith(". In"):
data["title"] = data["title"][0:-3]
for pdf_link in pub.find_all("a"):
if pdf_link is None or not pdf_link.has_attr("href"):
print("Missing link: " + pub.text)
else:
if pdf_link["href"].endswith("pdf"):
data["pdf"] = pdf_link["href"]
else:
if pdf_link.has_attr("onclick"):
abs_start = pdf_link["onclick"].find("abstract_")
abs_end = pdf_link["onclick"][abs_start:].find("'")
abs_id = pdf_link["onclick"][abs_start:abs_start+abs_end]
abs_soup = soup.find("p", {"id": abs_id})
abs_text = abs_soup.text.strip()
if abs_text.startswith("Abstract:"):
abs_text = abs_text[9:].strip()
abs_text = abs_text.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
data["abstract"] = abs_text
if "pdf" in data and data["pdf"].endswith("/1937.pdf"):
data["pages"] = "1-6"
data["volume"] = "49"
data["edition"] = "1"
data["title"] = "Sopranos with a singer’s formant? Historical, Physiological, and Acoustical Aspects of Castrato Singing"
data["author_full"] = "Johan Sundberg, Marianne Trovén, Bernhard Richter"
data["author"] = "Sundberg, J., Trovén, M., Richter, B."
pubs.append(data)
continue
raw_text = raw_text[pub_title_end:].strip()
if raw_text.endswith(" [pdf]"):
raw_text = raw_text[0:-6]
if raw_text[-1:] == ".":
raw_text = raw_text[0:-1]
if ", " in raw_text:
if raw_text.startswith("(pp."):
parts = raw_text.split("). ")
data["pages"] = parts[0][5:]
# manual fix
if data["pdf"].endswith("/3597.pdf"):
data["volume"] = "51"
data["edition"] = "1"
elif data["pdf"].endswith("/2002_44_1_153-156.pdf"):
data["volume"] = "44"
data["edition"] = "1"
else:
parts = raw_text.split(", ")
if parts[0].startswith("("):
to_mark = parts[0].find(")")
data["edition"] = parts[0][1:to_mark]
if " [abstract]" in parts[1]:
data["pages"] = parts[1].replace(" [abstract]", "")
if data["pages"].endswith("."):
data["pages"] = data["pages"][0:-1]
else:
data["pages"] = parts[1]
if "pages" in data and ". [html]" in data["pages"]:
data["pages"] = data["pages"].replace(". [html]", "")
if "pdf" in data:
if data["pdf"].endswith("2007_50_1_065-068.pdf"):
data["title"] = "The Parrot Effect – a study of the ability to imitate a foreign language"
data["author_full"] = "Johanna Persson, Linda Westholm"
data["edition"] = "1"
data["pages"] = "065-068"
elif data["pdf"].endswith("2007_50_1_113-116.pdf"):
data["title"] = "Automatic classification of 'front' and 'back' pronunciation variants of /r/ in the Götaland dialects of Swedish"
data["author_full"] = "Johan Frid"
elif data["pdf"].endswith("2007_50_1_073-076.pdf"):
data["title"] = "Emotional McGurk effect in Swedish"
data["pages"] = "073-076"
data["volume"] = "50"
data["edition"] = "1"
data["author_full"] = "Åsa Abelin"
data["author_full"] = "Abelin, Å."
elif data["pdf"] in MISSING:
data["title"] = MISSING[data["pdf"]][0]
data["author_full"] = MISSING[data["pdf"]][1]
pubs.append(data)
return pubs
import json
all = []
for year in get_years():
all += read_page(year)
with open("qpsr.json", "w") as out:
out.write(json.dumps(all, indent=4))