Get data from Folkets Swedish-English
Just pronunciation information
DICT="https://folkets-lexikon.csc.kth.se/folkets/folkets_sv_en_public.xml"
import requests
req = requests.get(DICT)
assert req.status_code == 200
import xml.etree.ElementTree as ET
tree = ET.fromstring(req.text)
words = []
for word_elem in tree.findall("word"):
word = {
"word": word_elem.attrib["value"],
}
for attrib in ["comment", "lang", "class"]:
if attrib in word_elem.attrib:
word[attrib] = word_elem.attrib[attrib]
phon = word_elem.find("phonetic")
if phon is not None:
if "soundFile" in phon.attrib:
word["soundfile"] = phon.attrib["soundFile"]
word["transcription"] = phon.attrib["value"]
words.append(word)
CHAR_REPLACE = {
"à": "0340",
"é": "0351",
"Ö": "0326",
"ö": "0366",
"Ä": "0304",
"ä": "0344",
"Å": "0305",
"å": "0345",
"ê": "0352",
"'": "'",
}
PLAINASCII = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz- "
seen = []
for word in words:
if "soundfile" in word:
if word["soundfile"].startswith("http://"):
if "sound/http://" in word["soundfile"]:
lindex = word["soundfile"].rfind("http://")
word["url"] = word["soundfile"][lindex:]
else:
word["url"] = word["soundfile"]
del(word["soundfile"])
continue
sf = word["soundfile"].replace(".swf", "")
for chars in list(sf):
ascii_tastic = True
for char in chars:
if char not in PLAINASCII:
if char in CHAR_REPLACE:
continue
ascii_tastic = False
if not ascii_tastic:
if not sf in seen:
print(sf)
seen.append(sf)
MISSING = [
"arbetsmarknadsinstitut", "becquerel", "inner-",
"j0344mn-", "j0344tte-", "kanon-", "svin-"
]
for word in words:
if "soundfile" in word and not "url" in word:
sf = word["soundfile"]
sf = sf.replace(".swf", "")
for repl in CHAR_REPLACE:
sf = sf.replace(repl, CHAR_REPLACE[repl])
if sf.startswith(" "):
sf = sf[1:]
if sf in MISSING:
continue
word["url"] = f"http://lexin.nada.kth.se/sound/{sf}.mp3"
import json
with open("/tmp/folkets-sv-en.json", "w") as outf:
json.dump(words, outf)
seen = []
with open("/tmp/folkets-sounds.tsv", "w") as outf:
for word in words:
if not "url" in word:
continue
if "transcription" in word:
ts = word["transcription"]
else:
ts = ""
line = f"{word['word']}\t{ts}\t{word['url']}"
if not line in seen:
outf.write(line + "\n")
seen.append(line)
Through sheer laziness, the audio was downloaded like this:
cat /tmp/folkets-sounds.tsv | awk -F'\t' '{print $3}' > /tmp/folkets-urls
wget -x -c -i /tmp/folkets-urls -o /tmp/folkets-urls.log