Convert NST pronunciation lexicons to JSON
Converting the pronunciation to IPA along the way
Based on this
Set up field reading
import csv
!pip install pyicu
field_names = [
"orthography",
"extended_pos",
"morphology",
"decomp",
"decpos",
"source",
"language_code",
"garbage",
"domain",
"abbr_acr",
"expansion",
"transliteration1",
"certainty_trans_1",
"status_trans_1",
"language_code_trans_1",
"transliteration2",
"certainty_trans_2",
"status_trans_2",
"language_code_trans_2",
"transliteration3",
"certainty_trans_3",
"status_trans_3",
"language_code_trans_3",
"transliteration4",
"certainty_trans_4",
"status_trans_4",
"language_code_trans_4",
"auto_gen_variants",
"set_id",
"set_name",
"style_status",
"inflector_role",
"lemma",
"inflection_rule",
"morph_label",
"compounder_code",
"semantic_info",
"available_field1",
"available_field2",
"available_field3",
"available_field4",
"available_field5",
"available_field6",
"available_field7",
"available_field8",
"available_field9",
"frequency",
"original_orthography",
"comment_field",
"update_info",
"unique_id"
]
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/sv.leksikon.tar.gz -O /tmp/sv.leksikon.tar.gz
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/da_leksikon.tar.gz -O /tmp/da_leksikon.tar.gz
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/no.leksikon.tar.gz -O /tmp/no.leksikon.tar.gz
import tarfile
data = {}
with tarfile.open("/tmp/sv.leksikon.tar.gz") as tar:
f = tar.extractfile("NST svensk leksikon/swe030224NST.pron/swe030224NST.pron")
prondata = f.read()
data["sv"] = prondata.decode('latin1')
with tarfile.open("/tmp/no.leksikon.tar.gz") as tar:
f = tar.extractfile("NSTs norske leksikon/nor030224NST.pron/nor030224NST.pron")
prondata = f.read()
data["no"] = prondata.decode('latin1')
with tarfile.open("/tmp/da_leksikon.tar.gz") as tar:
f = tar.extractfile("dan030224NST.pron/dan030224NST.pron")
prondata = f.read()
data["da"] = prondata.decode('latin1')
Set up transliterator
TRANSLIT_SV = """
n\` → ɳ ;
s\` → ʂ ;
l\` → ɭ ;
t\` → ʈ ;
d\` → ɖ ;
A → ɑ ;
O → ɔ ;
I → ɪ ;
E \* U → e \u2040 ʊ ;
E → ɛ ;
U → ʊ ;
Y → ʏ ;
2 → ø ;
9 → ø ;
u 0 → ɵ ;
N → ŋ ;
'""' → ² ;
'"' → ˈ ;
\% → ˌ ;
\: → ː ;
\$ → \. ;
g → ɡ ;
s \\\' → ɕ ;
x \\\\ → ɧ ;
\* → \u2040 ;
"""
NST_TRANSLIT = r"""
::XSampa-IPA;
\$ → \. ;
\? → ˀ;
\* → \u2040 ;
"""
DA_TRANSLIT = r"""
\? → ˀ;
\_ → ' ';
::XSampa-IPA;
\* → \u2040 ;
\$ → \. ;
"""
import icu
def transliterator_from_rules(name, rules):
fromrules = icu.Transliterator.createFromRules(name, rules)
icu.Transliterator.registerInstance(fromrules)
return icu.Transliterator.createInstance(name)
swelex_trans = transliterator_from_rules("swelex_trans", TRANSLIT_SV)
nstlex_trans = {}
nstlex_trans["no"] = transliterator_from_rules("nst_trans", NST_TRANSLIT)
nstlex_trans["da"] = transliterator_from_rules("da_trans", DA_TRANSLIT)
assert swelex_trans.transliterate('""bA:n`s`$%ma$man') == "²bɑːɳʂ.ˌma.man"
assert swelex_trans.transliterate('"b9r$mIN$ham') == "ˈbør.mɪŋ.ham"
assert swelex_trans.transliterate('"bI$rU') == "ˈbɪ.rʊ"
assert swelex_trans.transliterate('""bIsp$%go:$d`en') == "²bɪsp.ˌɡoː.ɖen"
assert swelex_trans.transliterate('"x\\A:l') == "ˈɧɑːl"
assert swelex_trans.transliterate("\"s'u:$lens") == "ˈɕuː.lens"
assert swelex_trans.transliterate('a$"lE*U$te$n`a') == 'a.ˈle⁀ʊ.te.ɳa'
assert swelex_trans.transliterate('"fu0l') == 'ˈfɵl'
def collapse_available_fields(data):
output = []
for i in range(1, 10):
if data[f"available_field{i}"] != "":
output.append(data[f"available_field{i}"])
del data[f"available_field{i}"]
data["available_fields"] = output
return data
def collapse_transliterations(data, transliterator):
output = []
for i in range(1, 5):
if data[f"transliteration{i}"] != "":
tmp = {}
tmp["transliteration"] = data[f"transliteration{i}"]
tmp["ipa"] = transliterator.transliterate(data[f"transliteration{i}"])
tmp["certainty"] = data[f"certainty_trans_{i}"]
tmp["status"] = data[f"status_trans_{i}"]
tmp["language_code"] = data[f"language_code_trans_{i}"]
output.append(tmp)
del data[f"transliteration{i}"]
del data[f"certainty_trans_{i}"]
del data[f"status_trans_{i}"]
del data[f"language_code_trans_{i}"]
data["transliterations"] = output
return data
import json
import io
with open("svlex.json", "w") as outf:
swelexf = io.StringIO(data["sv"])
swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
for row in swelex:
row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
row = collapse_available_fields(row)
row = collapse_transliterations(row, swelex_trans)
jsonstr = json.dumps(row)
outf.write(jsonstr + "\n")
data["da"] = data["da"].replace('\r', '')
for lang in ["no", "da"]:
with open(f"{lang}lex.json", "w", newline='') as outf:
swelexf = io.StringIO(data[lang])
swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
for row in swelex:
row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
row = collapse_available_fields(row)
row = collapse_transliterations(row, nstlex_trans[lang])
jsonstr = json.dumps(row)
outf.write(jsonstr + "\n")