Convert saldom to json
To create a lemmatisation dictionary
import xml.etree.ElementTree as ET
from collections import defaultdict
import json
def parse_lexical_resource(xml_file):
"""
Parse XML lexical resource and create a lookup dictionary.
Returns a dictionary where:
- Keys are word forms (writtenForm)
- Values are lists of dictionaries containing lemma and grammatical info
"""
tree = ET.parse(xml_file)
root = tree.getroot()
lexicon_lookup = defaultdict(list)
# Find all LexicalEntry elements
for entry in root.findall('.//LexicalEntry'):
lemma_elem = entry.find('.//Lemma/FormRepresentation')
if lemma_elem is not None:
lemma_info = {}
for feat in lemma_elem.findall('feat'):
lemma_info[feat.get('att')] = feat.get('val')
lemma = lemma_info.get('writtenForm', '')
lemgram = lemma_info.get('lemgram', '')
pos = lemma_info.get('partOfSpeech', '')
paradigm = lemma_info.get('paradigm', '')
for wordform in entry.findall('.//WordForm'):
form_info = {}
for feat in wordform.findall('feat'):
form_info[feat.get('att')] = feat.get('val')
written_form = form_info.get('writtenForm', '')
msd = form_info.get('msd', '')
if written_form:
lexicon_lookup[written_form].append({
'lemma': lemma,
'lemgram': lemgram,
'partOfSpeech': pos,
'paradigm': paradigm,
'msd': msd
})
return dict(lexicon_lookup)
def save_lookup_dict(lookup_dict, output_file='saldom.json'):
"""Save the lookup dictionary to a JSON file."""
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(lookup_dict, f, ensure_ascii=False, indent=2)
print(f"Lookup dictionary saved to {output_file}")
xml_file = '/kaggle/input/saldos-morphology/saldom.xml'
lookup_dict = parse_lexical_resource(xml_file)
print(f"Created lookup dictionary with {len(lookup_dict)} word forms\n")
save_lookup_dict(lookup_dict)