import xml.etree.ElementTree as ET
from collections import defaultdict
import json

def parse_lexical_resource(xml_file):
    """
    Parse XML lexical resource and create a lookup dictionary.
    
    Returns a dictionary where:
    - Keys are word forms (writtenForm)
    - Values are lists of dictionaries containing lemma and grammatical info
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    lexicon_lookup = defaultdict(list)
    
    # Find all LexicalEntry elements
    for entry in root.findall('.//LexicalEntry'):
        lemma_elem = entry.find('.//Lemma/FormRepresentation')
        if lemma_elem is not None:
            lemma_info = {}
            for feat in lemma_elem.findall('feat'):
                lemma_info[feat.get('att')] = feat.get('val')
            
            lemma = lemma_info.get('writtenForm', '')
            lemgram = lemma_info.get('lemgram', '')
            pos = lemma_info.get('partOfSpeech', '')
            paradigm = lemma_info.get('paradigm', '')
            
            for wordform in entry.findall('.//WordForm'):
                form_info = {}
                for feat in wordform.findall('feat'):
                    form_info[feat.get('att')] = feat.get('val')
                
                written_form = form_info.get('writtenForm', '')
                msd = form_info.get('msd', '')
                
                if written_form:
                    lexicon_lookup[written_form].append({
                        'lemma': lemma,
                        'lemgram': lemgram,
                        'partOfSpeech': pos,
                        'paradigm': paradigm,
                        'msd': msd
                    })
    
    return dict(lexicon_lookup)


def save_lookup_dict(lookup_dict, output_file='saldom.json'):
    """Save the lookup dictionary to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(lookup_dict, f, ensure_ascii=False, indent=2)
    print(f"Lookup dictionary saved to {output_file}")
xml_file = '/kaggle/input/saldos-morphology/saldom.xml'

lookup_dict = parse_lexical_resource(xml_file)

print(f"Created lookup dictionary with {len(lookup_dict)} word forms\n")

save_lookup_dict(lookup_dict)
Created lookup dictionary with 1025276 word forms

Lookup dictionary saved to saldom.json