!python -m spacy download sv_core_news_sm
Collecting sv-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/sv_core_news_sm-3.8.0/sv_core_news_sm-3.8.0-py3-none-any.whl (12.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.7/12.7 MB 72.3 MB/s eta 0:00:00
Installing collected packages: sv-core-news-sm
Successfully installed sv-core-news-sm-3.8.0
✔ Download and installation successful
You can now load the package via spacy.load('sv_core_news_sm')
⚠ Restart to reload dependencies
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
import spacy
from typing import List, Dict, Any

nlp = spacy.load("sv_core_news_sm")

# Example rule DSL
dsl_rules = [
    {
        "name": "den-har + definite noun -> noun",
        "pattern": [
            {"LEMMA": {"in": ["den", "det"]}, "POS": "DET"},
            {"TEXT": "här", "POS": "ADV"},
            {"DEP": "nmod", "POS": "NOUN", "MORPH": {"Definite": "Def"}}
        ],
        "action": "collapse_to",
        "head": 2
    },
    {
        "name": "har + supine -> past",
        "pattern": [
            {"LEMMA": "ha", "POS": "AUX"},
            {"DEP": "xcomp", "POS": "VERB", "MORPH": {"VerbForm": "Sup"}}
        ],
        "action": "transform_verb",
        "head": 1,
        "transform": {
            "lemma_map": {
                "suttit": "satt",
                "gått": "gick",
                "kommit": "kom"
            }
        }
    }
]

def apply_rewrite_rules(token, rules):
    for rule in rules:
        match = match_pattern(token, rule['pattern'])
        if match:
            if rule['action'] == 'collapse_to':
                return build_tree(match[rule['head']], rules)
            elif rule['action'] == 'transform_verb':
                head_token = match[rule['head']]
                lemma_map = rule.get('transform', {}).get('lemma_map', {})
                return {
                    'lemma': lemma_map.get(head_token.lemma_, head_token.lemma_),
                    'dep': head_token.dep_,
                    'pos': head_token.pos_,
                    'children': sorted([
                        build_tree(c, rules) for c in head_token.children if c not in match
                    ], key=lambda x: (x['dep'], x['lemma']))
                }
    return None

def match_pattern(token, pattern):
    matched = []
    siblings = [token] + list(token.children)
    for rule_token in pattern:
        for sib in siblings:
            if token_matches(sib, rule_token) and sib not in matched:
                matched.append(sib)
                break
        else:
            return None
    return matched

def token_matches(token, rule):
    for key, value in rule.items():
        if key == "LEMMA" and not match_value(token.lemma_, value): return False
        if key == "TEXT" and not match_value(token.text, value): return False
        if key == "POS" and token.pos_ != value: return False
        if key == "DEP" and token.dep_ != value: return False
        if key == "MORPH":
            for morph_key, morph_val in value.items():
                if token.morph.get(morph_key) != [morph_val]:
                    return False
    return True

def match_value(val, cond):
    if isinstance(cond, dict):
        if "in" in cond: return val in cond["in"]
    else:
        return val == cond

def build_tree(token, rules):
    # Try applying rewrite rules
    rewritten = apply_rewrite_rules(token, rules)
    if rewritten:
        return rewritten

    if token.pos_ == "PUNCT":
        return None

    children = filter(None, [build_tree(child, rules) for child in token.children])
    return {
        'lemma': token.lemma_.lower(),
        'dep': token.dep_,
        'pos': token.pos_,
        'children': sorted(children, key=lambda x: (x['dep'], x['lemma']))
    }

def get_root_trees(doc):
    roots = [token for token in doc if token.head == token]
    return sorted([
        build_tree(root, dsl_rules) for root in roots if root.pos_ != "PUNCT"
    ], key=lambda x: (x['dep'], x['lemma']))

def compare_trees(t1, t2):
    if t1['lemma'] != t2['lemma'] or t1['dep'] != t2['dep'] or t1['pos'] != t2['pos']:
        return False
    if len(t1['children']) != len(t2['children']):
        return False
    return all(compare_trees(c1, c2) for c1, c2 in zip(t1['children'], t2['children']))

def are_equivalent(sent1, sent2):
    doc1 = nlp(sent1)
    doc2 = nlp(sent2)
    trees1 = get_root_trees(doc1)
    trees2 = get_root_trees(doc2)
    import pprint
    pprint.pprint(trees1)
    pprint.pprint(trees2)
    if len(trees1) != len(trees2):
        return False
    return all(compare_trees(t1, t2) for t1, t2 in zip(trees1, trees2))

# Example usage
# if __name__ == "__main__":
s1 = "Katten satt på mattan."
s2 = "Den här katten har suttit på mattan."
print("Equivalent:", are_equivalent(s1, s2))
[{'children': [{'children': [{'children': [],
                              'dep': 'fixed',
                              'lemma': 'mattan',
                              'pos': 'NOUN'}],
                'dep': 'compound:prt',
                'lemma': 'på',
                'pos': 'ADP'},
               {'children': [],
                'dep': 'nsubj',
                'lemma': 'katt',
                'pos': 'NOUN'}],
  'dep': 'ROOT',
  'lemma': 'sitta',
  'pos': 'VERB'}]
[{'children': [{'children': [], 'dep': 'aux', 'lemma': 'ha', 'pos': 'AUX'},
               {'children': [{'children': [{'children': [],
                                            'dep': 'fixed',
                                            'lemma': 'här',
                                            'pos': 'ADV'}],
                              'dep': 'det',
                              'lemma': 'en',
                              'pos': 'DET'}],
                'dep': 'nsubj',
                'lemma': 'katt',
                'pos': 'NOUN'},
               {'children': [{'children': [],
                              'dep': 'case',
                              'lemma': 'på',
                              'pos': 'ADP'}],
                'dep': 'obl',
                'lemma': 'mattan',
                'pos': 'NOUN'}],
  'dep': 'ROOT',
  'lemma': 'sutti',
  'pos': 'VERB'}]
Equivalent: False
import pprint
pprint.pprint(trees1)
pprint.pprint(trees2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-44d2d520f43c> in <cell line: 0>()
      1 import pprint
----> 2 pprint.pprint(trees1)
      3 pprint.pprint(trees2)

NameError: name 'trees1' is not defined