Compare Swedish dependency trees
ChatGPT written, does not work
!python -m spacy download sv_core_news_sm
import spacy
from typing import List, Dict, Any
nlp = spacy.load("sv_core_news_sm")
# Example rule DSL
dsl_rules = [
{
"name": "den-har + definite noun -> noun",
"pattern": [
{"LEMMA": {"in": ["den", "det"]}, "POS": "DET"},
{"TEXT": "här", "POS": "ADV"},
{"DEP": "nmod", "POS": "NOUN", "MORPH": {"Definite": "Def"}}
],
"action": "collapse_to",
"head": 2
},
{
"name": "har + supine -> past",
"pattern": [
{"LEMMA": "ha", "POS": "AUX"},
{"DEP": "xcomp", "POS": "VERB", "MORPH": {"VerbForm": "Sup"}}
],
"action": "transform_verb",
"head": 1,
"transform": {
"lemma_map": {
"suttit": "satt",
"gått": "gick",
"kommit": "kom"
}
}
}
]
def apply_rewrite_rules(token, rules):
for rule in rules:
match = match_pattern(token, rule['pattern'])
if match:
if rule['action'] == 'collapse_to':
return build_tree(match[rule['head']], rules)
elif rule['action'] == 'transform_verb':
head_token = match[rule['head']]
lemma_map = rule.get('transform', {}).get('lemma_map', {})
return {
'lemma': lemma_map.get(head_token.lemma_, head_token.lemma_),
'dep': head_token.dep_,
'pos': head_token.pos_,
'children': sorted([
build_tree(c, rules) for c in head_token.children if c not in match
], key=lambda x: (x['dep'], x['lemma']))
}
return None
def match_pattern(token, pattern):
matched = []
siblings = [token] + list(token.children)
for rule_token in pattern:
for sib in siblings:
if token_matches(sib, rule_token) and sib not in matched:
matched.append(sib)
break
else:
return None
return matched
def token_matches(token, rule):
for key, value in rule.items():
if key == "LEMMA" and not match_value(token.lemma_, value): return False
if key == "TEXT" and not match_value(token.text, value): return False
if key == "POS" and token.pos_ != value: return False
if key == "DEP" and token.dep_ != value: return False
if key == "MORPH":
for morph_key, morph_val in value.items():
if token.morph.get(morph_key) != [morph_val]:
return False
return True
def match_value(val, cond):
if isinstance(cond, dict):
if "in" in cond: return val in cond["in"]
else:
return val == cond
def build_tree(token, rules):
# Try applying rewrite rules
rewritten = apply_rewrite_rules(token, rules)
if rewritten:
return rewritten
if token.pos_ == "PUNCT":
return None
children = filter(None, [build_tree(child, rules) for child in token.children])
return {
'lemma': token.lemma_.lower(),
'dep': token.dep_,
'pos': token.pos_,
'children': sorted(children, key=lambda x: (x['dep'], x['lemma']))
}
def get_root_trees(doc):
roots = [token for token in doc if token.head == token]
return sorted([
build_tree(root, dsl_rules) for root in roots if root.pos_ != "PUNCT"
], key=lambda x: (x['dep'], x['lemma']))
def compare_trees(t1, t2):
if t1['lemma'] != t2['lemma'] or t1['dep'] != t2['dep'] or t1['pos'] != t2['pos']:
return False
if len(t1['children']) != len(t2['children']):
return False
return all(compare_trees(c1, c2) for c1, c2 in zip(t1['children'], t2['children']))
def are_equivalent(sent1, sent2):
doc1 = nlp(sent1)
doc2 = nlp(sent2)
trees1 = get_root_trees(doc1)
trees2 = get_root_trees(doc2)
import pprint
pprint.pprint(trees1)
pprint.pprint(trees2)
if len(trees1) != len(trees2):
return False
return all(compare_trees(t1, t2) for t1, t2 in zip(trees1, trees2))
# Example usage
# if __name__ == "__main__":
s1 = "Katten satt på mattan."
s2 = "Den här katten har suttit på mattan."
print("Equivalent:", are_equivalent(s1, s2))
import pprint
pprint.pprint(trees1)
pprint.pprint(trees2)