Re-parse Swedish UD Talbanken with Stanza
To find out what kind of errors it makes
%%capture
!pip install -U stanza conllu==4.5 tqdm
!wget -O eval.py https://raw.githubusercontent.com/UniversalDependencies/tools/refs/heads/master/eval.py
!chmod +x eval.py
import stanza, pathlib
stanza.download('sv')
!wget -O sv_talbanken.tar.gz https://github.com/UniversalDependencies/UD_Swedish-Talbanken/archive/refs/tags/r2.16.tar.gz
!tar -xf sv_talbanken.tar.gz
from stanza.utils.conll import CoNLL
from tqdm import tqdm
import stanza, pathlib
nlp = stanza.Pipeline(
lang='sv',
processors='tokenize,pos,lemma,depparse',
tokenize_pretokenized=False,
use_gpu=False)
from conllu import parse_incr
for split in ["test", "dev", "train"]:
INFILE=f"UD_Swedish-Talbanken-r2.16/sv_talbanken-ud-{split}.conllu"
OUTFILE = f"talbanken-stanza-{split}.conllu"
with open(INFILE, encoding="utf-8") as gold,\
open(OUTFILE, "w", encoding="utf-8") as out:
for tokenlist in tqdm(parse_incr(gold)):
doc = nlp(tokenlist.metadata["text"])
lines = "{:C}".format(doc).split("\n")
lines = [line for line in lines if not line.startswith("#")]
mdlines = []
for mditem in tokenlist.metadata:
mdlines.append(f"# {mditem}: {tokenlist.metadata[mditem]}")
lines = mdlines + lines
out.write("\n".join(lines))
out.write("\n\n")