!pip install jiwer
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.12/dist-packages (from jiwer) (8.3.0)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 31.8 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.1
!unzip upload.zip
Archive:  upload.zip
  inflating: splits-orig/eng.test.retsv  
  inflating: splits-orig/ita.test.retsv  
  inflating: splits-orig/swe.test.retsv  
  inflating: splits-orig/spa.test.retsv  
  inflating: splits-orig/lat.test.retsv  
  inflating: splits-orig/dan.test.retsv  
  inflating: splits-orig/fre.test.retsv  
  inflating: output/spa.mwl.out      
  inflating: output/lat.wl.out       
  inflating: output/ita.mwl.out      
  inflating: output/dan.wl-merged.out  
  inflating: output/eng.mwl.out      
  inflating: output/fre.wl.out       
  inflating: output/ita.wl.out       
  inflating: output/swe.merged.out   
  inflating: output/fre.mwl.out      
  inflating: output/spa.wl.out       
  inflating: output/eng.wl-merged.out  
  inflating: output/spa.wl-merged.out  
  inflating: output/lat.mwl.out      
  inflating: output/dan.wl.out       
  inflating: output/swe.wl.out       
  inflating: output/ita.wl-merged.out  
  inflating: output/dan.mwl.out      
  inflating: output/eng.wl.out       
  inflating: output/lat.wl-merged.out  
  inflating: output/fre.wl-merged.out  
!mv /content/output/swe.merged.out /content/output/swe.mwl.out
from jiwer import wer

test = {}
mwl = {}
wl = {}
wlm = {}

for lang in ["swe", "dan", "eng", "spa", "lat", "fre", "ita"]:
    test[lang] = {}
    mwl[lang] = {}
    wl[lang] = {}
    wlm[lang] = {}

    with open(f"splits-orig/{lang}.test.retsv") as rtf:
        for line in rtf.readlines():
            line = line.strip().split("\t")
            test[lang][line[0]] = line[1]
    with open(f"output/{lang}.mwl.out") as rtf:
        for line in rtf.readlines():
            if line[0] in "¹²³⁴⁵⁶":
                line = line[1:]
            line = line.strip().split("\t")
            mwl[lang][line[0]] = line[1]
    with open(f"output/{lang}.wl.out") as rtf:
        for line in rtf.readlines():
            line = line.strip().split("\t")
            wl[lang][line[0]] = line[1]
    if lang == "swe":
        continue
    with open(f"output/{lang}.wl-merged.out") as rtf:
        for line in rtf.readlines():
            line = line.strip().split("\t")
            wlm[lang][line[0]] = line[1]
for lang in ["swe", "dan", "eng", "spa", "lat", "fre", "ita"]:
    refs = []
    hyps = []
    for word in test[lang]:
        refs.append(test[lang][word])
        hyps.append(wl[lang][word])
    print("WL", lang, wer(refs, hyps))
WL swe 0.009857072449482503
WL dan 0.10990712074303406
WL eng 0.1256150506512301
WL spa 0.1094017094017094
WL lat 0.11362126245847176
WL fre 0.1687116564417178
WL ita 0.10502793296089385
for lang in ["swe", "dan", "eng", "spa", "lat", "fre", "ita"]:
    refs = []
    hyps = []
    for word in test[lang]:
        refs.append(test[lang][word])
        hyps.append(mwl[lang][word])
    print("MWL", lang, wer(refs, hyps))
MWL swe 0.009857072449482503
MWL dan 0.11222910216718267
MWL eng 0.15658465991316933
MWL spa 0.14188034188034188
MWL lat 0.132890365448505
MWL fre 0.24846625766871167
MWL ita 0.11955307262569832
for lang in ["dan", "eng", "spa", "lat", "fre", "ita"]:
    refs = []
    hyps = []
    for word in test[lang]:
        refs.append(test[lang][word])
        hyps.append(wlm[lang][word])
    print("WLM", lang, wer(refs, hyps))
WLM dan 0.1393188854489164
WLM eng 0.2593342981186686
WLM spa 0.1641025641025641
WLM lat 0.15946843853820597
WLM fre 0.2983128834355828
WLM ita 0.1553072625698324
!unzip raw.zip
Archive:  raw.zip
  inflating: dan.wl-raw.out          
  inflating: eng.wl-raw.out          
  inflating: fre.wl-raw.out          
  inflating: ita.wl-raw.out          
  inflating: lat.wl-raw.out          
  inflating: spa.wl-raw.out          
  inflating: swe.wl-raw.out          
raw = {}

for lang in ["swe", "dan", "eng", "spa", "lat", "fre", "ita"]:
    raw[lang] = {}
    with open(f"{lang}.wl-raw.out") as rtf:
        for line in rtf.readlines():
            line = line.strip().split("\t")
            raw[lang][line[0]] = line[1]
for lang in ["swe", "dan", "eng", "spa", "lat", "fre", "ita"]:
    refs = []
    hyps = []
    for word in test[lang]:
        refs.append(test[lang][word])
        hyps.append(raw[lang][word])
    print("RAW", lang, wer(refs, hyps))
RAW swe 0.009561360275998028
RAW dan 0.12151702786377709
RAW eng 0.21128798842257598
RAW spa 0.1452991452991453
RAW lat 0.14219269102990034
RAW fre 0.294478527607362
RAW ita 0.1553072625698324