WAXHOLM_DIR = "/Users/joregan/Playing/waxholm/scenes_formatted/"
$find ~/Playing/waxholm/scenes_formatted/ -name '*.mix'|while read i;do \
cat $i |grep -v '^$'|grep -v '^FR'|grep -v '^CT'|grep -v '^WIZARD'|\
grep -v '^AUTOLAB'|grep -v 'CORRECTED:'|grep -v 'DATA BANK MATERIAL'|grep -v '^SPEAKER'; done
#!/usr/bin/perl

use warnings;
use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my $smp = '';
my $last = '';
my $text = '';
my $phone = '';
my $label = '';

sub clean_up {
        my $text = shift;

        $text =~ s/\s+/ /g;
        $text =~ s/\{/ä/g;
        $text =~ s/\}/å/g;
        $text =~ s/\|/ö/g;
        $text =~ s/\\/Ö/g;
        return $text;
}

while(<>) {
        chomp;
        if(m!Waxholm dialog. /u/wax/data/scenes/[^/]*/(.*)!) {
                my $newsmp = $1;
                $last = 'smp';
                $text = clean_up($text);
                $phone = clean_up($phone);
                $label = clean_up($label);

                if ($smp ne '') {
                        print "$smp\t$text\t$phone\t$label\n";
                }
                $smp = $newsmp;
                $text = '';
                $phone = '';
                $label = '';
        } elsif (m!^PHONEME:!) {
                s/^PHONEME:\s+//;
                $phone = $_;
                $last = 'phone';
        } elsif (m!^Labels:!) {
                s/^Labels:\s+//;
                $label = $_;
                $last = 'label';
        } elsif (m!^TEXT:!) {
                s/^TEXT:\s*//;
                $text = $_;
                $last = 'text';
        } elsif (m!^S{g f|ljande mening:!) {
                print STDERR "$smp:$_\n";
        } else {
                if($last eq 'text') {
                        $text = "$text $_";
                } elsif($last eq 'label') {
                        $label = "$label $_";
                } elsif($last eq 'phone') {
                        $phone = "$phone $_";
                } else {
                        print "DAFUQ: $_";
                }
        }
}
from pathlib import Path
import re

WAXHOLM_PATH = Path(WAXHOLM_DIR)

def skippable(text):
    if text.startswith("CT"):
        return True
    elif text.startswith("CORRECTED:"):
        return True
    elif text.startswith("AUTOLABEL:"):
        return True
    elif text.startswith("DATA BANK MATERIAL:"):
        return True
    elif text.startswith("WIZARD:"):
        return True
    elif text.startswith("SPEAKER"):
        return True
    elif text.startswith("Digital recording"):
        return True
    elif text.startswith("S{g f|ljande mening:"):
        return True
    elif text.startswith("Correction"):
        return True
    elif text.strip().startswith("Corrected"):
        return True
    elif text.strip() == "":
        return True
    return False


def fix_text(text: str) -> str:
    if text == "":
        return ""
    text = text.strip()
    replacements = text.maketrans("{}|\\[]", "äåöÖÄÅ")
    tr = text.translate(replacements)
    spaced = re.sub("\s+", " ", tr)
    if spaced[-1] == ".":
        spaced = spaced[:-1]
    return spaced.strip()


def line_kludge(smp, line):
    LINES = {
        "fp2015/fp2015.1.03.smp": "J'A:+",
        "fp2015/fp2015.1.00.smp": "H'EJ",
        'fp2015/fp2015.1.01.smp': "J'A:",
        'fp2015/fp2015.1.04.smp': 'V"ILK\'A',
        'fp2015/fp2015.1.05.smp': "G']:R",
        'fp2015/fp2015.1.06.smp': "']:+",
        'fp2014/fp2014.1.02.smp': "M']NDA",
        'fp2014/fp2014.6.00.smp': 'V"ILK\'A',
        'fp2014/fp2014.8.05.smp': "T'AK",
        'fp2014/fp2014.8.04.smp': "F'INS",
        'fp2014/fp2014.6.01.smp': 'V"ILK\'A',
        'fp2014/fp2014.1.03.smp': "K'AN+",
        'fp2014/fp2014.1.01.smp': "FR']:N+",
        'fp2014/fp2014.1.00.smp': "G']:R",
        'fp2014/fp2014.6.02.smp': "T'AK",
        'fp2014/fp2014.8.02.smp': "T'AK",
        'fp2014/fp2014.1.05.smp': "D'E+",
        'fp2014/fp2014.1.04.smp': "G']:R",
        'fp2014/fp2014.8.03.smp': "K'AN+",
        'fp2014/fp2014.8.01.smp': 'V"ILK\'A',
        'fp2014/fp2014.8.00.smp': "K'AN+",
        'fp2007/fp2007.6.01.smp': "V'A:R+",
        'fp2007/fp2007.8.04.smp': "'EN+",
        'fp2007/fp2007.1.03.smp': "J'A:",
        'fp2007/fp2007.1.13.smp': "J'A:+",
        'fp2007/fp2007.1.12.smp': "J'A:+",
        'fp2007/fp2007.1.02.smp': "'I:+",
        'fp2007/fp2007.8.05.smp': "'EN+",
        'fp2007/fp2007.6.00.smp': "V'A:R+",
        'fp2007/fp2007.1.00.smp': "N'[3R+",
        'fp2007/fp2007.1.09.smp': "J'A:+",
        'fp2007/fp2007.8.07.smp': "SL'U:T",
        'fp2007/fp2007.6.02.smp': "SL'U:T",
        'fp2007/fp2007.1.08.smp': 'V"AKShyH\']LM',
        'fp2007/fp2007.8.06.smp': "V'A:R+",
        'fp2007/fp2007.1.11.smp': "FR'E:DA",
        'fp2007/fp2007.1.04.smp': "J'A:+",
        'fp2007/fp2007.1.14.smp': "P']:+",
        'fp2007/fp2007.8.03.smp': "V'A:R+",
        'fp2007/fp2007.8.02.smp': 'V"ILK\'A',
        'fp2007/fp2007.1.15.smp': "SL'U:T",
        'fp2007/fp2007.1.05.smp': "V'A:R+",
        'fp2007/fp2007.8.00.smp': "F'INS",
        'fp2007/fp2007.1.06.smp': "J'A:+",
        'fp2007/fp2007.8.01.smp': "J'A:+",
        'fp2009/fp2009.5.02.smp': "T'I:SDA",
        'fp2009/fp2009.1.05.smp': "'EFTER+",
        'fp2009/fp2009.2.00.smp': "J'A:+",
        'fp2009/fp2009.2.01.smp': "J'A:+",
        'fp2009/fp2009.5.03.smp': "J'A:+",
        'fp2009/fp2009.1.04.smp': "'EFTER+",
        'fp2009/fp2009.2.03.smp': "V'A:+",
        'fp2007/fp2007.1.10.smp': "FR']:N+",
        'fp2009/fp2009.5.01.smp': "IFR']:N",
        'fp2009/fp2009.1.06.smp': "F'INS",
        'fp2009/fp2009.5.00.smp': "J'A:+",
        'fp2009/fp2009.1.07.smp': "N'[3R+",
        'fp2009/fp2009.2.07.smp': '"ING\'ET+',
        'fp2009/fp2009.1.02.smp': "J'A:+",
        'fp2009/fp2009.1.03.smp': "N'[3R+",
        'fp2009/fp2009.5.04.smp': '"ING\'ET+',
        'fp2009/fp2009.2.06.smp': "V'A:+",
        'fp2009/fp2009.1.01.smp': "J'A:+",
        'fp2009/fp2009.1.08.smp': "P']:+",
        'fp2009/fp2009.2.04.smp': "J'A:+",
        'fp2009/fp2009.1.09.smp': "'EFTER+",
        'fp2009/fp2009.2.05.smp': "J'A:+",
        'fp2009/fp2009.1.00.smp': "N'[3R+",
        'fp2009/fp2009.1.10.smp': 'SL"U:T\'A',
        'fp2009/fp2009.2.02.smp': "FR']:N+",
        'fp2001/fp2001.1.02.smp': 'STR"\\MK\'AJEN',
        'fp2001/fp2001.6.00.smp': 'V"I:S\'A',
        'fp2001/fp2001.2.17.smp': "N'[3R+",
        'fp2001/fp2001.2.07.smp': "D'E+",
        'fp2001/fp2001.6.01.smp': "J'A:+",
        'fp2001/fp2001.2.06.smp': "FR']:N+",
        'fp2001/fp2001.2.16.smp': ",H'U:R+",
        'fp2001/fp2001.1.03.smp': "J'A:+",
        'fp2001/fp2001.6.03.smp': "V'A:R+",
        'fp2001/fp2001.2.14.smp': 'V"ILK\'A',
        'fp2001/fp2001.2.04.smp': "V'A:R+",
        'fp2001/fp2001.1.01.smp': "J'A:+",
        'fp2001/fp2001.1.00.smp': 'V"ILK\'A',
        'fp2001/fp2001.6.02.smp': "J'A:+",
        'fp2001/fp2001.2.15.smp': 'V"ILK\'A',
        'fp2001/fp2001.2.10.smp': 'V"ILK\'A',
        'fp2001/fp2001.2.00.smp': "K'AN+",
        'fp2001/fp2001.2.09.smp': "G']:R",
        'fp2001/fp2001.2.19.smp': "D'E+",
        'fp2001/fp2001.1.05.smp': 'V"ILK\'A',
        'fp2001/fp2001.2.18.smp': "J'A:+",
        'fp2001/fp2001.2.08.smp': "P']:+",
        'fp2001/fp2001.1.04.smp': "G']:R",
        'fp2001/fp2001.2.01.smp': "V'A:R+",
        'fp2001/fp2001.2.11.smp': "V'A:R+",
        'fp2001/fp2001.6.06.smp': "D']:+",
        'fp2001/fp2001.2.13.smp': 'V"ILK\'A',
        'fp2001/fp2001.2.03.smp': 'V"ILK\'A',
        'fp2001/fp2001.6.04.smp': 'V"ILK\'A',
        'fp2001/fp2001.2.02.smp': "JA:H'A:,']:+",
        'fp2001/fp2001.2.12.smp': "V'A:R+",
        'fp2001/fp2001.6.05.smp': "V'A:R+",
        'fp2006/fp2006.5.05.smp': "J'A:+",
        'fp2006/fp2006.1.02.smp': "J'A:+",
        'fp2006/fp2006.6.00.smp': 'V"ILK\'A',
        'fp2006/fp2006.6.01.smp': 'V"ILK\'A',
        'fp2006/fp2006.5.04.smp': "J'A:+",
        'fp2006/fp2006.1.03.smp': "SL'U:T",
        'fp2006/fp2006.6.03.smp': "J'A:+",
        'fp2006/fp2006.5.06.smp': "J'A:+",
        'fp2006/fp2006.1.01.smp': "J'A:+",
        'fp2006/fp2006.5.07.smp': 'V"ILK\'A',
        'fp2006/fp2006.1.00.smp': "J'A:+",
        'fp2006/fp2006.6.02.smp': "V'A:R+",
        'fp2006/fp2006.6.07.smp': "SL'U:T",
        'fp2006/fp2006.5.02.smp': "J'A:+",
        'fp2006/fp2006.5.03.smp': "P']:+",
        'fp2006/fp2006.6.06.smp': "V'A:R+",
        'fp2006/fp2006.5.01.smp': "J'A:+",
        'fp2006/fp2006.6.04.smp': "J'A:+",
        'fp2006/fp2006.5.08.smp': "J'A:+",
        'fp2006/fp2006.6.05.smp': "V'A:R+",
        'fp2006/fp2006.5.09.smp': "J'A:+",
        'fp2006/fp2006.5.00.smp': 'V"ILK\'A',
        'fp2006/fp2006.5.10.smp': "SL'U:T",
        'fp2011/fp2011.3.04.smp': "'EN+",
        'fp2011/fp2011.1.07.smp': "D']:+",
        'fp2011/fp2011.1.06.smp': "F'INS",
        'fp2011/fp2011.3.05.smp': "F'INS",
        'fp2011/fp2011.1.04.smp': "N'[3R+",
        'fp2011/fp2011.3.07.smp': 'ST"]KhyH\']LM',
        'fp2011/fp2011.1.05.smp': "VARIFR']:N",
        'fp2011/fp2011.1.00.smp': "J'A:+",
        'fp2011/fp2011.1.09.smp': "T'AK",
        'fp2011/fp2011.3.03.smp': "K'AN+",
        'fp2011/fp2011.1.08.smp': "T'AK",
        'fp2011/fp2011.3.02.smp': "K'AN+",
        'fp2011/fp2011.1.01.smp': "J'A:+",
        'fp2011/fp2011.3.00.smp': "J'A:+",
        'fp2011/fp2011.3.10.smp': "D'E+",
        'fp2011/fp2011.1.03.smp': "K'AN+",
        'fp2011/fp2011.3.09.smp': "J'A:+",
        'fp2011/fp2011.1.02.smp': "FR']:N+",
        'fp2011/fp2011.3.01.smp': "V'A:R+",
        'fp2011/fp2011.6.00.smp': "J'A:+",
        'fp2010/fp2010.1.06.smp': "J'A:+",
        'fp2010/fp2010.8.08.smp': 'V"ILK\'A',
        'fp2010/fp2010.6.04.smp': "H'U:R+",
        'fp2010/fp2010.8.01.smp': 'N"]:G]NhyST\'ANS',
        'fp2010/fp2010.8.11.smp': "F'INS",
        'fp2010/fp2010.8.10.smp': "F'INS",
        'fp2010/fp2010.6.05.smp': "'[3R+",
        'fp2010/fp2010.1.07.smp': "G']:R",
        'fp2010/fp2010.8.09.smp': "J'A:+",
        'fp2010/fp2010.8.02.smp': "J'A:+",
        'fp2010/fp2010.8.12.smp': "J'A:+",
        'fp2010/fp2010.1.05.smp': "J'A:+",
        'fp2010/fp2010.1.04.smp': "J'A:+",
        'fp2010/fp2010.6.06.smp': "SL'U:T",
        'fp2010/fp2010.8.13.smp': "V'A:+",
        'fp2010/fp2010.8.03.smp': "J'A:+",
        'fp2010/fp2010.6.03.smp': ",H'U:R+",
        'fp2010/fp2010.8.06.smp': 'V"ILK\'A',
        'fp2010/fp2010.8.16.smp': "SL'U:T",
        'fp2010/fp2010.1.08.smp': "F'INS",
        'fp2010/fp2010.1.11.smp': "J'A:+",
        'fp2010/fp2010.1.01.smp': "J'A:",
        'fp2010/fp2010.1.00.smp': "J'O:",
        'fp2010/fp2010.1.10.smp': "J'A:+",
        'fp2010/fp2010.8.07.smp': "N'[3R+",
        'fp2010/fp2010.6.02.smp': "J'A:+",
        'fp2010/fp2010.1.09.smp': "G']:R",
        'fp2010/fp2010.1.12.smp': "G']:R",
        'fp2010/fp2010.1.02.smp': "J'A:+",
        'fp2010/fp2010.8.05.smp': "J'A:+",
        'fp2010/fp2010.8.15.smp': "K'AN+",
        'fp2010/fp2010.6.01.smp': 'V"ILK\'A',
        'fp2010/fp2010.8.14.smp': "V'A:+",
        'fp2010/fp2010.8.04.smp': "J'A:+",
        'fp2010/fp2010.1.03.smp': "J'A:+",
        'fp2010/fp2010.1.13.smp': "SL'U:T",
        'fp2003/fp2003.8.00.smp': "J'A:+",
        'fp2003/fp2003.1.07.smp': '"ING\'ET+',
        'fp2003/fp2003.1.06.smp': "J'A:+",
        'fp2003/fp2003.8.01.smp': "F'INS",
        'fp2003/fp2003.1.04.smp': "J'A:+",
        'fp2003/fp2003.8.03.smp': "V'A:R+",
        'fp2003/fp2003.1.05.smp': "T'IL+",
        'fp2003/fp2003.1.00.smp': "J'A:+",
        'fp2003/fp2003.6.02.smp': 'T"AK\'AR',
        'fp2003/fp2003.1.08.smp': "N'U:",
        'fp2003/fp2003.1.01.smp': "J'A:+",
        'fp2003/fp2003.8.04.smp': "T'AK",
        'fp2003/fp2003.6.01.smp': "V'A:R+",
        'fp2003/fp2003.1.03.smp': "J'A:+",
        'fp2003/fp2003.1.02.smp': "T'IL+",
        'fp2003/fp2003.6.00.smp': "J'A:+",
        'fp2004/fp2004.6.05.smp': "F'INS",
        'fp2004/fp2004.6.04.smp': 'V"ILK\'A',
        'fp2004/fp2004.7.00.smp': "V'A:R+",
        'fp2004/fp2004.6.06.smp': "V'A:R+",
        'fp2004/fp2004.7.01.smp': ",H'U:R+",
        'fp2004/fp2004.6.07.smp': "T'AK",
        'fp2004/fp2004.1.00.smp': ",H'U:R+",
        'fp2004/fp2004.6.02.smp': "J'A:+",
        'fp2004/fp2004.6.03.smp': "V'A:+",
        'fp2004/fp2004.6.01.smp': "V'A:R+",
        'fp2004/fp2004.6.00.smp': 'H"EJS\'AN',
        'fp2005/fp2005.5.01.smp': "T'IL+",
        'fp2005/fp2005.2.03.smp': "FR']:N+",
        'fp2005/fp2005.2.12.smp': "SL'U:T",
        'fp2005/fp2005.2.02.smp': 'ST"]KhyH\']LM',
        'fp2005/fp2005.5.00.smp': "J'A:+",
        'fp2005/fp2005.2.00.smp': "V'A:R+",
        'fp2005/fp2005.2.10.smp': "J'A:+",
        'fp2005/fp2005.5.02.smp': "P']:+",
        'fp2005/fp2005.2.09.smp': "J'A:+",
        'fp2005/fp2005.5.03.smp': "SL'U:T",
        'fp2005/fp2005.2.08.smp': "J'A:+",
        'fp2005/fp2005.1.04.smp': "SL'U:T",
        'fp2005/fp2005.2.11.smp': "T'IL+",
        'fp2005/fp2005.2.01.smp': "J'A:+",
        'fp2005/fp2005.2.04.smp': "FR'E:DA",
        'fp2005/fp2005.1.01.smp': "FR']:N+",
        'fp2005/fp2005.1.00.smp': "J'A:+",
        'fp2005/fp2005.2.05.smp': "J'A:+",
        'fp2005/fp2005.1.02.smp': "P']:+",
        'fp2005/fp2005.2.07.smp': "J'A:+",
        'fp2005/fp2005.2.06.smp': "'EFTER+",
        'fp2005/fp2005.1.03.smp': "P']:+",
        'fp2024/fp2024.5.00.smp': "FR']:N+",
    }
    if not smp in LINES:
        return False
    return line.startswith(LINES[smp])


mixfiles = []
for mixfile in WAXHOLM_PATH.glob("*/*.mix"):
    current = {}
    current["stem"] = mixfile.stem
    with open(mixfile) as infile:
        last = ''
        for line in infile.readlines():
            if skippable(line):
                continue
            elif line.startswith("Waxholm dialog."):
                SCENES = "/scenes/"
                scene_start = line.find(SCENES)
                smp = line[scene_start+len(SCENES):].strip()
                current["smp"] = smp
            elif line.startswith("PHONEME:"):
                current["phoneme"] = line[9:].strip()
                last = "phoneme"
            elif line.startswith("Labels:"):
                current["labels"] = line[7:].strip()
                last = "labels"
            elif line.startswith("TEXT:"):
                current["text"] = line[5:].strip()
                last = "text"
            elif line_kludge(current["smp"], line):
                current["phoneme"] = line.strip()
                last = "phoneme"
            elif line.startswith("FR "):
                continue
            else:
                if last == "":
                    print(smp, line)
                current[last] = " ".join([current[last], line.strip()]).strip()
    for key in ["text", "phoneme", "labels"]:
        if not "phoneme" in current:
            current["phoneme"] = ""
        else:
            current[key] = fix_text(current[key])
    mixfiles.append(current)
import json
with open("/tmp/waxholm_raw_lexicon.json", "w") as outf:
    json.dump(mixfiles, outf)
maybes = {}
for entry in mixfiles:
    if entry["phoneme"] == "":
        maybe_ph = entry["text"].split(".")
        maybe_first = maybe_ph[1].strip().split(" ")[0]
        maybes[entry["smp"]] = maybe_first
with open("/tmp/waxholm_raw_lexicon.json") as inf:
    data = json.load(inf)
for item in data:
    item["stem"] = item["stem"].replace(".smp", "")
    if item["phoneme"] == "":
        del item["phoneme"]
    item["labels_original"] = item["labels"]
    labels = item["labels"].split(" ")
    new_labels = [lbl for lbl in labels if lbl != "p:"]
    item["labels"] = " ".join(new_labels)
with open("/tmp/waxholm_raw_lexicon.json", "w") as outf:
    json.dump(data, outf)