Phonetic rules, take 2
Second attempt at an earlier notebook
Earlier notebook is here
_ALPHABET = {
"A": ["a"],
"B": ["be", "bé"],
"C": ["ce", "se", "sé", "cé", "si", "ci"],
"D": ["de", "dé", "di"],
"E": ["e", "é"],
"F": ["eff", "ef"],
"G": ["ge", "gé", "gi"],
"H": ["hå", "ho"],
"I": ["i"],
"J": ["ji", "gi"],
"K": ["kå", "ko"],
"L": ["ell", "el"],
"M": ["emm", "em"],
"N": ["enn", "en"],
"O": ["o"],
"P": ["pe", "pé", "pi"],
"Q": ["qu"],
"R": ["err", "er", "är", "ärr"],
"S": ["ess", "es"],
"T": ["te", "té", "ti"],
"U": ["u"],
"V": ["ve", "vé", "vi"],
"W": ["dubbelve"],
"X": ["ex", "ecz", "ecs", "eks"],
"Y": ["y"],
"Z": ["zäta", "säta", "seta", "zeta"],
"Å": ["å"],
"Ä": ["ä"],
"Ö": ["ö"]
}
DIGITS = {
"1": ["ett"],
"2": ["två"],
"3": ["tre"],
"4": ["fyra"],
"5": ["fem"],
"6": ["sex"],
"7": ["sju"],
"8": ["åtta"],
"9": ["nio", "ni"]
}
ALPHABET = {k: sorted(v, key=len, reverse=True) for k,v in _ALPHABET.items()}
2442206120015761721 1 487.78 0.32 essefu 1.0 <eps> ins
2442206120015761721 1 488.26 0.16 fem 1.0 SfU5 sub
2442206120015761721 1 490.6 0.519 tjugohundrafjorton 1.0 <eps> ins
2442206120015761721 1 491.2 0.379 femton 1.0 <eps> ins
2442206120015761721 1 491.74 0.779 etthundratjugofyra 1.0 2014/15:124 sub
ALNUM = {**ALPHABET, **DIGITS}
ALNUM_REGEX = {k: f"({'|'.join(v)})" for k,v in ALNUM.items()}
from difflib import SequenceMatcher
ACCEPT = [
("e", "ə")
]
a = "kamaren"
b = "kamarən"
ok = False
s = SequenceMatcher(None, a, b)
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == "replace":
if (a[i1:i2], b[j1:j2]) in ACCEPT:
ok = True
else:
ok = False
elif tag == "equal":
ok = True
from phonemizer import phonemize
phonemize("slut", language='sv')
from sync_asr.utils.nst_lexicon import get_nst_lexicon
%cd /Users/joregan/Playing/sync_asr
%pip install -e .
lexicon = get_nst_lexicon()
dictionary = {}
for entry in lexicon:
if 'garbage' in entry and entry['garbage'] == 'GARB':
continue
else:
word = entry['orthography']
word = word.replace("_", " ")
if not word in dictionary:
dictionary[word] = set()
for translit in entry['transliterations']:
dictionary[word].add(translit['ipa'].replace(".", "").replace("¤", "").replace("_", " ").replace("\u0361", ""))
CONSONANTS = "bdfhjklmnprstvŋɕɖɡɧɭɳʂʈ"
VOWELS = "aeiouyøɪɑɔɛɵʉʊʏ"
ACCENTS = "²ˌˈ"
print(dictionary["inte"])
print(phonemize("inte", language='sv'))
characters = set()
for entry in dictionary:
for pron in dictionary[entry]:
for char in pron:
characters.add(char)
cons = set()
VS = VOWELS + ACCENTS + "_"
for entry in dictionary:
for pron in dictionary[entry]:
for char in pron:
if char not in VS:
cons.add(char)
def shift_accent(ipa):
accent = ""
output = ""
for character in ipa:
if character in ACCENTS:
accent = character
elif character in CONSONANTS:
output += character
elif character in VOWELS:
if accent != "":
output += accent
accent = ""
output += character
else:
output += character
return output
shift_accent("²kratʊʂ")
import re
import itertools
class Rule():
def __init__(self, match, replacement, rulename, example, on_accented = False, before_accented = False):
self.match = match
self.replacement = replacement
self.rulename = rulename
self.example = example
self.on_accented = on_accented
self.before_accented = before_accented
def apply(self, word):
if self.on_accented:
pattern = fr"((?:[^²ˌˈ]){self.match}|^{self.match})"
elif self.before_accented:
pattern = fr"({self.match}(?:[^²ˌˈ]))"
else:
word = re.sub("[²ˌˈ]", "", word)
pattern = self.match
matches = [(m.start(), m.end()) for m in re.finditer(pattern, word)]
if self.on_accented:
tmp = []
for m in matches:
if m[1] - m[0] > len(self.match):
tmp.append((m[1] - len(self.match), m[1]))
else:
tmp.append(m)
matches = tmp
pieces = []
prev_end = 0
for piece in matches:
if piece[0] > 0:
pieces.append([word[prev_end:piece[0]]])
pieces.append([word[piece[0]:piece[1]], self.replacement])
prev_end = piece[1]
pieces.append([word[prev_end:]])
output = []
for part in itertools.product(*pieces):
output.append("".join(part))
return output
rule = Rule("nh", "n", "n → ∅ / _ h", "Stenholm")
assert rule.apply("nhanhanha") == ['nhanhanha',
'nhanhana',
'nhananha',
'nhanana',
'nanhanha',
'nanhana',
'nananha',
'nanana']
# is single words, but at some later processing stage we can validate
class AssimilationRule(Rule):
def __init__(self, match, replacement, rulename, example, on_accented = False, before_accented = False, pre_context = "", post_context = ""):
super.__init__(match, replacement, rulename, example, on_accented, before_accented)
self.pre_context = pre_context
self.post_context = post_context
GENERAL_STRESSED = [
Rule("e", "ə", "e → ə / [-stressed]", "", True),
Rule("ɛ", "ə", "e → ə / [-stressed]", "", True)
]
AssimilationRule("r$", "", "h → ∅ / r # _", "har han", False, False, "", "h")
AssimilationRule("n$", "", "h → ∅ / n # _", "han har", False, False, "", "h")
AssimilationRule("r$", "", "r → ∅ / _ # [+consonant]", "där bilen", False, False, "", f"[{CONSONANTS}]")
Rule("[œɶeə]r", "r", "e → ∅ / _ r [+stressed]", "bero", False, True)
Rule("ɪntə", "ntə", "ɪ → ∅ / [+vowel] # _ n t ə", "ska inte", False, False)
Rule("ɪnte", "nte", "ɪ → ∅ / [+vowel] # _ n t e", "ska inte", False, False)
Rule("nh", "h", "n → ∅ / _ h", "Stenholm")