Phonetic rule processing, take 1
Or, the notebook I wrote, then lost
!wget https://www.openslr.org/resources/29/lexicon-sv.tgz
!tar zxvf lexicon-sv.tgz
lexicon = {}
with open("lexicon.txt") as lexf:
for line in lexf.readlines():
parts = line.strip().split()
word = parts[0]
phones = parts[1:]
if word[0:1] in "!<&-":
continue
if not word in lexicon:
lexicon[word] = []
lexicon[word].append(phones)
VOWELS = "2: 9 A: E E*U E: I O U Y a a*U e e: i: o: u0 u: y: }:".split(" ")
CONS = "N S b d d` f g h j k l l` m n n` p r s s' s` t t` v".split(" ")
def move_stress(phones):
outphones = []
stress = ''
for phone in phones:
if phone.startswith('%"'):
stress = '"'
phone = phone[2:]
if phone in VOWELS:
outphones.append(stress + phone)
stress = ''
else:
outphones.append(phone)
elif phone[0:1] in '%"':
stress = phone[0:1]
phone = phone[1:]
if phone in VOWELS:
outphones.append(stress + phone)
stress = ''
else:
outphones.append(phone)
else:
if stress != '' and phone in VOWELS:
phone = stress + phone
stress = ''
outphones.append(phone)
if stress != '':
print("Error: unplaced stress", phones)
return outphones
test = '%"j Y t r a %s E j'.split()
assert move_stress(test) == ['j', '"Y', 't', 'r', 'a', 's', '%E', 'j']
import itertools
class BaseRule():
def __init__(self, rule, phone, keep_stress=False):
self.rule = rule
self.phone = phone
self.keep_stress = keep_stress
def clean_phones(self, phones):
if self.keep_stress:
return [x.replace('"', '').replace('%', '') for x in phones]
else:
return phones
def applies(self, phones):
pass
def expand(self, phones, positions=[]):
pass
def __str__(self):
return f"[{self.rule}]"
class PhonologicalRule(BaseRule):
def __init__(self, rule, phone, transform=[], left_context=[], right_context=[], keep_stress=False):
super().__init__(rule, phone, keep_stress)
self.lctx = left_context
self.rctx = right_context
self.transform = transform
def lctx_ok(self, phones, pos):
if self.lctx == []:
return True
end = pos
start = end - len(self.lctx)
if phones[start:end] == self.lctx:
return True
return False
def rctx_ok(self, phones, pos):
if self.rctx == []:
return True
start = pos + 1
end = start + len(self.rctx)
if phones[start:end] == self.rctx:
return True
return False
def ctx_ok(self, phones, pos):
return self.rctx_ok(phones, pos) and self.lctx_ok(phones, pos)
def applies(self, phones):
positions = []
phones = self.clean_phones(phones)
if not self.phone in phones:
return []
for i in range(0, len(phones)):
if phones[i] == self.phone and self.ctx_ok(phones, i):
positions.append(i)
return positions
def expand(self, phones, positions=[]):
tmp = []
if positions == []:
positions = self.applies(phones)
for i in range(0, len(phones)):
if i in positions:
tmp.append([phones[i], " ".join(self.transform)])
else:
tmp.append([phones[i]])
expanded = [x for x in itertools.product(*tmp)]
tidied = set()
for exp in expanded:
tidied.add(tuple([c for c in exp if c != '']))
return [list(t) for t in tidied]
rule = PhonologicalRule("k → ∅ / _ t", "k", [], [], ["t"])
print(rule)
rule.expand("v I k t I k t".split(" "))
2D ɖ
2L ɭ
2N ɳ
2S ʂ
2T ʈ
A a
A: ɑː
B b
D d
E e
E0 ə
E: eː
F f
G ɡ
H h
I ɪ
I: iː
J j
K k
L l
M m
N n
NG ŋ
O ʊ
O: uː
P p
R r
S s
SJ ɧ
T t
TJ ɕ
U ɵ
U: ʉː
V v
Y ʏ
Y: yː
[ ɛ
[3 æː
[4 æ
[: ɛː
\ œ
\3 œ̞ː
\4 œ̞
\: øː
] ɔ
]: oː
gcl <gcl>
ha <ha>
hes <hes>
kl <kl>
pa <pa>
sm <sm>
v <v>
~H ~h
~L ~l
~N ~n
ADDITIONS = """
INTE\tn t e\tI -> 0 / VOWEL # _
EN\tE N\tn -> N / _ # [+velar]
EN\tE m\tn -> m / _ # [+labial]
JA\t"A:
JA\t"j a
NEJ\t"n E
"""