Format Braxen
Convert to IPA from Braxen's phoneset; partly generated
BRAXEN_DIR = "/Users/joregan/Playing/braxen/"
BRAXEN_IPA_RAW = """
p p
b b
t t
rt ʈ
d d
rd ɖ
k k
g ɡ
g g
f f
v v
s s
rs ʂ
sh ʃ
zh ʒ
z z
dh ð
th θ
h h
x ɧ
xx x
c ɕ
tc t͡ʃ
dj d͡ʒ
m m
n n
rn ɳ
ng ŋ
r r
l l
rl ɭ
j j
w w
rh ɾ
r0 r0
rx ʀ
i: iː
i ɪ
ih ɪ̯
y: yː
y ʏ
e: eː
e e
eh e̝
ex ə
ä: ɛː
ä ɛ
ae: æː
ae æ
ö: øː
ö ø
oe: ɶː
oe ɶ
u: uː
u u
oh o
o: oː
o ɔ
uu: ʉː
uu ɵ
uuh ʉ
uw: ʊː
uw ʊ
a: ɑː
a a
aa: aː
au aʊ
eu ɛʊ
ei eɪ
ai aɪ
oi ɔɪ
ou əʊ
eex eə
iex ɪə
uex ʊə
an ã
en ɛ̃
on õ
un œ̃
. .
"""
BRAXEN_IPA = {}
for line in BRAXEN_IPA_RAW.strip().split("\n"):
if not line.strip():
continue
parts = line.split("\t")
if len(parts) != 2:
continue
BRAXEN_IPA[parts[0]] = parts[1]
def braxen_encode(phoneme_string, phoneme_to_ipa, strictly_braxen=False, stress_type=None):
"""
Convert a string of symbolic phonemes to IPA using stress handling.
Args:
phoneme_string (str): e.g., "'a: . r ex n"
phoneme_to_ipa (dict): mapping of phoneme symbols to IPA
Returns:
str: IPA transcription
"""
# Replace morpheme/compound boundaries with syllable breaks
phoneme_string = phoneme_string.replace('-', '.').replace('~', '.').replace('|', '.')
if not strictly_braxen:
BRAXEN_IPA['r0'] = 'ɹ'
ipa_output = []
for p in phoneme_string.strip().split():
stress = None
if strictly_braxen:
if p == 'r0':
continue
# Handle stress markers
if p.startswith("'"):
stress = 'ˈ́'
p = p[1:]
elif p.startswith('"'):
stress = 'ˈ̀'
p = p[1:]
elif p.startswith(','):
stress = 'ˌ'
p = p[1:]
elif stress_type and stress_type == 'wiktionary':
if p.startswith("'"):
stress = '¹'
p = p[1:]
elif p.startswith('"'):
stress = '²'
p = p[1:]
elif p.startswith(','):
stress = 'ˌ'
p = p[1:]
else:
if p.startswith("'"):
stress = 'ˈ'
p = p[1:]
elif p.startswith('"'):
stress = 'ˈ'
p = p[1:]
elif p.startswith(','):
stress = 'ˌ'
p = p[1:]
if p in phoneme_to_ipa:
ipa = phoneme_to_ipa[p]
if stress:
ipa = stress + ipa
ipa_output.append(ipa)
else:
print(f"[WARN] No match for phoneme: {p}")
return ''.join(ipa_output)
assert braxen_encode("""g rh 'ae n d ~ m a: . s t ex r0""", BRAXEN_IPA) == 'gɾˈænd.mɑː.stəɹ'
braxen_encode("p au ex rh w o: k", BRAXEN_IPA)
braxen_encode("p ou k ex b ou l", BRAXEN_IPA)
from pathlib import Path
DICT_PATH = Path(BRAXEN_DIR) / "dict" / "braxen-sv.tsv"
with open(DICT_PATH, 'r', encoding='utf-8') as f:
for line in f.readlines():
if line.startswith("#") or not line.strip():
continue
parts = line.strip().split("\t")
word = parts[0]
transcription = parts[1]
pos_tags = parts[2]
lang = parts[3] if len(parts) > 3 else "swe"
ipa = braxen_encode(transcription, BRAXEN_IPA, strictly_braxen=False)
if lang == "pol":
print(f"{word}\t{ipa}\t{pos_tags}\t{lang}")