from pydub import AudioSegment

%%capture
%pip install pocketsphinx

MAPPING = """
ɑː AA
æ AE
ə AH
ɐ AH
ʌ AH
ɔː AO
aʊ AW
aɪ AY
b B
tʃ CH
d D
ð DH
ɛ EH
ɚ ER
ɜː ER
eɪ EY
f F
ɡ G
h HH
ɪ IH
i IY
iː IY
dʒ JH
k K
l L
m M
n N
ŋ NG
oʊ OW
ɔɪ OY
p P
ɹ R
s S
ʃ SH
t T
θ TH
ʊ UH
uː UW
v V
w W
j Y
z Z
ʒ ZH
ɾ D
"""

espeak_to_cmudict = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in espeak_to_cmudict:
        espeak_to_cmudict[k] = v

import re

cmudict_keys = espeak_to_cmudict.keys()
cmudict_keys = sorted(cmudict_keys, key=len, reverse=True)
espeak_regex = re.compile(rf"({'|'.join(cmudict_keys)})")

def cmudictify(espeak):
    espeak = espeak.replace("ˈ", "").replace("ˌ", "")
    return " ".join([espeak_to_cmudict[x] for x in re.findall(espeak_regex, espeak)])

EGTEXT = "Yeah, that's true. I mean, they are the same size and they are a little bit, but I think I I should go more for something that style."
EGPHON = "/jˈæ ðˈæs tɹˈuː ə mˈiːn ðˈeɪ ɚ ðə sˈeɪm sˈaɪz ən ðˈeɪ ˈɑːɹ ə lˈɪɾə bˈɪɾ bˈʌt ˈaɪ θˈɪŋk ˈaɪ ˈaɪ ʃˈʊ ɡˈoʊ mˈɔːɹ fɚ sˈʌmθɪŋ ðˈæt stˈaɪl./"
EGFILE = "/Users/joregan/Playing/hsi/audio/hsi_7_0719_210_001_main.wav"
EGSTART = 70.028
EGEND = 75.441

def normword(text):
    text = text.strip(",.;:!?")
    return text.lower()

def normphon(phon):
    phon = phon.strip(",.;:!?")
    return phon

def make_lexicon(text, phon):
    if phon.startswith("/") and phon.endswith("/"):
        phon = phon[1:-1]
    words = [normword(x) for x in text.split(" ")]
    phonwords = [cmudictify(normphon(x)) for x in phon.split(" ")]
    assert len(words) == len(phonwords)
    output = list(set(zip(words, phonwords)))
    return output

lex = make_lexicon(EGTEXT, EGPHON)

audio = AudioSegment.from_file(EGFILE)

audio = audio.set_frame_rate(16000)

seg = audio[int(EGSTART * 1000):int(EGEND * 1000)]

def make_ps_dict(entries):
    counts = {}
    output = []
    lex = sorted(entries)
    for entry in lex:
        count = 1
        if not entry[0] in counts:
            counts[entry[0]] = 1
        else:
            counts[entry[0]] += 1
            count = counts[entry[0]]
        if count != 1:
            subscript = f"({count})"
        else:
            subscript = ""
        output.append(f"{entry[0]}{subscript} {entry[1]}")
    return output

def make_fsg_transitions_from_text(text):
    words = [normword(x) for x in text.split(" ")]
    enum = [x for x in enumerate(words)]
    trans = [(x[0], x[0] + 1, 1.0, x[1]) for x in enum]
    return trans

fsgt = make_fsg_transitions_from_text(EGTEXT)

start_state = fsgt[0][0]
end_state = fsgt[-1][1]

import pocketsphinx

This was the first attempt. Adding None for the dictionary (as the docs suggested) didn't help: can't add words to a dictionary that doesn't exist.

This may yet be a thing, because I can't be sure that it really failed to align using the grammar: audio handling sucks, and I should maybe have passed ffmpeg parameters before writing the audio.

import tempfile
entries = make_ps_dict(lex)

with (
    tempfile.NamedTemporaryFile(suffix=".dict") as dictf,
    tempfile.NamedTemporaryFile(suffix=".wav") as wavf,
):
    with open(dictf.name, "w") as dictout:
        for entry in entries:
            dictout.write(entry + "\n")
    
    seg.export(wavf.name, format="wav")

    decoder = pocketsphinx.Decoder(lm=None, dict=dictf.name)
    fsg = decoder.create_fsg("dummy", start_state, end_state, fsgt)
    decoder.add_fsg("dummy", fsg)
    decoder.activate_search("dummy")
    decoder.start_utt()
    # decoder.process_raw(seg.get_array_of_samples('B'))
    decoder.process_raw(wavf.read(), full_utt=True)
    decoder.end_utt()

decoder.seg()

ERROR: "fsg_search.c", line 944: Final result does not match the grammar in frame 1082

fsg.writefile("/tmp/fsm")

!cat /tmp/fsm

FSG_BEGIN dummy
NUM_STATES 29
START_STATE 0
FINAL_STATE 28
TRANSITION 0 0 0.000000 [NOISE]
TRANSITION 0 0 0.005001 <sil>
TRANSITION 0 1 1.000000 yeah
TRANSITION 1 1 0.000000 [NOISE]
TRANSITION 1 1 0.005001 <sil>
TRANSITION 1 2 1.000000 that's
TRANSITION 2 2 0.000000 [NOISE]
TRANSITION 2 2 0.005001 <sil>
TRANSITION 2 3 1.000000 true
TRANSITION 3 3 0.000000 [NOISE]
TRANSITION 3 3 0.005001 <sil>
TRANSITION 3 4 1.000000 i(2)
TRANSITION 3 4 1.000000 i
TRANSITION 4 4 0.000000 [NOISE]
TRANSITION 4 4 0.005001 <sil>
TRANSITION 4 5 1.000000 mean
TRANSITION 5 5 0.000000 [NOISE]
TRANSITION 5 5 0.005001 <sil>
TRANSITION 5 6 1.000000 they
TRANSITION 6 6 0.000000 [NOISE]
TRANSITION 6 6 0.005001 <sil>
TRANSITION 6 7 1.000000 are(2)
TRANSITION 6 7 1.000000 are
TRANSITION 7 7 0.000000 [NOISE]
TRANSITION 7 7 0.005001 <sil>
TRANSITION 7 8 1.000000 the
TRANSITION 8 8 0.000000 [NOISE]
TRANSITION 8 8 0.005001 <sil>
TRANSITION 8 9 1.000000 same
TRANSITION 9 9 0.000000 [NOISE]
TRANSITION 9 9 0.005001 <sil>
TRANSITION 9 10 1.000000 size
TRANSITION 10 10 0.000000 [NOISE]
TRANSITION 10 10 0.005001 <sil>
TRANSITION 10 11 1.000000 and
TRANSITION 11 11 0.000000 [NOISE]
TRANSITION 11 11 0.005001 <sil>
TRANSITION 11 12 1.000000 they
TRANSITION 12 12 0.000000 [NOISE]
TRANSITION 12 12 0.005001 <sil>
TRANSITION 12 13 1.000000 are(2)
TRANSITION 12 13 1.000000 are
TRANSITION 13 13 0.000000 [NOISE]
TRANSITION 13 13 0.005001 <sil>
TRANSITION 13 14 1.000000 a
TRANSITION 14 14 0.000000 [NOISE]
TRANSITION 14 14 0.005001 <sil>
TRANSITION 14 15 1.000000 little
TRANSITION 15 15 0.000000 [NOISE]
TRANSITION 15 15 0.005001 <sil>
TRANSITION 15 16 1.000000 bit
TRANSITION 16 16 0.000000 [NOISE]
TRANSITION 16 16 0.005001 <sil>
TRANSITION 16 17 1.000000 but
TRANSITION 17 17 0.000000 [NOISE]
TRANSITION 17 17 0.005001 <sil>
TRANSITION 17 18 1.000000 i(2)
TRANSITION 17 18 1.000000 i
TRANSITION 18 18 0.000000 [NOISE]
TRANSITION 18 18 0.005001 <sil>
TRANSITION 18 19 1.000000 think
TRANSITION 19 20 1.000000 i(2)
TRANSITION 19 20 1.000000 i
TRANSITION 19 19 0.000000 [NOISE]
TRANSITION 19 19 0.005001 <sil>
TRANSITION 20 20 0.000000 [NOISE]
TRANSITION 20 20 0.005001 <sil>
TRANSITION 20 21 1.000000 i(2)
TRANSITION 20 21 1.000000 i
TRANSITION 21 21 0.000000 [NOISE]
TRANSITION 21 21 0.005001 <sil>
TRANSITION 21 22 1.000000 should
TRANSITION 22 22 0.000000 [NOISE]
TRANSITION 22 22 0.005001 <sil>
TRANSITION 22 23 1.000000 go
TRANSITION 23 23 0.000000 [NOISE]
TRANSITION 23 23 0.005001 <sil>
TRANSITION 23 24 1.000000 more
TRANSITION 24 24 0.000000 [NOISE]
TRANSITION 24 24 0.005001 <sil>
TRANSITION 24 25 1.000000 for
TRANSITION 25 25 0.000000 [NOISE]
TRANSITION 25 25 0.005001 <sil>
TRANSITION 25 26 1.000000 something
TRANSITION 26 26 0.000000 [NOISE]
TRANSITION 26 26 0.005001 <sil>
TRANSITION 26 27 1.000000 that
TRANSITION 27 27 0.000000 [NOISE]
TRANSITION 27 27 0.005001 <sil>
TRANSITION 27 28 1.000000 style
TRANSITION 28 28 0.000000 [NOISE]
TRANSITION 28 28 0.005001 <sil>
FSG_END

with open("/tmp/mytmp.dict", "w") as dictout:
    for entry in entries:
        dictout.write(entry + "\n")

seg.export("/tmp/clip.wav", format="wav")

<_io.BufferedRandom name='/tmp/clip.wav'>

!sox /tmp/clip.wav $(pocketsphinx soxflags) > /tmp/ps.raw
psjson=!pocketsphinx align /tmp/ps.raw "yeah that's true i mean they are the same size and they are a little bit but i think i i should go more for something that style"

psjson

['{"b":0.000,"d":5.410,"p":1.000,"t":"yeah that\'s true i mean they are the same size and they are a little bit but i think i i should go more for something that style","w":[{"b":0.000,"d":0.250,"p":0.964,"t":"yeah"},{"b":0.250,"d":0.150,"p":0.937,"t":"that\'s"},{"b":0.400,"d":0.200,"p":0.978,"t":"true"},{"b":0.600,"d":0.060,"p":0.974,"t":"i"},{"b":0.660,"d":0.180,"p":0.980,"t":"mean"},{"b":0.840,"d":0.140,"p":0.979,"t":"they"},{"b":0.980,"d":0.070,"p":0.983,"t":"are(2)"},{"b":1.050,"d":0.110,"p":0.987,"t":"the"},{"b":1.160,"d":0.260,"p":0.974,"t":"same"},{"b":1.420,"d":0.380,"p":0.956,"t":"size"},{"b":1.800,"d":0.120,"p":0.975,"t":"and"},{"b":1.920,"d":0.110,"p":0.986,"t":"they"},{"b":2.030,"d":0.140,"p":0.983,"t":"are(2)"},{"b":2.170,"d":0.030,"p":0.990,"t":"a"},{"b":2.200,"d":0.180,"p":0.914,"t":"little"},{"b":2.380,"d":0.140,"p":0.980,"t":"bit"},{"b":2.520,"d":0.220,"p":0.935,"t":"but"},{"b":2.740,"d":0.110,"p":0.983,"t":"i"},{"b":2.850,"d":0.260,"p":0.968,"t":"think"},{"b":3.110,"d":0.190,"p":0.975,"t":"i"},{"b":3.300,"d":0.090,"p":0.977,"t":"i"},{"b":3.390,"d":0.200,"p":0.976,"t":"should"},{"b":3.590,"d":0.150,"p":0.982,"t":"go"},{"b":3.740,"d":0.220,"p":0.979,"t":"more"},{"b":3.960,"d":0.200,"p":0.978,"t":"for(2)"},{"b":4.160,"d":0.320,"p":0.971,"t":"something"},{"b":4.480,"d":0.300,"p":0.961,"t":"that"},{"b":4.780,"d":0.510,"p":0.968,"t":"style"},{"b":5.290,"d":0.110,"p":0.955,"t":"<sil>"}]}']

import json
data = json.loads(str(psjson[0]))

data

{'b': 0.0,
 'd': 5.41,
 'p': 1.0,
 't': "yeah that's true i mean they are the same size and they are a little bit but i think i i should go more for something that style",
 'w': [{'b': 0.0, 'd': 0.25, 'p': 0.964, 't': 'yeah'},
  {'b': 0.25, 'd': 0.15, 'p': 0.937, 't': "that's"},
  {'b': 0.4, 'd': 0.2, 'p': 0.978, 't': 'true'},
  {'b': 0.6, 'd': 0.06, 'p': 0.974, 't': 'i'},
  {'b': 0.66, 'd': 0.18, 'p': 0.98, 't': 'mean'},
  {'b': 0.84, 'd': 0.14, 'p': 0.979, 't': 'they'},
  {'b': 0.98, 'd': 0.07, 'p': 0.983, 't': 'are(2)'},
  {'b': 1.05, 'd': 0.11, 'p': 0.987, 't': 'the'},
  {'b': 1.16, 'd': 0.26, 'p': 0.974, 't': 'same'},
  {'b': 1.42, 'd': 0.38, 'p': 0.956, 't': 'size'},
  {'b': 1.8, 'd': 0.12, 'p': 0.975, 't': 'and'},
  {'b': 1.92, 'd': 0.11, 'p': 0.986, 't': 'they'},
  {'b': 2.03, 'd': 0.14, 'p': 0.983, 't': 'are(2)'},
  {'b': 2.17, 'd': 0.03, 'p': 0.99, 't': 'a'},
  {'b': 2.2, 'd': 0.18, 'p': 0.914, 't': 'little'},
  {'b': 2.38, 'd': 0.14, 'p': 0.98, 't': 'bit'},
  {'b': 2.52, 'd': 0.22, 'p': 0.935, 't': 'but'},
  {'b': 2.74, 'd': 0.11, 'p': 0.983, 't': 'i'},
  {'b': 2.85, 'd': 0.26, 'p': 0.968, 't': 'think'},
  {'b': 3.11, 'd': 0.19, 'p': 0.975, 't': 'i'},
  {'b': 3.3, 'd': 0.09, 'p': 0.977, 't': 'i'},
  {'b': 3.39, 'd': 0.2, 'p': 0.976, 't': 'should'},
  {'b': 3.59, 'd': 0.15, 'p': 0.982, 't': 'go'},
  {'b': 3.74, 'd': 0.22, 'p': 0.979, 't': 'more'},
  {'b': 3.96, 'd': 0.2, 'p': 0.978, 't': 'for(2)'},
  {'b': 4.16, 'd': 0.32, 'p': 0.971, 't': 'something'},
  {'b': 4.48, 'd': 0.3, 'p': 0.961, 't': 'that'},
  {'b': 4.78, 'd': 0.51, 'p': 0.968, 't': 'style'},
  {'b': 5.29, 'd': 0.11, 'p': 0.955, 't': '<sil>'}]}

with open("/tmp/audacity.tsv", "w") as tsvf:
    for word in data["w"]:
        tsvf.write(f"{word['b']}\t{word['b']+word['d']:.2}\t{word['t']}\n")