Trying to use pocketsphinx to word align
Because timing accuracy in ASR is getting progressively worse, look backwards
from pydub import AudioSegment
%%capture
%pip install pocketsphinx
MAPPING = """
ɑː AA
æ AE
ə AH
ɐ AH
ʌ AH
ɔː AO
aʊ AW
aɪ AY
b B
tʃ CH
d D
ð DH
ɛ EH
ɚ ER
ɜː ER
eɪ EY
f F
ɡ G
h HH
ɪ IH
i IY
iː IY
dʒ JH
k K
l L
m M
n N
ŋ NG
oʊ OW
ɔɪ OY
p P
ɹ R
s S
ʃ SH
t T
θ TH
ʊ UH
uː UW
v V
w W
j Y
z Z
ʒ ZH
ɾ D
"""
espeak_to_cmudict = {}
for line in MAPPING.split("\n"):
if line == "":
continue
line = line.strip()
parts = line.split(" ")
if len(parts) != 2:
print(line)
continue
k, v = line.split(" ")
if not k in espeak_to_cmudict:
espeak_to_cmudict[k] = v
import re
cmudict_keys = espeak_to_cmudict.keys()
cmudict_keys = sorted(cmudict_keys, key=len, reverse=True)
espeak_regex = re.compile(rf"({'|'.join(cmudict_keys)})")
def cmudictify(espeak):
espeak = espeak.replace("ˈ", "").replace("ˌ", "")
return " ".join([espeak_to_cmudict[x] for x in re.findall(espeak_regex, espeak)])
EGTEXT = "Yeah, that's true. I mean, they are the same size and they are a little bit, but I think I I should go more for something that style."
EGPHON = "/jˈæ ðˈæs tɹˈuː ə mˈiːn ðˈeɪ ɚ ðə sˈeɪm sˈaɪz ən ðˈeɪ ˈɑːɹ ə lˈɪɾə bˈɪɾ bˈʌt ˈaɪ θˈɪŋk ˈaɪ ˈaɪ ʃˈʊ ɡˈoʊ mˈɔːɹ fɚ sˈʌmθɪŋ ðˈæt stˈaɪl./"
EGFILE = "/Users/joregan/Playing/hsi/audio/hsi_7_0719_210_001_main.wav"
EGSTART = 70.028
EGEND = 75.441
def normword(text):
text = text.strip(",.;:!?")
return text.lower()
def normphon(phon):
phon = phon.strip(",.;:!?")
return phon
def make_lexicon(text, phon):
if phon.startswith("/") and phon.endswith("/"):
phon = phon[1:-1]
words = [normword(x) for x in text.split(" ")]
phonwords = [cmudictify(normphon(x)) for x in phon.split(" ")]
assert len(words) == len(phonwords)
output = list(set(zip(words, phonwords)))
return output
lex = make_lexicon(EGTEXT, EGPHON)
audio = AudioSegment.from_file(EGFILE)
audio = audio.set_frame_rate(16000)
seg = audio[int(EGSTART * 1000):int(EGEND * 1000)]
def make_ps_dict(entries):
counts = {}
output = []
lex = sorted(entries)
for entry in lex:
count = 1
if not entry[0] in counts:
counts[entry[0]] = 1
else:
counts[entry[0]] += 1
count = counts[entry[0]]
if count != 1:
subscript = f"({count})"
else:
subscript = ""
output.append(f"{entry[0]}{subscript} {entry[1]}")
return output
def make_fsg_transitions_from_text(text):
words = [normword(x) for x in text.split(" ")]
enum = [x for x in enumerate(words)]
trans = [(x[0], x[0] + 1, 1.0, x[1]) for x in enum]
return trans
fsgt = make_fsg_transitions_from_text(EGTEXT)
start_state = fsgt[0][0]
end_state = fsgt[-1][1]
import pocketsphinx
This was the first attempt. Adding None
for the dictionary (as the docs suggested) didn't help: can't add words to a dictionary that doesn't exist.
This may yet be a thing, because I can't be sure that it really failed to align using the grammar: audio handling sucks, and I should maybe have passed ffmpeg parameters before writing the audio.
import tempfile
entries = make_ps_dict(lex)
with (
tempfile.NamedTemporaryFile(suffix=".dict") as dictf,
tempfile.NamedTemporaryFile(suffix=".wav") as wavf,
):
with open(dictf.name, "w") as dictout:
for entry in entries:
dictout.write(entry + "\n")
seg.export(wavf.name, format="wav")
decoder = pocketsphinx.Decoder(lm=None, dict=dictf.name)
fsg = decoder.create_fsg("dummy", start_state, end_state, fsgt)
decoder.add_fsg("dummy", fsg)
decoder.activate_search("dummy")
decoder.start_utt()
# decoder.process_raw(seg.get_array_of_samples('B'))
decoder.process_raw(wavf.read(), full_utt=True)
decoder.end_utt()
decoder.seg()
fsg.writefile("/tmp/fsm")
!cat /tmp/fsm
with open("/tmp/mytmp.dict", "w") as dictout:
for entry in entries:
dictout.write(entry + "\n")
seg.export("/tmp/clip.wav", format="wav")
!sox /tmp/clip.wav $(pocketsphinx soxflags) > /tmp/ps.raw
psjson=!pocketsphinx align /tmp/ps.raw "yeah that's true i mean they are the same size and they are a little bit but i think i i should go more for something that style"
psjson
import json
data = json.loads(str(psjson[0]))
data
with open("/tmp/audacity.tsv", "w") as tsvf:
for word in data["w"]:
tsvf.write(f"{word['b']}\t{word['b']+word['d']:.2}\t{word['t']}\n")