!wget https://ia600600.us.archive.org/23/items/multilingual_poetry_014_1002/hungarian_aborozo_petofi_dii.mp3
!ffmpeg -i ~/Downloads/hungarian_aborozo_petofi_dii.mp3 -acodec pcm_s16le -ac 1 -ar 16000 /tmp/samp.wav
!sox /tmp/samp.wav -r 16000 -c 1 -b 16 -e signed-integer /tmp/samp.raw
from pocketsphinx import Decoder, Config

MODEL="/opt/homebrew/Cellar/cmu-pocketsphinx/5.0.4/share/pocketsphinx/model/en-us"
RAW="/tmp/samp.raw"

config = Config(
    hmm=f"{MODEL}/en-us",
    allphone=f"{MODEL}/en-us-phone.lm.bin",
    lm=None,
    allphone_ci=True,
    dict=None,
    samprate=16000,
)

ps = Decoder(config)

with open(RAW, "rb") as f:
    ps.start_utt()
    ps.process_raw(f.read(), no_search=False, full_utt=True)
    ps.end_utt()

frate = ps.config.get_int("-frate")
print(f"# frate={frate} frames/sec")

for seg in ps.seg():
    label = seg.word
    sf = seg.start_frame
    ef = seg.end_frame
    start_s = sf / frate
    end_s = ef / frate
    dur_s = (ef - sf) / frate
    print(f"{label}\t{start_s:.3f}\t{end_s:.3f}\t{dur_s:.3f}")
# frate=100 frames/sec
SIL	0.000	0.560	0.560
OW	0.570	0.640	0.070
DH	0.650	0.700	0.050
L	0.710	0.820	0.110
AO	0.830	0.920	0.090
Z	0.930	1.050	0.120
OW	1.060	1.290	0.230
SIL	1.300	1.830	0.530
M	1.840	1.900	0.060
AY	1.910	2.050	0.140
T	2.060	2.200	0.140
AA	2.210	2.330	0.120
N	2.340	2.430	0.090
EH	2.440	2.580	0.140
DH	2.590	2.670	0.080
AH	2.680	2.740	0.060
T	2.750	2.840	0.090
IH	2.850	2.940	0.090
D	2.950	3.060	0.110
IY	3.070	3.320	0.250
SIL	3.330	3.660	0.330
R	3.670	3.750	0.080
EH	3.760	3.860	0.100
G	3.870	3.980	0.110
NG	3.990	4.060	0.070
AE	4.070	4.170	0.100
NG	4.180	4.270	0.090
IH	4.280	4.390	0.110
R	4.400	4.480	0.080
EY	4.490	4.650	0.160
N	4.660	4.760	0.100
F	4.770	4.850	0.080
AO	4.860	4.920	0.060
ER	4.930	4.980	0.050
IH	4.990	5.100	0.110
P	5.110	5.140	0.030
R	5.150	5.310	0.160
AO	5.320	5.390	0.070
K	5.400	5.460	0.060
TH	5.470	5.570	0.100
OW	5.580	5.680	0.100
B	5.690	5.740	0.050
AH	5.750	5.790	0.040
AY	5.800	5.950	0.150
R	5.960	6.010	0.050
P	6.020	6.130	0.110
AY	6.140	6.260	0.120
B	6.270	6.320	0.050
EH	6.330	6.450	0.120
N	6.460	6.510	0.050
AH	6.520	6.570	0.050
W	6.580	6.640	0.060
AY	6.650	6.770	0.120
NG	6.780	7.040	0.260
AE	7.050	7.200	0.150
D	7.210	7.300	0.090
SIL	7.310	10.060	2.750
L	10.070	10.210	0.140
NG	10.220	10.300	0.080
IH	10.310	10.420	0.110
Z	10.430	10.510	0.080
IH	10.520	10.630	0.110
B	10.640	10.760	0.120
R	10.770	10.810	0.040
AA	10.820	10.880	0.060
CH	10.890	11.000	0.110
K	11.010	11.060	0.050
AE	11.070	11.120	0.050
N	11.130	11.210	0.080
EH	11.220	11.490	0.270
TH	11.500	11.640	0.140
IY	11.650	11.860	0.210
R	11.870	11.940	0.070
N	11.950	12.010	0.060
IH	12.020	12.180	0.160
AA	12.190	12.290	0.100
N	12.300	12.340	0.040
IH	12.350	12.510	0.160
AE	12.520	12.620	0.100
DH	12.630	12.720	0.090
AW	12.730	12.810	0.080
N	12.820	12.980	0.160
SIL	12.990	13.680	0.690
L	13.690	13.810	0.120
N	13.820	13.890	0.070
D	13.900	13.930	0.030
IH	13.940	14.020	0.080
Z	14.030	14.110	0.080
IH	14.120	14.210	0.090
B	14.220	14.300	0.080
UH	14.310	14.480	0.170
T	14.490	14.620	0.130
K	14.630	14.700	0.070
AA	14.710	14.770	0.060
L	14.780	14.870	0.090
AW	14.880	15.020	0.140
EH	15.030	15.160	0.130
SIL	15.170	15.470	0.300
CH	15.480	15.620	0.140
AA	15.630	15.780	0.150
SH	15.790	15.940	0.150
K	15.950	16.070	0.120
AA	16.080	16.160	0.080
T	16.170	16.260	0.090
AA	16.270	16.390	0.120
L	16.400	16.530	0.130
AH	16.540	16.570	0.030
B	16.580	16.660	0.080
N	16.670	16.730	0.060
AE	16.740	16.810	0.070
AH	16.820	16.940	0.120
DH	16.950	17.040	0.090
AW	17.050	17.150	0.100
N	17.160	17.270	0.110
SIL	17.280	18.120	0.840
SH	18.130	18.250	0.120
IH	18.260	18.350	0.090
DH	18.360	18.440	0.080
AW	18.450	18.580	0.130
L	18.590	18.750	0.160
AH	18.760	18.790	0.030
T	18.800	18.880	0.080
L	18.890	18.960	0.070
K	18.970	19.090	0.120
ER	19.100	19.130	0.030
W	19.140	19.260	0.120
N	19.270	19.370	0.100
AH	19.380	19.450	0.070
M	19.460	19.540	0.080
L	19.550	19.590	0.040
EH	19.600	19.620	0.020
CH	19.630	19.760	0.130
ER	19.770	19.810	0.040
P	19.820	19.850	0.030
AW	19.860	19.920	0.060
B	19.930	20.010	0.080
UH	20.020	20.110	0.090
Y	20.120	20.180	0.060
S	20.190	20.350	0.160
AE	20.360	20.410	0.050
N	20.420	20.460	0.040
AA	20.470	20.550	0.080
K	20.560	20.640	0.080
IY	20.650	20.710	0.060
DH	20.720	20.780	0.060
EY	20.790	20.900	0.110
N	20.910	20.940	0.030
IY	20.950	21.010	0.060
M	21.020	21.090	0.070
AE	21.100	21.260	0.160
D	21.270	21.330	0.060
UH	21.340	21.410	0.070
K	21.420	21.480	0.060
ER	21.490	21.600	0.110
DH	21.610	21.760	0.150
AH	21.770	21.790	0.020
T	21.800	21.860	0.060
JH	21.870	21.910	0.040
EH	21.920	22.000	0.080
K	22.010	22.110	0.100
AE	22.120	22.180	0.060
B	22.190	22.260	0.070
AW	22.270	22.410	0.140
N	22.420	22.490	0.070
T	22.500	22.650	0.150
IH	22.660	22.710	0.050
NG	22.720	22.870	0.150
AE	22.880	22.980	0.100
M	22.990	23.030	0.040
+SPN+	23.040	23.120	0.080
+NSN+	23.130	23.240	0.110
SIL	23.250	24.350	1.100
SH	24.360	24.450	0.090
UH	24.460	24.510	0.050
B	24.520	24.580	0.060
AO	24.590	24.720	0.130
R	24.730	24.750	0.020
D	24.760	24.900	0.140
DH	24.910	24.950	0.040
AW	24.960	25.070	0.110
N	25.080	25.160	0.080
AE	25.170	25.240	0.070
M	25.250	25.300	0.050
IY	25.310	25.410	0.100
D	25.420	25.480	0.060
EH	25.490	25.560	0.070
N	25.570	25.630	0.060
F	25.640	25.720	0.080
IY	25.730	25.780	0.050
T	25.790	25.950	0.160
EH	25.960	26.050	0.090
N	26.060	26.090	0.030
P	26.100	26.170	0.070
AE	26.180	26.240	0.060
K	26.250	26.330	0.080
T	26.340	26.360	0.020
AE	26.370	26.510	0.140
D	26.520	26.630	0.110
AH	26.640	26.820	0.180
T	26.830	26.930	0.100
ER	26.940	27.010	0.070
AE	27.020	27.220	0.200
P	27.230	27.310	0.080
SIL	27.320	27.850	0.530
TH	27.860	27.970	0.110
IY	27.980	28.170	0.190
N	28.180	28.280	0.100
EH	28.290	28.360	0.070
T	28.370	28.440	0.070
AH	28.450	28.670	0.220
UW	28.680	28.860	0.180
T	28.870	28.970	0.100
IY	28.980	29.060	0.080
N	29.070	29.180	0.110
AH	29.190	29.260	0.070
CH	29.270	29.420	0.150
G	29.430	29.490	0.060
AO	29.500	29.610	0.110
D	29.620	29.650	0.030
T	29.660	29.740	0.080
IY	29.750	29.850	0.100
W	29.860	29.940	0.080
IY	29.950	30.050	0.100
S	30.060	30.160	0.100
AE	30.170	30.280	0.110
G	30.290	30.400	0.110
AH	30.410	30.470	0.060
T	30.480	30.550	0.070
AE	30.560	30.720	0.160
K	30.730	30.880	0.150
SIL	30.890	31.740	0.850
L	31.750	31.880	0.130
AH	31.890	31.970	0.080
B	31.980	32.050	0.070
AH	32.060	32.110	0.050
IY	32.120	32.280	0.160
DH	32.290	32.340	0.050
AH	32.350	32.420	0.070
W	32.430	32.580	0.150
UH	32.590	32.610	0.020
Y	32.620	32.720	0.100
EY	32.730	32.890	0.160
N	32.900	33.010	0.110
AH	33.020	33.070	0.050
CH	33.080	33.210	0.130
AY	33.220	33.430	0.210
NG	33.440	33.520	0.080
IH	33.530	33.570	0.040
AY	33.580	33.820	0.240
SH	33.830	33.920	0.090
IY	33.930	34.070	0.140
N	34.080	34.150	0.070
EH	34.160	34.210	0.050
P	34.220	34.300	0.080
EH	34.310	34.410	0.100
T	34.420	34.570	0.150
SIL	34.580	35.160	0.580
G	35.170	35.200	0.030
AO	35.210	35.330	0.120
D	35.340	35.380	0.040
T	35.390	35.440	0.050
AH	35.450	35.500	0.050
N	35.510	35.550	0.040
EY	35.560	35.670	0.110
AE	35.680	36.000	0.320
TH	36.010	36.140	0.130
AE	36.150	36.310	0.160
N	36.320	36.410	0.090
EY	36.420	36.580	0.160
CH	36.590	36.690	0.100
AA	36.700	36.830	0.130
HH	36.840	36.940	0.100
R	36.950	37.020	0.070
EH	37.030	37.240	0.210
N	37.250	37.300	0.050
IH	37.310	37.370	0.060
K	37.380	37.460	0.080
T	37.470	37.530	0.060
IY	37.540	37.600	0.060
DH	37.610	37.730	0.120
K	37.740	37.840	0.100
AE	37.850	37.930	0.080
TH	37.940	38.110	0.170
SIL	38.120	39.210	1.090
CH	39.220	39.300	0.080
G	39.310	39.390	0.080
AA	39.400	39.490	0.090
M	39.500	39.570	0.070
AY	39.580	39.660	0.080
D	39.670	39.750	0.080
UH	39.760	39.940	0.180
T	39.950	40.090	0.140
AA	40.100	40.170	0.070
N	40.180	40.250	0.070
AE	40.260	40.420	0.160
N	40.430	40.510	0.080
+SPN+	40.520	40.620	0.100
AH	40.630	40.730	0.100
AY	40.740	40.990	0.250
AW	41.000	41.180	0.180
HH	41.190	41.290	0.100
AH	41.300	41.340	0.040
DH	41.350	41.400	0.050
IY	41.410	41.550	0.140
S	41.560	41.620	0.060
IY	41.630	41.860	0.230
EY	41.870	42.120	0.250
SIL	42.130	42.840	0.710
N	42.850	42.870	0.020
EY	42.880	43.000	0.120
IH	43.010	43.120	0.110
SH	43.130	43.200	0.070
K	43.210	43.300	0.090
AA	43.310	43.420	0.110
R	43.430	43.480	0.050
D	43.490	43.740	0.250
SIL	43.750	44.150	0.400
S	44.160	44.300	0.140
N	44.310	44.360	0.050
AE	44.370	44.450	0.080
AH	44.460	44.590	0.130
T	44.600	44.710	0.110
EH	44.720	44.800	0.080
D	44.810	44.880	0.070
UW	44.890	45.070	0.180
L	45.080	45.150	0.070
K	45.160	45.290	0.130
IH	45.300	45.440	0.140
G	45.450	45.540	0.090
EH	45.550	45.610	0.060
AE	45.620	45.780	0.160
DH	45.790	45.940	0.150
TH	45.950	46.100	0.150
AH	46.110	46.150	0.040
N	46.160	46.230	0.070
AE	46.240	46.310	0.070
T	46.320	46.440	0.120
EY	46.450	46.610	0.160
SIL	46.620	48.570	1.950
N	48.580	48.760	0.180
AH	48.770	48.840	0.070
F	48.850	48.960	0.110
AO	48.970	49.220	0.250
N	49.230	49.440	0.210
SIL	49.450	49.860	0.410
DH	49.870	49.920	0.050
IH	49.930	49.990	0.060
T	50.000	50.140	0.140
EY	50.150	50.220	0.070
K	50.230	50.320	0.090
OW	50.330	50.420	0.090
R	50.430	50.500	0.070
G	50.510	50.570	0.060
NG	50.580	50.700	0.120
K	50.710	50.810	0.100
IY	50.820	50.850	0.030
S	50.860	50.980	0.120
N	50.990	51.080	0.090
AH	51.090	51.140	0.050
B	51.150	51.230	0.080
AH	51.240	51.310	0.070
DH	51.320	51.410	0.090
IH	51.420	51.520	0.100
T	51.530	51.650	0.120
AH	51.660	51.700	0.040
N	51.710	51.790	0.080
IY	51.800	51.990	0.190
NG	52.000	52.110	0.110
SIL	52.120	57.160	5.040

Text

A borozó by Sándor Petőfi. Read in Hungarian for librivox dot org by Diana Majlinger.

Gondüző borocska mellett Vígan illan életem; Gondüző borocska mellett, Sors, hatalmad nevetem.

És mit ámultok? ha mondom, Hogy csak a bor istene, Akit én imádok, aki E kebelnek mindene.

És a bor vidám hevében Füttyentek rád, zord világ! Szívemet hol annyi kínnak Skorpiói szaggaták.

Bor taníta húrjaimra Csalni nyájas éneket; Bor taníta elfeledni, Csalfa lyányok, titeket.

Egykor majd borocska mellől A halál ha űzni jő: Még egy korty - s nevetve dűlök Jégöledbe, temető!

End of poem. This recording is in the public domain.

DATA = {
    "A borozó by Sándor Petőfi.": {
        "languages": ["hu", "hu", "en", "hu", "hu"],
        "entities": ["B-WORK", "E-WORK", "O", "B-PERSON", "E-PERSON"]
    },
    "Read in Hungarian for librivox dot org by Diana Majlinger.": {
        "languages": ["en", "en", "en", "en", "en", "en", "en", "en", "hu", "hu"],
        "entities": ["O", "O", "O", "O", "O", "B-URL", "I-URL", "E-URL", "B-PERSON", "E-PERSON"],
        "normalized": "Read in Hungarian for librivox.org by Diana Majlinger."
    }
}
TEXT = """
A borozó
SÁNDOR PETŐFI.
Diana Majlinger

Gondüző borocska mellett
Vígan illan életem;
Gondüző borocska mellett,
Sors, hatalmad nevetem.

És mit ámultok? ha mondom,
Hogy csak a bor istene,
Akit én imádok, aki
E kebelnek mindene.

És a bor vidám hevében
Füttyentek rád, zord világ!
Szívemet hol annyi kínnak
Skorpiói szaggaták.

Bor taníta húrjaimra
Csalni nyájas éneket;
Bor taníta elfeledni,
Csalfa lyányok, titeket.

Egykor majd borocska mellől
A halál ha űzni jő:
Még egy korty - s nevetve dűlök
Jégöledbe, temető!
"""
word_lists = {}
for line in TEXT.strip().split("\n"):
    if line.strip() == "":
        continue
    words = line.strip().split(" ")
    cleaned_words = [w.lower().strip("-:;,!?.") for w in words if w]
    only_cleaned_words = [w for w in cleaned_words if w]
    espeak_out = !/opt/homebrew/bin/espeak -v hu --ipa -q --stdout "{line}"
    stripped = [x.strip() for x in espeak_out if x.strip()]
    phone_line = " ".join(stripped)
    phone_words = phone_line.split(" ")
    if len(only_cleaned_words) != len(phone_words):
        print(f"WARNING: word count mismatch for line: {line}")
        print(f"  cleaned_words ({len(only_cleaned_words)}): {cleaned_words}")
        print(f"  phone_words   ({len(phone_words)}): {phone_words}")
    for a, b in zip(only_cleaned_words, phone_words):
        if a not in word_lists:
            word_lists[a] = set()
        word_lists[a].add(b)
    out_words = {w: list(word_lists[w]) for w in word_lists}
EN_TEXT = """
by
Read in Hungarian for librivox dot org by
End of poem. This recording is in the public domain.
"""
en_word_lists = {
    "read": ["ɹˈɛd"],
    "librivox": ["lˈɪbɹivˌɑːks"]
}
_EN_WORDS = EN_TEXT.strip().split()
for w in _EN_WORDS:
    cleaned = w.lower().strip("-:;,!?.")
    if cleaned == "":
        continue
    if cleaned in en_word_lists:
        continue
    espeak_out = !/opt/homebrew/bin/espeak -v en-us --ipa -q --stdout "{cleaned}"
    stripped = [x.strip() for x in espeak_out if x.strip()]
    phone_line = " ".join(stripped)
    if cleaned not in en_word_lists:
        en_word_lists[cleaned] = set()
    en_word_lists[cleaned].add(phone_line)
en_word_lists
{'by': {'bˈaɪ'},
 'read': {'ɹˈiːd'},
 'in': {'ˈɪn'},
 'hungarian': {'hʌŋɡˈɛɹiən'},
 'for': {'fˈɔːɹ'},
 'librivox': {'lˈaɪbɹɪvˌɑːks'},
 'dot': {'dˈɑːt'},
 'org': {'ˈɔːɹɡ'},
 'end': {'ˈɛnd'},
 'of': {'ˈʌv'},
 'poem': {'pˈoʊɪm'},
 'this': {'ðˈɪs'},
 'recording': {'ɹɪkˈoːɹdɪŋ'},
 'is': {'ˈɪz'},
 'the': {'ðˈə'},
 'public': {'pˈʌblɪk'},
 'domain': {'dəmˈeɪn'}}
out_words
{'a': ['ˌɑ'],
 'borozó': ['bˈorozoː'],
 'sándor': ['ʃˈaːndor'],
 'petőfi': ['pˈɛtøːfi'],
 'diana': ['dˈiɑnɑ'],
 'majlinger': ['mˈɑjlinɡɛr'],
 'gondüző': ['ɡˈondyzøː'],
 'borocska': ['bˈorotʃkɑ'],
 'mellett': ['mˌɛllɛtː'],
 'vígan': ['vˈiːɡɑn'],
 'illan': ['ˈillɑn'],
 'életem': ['ˈeːlɛtɛm'],
 'sors': ['ʃˈorʃ'],
 'hatalmad': ['hˈɑtɑlmɑd'],
 'nevetem': ['nˈɛvɛtɛm'],
 'és': ['ˌeːʃ'],
 'mit': ['mˈit'],
 'ámultok': ['ˈaːmultok'],
 'ha': ['hˌɑ'],
 'mondom': ['mˈondom'],
 'hogy': ['hˈoɟ'],
 'csak': ['tʃˈɑk'],
 'bor': ['bˈor'],
 'istene': ['ˈiʃtɛnɛ'],
 'akit': ['ˈɑkit'],
 'én': ['ˈeːn'],
 'imádok': ['ˈimaːdok'],
 'aki': ['ˈɑki'],
 'e': ['ˌɛ'],
 'kebelnek': ['kˈɛbɛlnɛk'],
 'mindene': ['mˈindɛnɛ'],
 'vidám': ['vˈidaːm'],
 'hevében': ['hˈɛveːbɛn'],
 'füttyentek': ['fˈycːɛntɛk'],
 'rád': ['rˌaːd'],
 'zord': ['zˈord'],
 'világ': ['vˈilaːɡ'],
 'szívemet': ['sˈivɛmɛt'],
 'hol': ['hˈol'],
 'annyi': ['ˈɑɲɲi'],
 'kínnak': ['kˈiːnnɑk'],
 'skorpiói': ['ʃkˈorpioːi'],
 'szaggaták': ['sˈɑɡːɑtaːk'],
 'taníta': ['tˈɑniːtɑ'],
 'húrjaimra': ['hˈuːrjɑimrɑ'],
 'csalni': ['tʃˈɑlni'],
 'nyájas': ['ɲˈaːjɑʃ'],
 'éneket': ['ˈeːnɛkɛt'],
 'elfeledni': ['ˈɛlfɛlɛdni'],
 'csalfa': ['tʃˈɑlfɑ'],
 'lyányok': ['jˈaːɲok'],
 'titeket': ['tˈitɛkɛt'],
 'egykor': ['ˈɛɟkor'],
 'majd': ['mˈɑjd'],
 'mellől': ['mˌɛlløːl'],
 'halál': ['hˈɑlaːl'],
 'űzni': ['ˈyːzni'],
 'jő': ['jˈøː'],
 'még': ['mˈeːɡ'],
 'egy': ['ˈɛɟ'],
 'korty': ['kˈorc'],
 's': ['ʃ'],
 'nevetve': ['nˈɛvɛtvɛ'],
 'dűlök': ['dˈyːløk'],
 'jégöledbe': ['jˈeːɡølɛdbɛ'],
 'temető': ['tˈɛmɛtøː']}