Irish number normalisation with Pynini
Is it worth doing?
!pip install -q condacolab
import condacolab
condacolab.install()
%%capture
!conda install -c conda-forge pynini
%%capture
!pip install pyicu
import pynini
import icu
formatter = icu.RuleBasedNumberFormat(icu.URBNFRuleSetTag.SPELLOUT, icu.Locale('ga'))
for i in range(0, 10):
print(formatter.format(i))
pynini.cross("0", "a náid") | pynini.cross("1", "a haon")
i = 1
print(f"{i:03d}")
count_1_999 = pynini.union(*[pynini.cross(f"{i:03d}", formatter.format(i)) for i in range(1, 1000)])
count_1_999_x1000 = pynini.union(*[pynini.cross(f"{i:03d}", formatter.format(i * 1000)) for i in range(1, 1000)])
("999" @ count_1_999_x1000).string()
count_1_999_x1000000 = pynini.union(*[pynini.cross(f"{i:03d}", formatter.format(i * 1000000)) for i in range(1, 1000)])
drop_000 = pynini.cross("000", "")
ins_space = pynini.cross("", " ")
ins_space_or_is = (pynini.cross("", " ") | pynini.cross("", " is "))
("999" @ count_1_999_x1000000).string()
count_1_999999 = (count_1_999_x1000 + drop_000 | count_1_999_x1000 + ins_space + count_1_999 | drop_000 + count_1_999)
("000001" @ count_1_999999).string()
We want a fairly large number for this to be worth it; unfortunately, memory limits get in the way, so building up in sections is the only way forward.
IOW, pynini gives no advantage over thrax.
#count_0_1000000000000 = pynini.union(*[pynini.cross(f"{i:03d}", formatter.format(i)) for i in range(0, 1000000000000)])
I can still generate list parts, though
with open("count-1-999.tsv", "w") as outf:
for i in range(1, 1000):
outf.write(f"{i:03d}\t{formatter.format(i)}\n")
with open("count-1-999-thousands.tsv", "w") as outf:
for i in range(1, 1000):
outf.write(f"{i:03d}\t{formatter.format(i * 1000)}\n")
with open("count-1-999-billions.tsv", "w") as outf:
for i in range(1, 1000):
outf.write(f"{i:03d}\t{formatter.format(i * 1000000000)}\n")