Clean Irish text
...preserving some punctuation, for silence alignment
def _ga_lc_word(text):
if text[0:1] in "nt" and text[1:2] in "AÁEÉIÍOÓUÚ":
return text[0:1] + "-" + text[1:].lower()
else:
return text.lower()
def ga_lower(text):
words = [_ga_lc_word(word) for word in text.split()]
return " ".join(words)
test = "Cuairt an tAthair"
assert ga_lower(test) == "cuairt an t-athair"
import re
def clean_text(text):
# keep only word-internal apostrophes
text = re.sub("^'+", "", text)
text = re.sub("[']+$", "", text)
text = text.replace("' ", " ").replace(" '", " ")
text = text.replace("’", "'")
text = re.sub("[‘“”\"\(\)\[\]\{\}]", "", text)
# keep punctuation that can correspond to silence
text = re.sub("([,;\.!?])", " \\1", text)
# leave spaced hyphens, which also can be silences, except at EOS
text = re.sub(" \-$", "", text)
return ga_lower(text)
test = "'cuairt (an) “tAthair”''"
assert clean_text(test) == "cuairt an t-athair"
test = "'cuairt, (an) “tAthair”!"
assert clean_text(test) == "cuairt , an t-athair !"
test = "'cuairt, (an) “tAthair”! -"
assert clean_text(test) == "cuairt , an t-athair !"
Actually using it.
from pathlib import Path
OUT = Path("<SNIP>")
SRC = Path("<SNIP>")
for filename in SRC.glob("*.txt"):
base = filename.stem
wav = OUT / f"{base}.wav"
if wav.is_file():
out = OUT / f"{base}.txt"
with open(out, "w") as outf, open(filename) as inf:
text = inf.read()
clean = clean_text(text)
outf.write(clean)