Process Hungarian text
To align a pair of books
BASE_DIR = "/Users/joregan/Playing/maligna/maligna-ui/maligna-3.0.2-SNAPSHOT/hp-tuz"
FILE = f"{BASE_DIR}/konyv_4___harry_potter_es_a_tuz_serlege_www.5mp.eu_.txt"
OUTFILE = f"{BASE_DIR}/hp-hu.txt"
current = ""
final_punct = set(".!?…")
with open(FILE, "r") as f, open(OUTFILE, "w") as out_f:
for line in f.readlines():
sline = line.strip()
if sline == "":
continue
if sline[-1] in final_punct:
if current:
out_f.write(f"{current} {sline}\n")
current = ""
else:
out_f.write(f"{sline}\n")
elif sline.endswith("fejezet"):
if current:
out_f.write(f"{current}\n")
current = ""
out_f.write(f"{sline}\n")
else:
if current:
current += " " + sline
else:
current = sline