!pip install mosestokenizer

from datasets import load_dataset

script = "/home/jim/Playing/notes/_drafts/nos.py"
nos = load_dataset(script, 'documents')

from mosestokenizer import *

sentences = 0
with MosesSentenceSplitter('ga') as splitsents:
    with open("/tmp/nos.txt", "w") as outf:
        for item in nos['train']:
            outf.write(item['title'] + "\n")
            sentences += 1
            if not item['text']:
                continue
            sents = splitsents([item['text']])
            sentences += len(sents)
            for sentence in sents:
                outf.write(sentence + "\n")

sentences