Split sentences from datasets
Needed to count sentences
!pip install mosestokenizer
from datasets import load_dataset
script = "/home/jim/Playing/notes/_drafts/nos.py"
nos = load_dataset(script, 'documents')
from mosestokenizer import *
sentences = 0
with MosesSentenceSplitter('ga') as splitsents:
with open("/tmp/nos.txt", "w") as outf:
for item in nos['train']:
outf.write(item['title'] + "\n")
sentences += 1
if not item['text']:
continue
sents = splitsents([item['text']])
sentences += len(sents)
for sentence in sents:
outf.write(sentence + "\n")
sentences