Version on Kaggle

%pip install mosestokenizer
!pwd
/kaggle/working
HTML = "https://www.gutenberg.org/cache/epub/2814/pg2814-images.html"
import requests
from bs4 import BeautifulSoup
dubliners = requests.get(HTML)
assert dubliners.status_code == 200
soup = BeautifulSoup(dubliners.text, 'html.parser')
body = soup.find("body")
text = []
for chapter in body.findAll("div", {"class": "chapter"}):
    for element in chapter.findChildren():
        if element.name == "section" and element.get("id") in ["pg-header", "pg-footer"]:
            continue
        stripped = element.text.strip()
        if stripped != "":
            text.append(stripped)
from mosestokenizer import MosesSentenceSplitter
sents = []
try:
    with MosesSentenceSplitter('en') as splitsents:
        for para in text:
          if para == "":
              continue
          sents += splitsents([para.replace("\r\n", " ")])
except Exception as ex:
    print(ex, para)
import re
def cleaner(text):
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("’ ", " ")
    text = text.replace("’", "'")
    text = text.replace("‘", " ")
    text = text.replace("\t", " ")
    text = text.replace("!...", " ")
    text = text.replace("....", " ")
    text = text.replace("...", " ")
    text = text.replace(":", " ")
    text = text.replace(";", " ")
    text = text.replace("!", " ")
    text = text.replace(",", " ")
    text = text.replace("?", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("—", " ")
    text = text.replace("\r\n", " ")
    text = text.replace("\n", " ")
    text = text.replace("\xa0", " ")
    text = text.replace(".", " ")
    text = text.replace("&", " and ")
    text = text.replace(" 57E ", " fifty seven e ")
    text = text.replace(" 1st ", " first ")
    text = text.replace(" 6th ", " sixth ")
    text = text.replace(" 1895 ", " eighteen ninety five ")
    text = text.replace(" 1891", " eighteen ninety one")
    text = text.replace("1891", " eighteen ninety one")
    text = text.replace("65 ", " sixty five ")
    if text[-1] == ".":
        text = text[:-1]
    text = re.sub("  +", " ", text)
    return text.lower().strip()
clean = [cleaner(x) for x in sents]
with open("dubliners-clean.txt", "w") as outf:
    for line in clean:
        outf.write(line + "\n")
!apt install -y build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev
%cd /tmp
/tmp
!git clone https://github.com/kpu/kenlm
%cd kenlm
/tmp/kenlm
!mkdir build
%cd build
!cmake ..
!make -j 4
%cd /kaggle/working
/kaggle/working
!/tmp/kenlm/build/bin/lmplz -o 5 < dubliners-clean.txt > dubliners.arpa