Get Harvard sentences as separate lists
For further comparison
output_dir = "/Users/joregan/Playing/spoken-sentence-transformers/harvard"
URL = "https://www.cs.columbia.edu/%7Ehgs/audio/harvard.html"
import requests
from bs4 import BeautifulSoup
req = requests.get(URL)
soup = BeautifulSoup(req.text, "html.parser")
lists = []
for list in soup.find_all("ol"):
current = []
for item in list.find_all("li"):
current.append(item.text)
lists.append(current)
from pathlib import Path
BASE = Path(output_dir)
for i, sentences in enumerate(lists, start=1):
with open(BASE / f"harvard-{i}.txt", "w") as f:
for sentence in sentences:
f.write(sentence + "\n")