Sgéilíní na Finne
Partial scraping for Sgéilíní na Finne
The original site is mirrored here
URLS = """
http://web.archive.org/web/20160720003620/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal01.html
http://web.archive.org/web/20160612133120/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal02.html
http://web.archive.org/web/20160612133013/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal03.html
http://web.archive.org/web/20160612133127/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal04.html
http://web.archive.org/web/20160612132904/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal05.html
http://web.archive.org/web/20160612133018/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal06.html
http://web.archive.org/web/20160612133132/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal07.html
http://web.archive.org/web/20160612133302/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal08.html
http://web.archive.org/web/20160612132911/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal09.html
http://web.archive.org/web/20160612133023/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal10.html
http://web.archive.org/web/20160612133308/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal11.html
http://web.archive.org/web/20160612133028/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal12.html
http://web.archive.org/web/20160612133137/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal13.html
http://web.archive.org/web/20160612133033/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal14.html
http://web.archive.org/web/20160612133313/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal15.html
http://web.archive.org/web/20160612132916/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal16.html
http://web.archive.org/web/20160612133144/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal17.html
http://web.archive.org/web/20160612133149/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal18.html
http://web.archive.org/web/20160612133154/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal19.html
http://web.archive.org/web/20160612132921/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal20.html
http://web.archive.org/web/20160612133039/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal21.html
http://web.archive.org/web/20160612133159/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal22.html
http://web.archive.org/web/20160612132926/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal23.html
http://web.archive.org/web/20160612133204/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal24.html
http://web.archive.org/web/20160612133044/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal25.html
http://web.archive.org/web/20160612133059/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal26.html
http://web.archive.org/web/20160612132931/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal27.html
http://web.archive.org/web/20160612133318/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal28.html
http://web.archive.org/web/20160612133323/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal29.html
http://web.archive.org/web/20160612132936/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal30.html
http://web.archive.org/web/20160612132941/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal31.html
http://web.archive.org/web/20160612132946/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal32.html
http://web.archive.org/web/20160612133328/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal33.html
http://web.archive.org/web/20160612132951/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal34.html
http://web.archive.org/web/20160612133209/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal35.html
http://web.archive.org/web/20160612133218/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal36.html
http://web.archive.org/web/20160608213843/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal37.html
"""
MODERN = """
ruadh rua
ruaidh ruaí
chomhnuidhe chónaí
cheann-tuigheadh cheann tuíadh
rabh raibh
da dá
muinteardha muinteartha
chroidhe chroí
saoghal saol
éirghe éirí
leath-mheasardha leathmheasartha
brúighte brúite
cráidhte cráite
báidhte báite
de'n den
"""
For the most part, the modernised forms are the standard forms. Others, such as 'ruaí', exist as dialectal forms, while 'ceann tuíadh' does not, but this matches what was spoken.
UNATTESTED = """
cheann-tuigheadh
"""
STANDARD = """
ruaidh rua
caidé cad é
cheann-tuigheadh cheann tuí
"""
PREPEND_SCEAL = "02 03"
PREPEND_CEACHT = "04 05"
PREPEND_UIMHIR = "06 07 08 09 10 11 12 13 14 15 16 18 23"
_tmp_mod = [a.split("\t") for a in MODERN.split("\n") if "\t" in a]
modern = {a[0]: a[1] for a in _tmp_mod}
_END_TB = "<!-- END WAYBACK TOOLBAR INSERT -->"
import requests
from bs4 import BeautifulSoup
test = "http://web.archive.org/web/20160612132931/http://www.smo.uhi.ac.uk/~oduibhin/sf/sceal27.html"
req = requests.get(test)
req.status_code
text = req.text
if _END_TB in text:
text = text.split(_END_TB)[1]
if "<hr>" in text:
text = text.split("<hr>")[0].strip()
text
extt = BeautifulSoup(text, "lxml")
extt.text
header = extt.find("font", {"size": 5})
titles = header.find_all("b")
if len(titles) == 1:
title = titles[0].text
title
for i in extt.find_all("font", {"size": 5}):
i.decompose()
extt