TG4 Foghlaim scraper pieces
Partial scraper pieces
import requests
from bs4 import BeautifulSoup
landing = "https://www.tg4.ie/ga/brandai-eile/foghlaim/ceachtanna/"
landing_page = requests.get(landing)
assert landing_page.status_code == 200
soup = BeautifulSoup(landing_page.text, "lxml")
lessons = []
for lesson_item in soup.find_all("a", {"class": "prog-panel"}):
lessons.append(lesson_item.get("href"))
def _reamhobair_text(url):
out = []
page = requests.get(url)
assert page.status_code == 200
soup = BeautifulSoup(page.text, "lxml")
for part in soup.find("div", {"class": "arconix-toggle-content"}):
#out.append(part.text)
print(part)
return out
_reamhobair_text("https://www.tg4.ie/ga/brandai-eile/foghlaim/ceachtanna/an-scoil/reamhobair/")
def _reamhobair_questions(url):
import json
out = []
page = requests.get(url)
assert page.status_code == 200
soup = BeautifulSoup(page.text, "lxml")
for script_tag in soup.find_all("script"):
if script_tag.text.startswith("H5PIntegration="):
if script_tag.text.endswith(";"):
json_inner = json.loads(script_tag.text[15:-1])
else:
json_inner = json.loads(script_tag.text[15:])
if "contents" in json_inner:
for k in json_inner["contents"].keys():
if "library" in json_inner["contents"][k].keys():
if "jsonContent" in json_inner["contents"][k].keys():
jsc = json_inner["contents"][k]["jsonContent"]
if type(jsc) == str and "questions" in jsc:
jsc_l = json.loads(jsc)
out.append((k, json_inner["contents"][k]["library"], jsc_l["questions"]))
else:
continue
return out
_reamhobair_questions("https://www.tg4.ie/ga/brandai-eile/foghlaim/ceachtanna/an-scoil/reamhobair/")
_reamhobair_questions("https://www.tg4.ie/ga/brandai-eile/foghlaim/ceachtanna/ras-na-bpointi/mir-a-haon/")