Multidict scraper
(Not completed)
import requests
from bs4 import BeautifulSoup
def scrapepage(pageid):
page = requests.get(f'https://multidict.net/clilstore/page.php?id={pageid}')
soup = BeautifulSoup(page.text, 'html.parser')
body = soup.find('body')
bodytext = body.find('div', {'class': 'body-indent'})
text = [tmp.text for tmp in bodytext.findAll('p')]
iframe = bodytext.findAll('iframe')
return iframe[0]['src'], text
print(scrapepage('8839'))
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "en-US,en;q=0.9,pl;q=0.8,ga;q=0.7,en-GB;q=0.6",
"cache-control": "max-age=0",
"content-type": "application/x-www-form-urlencoded",
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
}
s = requests.Session()
s.headers.update(headers)
s.get("https://multidict.net/clilstore/")
s.headers.update({'referer': "https://multidict.net/clilstore/"})
x = s.post("https://multidict.net/clilstore/", data="sl=ga&filterForm=1&title=&text=&showAll=showAll")
listsoup = BeautifulSoup(x.text, 'html.parser')
table = listsoup.find('table', {'id': 'main'})
links = table.findAll('a')
def attrstartswith(tag, attr, needle):
return tag.attrs and attr in tag.attrs and tag.attrs[attr].startswith(needle)
def collectlinks(links):
out = []
for link in links:
if attrstartswith(link, 'href', '/cs/'):
out.append(link.attrs['href'][4:])
return out