Crawling all page with scrapy and FormRequest - python-3.x

I would like to scrape all link for the formation in this website : https://www.formatic-centre.fr/formation/
Apparently the next pages are dynamically loaded with AJAX. I need to simulate those requests using FormRequest from scrapy.
That was I did, I look up for the parameters with developer tools : ajax1
I put those parameters into FormRequest but apparently if it didn't work, I need to include the header, that what I did : ajax2
But it didn't work either.. I'm guessing I'm doing something wrong but what ?
Here's my script, if you want (sorry it's quite long, because I put all the parameters and the headers) :
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
from scrapy.http import FormRequest
class LinkSpider(scrapy.Spider):
name = "link"
#allow_domains = ['https://www.formatic-centre.fr/']
start_urls = ['https://www.formatic-centre.fr/formation/']
rules = (Rule(LinkExtractor(allow=r'formation'), callback="parse", follow= True),)
def parse(self, response):
card = response.xpath('//a[#class="title"]')
for a in card:
yield {'links': a.xpath('#href').get()}
return [FormRequest(url="https://www.formatic-centre.fr/formation/",
formdata={'action' : "swlabscore",
'module[0]' : "top.Top_Controller",
'module[1]' : "ajax_get_course_pagination",
'page' : "2",
'layout' : "course",
'limit_post' : "",
'offset_post' : "0",
'sort_by' : "",
'pagination' : "yes",
'location_slug' : "",
'columns' : "2",
'paged' : "",
'cur_limit' : "",
'rows': "0",
'btn_content' : "En+savoir+plus",
'uniq_id' : "block-13759488265f916bca45c89",
'ZmfUNQ': "63y[Jt",
'PmhpIuZ_cTnUxqg' : "7v#IahmJNMplbCu",
'cZWVDbSPzTXRe' : "n9oa2k5u4GHWm",
'eOBITfdGRuriQ' : "hBPN5nObe.ktH",
"Accept" : "*/*",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language" : "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection" : "keep-alive",
"Content-Length" : "1010",
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie" : "_ga=GA1.2.815964309.1603392091; _gid=GA1.2.1686929506.1603392091; jlFYkafUWiyJe=LGAWcXg_wUjFo; z-byDgTnkdcQJSNH=03d1yiqH%40h8uZNtw; YeAhrFumyo-HQwpn=5uOhD6viWy%5BYeq3o",
"Host" : "www.formatic-centre.fr",
"Origin" : "https://www.formatic-centre.fr",
"Referer" : "https://www.formatic-centre.fr/formation/",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
"X-Requested-With" : "XMLHttpRequest",
"access-control-allow-credentials" : "true",
"access-control-allow-origin" : "https://www.formatic-centre.fr",
"cache-control" : "no-cache, must-revalidate, max-age=0",
"content-encoding": "gzip",
"content-length" :"2497",
"content-type" :"text/html; charset=UTF-8",
"date" :"Thu, 22 Oct 2020 18:42:54 GMT",
"expires" :"Wed, 11 Jan 1984 05:00:00 GMT",
"referrer-policy": "strict-origin-when-cross-origin",
"server": "Apache",
"set-cookie" : "jlFYkafUWiyJe=LGAWcXg_wUjFo; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"set-cookie" : "z-byDgTnkdcQJSNH=03d1yiqH%40h8uZNtw; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"set-cookie" : "YeAhrFumyo-HQwpn=5uOhD6viWy%5BYeq3o; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"strict-transport-security" : "max-age=15552001; preload",
"vary" : "Accept-Encoding",
"x-content-type-options" : "nosniff",
"X-Firefox-Spdy" : "h2",
"x-frame-options" : "SAMEORIGIN",
"x-robots-tag" : "noindex"})]
The script work for the first page, I obtained the links, but when he need to use FormRequest, nothing happened and I can't obtained the link in the next pages.
Any ideas ?
EDIT : I didn't see it but the terminal tell me this error :
2020-10-23 03:51:30 [scrapy.core.engine] DEBUG: Crawled (400) <POST https://www.formatic-centre.fr/formation/> (referer: https://www.formatic-centre.fr/formation/) ['partial']
2020-10-23 03:51:30 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <400 https://www.formatic-centre.fr/formation/>: HTTP status code is not handled or not allowed
Maybe it could help ?

You have some issues with how you format and send both your headers and the payload itself.
Also, you have to keep changing the page, so the server knows where you're at and what response to send back.
I didn't want to set up a new scrapy project but here's how I got all the links, so hopefully this will nudge you in the right direction:
And if it feels like a hack, well, because it is one.
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.formatic-centre.fr",
"referer": "https://www.formatic-centre.fr/formation/",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.99 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
raw_string = "action=swlabscore&module%5B%5D=top.Top_Controller&module%5B%5D=ajax_get_course_pagination&params%5B0%5D%5Bpage%5D=2&params%5B0%5D%5Batts%5D%5Blayout%5D=course&params%5B0%5D%5Batts%5D%5Blimit_post%5D=&params%5B0%5D%5Batts%5D%5Boffset_post%5D=0&params%5B0%5D%5Batts%5D%5Bsort_by%5D=&params%5B0%5D%5Batts%5D%5Bpagination%5D=yes&params%5B0%5D%5Batts%5D%5Blocation_slug%5D=&params%5B0%5D%5Batts%5D%5Bcolumns%5D=2&params%5B0%5D%5Batts%5D%5Bpaged%5D=&params%5B0%5D%5Batts%5D%5Bcur_limit%5D=&params%5B0%5D%5Batts%5D%5Brows%5D=0&params%5B0%5D%5Batts%5D%5Bbtn_content%5D=En+savoir+plus&params%5B0%5D%5Batts%5D%5Buniq_id%5D=block-13759488265f916bca45c89&params%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Blarge%5D=swedugate-thumb-300x225&params%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Bno-image%5D=thumb-300x225.gif&params%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Bsmall%5D=swedugate-thumb-300x225&params%5B0%5D%5Blayout_course%5D=style-grid&ZmfUNQ=63y[Jt&PmhpIuZ_cTnUxqg=7v#IahmJNMplbCu&cZWVDbSPzTXRe=n9oa2k5u4GHWm&eOBITfdGRuriQ=hBPN5nObe.ktH"
payloadd = [
('action', 'swlabscore'),
('module[]', 'top.Top_Controller'),
('module[]', 'ajax_get_course_pagination'),
('params[0][page]', '1'),
('params[0][atts][layout]', 'course'),
('params[0][atts][offset_post]', '0'),
('params[0][atts][pagination]', 'yes'),
('params[0][atts][columns]', '2'),
('params[0][atts][rows]', '0'),
('params[0][atts][btn_content]', 'En savoir plus'),
('params[0][atts][uniq_id]', 'block-13759488265f916bca45c89'),
('params[0][atts][thumb-size][large]', 'swedugate-thumb-300x225'),
('params[0][atts][thumb-size][no-image]', 'thumb-300x225.gif'),
('params[0][atts][thumb-size][small]', 'swedugate-thumb-300x225'),
('params[0][layout_course]', 'style-grid'),
('ZmfUNQ', '63y[Jt'),
('PmhpIuZ_cTnUxqg', '7v#IahmJNMplbCu'),
('cZWVDbSPzTXRe', 'n9oa2k5u4GHWm'),
('eOBITfdGRuriQ', 'hBPN5nObe.ktH'),
]
all_links = []
for page in range(1, 10):
payloadd.pop(3)
payloadd.insert(3, ('params[0][page]', str(page)))
response = requests.post(
"https://www.formatic-centre.fr/wp-admin/admin-ajax.php?",
headers=headers,
data=urlencode(payloadd)
)
print(f"Getting links from page {page}...")
soup = BeautifulSoup(response.text, "html.parser").find_all("a", class_="btn btn-green")
links = [i["href"] for i in soup]
print('\n'.join(links))
all_links.extend(links)
with open("formatic-center_links.txt", "w") as f:
f.writelines("\n".join(all_links) + "\n")
This produces a file with all the links under the EN SAVOIR PLUS buttons.
https://www.formatic-centre.fr/formation/les-regles-juridiques-du-teletravail/
https://www.formatic-centre.fr/formation/mieux-gerer-son-stress-en-periode-du-covid-19/
https://www.formatic-centre.fr/formation/dynamiser-vos-equipes-special-post-confinement/
https://www.formatic-centre.fr/formation/conduire-ses-entretiens-specifique-post-confinement/
https://www.formatic-centre.fr/formation/cours-excel/
https://www.formatic-centre.fr/formation/autocad-3d-2/
https://www.formatic-centre.fr/formation/concevoir-et-developper-une-strategie-marketing/
https://www.formatic-centre.fr/formation/preparer-soutenance/
https://www.formatic-centre.fr/formation/mettre-en-place-une-campagne-adwords/
https://www.formatic-centre.fr/formation/utiliser-google-analytics/
and so on ...

Related

Find marked elements, using python

I need to get an element from this website: https://channelstore.roku.com/en-gb/details/38e7b84fe064cf927ad471ed632cc3d8/vlrpdd2
Task image:
I tried this code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://channelstore.roku.com/en-gb/details/38e7b84fe064cf927ad471ed632cc3d8/vlrpdd2')
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
I got a document, but I only see metadata without the result I expected:
If you inspect your browsers Network calls (Click on F12), you'll see that the data is loaded dynamically from:
https://channelstore.roku.com/api/v6/channels/detailsunion/38e7b84fe064cf927ad471ed632cc3d8
So, to mimic the response, you can send a GET request to the URL.
Note, there's no need for BeautifulSoup:
import requests
headers = {
'authority': 'channelstore.roku.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'cookie': '_csrf=ZaFYG2W7HQA4xqKW3SUfuta0; ks.locale=j%3A%7B%22language%22%3A%22en%22%2C%22country%22%3A%22GB%22%7D; _usn=c2062c71-f89e-456f-9374-7c7767afc665; _uc=54c11aa8-597e-4155-bfa1-379add00fc85%3Aa044adeb2f02798a3c8335d874d49562; _ga=GA1.3.760826055.1671811832; _gid=GA1.3.2471563.1671811832; _ga=GA1.1.760826055.1671811832; _cs_c=0; roku_test; AWSELB=0DC9CDB91658555B919B869A2ED9157DFA13B446022D0100EBAAD7261A39D5A536AC0223E5570FAECF0099832FA9F5DB8028018FCCD9D0A49D8F2BDA087916BC1E51F73D1E; AWSELBCORS=0DC9CDB91658555B919B869A2ED9157DFA13B446022D0100EBAAD7261A39D5A536AC0223E5570FAECF0099832FA9F5DB8028018FCCD9D0A49D8F2BDA087916BC1E51F73D1E; _gat_UA-678051-1=1; _ga_ZZXW5ZLMQ5=GS1.1.1671811832.1.1.1671812598.0.0.0; _cs_id=8a1f1aec-e083-a585-e054-158b6714ab4a.1671811832.1.1671812598.1671811832.1.1705975832137; _cs_s=10.5.0.1671814398585',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
params = {
'country': 'GB',
'language': 'en',
}
response = requests.get(
'https://channelstore.roku.com/api/v6/channels/detailsunion/38e7b84fe064cf927ad471ed632cc3d8',
params=params,
headers=headers,
).json()
# Uncomment to print all the data
# from pprint import pprint
# pprint(response)
print(response.get("feedChannel").get("name"))
print("rating: ", response.get("feedChannel").get("starRatingCount"))
Prints:
The Silver Collection Comedies
rating: 3
you will need to use selenium for python as you need to load the javascript.
beautifulsoup can only really handle static websites.

How to bypass Cloudflare security using httpx request?

Ok, I`m trying to get html body from one site, with CloudFlare security.
I wrote the following code:
def reqja3():
"""Get request"""
import ssl, httpx
ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLSv1_2)
ssl_ctx.set_alpn_protocols(["h2", "http/1.1"])
ssl_ctx.set_ecdh_curve("prime256v1")
ssl_ctx.set_ciphers(
"TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:"
"TLS_AES_128_GCM_SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:"
"ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES256-GCM-SHA384:"
"ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:"
"DHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:"
"ECDHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES128-GCM-SHA256:"
"ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384:"
"DHE-RSA-AES256-SHA256:ECDHE-ECDSA-AES128-SHA256:"
"ECDHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA256:"
"ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:"
"DHE-RSA-AES256-SHA:ECDHE-ECDSA-AES128-SHA:"
"ECDHE-RSA-AES128-SHA:DHE-RSA-AES128-SHA:"
"RSA-PSK-AES256-GCM-SHA384:DHE-PSK-AES256-GCM-SHA384:"
"RSA-PSK-CHACHA20-POLY1305:DHE-PSK-CHACHA20-POLY1305:"
"ECDHE-PSK-CHACHA20-POLY1305:AES256-GCM-SHA384:"
"PSK-AES256-GCM-SHA384:PSK-CHACHA20-POLY1305:"
"RSA-PSK-AES128-GCM-SHA256:DHE-PSK-AES128-GCM-SHA256:"
"AES128-GCM-SHA256:PSK-AES128-GCM-SHA256:AES256-SHA256:"
"AES128-SHA256:ECDHE-PSK-AES256-CBC-SHA384:"
"ECDHE-PSK-AES256-CBC-SHA:SRP-RSA-AES-256-CBC-SHA:"
"SRP-AES-256-CBC-SHA:RSA-PSK-AES256-CBC-SHA384:"
"DHE-PSK-AES256-CBC-SHA384:RSA-PSK-AES256-CBC-SHA:"
"DHE-PSK-AES256-CBC-SHA:AES256-SHA:PSK-AES256-CBC-SHA384:"
"PSK-AES256-CBC-SHA:ECDHE-PSK-AES128-CBC-SHA256:ECDHE-PSK-AES128-CBC-SHA:"
"SRP-RSA-AES-128-CBC-SHA:SRP-AES-128-CBC-SHA:RSA-PSK-AES128-CBC-SHA256:"
"DHE-PSK-AES128-CBC-SHA256:RSA-PSK-AES128-CBC-SHA:"
"DHE-PSK-AES128-CBC-SHA:AES128-SHA:PSK-AES128-CBC-SHA256:PSK-AES128-CBC-SHA"
)
client = httpx.Client(http2=True, verify=ssl_ctx)
print(
client.get(
"https://betway.com/en/sports",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Host": "betway.com",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"TE": "trailers",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
"Cookie": "bw_BrowserId=76501619282281851930349603208014741540; _ga=GA1.2.1003986219.1650841035; _fbp=fb.1.1650841035798.971215073; COOKIE_POLICY_ACCEPTED=true; TrackingVisitId=67e26f62-e357-443d-be0c-83223d7ab902; hash=67e26f62-e357-443d-be0c-83223d7ab902; bw_SessionId=47d27eaa-623f-4434-a03b-9716d4b829a0; StaticResourcesVersion=12.43.0.1; ssc_btag=67e26f62-e357-443d-be0c-83223d7ab902; SpinSportVisitId=d369d188-39c6-41c6-8058-5aa297dd50c0; userLanguage=en; TimezoneOffset=120; _gid=GA1.2.381640013.1652975492; _gat_UA-1515961-1=1; ens_firstPageView=true; _gat=1; AMCVS_74756B615BE2FD4A0A495EB8%40AdobeOrg=1",
},
).text
)
reqja3()
Cloudflare can bypass with "right request" so, u don't need to use JS.
The main thing, it's to make request like browser do.
I set SSL parameters, TLS protocol and http2 ver. and it was working until today.
Now I'm trying to understand what I do wrong.

web scraping trouble - Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER

I tried to scrape a website with urllib and beautifulsoup (python 3.9) but I still have the same error message "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER" with special caracters as below:
��T�w?.��m����%�%z��%�H=S��$S�YYyi�ABD�x�!%��f36��\�Y�j�46f����I��9��!D��������������������b7�3�8��JnH�t���mړBm���<���,�zR�m��A�g��{�XF%��&)�6zy��'
�)a�Fo
�����N舅,���~?w�w� �7z�Y6N������Q��ƣA��,p�8��/��W��q�$
���#e�J7�#� 5�X�z�Ȥ�&q��8 ��H"����I0�����͂8ZY}J�m��c}&5e��?
"/>[�7X�?NF4r���[k��6�X?��VV��H�J$j�6h��e�C��]<�V��z D
����"d�nje��{���+YL��*�X?a���m�������MNn�+��1=b$�N�4p�0���/�h�'�?�,�[��V��$�D���Z��+�?�x�X�g����
I read some topics about this problem but I don't find the solution in my case.
Below, my code :
url = "https://www.fnac.com"
hdr = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
"Accept": "*/*",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection" : "keep-alive"}
req = urllib.request.Request(url, headers=hdr)
page = urllib.request.urlopen(req)
if page.getcode() == 200:
soup = BeautifulSoup(page, "html.parser", from_encoding="utf-8")
#divs = soup.findAll('div')
#href = [i['href'] for i in soup.findAll('a', href=True)]
print(soup)
else:
print("failed!")
I tried to change encoding mode by ASCII or iso-8858-(1...9) but the problem is stil the same.
Thanks for your help :)
Remove Accept-Encoding from the HTTP headers:
import urllib
from bs4 import BeautifulSoup
url = "https://www.fnac.com"
hdr = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
"Accept": "*/*",
# "Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection": "keep-alive",
}
req = urllib.request.Request(url, headers=hdr)
page = urllib.request.urlopen(req)
if page.getcode() == 200:
soup = BeautifulSoup(page, "html.parser", from_encoding="utf-8")
# divs = soup.findAll('div')
# href = [i['href'] for i in soup.findAll('a', href=True)]
print(soup)
else:
print("failed!")
Prints:
<!DOCTYPE html>
<html class="no-js" lang="fr-FR">
<head><meta charset="utf-8"/> <!-- entry: inline-kameleoon -->
...

Scraping ajax xmlhttprequest using python

I want to scrape school name, address, phone, email in UK from https://www.isc.co.uk/schools/ this website using xmlhttpeequest. It returns error 500.
import requests
headers = {"Accept": "application/json, text/plain, */*","Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/json;charset=UTF-8",
"Cookie": "_ga=GA1.3.1302518161.1584461820; _hjid=5f23e8d2-23c6-4c87-9cc0-ca216b587ae1; cookie_preference=false; iscFilterStates=%7B%22locationLatitude%22%3Anull%2C%22locationLongitude%22%3Anull%2C%22distanceInMiles%22%3A0%2C%22residencyTypes%22%3A%5B%5D%2C%22genderGroup%22%3Anull%2C%22ageRange%22%3Anull%2C%22religiousAffiliation%22%3Anull%2C%22financialAssistances%22%3A%5B%5D%2C%22examinations%22%3A%5B%5D%2C%22specialNeeds%22%3Afalse%2C%22scholarshipsAndBurseries%22%3Afalse%2C%22latitudeSW%22%3A47.823214345168694%2C%22longitudeSW%22%3A-18.049563984375%2C%22latitudeNE%22%3A59.385618287793505%2C%22longitudeNE%22%3A12.953853984375021%2C%22contactCountyID%22%3A0%2C%22contactCountryID%22%3A0%2C%22londonBoroughID%22%3A0%2C%22filterByBounds%22%3Atrue%2C%22savedBounds%22%3Atrue%2C%22zoom%22%3A5%2C%22center%22%3A%7B%22lat%22%3A54.00366%2C%22lng%22%3A-2.547855%7D%7D; _gid=GA1.3.1000954634.1584850972; _gat=1; __atuvc=11%7C12%2C4%7C13; __atuvs=5e773c3c593ef6aa000; __atssc=google%3B7",
"Host": "www.isc.co.uk",
"Origin": "https://www.isc.co.uk",
"Referer": "https://www.isc.co.uk/schools/",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3927.0 Safari/537.36"}
response = requests.post('https://www.isc.co.uk/Umbraco/Api/FindSchoolApi/FindSchoolListResults?skip=20&take=20', headers = headers)
response.status_code
The website is loaded with JavaScript event which render it's data dynamically once the page loads.
requests library will not be able to render JavaScript on the fly. so you can use selenium or requests_html. and indeed there's a lot of modules which can do that.
Now, we do have another option on the table, to track from where the data is rendered. I were able to locate the XHR request which is used to retrieve the data from the back-end API and render it to the users side.
You can get the XHR request by open Developer-Tools and check Network and check XHR/JS requests made depending of the type of call such as fetch
import requests
import csv
data = {'locationLatitude': None, 'locationLongitude': None, 'distanceInMiles':
0, 'residencyTypes': [], 'genderGroup': None, 'ageRange': None, 'religiousAffiliation': None, 'financialAssistances': [], 'examinations': [], 'specialNeeds': False, 'scholarshipsAndBurseries': False, 'latitudeSW': 47.823214345168694, 'longitudeSW': -18.049563984375, 'latitudeNE': 59.385618287793505, 'longitudeNE': 12.953853984375021, 'contactCountyID': 0, 'contactCountryID': 0, 'londonBoroughID': 0, 'filterByBounds': True, 'savedBounds': True, 'zoom': 5, 'center': {'lat': 54.00366, 'lng': -2.547855}}
r = requests.post(
"https://www.isc.co.uk/Umbraco/Api/FindSchoolApi/FindSchoolListResults?skip=0&take=20", json=data).json()
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Address", "Phone", "Email"])
for item in r:
writer.writerow(
[item["Name"], item["FullAddress"], item["TelephoneNumber"], item["EmailAddress"]])
print("Done")
Output : View-Online

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below
import requests
urls = "https://pan.baidu.com/s/1sj1JLJv"
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
'Content-Length': '0',
"Connection": "keep-alive"<br>
}
print(str((requests.get(urls, headers=header)).content, 'utf-8'))
from scrapy_redis.spiders import RedisCrawlSpider
class baiduuSpider(RedisCrawlSpider):
...
...
...
urls = "https://pan.baidu.com/s/1sj1JLJv"
yield scrapy.Request(url = urls,headers = headers,callback = self.first_parse)
def first_parse(self, response):
print(response.body.decode('utf-8'))
How do I fix this question
I'm sorry, but you won't succeed, because the page loads dynamically.
It is necessary to compile javascript on the fly - Selenium, Splash

Resources