How to bypass Cloudflare security using httpx request? - python-3.x

Ok, I`m trying to get html body from one site, with CloudFlare security.
I wrote the following code:
def reqja3():
"""Get request"""
import ssl, httpx
ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLSv1_2)
ssl_ctx.set_alpn_protocols(["h2", "http/1.1"])
ssl_ctx.set_ecdh_curve("prime256v1")
ssl_ctx.set_ciphers(
"TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:"
"TLS_AES_128_GCM_SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:"
"ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES256-GCM-SHA384:"
"ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:"
"DHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:"
"ECDHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES128-GCM-SHA256:"
"ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384:"
"DHE-RSA-AES256-SHA256:ECDHE-ECDSA-AES128-SHA256:"
"ECDHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA256:"
"ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:"
"DHE-RSA-AES256-SHA:ECDHE-ECDSA-AES128-SHA:"
"ECDHE-RSA-AES128-SHA:DHE-RSA-AES128-SHA:"
"RSA-PSK-AES256-GCM-SHA384:DHE-PSK-AES256-GCM-SHA384:"
"RSA-PSK-CHACHA20-POLY1305:DHE-PSK-CHACHA20-POLY1305:"
"ECDHE-PSK-CHACHA20-POLY1305:AES256-GCM-SHA384:"
"PSK-AES256-GCM-SHA384:PSK-CHACHA20-POLY1305:"
"RSA-PSK-AES128-GCM-SHA256:DHE-PSK-AES128-GCM-SHA256:"
"AES128-GCM-SHA256:PSK-AES128-GCM-SHA256:AES256-SHA256:"
"AES128-SHA256:ECDHE-PSK-AES256-CBC-SHA384:"
"ECDHE-PSK-AES256-CBC-SHA:SRP-RSA-AES-256-CBC-SHA:"
"SRP-AES-256-CBC-SHA:RSA-PSK-AES256-CBC-SHA384:"
"DHE-PSK-AES256-CBC-SHA384:RSA-PSK-AES256-CBC-SHA:"
"DHE-PSK-AES256-CBC-SHA:AES256-SHA:PSK-AES256-CBC-SHA384:"
"PSK-AES256-CBC-SHA:ECDHE-PSK-AES128-CBC-SHA256:ECDHE-PSK-AES128-CBC-SHA:"
"SRP-RSA-AES-128-CBC-SHA:SRP-AES-128-CBC-SHA:RSA-PSK-AES128-CBC-SHA256:"
"DHE-PSK-AES128-CBC-SHA256:RSA-PSK-AES128-CBC-SHA:"
"DHE-PSK-AES128-CBC-SHA:AES128-SHA:PSK-AES128-CBC-SHA256:PSK-AES128-CBC-SHA"
)
client = httpx.Client(http2=True, verify=ssl_ctx)
print(
client.get(
"https://betway.com/en/sports",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Host": "betway.com",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"TE": "trailers",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
"Cookie": "bw_BrowserId=76501619282281851930349603208014741540; _ga=GA1.2.1003986219.1650841035; _fbp=fb.1.1650841035798.971215073; COOKIE_POLICY_ACCEPTED=true; TrackingVisitId=67e26f62-e357-443d-be0c-83223d7ab902; hash=67e26f62-e357-443d-be0c-83223d7ab902; bw_SessionId=47d27eaa-623f-4434-a03b-9716d4b829a0; StaticResourcesVersion=12.43.0.1; ssc_btag=67e26f62-e357-443d-be0c-83223d7ab902; SpinSportVisitId=d369d188-39c6-41c6-8058-5aa297dd50c0; userLanguage=en; TimezoneOffset=120; _gid=GA1.2.381640013.1652975492; _gat_UA-1515961-1=1; ens_firstPageView=true; _gat=1; AMCVS_74756B615BE2FD4A0A495EB8%40AdobeOrg=1",
},
).text
)
reqja3()
Cloudflare can bypass with "right request" so, u don't need to use JS.
The main thing, it's to make request like browser do.
I set SSL parameters, TLS protocol and http2 ver. and it was working until today.
Now I'm trying to understand what I do wrong.

Related

return results via express with lighthouse npm library

https://googlechrome.github.io/lighthouse/viewer/?psiurl=https%3A%2F%2Fwww.zfcakademi.com%2F&strategy=mobile&category=performance&category=accessibility&category=best-practices&category=seo&category=pwa&utm_source=lh-chrome-ext&output=json
In inquiries we made over the address
https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=https://run-fix.com/&strategy=mobile&utm_source=lh-chrome-ext&category=performance&category=accessibility&category=best-practices&category=seo&category=pwa
The results are returned to us by using the API. However, when using this api, a token is sent via the github page. Access is provided by this token. In order to send a request without a token, you need to refresh the page with the help of cookies created after opening the page. Otherwise, it gives 403 access permissions error.
axios.get(`https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=https://www.zfcakademi.com/&strategy=desktop&utm_source=lh-chrome-ext&category=performance&category=accessibility&category=best-practices&category=seo&category=pwa`, {
"headers": {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36',
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "tr,en-US;q=0.9,en;q=0.8,tr-TR;q=0.7,ru;q=0.6",
"cache-control": "max-age=0",
"sec-ch-dpr": "1.5",
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"",
"sec-ch-ua-arch": "\"x86\"",
"sec-ch-ua-bitness": "\"64\"",
"sec-ch-ua-full-version": "\"101.0.4951.64\"",
"sec-ch-ua-full-version-list": "\" Not A;Brand\";v=\"99.0.0.0\", \"Chromium\";v=\"101.0.4951.64\", \"Google Chrome\";v=\"101.0.4951.64\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": "\"\"",
"sec-ch-ua-platform": "\"Windows\"",
"sec-ch-ua-platform-version": "\"10.0.0\"",
"sec-ch-ua-wow64": "?0",
"sec-ch-viewport-width": "853",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"x-client-data": "CIS2yQEIorbJAQjEtskBCKmdygEI9eHKAQiTocsBCNvvywEInvnLAQjmhMwBCJmazAEI2qnMAQiJq8wBCOurzAEIwqzMARirqcoB"
},
"referrerPolicy": "origin",
"body": null,
"method": "GET",
"mode": "cors",
"credentials": "include"
})
.then(resp => {
resolve(resp.data)
})
When I send a request in this way, I get a 403 error as in the picture below and I cannot access it. They have set up a cookie system. In the Lighthouse npm library, only getting data with commands is shown. I'm trying to create an api with express, but I couldn't find the slightest resource how to use it with lighthouse express. Everyone has used it with the help of command line. Can you please help? How can I combine express and lighthouse? For my client, this seo data is very important.
https://www.npmjs.com/package/lighthouse?activeTab=readme

How can I specify the exact http method with python requests?

In Burp Suite the first line of a captured request is usually GET / HTTP/1.1. However, I am currently practicing Host Header injection using the method of supplying an absolute URL in order to something like this:
GET https://vulnerable-website.com/ HTTP/1.1
Host: bad-stuff-here
In python I am using the requests library and am unable to specify the exact GET request I need.
import requests
burp0_url = "https://vulnerable-website.com:443/"
burp0_cookies = {[redacted]}
burp0_headers = {"Host": "bad-stuff-here", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Referer": "https://vulnerable-website.com/", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
output = requests.get(burp0_url, headers=burp0_headers, cookies=burp0_cookies)
print(output, output.text)
I have tried specifying the GET request in the header dictionary (header = {"GET":" / HTTP/1.1", ...}), however this only results in a GET Header not r
Request on the 6th line being sent:
GET / HTTP/1.1
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0
Accept-Encoding: gzip, deflate
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Connection: close
GET: /
Host: bad-stuff-here
Accept-Language: en-US,en;q=0.5
Referer: https://vulnerable-website.com/
Upgrade-Insecure-Requests: 1
Cookie: [redacted]
This is a very specific problem and I'm not sure if anyone has had the same issues but any help is appreciated. Maybe a workaround with urllib or something I'm missing. Thanks.
requests uses urllib3 under the hood.
You have to craft the request yourself because of non of the clients [urlib, requests, http.client] won't allow you to insert a control character by design.
You can use a plain socket for this
msg = 'GET / HTTP/1.1\r\n\r\n'
s = socket.create_connection(("vulnerable-website.com", 80))
with closing(s):
s.send(msg)
buf = ''.join(iter(partial(s.recv, 4096), ''))

Crawling all page with scrapy and FormRequest

I would like to scrape all link for the formation in this website : https://www.formatic-centre.fr/formation/
Apparently the next pages are dynamically loaded with AJAX. I need to simulate those requests using FormRequest from scrapy.
That was I did, I look up for the parameters with developer tools : ajax1
I put those parameters into FormRequest but apparently if it didn't work, I need to include the header, that what I did : ajax2
But it didn't work either.. I'm guessing I'm doing something wrong but what ?
Here's my script, if you want (sorry it's quite long, because I put all the parameters and the headers) :
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
from scrapy.http import FormRequest
class LinkSpider(scrapy.Spider):
name = "link"
#allow_domains = ['https://www.formatic-centre.fr/']
start_urls = ['https://www.formatic-centre.fr/formation/']
rules = (Rule(LinkExtractor(allow=r'formation'), callback="parse", follow= True),)
def parse(self, response):
card = response.xpath('//a[#class="title"]')
for a in card:
yield {'links': a.xpath('#href').get()}
return [FormRequest(url="https://www.formatic-centre.fr/formation/",
formdata={'action' : "swlabscore",
'module[0]' : "top.Top_Controller",
'module[1]' : "ajax_get_course_pagination",
'page' : "2",
'layout' : "course",
'limit_post' : "",
'offset_post' : "0",
'sort_by' : "",
'pagination' : "yes",
'location_slug' : "",
'columns' : "2",
'paged' : "",
'cur_limit' : "",
'rows': "0",
'btn_content' : "En+savoir+plus",
'uniq_id' : "block-13759488265f916bca45c89",
'ZmfUNQ': "63y[Jt",
'PmhpIuZ_cTnUxqg' : "7v#IahmJNMplbCu",
'cZWVDbSPzTXRe' : "n9oa2k5u4GHWm",
'eOBITfdGRuriQ' : "hBPN5nObe.ktH",
"Accept" : "*/*",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language" : "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection" : "keep-alive",
"Content-Length" : "1010",
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie" : "_ga=GA1.2.815964309.1603392091; _gid=GA1.2.1686929506.1603392091; jlFYkafUWiyJe=LGAWcXg_wUjFo; z-byDgTnkdcQJSNH=03d1yiqH%40h8uZNtw; YeAhrFumyo-HQwpn=5uOhD6viWy%5BYeq3o",
"Host" : "www.formatic-centre.fr",
"Origin" : "https://www.formatic-centre.fr",
"Referer" : "https://www.formatic-centre.fr/formation/",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
"X-Requested-With" : "XMLHttpRequest",
"access-control-allow-credentials" : "true",
"access-control-allow-origin" : "https://www.formatic-centre.fr",
"cache-control" : "no-cache, must-revalidate, max-age=0",
"content-encoding": "gzip",
"content-length" :"2497",
"content-type" :"text/html; charset=UTF-8",
"date" :"Thu, 22 Oct 2020 18:42:54 GMT",
"expires" :"Wed, 11 Jan 1984 05:00:00 GMT",
"referrer-policy": "strict-origin-when-cross-origin",
"server": "Apache",
"set-cookie" : "jlFYkafUWiyJe=LGAWcXg_wUjFo; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"set-cookie" : "z-byDgTnkdcQJSNH=03d1yiqH%40h8uZNtw; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"set-cookie" : "YeAhrFumyo-HQwpn=5uOhD6viWy%5BYeq3o; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"strict-transport-security" : "max-age=15552001; preload",
"vary" : "Accept-Encoding",
"x-content-type-options" : "nosniff",
"X-Firefox-Spdy" : "h2",
"x-frame-options" : "SAMEORIGIN",
"x-robots-tag" : "noindex"})]
The script work for the first page, I obtained the links, but when he need to use FormRequest, nothing happened and I can't obtained the link in the next pages.
Any ideas ?
EDIT : I didn't see it but the terminal tell me this error :
2020-10-23 03:51:30 [scrapy.core.engine] DEBUG: Crawled (400) <POST https://www.formatic-centre.fr/formation/> (referer: https://www.formatic-centre.fr/formation/) ['partial']
2020-10-23 03:51:30 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <400 https://www.formatic-centre.fr/formation/>: HTTP status code is not handled or not allowed
Maybe it could help ?
You have some issues with how you format and send both your headers and the payload itself.
Also, you have to keep changing the page, so the server knows where you're at and what response to send back.
I didn't want to set up a new scrapy project but here's how I got all the links, so hopefully this will nudge you in the right direction:
And if it feels like a hack, well, because it is one.
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.formatic-centre.fr",
"referer": "https://www.formatic-centre.fr/formation/",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.99 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
raw_string = "action=swlabscore&module%5B%5D=top.Top_Controller&module%5B%5D=ajax_get_course_pagination&params%5B0%5D%5Bpage%5D=2&params%5B0%5D%5Batts%5D%5Blayout%5D=course&params%5B0%5D%5Batts%5D%5Blimit_post%5D=&params%5B0%5D%5Batts%5D%5Boffset_post%5D=0&params%5B0%5D%5Batts%5D%5Bsort_by%5D=&params%5B0%5D%5Batts%5D%5Bpagination%5D=yes&params%5B0%5D%5Batts%5D%5Blocation_slug%5D=&params%5B0%5D%5Batts%5D%5Bcolumns%5D=2&params%5B0%5D%5Batts%5D%5Bpaged%5D=&params%5B0%5D%5Batts%5D%5Bcur_limit%5D=&params%5B0%5D%5Batts%5D%5Brows%5D=0&params%5B0%5D%5Batts%5D%5Bbtn_content%5D=En+savoir+plus&params%5B0%5D%5Batts%5D%5Buniq_id%5D=block-13759488265f916bca45c89&params%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Blarge%5D=swedugate-thumb-300x225&params%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Bno-image%5D=thumb-300x225.gif&params%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Bsmall%5D=swedugate-thumb-300x225&params%5B0%5D%5Blayout_course%5D=style-grid&ZmfUNQ=63y[Jt&PmhpIuZ_cTnUxqg=7v#IahmJNMplbCu&cZWVDbSPzTXRe=n9oa2k5u4GHWm&eOBITfdGRuriQ=hBPN5nObe.ktH"
payloadd = [
('action', 'swlabscore'),
('module[]', 'top.Top_Controller'),
('module[]', 'ajax_get_course_pagination'),
('params[0][page]', '1'),
('params[0][atts][layout]', 'course'),
('params[0][atts][offset_post]', '0'),
('params[0][atts][pagination]', 'yes'),
('params[0][atts][columns]', '2'),
('params[0][atts][rows]', '0'),
('params[0][atts][btn_content]', 'En savoir plus'),
('params[0][atts][uniq_id]', 'block-13759488265f916bca45c89'),
('params[0][atts][thumb-size][large]', 'swedugate-thumb-300x225'),
('params[0][atts][thumb-size][no-image]', 'thumb-300x225.gif'),
('params[0][atts][thumb-size][small]', 'swedugate-thumb-300x225'),
('params[0][layout_course]', 'style-grid'),
('ZmfUNQ', '63y[Jt'),
('PmhpIuZ_cTnUxqg', '7v#IahmJNMplbCu'),
('cZWVDbSPzTXRe', 'n9oa2k5u4GHWm'),
('eOBITfdGRuriQ', 'hBPN5nObe.ktH'),
]
all_links = []
for page in range(1, 10):
payloadd.pop(3)
payloadd.insert(3, ('params[0][page]', str(page)))
response = requests.post(
"https://www.formatic-centre.fr/wp-admin/admin-ajax.php?",
headers=headers,
data=urlencode(payloadd)
)
print(f"Getting links from page {page}...")
soup = BeautifulSoup(response.text, "html.parser").find_all("a", class_="btn btn-green")
links = [i["href"] for i in soup]
print('\n'.join(links))
all_links.extend(links)
with open("formatic-center_links.txt", "w") as f:
f.writelines("\n".join(all_links) + "\n")
This produces a file with all the links under the EN SAVOIR PLUS buttons.
https://www.formatic-centre.fr/formation/les-regles-juridiques-du-teletravail/
https://www.formatic-centre.fr/formation/mieux-gerer-son-stress-en-periode-du-covid-19/
https://www.formatic-centre.fr/formation/dynamiser-vos-equipes-special-post-confinement/
https://www.formatic-centre.fr/formation/conduire-ses-entretiens-specifique-post-confinement/
https://www.formatic-centre.fr/formation/cours-excel/
https://www.formatic-centre.fr/formation/autocad-3d-2/
https://www.formatic-centre.fr/formation/concevoir-et-developper-une-strategie-marketing/
https://www.formatic-centre.fr/formation/preparer-soutenance/
https://www.formatic-centre.fr/formation/mettre-en-place-une-campagne-adwords/
https://www.formatic-centre.fr/formation/utiliser-google-analytics/
and so on ...

Scraping ajax xmlhttprequest using python

I want to scrape school name, address, phone, email in UK from https://www.isc.co.uk/schools/ this website using xmlhttpeequest. It returns error 500.
import requests
headers = {"Accept": "application/json, text/plain, */*","Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/json;charset=UTF-8",
"Cookie": "_ga=GA1.3.1302518161.1584461820; _hjid=5f23e8d2-23c6-4c87-9cc0-ca216b587ae1; cookie_preference=false; iscFilterStates=%7B%22locationLatitude%22%3Anull%2C%22locationLongitude%22%3Anull%2C%22distanceInMiles%22%3A0%2C%22residencyTypes%22%3A%5B%5D%2C%22genderGroup%22%3Anull%2C%22ageRange%22%3Anull%2C%22religiousAffiliation%22%3Anull%2C%22financialAssistances%22%3A%5B%5D%2C%22examinations%22%3A%5B%5D%2C%22specialNeeds%22%3Afalse%2C%22scholarshipsAndBurseries%22%3Afalse%2C%22latitudeSW%22%3A47.823214345168694%2C%22longitudeSW%22%3A-18.049563984375%2C%22latitudeNE%22%3A59.385618287793505%2C%22longitudeNE%22%3A12.953853984375021%2C%22contactCountyID%22%3A0%2C%22contactCountryID%22%3A0%2C%22londonBoroughID%22%3A0%2C%22filterByBounds%22%3Atrue%2C%22savedBounds%22%3Atrue%2C%22zoom%22%3A5%2C%22center%22%3A%7B%22lat%22%3A54.00366%2C%22lng%22%3A-2.547855%7D%7D; _gid=GA1.3.1000954634.1584850972; _gat=1; __atuvc=11%7C12%2C4%7C13; __atuvs=5e773c3c593ef6aa000; __atssc=google%3B7",
"Host": "www.isc.co.uk",
"Origin": "https://www.isc.co.uk",
"Referer": "https://www.isc.co.uk/schools/",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3927.0 Safari/537.36"}
response = requests.post('https://www.isc.co.uk/Umbraco/Api/FindSchoolApi/FindSchoolListResults?skip=20&take=20', headers = headers)
response.status_code
The website is loaded with JavaScript event which render it's data dynamically once the page loads.
requests library will not be able to render JavaScript on the fly. so you can use selenium or requests_html. and indeed there's a lot of modules which can do that.
Now, we do have another option on the table, to track from where the data is rendered. I were able to locate the XHR request which is used to retrieve the data from the back-end API and render it to the users side.
You can get the XHR request by open Developer-Tools and check Network and check XHR/JS requests made depending of the type of call such as fetch
import requests
import csv
data = {'locationLatitude': None, 'locationLongitude': None, 'distanceInMiles':
0, 'residencyTypes': [], 'genderGroup': None, 'ageRange': None, 'religiousAffiliation': None, 'financialAssistances': [], 'examinations': [], 'specialNeeds': False, 'scholarshipsAndBurseries': False, 'latitudeSW': 47.823214345168694, 'longitudeSW': -18.049563984375, 'latitudeNE': 59.385618287793505, 'longitudeNE': 12.953853984375021, 'contactCountyID': 0, 'contactCountryID': 0, 'londonBoroughID': 0, 'filterByBounds': True, 'savedBounds': True, 'zoom': 5, 'center': {'lat': 54.00366, 'lng': -2.547855}}
r = requests.post(
"https://www.isc.co.uk/Umbraco/Api/FindSchoolApi/FindSchoolListResults?skip=0&take=20", json=data).json()
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Address", "Phone", "Email"])
for item in r:
writer.writerow(
[item["Name"], item["FullAddress"], item["TelephoneNumber"], item["EmailAddress"]])
print("Done")
Output : View-Online

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below
import requests
urls = "https://pan.baidu.com/s/1sj1JLJv"
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
'Content-Length': '0',
"Connection": "keep-alive"<br>
}
print(str((requests.get(urls, headers=header)).content, 'utf-8'))
from scrapy_redis.spiders import RedisCrawlSpider
class baiduuSpider(RedisCrawlSpider):
...
...
...
urls = "https://pan.baidu.com/s/1sj1JLJv"
yield scrapy.Request(url = urls,headers = headers,callback = self.first_parse)
def first_parse(self, response):
print(response.body.decode('utf-8'))
How do I fix this question
I'm sorry, but you won't succeed, because the page loads dynamically.
It is necessary to compile javascript on the fly - Selenium, Splash

Resources