How to make all binary files (images) download using requests() and open()? - python-3.x

When I try to download a image from one url, the code works, but when I try another URL, it dosen't work.
This Doesn't Work. It only creates the filename.
# This Doesn't Work.
import requests
url = 'https://ryanspressurewashing.com/wp-content/uploads/2017/06/metal-
roof-after-pressure-washing.jpg'
r = requests.get(url, stream=True)
with open('image3.jpg', 'wb') as my_file:
# Read by 4KB chunks
for byte_chunk in r.iter_content(chunk_size=4096):
my_file.write(byte_chunk)
# This Works?
import requests
url = 'http://www.webscrapingfordatascience.com/files/kitten.jpg'
r = requests.get(url, stream=True)
with open('image.jpg', 'wb') as my_file:
# Read by 4KB chunks
for byte_chunk in r.iter_content(chunk_size=4096):
my_file.write(byte_chunk)

Different portals may have different security systems to block script/bots.
When you open image3.jpg in text editor you will see
<head>
<title>Not Acceptable!</title>
</head>
<body>
<h1>Not Acceptable!</h1>
<p>An appropriate representation of the requested resource could not be found on this server.
This error was generated by Mod_Security.</p>
</body>
</html>
Some servers may need correct headerse, cookies, session-id, etc. to give access to data.
This portal needs correct header user-agent
import requests
url = 'https://ryanspressurewashing.com/wp-content/uploads/2017/06/metal-roof-after-pressure-washing.jpg'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'
}
r = requests.get(url, stream=True, headers=headers)
with open('image3.jpg', 'wb') as my_file:
# Read by 4KB chunks
for byte_chunk in r.iter_content(chunk_size=4096):
my_file.write(byte_chunk)
requests as default uses user-agent: python-requests/2.21.0 so portals can easily recognize script and block it.
You can see this header using https://httpbin.org/get
import requests
r = requests.get('https://httpbin.org/get')
print(r.text)
Result:
{
"args": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.21.0"
},
"origin": "83.23.39.165, 83.23.39.165",
"url": "https://httpbin.org/get"
}
See more functions on httpbin.org

Related

web scraping trouble - Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER

I tried to scrape a website with urllib and beautifulsoup (python 3.9) but I still have the same error message "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER" with special caracters as below:
��T�w?.��m����%�%z��%�H=S��$S�YYyi�ABD�x�!%��f36��\�Y�j�46f����I��9��!D��������������������b7�3�8��JnH�t���mړBm���<���,�zR�m��A�g��{�XF%��&)�6zy��'
�)a�Fo
�����N舅,���~?w�w� �7z�Y6N������Q��ƣA��,p�8��/��W��q�$
���#e�J7�#� 5�X�z�Ȥ�&q��8 ��H"����I0�����͂8ZY}J�m��c}&5e��?
"/>[�7X�?NF4r���[k��6�X?��VV��H�J$j�6h��e�C��]<�V��z D
����"d�nje��{���+YL��*�X?a���m�������MNn�+��1=b$�N�4p�0���/�h�'�?�,�[��V��$�D���Z��+�?�x�X�g����
I read some topics about this problem but I don't find the solution in my case.
Below, my code :
url = "https://www.fnac.com"
hdr = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
"Accept": "*/*",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection" : "keep-alive"}
req = urllib.request.Request(url, headers=hdr)
page = urllib.request.urlopen(req)
if page.getcode() == 200:
soup = BeautifulSoup(page, "html.parser", from_encoding="utf-8")
#divs = soup.findAll('div')
#href = [i['href'] for i in soup.findAll('a', href=True)]
print(soup)
else:
print("failed!")
I tried to change encoding mode by ASCII or iso-8858-(1...9) but the problem is stil the same.
Thanks for your help :)
Remove Accept-Encoding from the HTTP headers:
import urllib
from bs4 import BeautifulSoup
url = "https://www.fnac.com"
hdr = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
"Accept": "*/*",
# "Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection": "keep-alive",
}
req = urllib.request.Request(url, headers=hdr)
page = urllib.request.urlopen(req)
if page.getcode() == 200:
soup = BeautifulSoup(page, "html.parser", from_encoding="utf-8")
# divs = soup.findAll('div')
# href = [i['href'] for i in soup.findAll('a', href=True)]
print(soup)
else:
print("failed!")
Prints:
<!DOCTYPE html>
<html class="no-js" lang="fr-FR">
<head><meta charset="utf-8"/> <!-- entry: inline-kameleoon -->
...

How to get proper result from web scraping (about flight information in Kayak)?

from bs4 import BeautifulSoup
import requests
url = 'https://www.kayak.co.uk/flights/SEL-LON/2020-12-31?sort=bestflight_a'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
deptimes = soup.find_all('span', attrs={'class': 'depart-time base-time'})
deptimes
In Kayak, I have tried to get departure time information of flights from Seoul to London.
The result is [], and I'm trying this format to other information but the results are same always.
Thank you
If you look into what you get back as a response.content, kayak.co.uk thinks (correctly) that you're a bot and sends this (among other stuff)
If you are seeing this page, it means that KAYAK thinks you are a "bot," and the page you were trying to get to is only useful for humans.
However, a slight modification of the code tricks the server enough to return what you're looking for.
Try this:
import requests
from bs4 import BeautifulSoup
url = 'https://www.kayak.co.uk/flights/SEL-LON/2020-12-31?sort=bestflight_a'
headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9,pl;q=0.8",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
departure_times = soup.find_all('span', {'class': 'depart-time base-time'})
for _time in departure_times:
print(_time.text)
This outputs:
00:25
00:25
00:25
14:30
14:30
13:00
13:00
... and so on

Scraping ajax xmlhttprequest using python

I want to scrape school name, address, phone, email in UK from https://www.isc.co.uk/schools/ this website using xmlhttpeequest. It returns error 500.
import requests
headers = {"Accept": "application/json, text/plain, */*","Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/json;charset=UTF-8",
"Cookie": "_ga=GA1.3.1302518161.1584461820; _hjid=5f23e8d2-23c6-4c87-9cc0-ca216b587ae1; cookie_preference=false; iscFilterStates=%7B%22locationLatitude%22%3Anull%2C%22locationLongitude%22%3Anull%2C%22distanceInMiles%22%3A0%2C%22residencyTypes%22%3A%5B%5D%2C%22genderGroup%22%3Anull%2C%22ageRange%22%3Anull%2C%22religiousAffiliation%22%3Anull%2C%22financialAssistances%22%3A%5B%5D%2C%22examinations%22%3A%5B%5D%2C%22specialNeeds%22%3Afalse%2C%22scholarshipsAndBurseries%22%3Afalse%2C%22latitudeSW%22%3A47.823214345168694%2C%22longitudeSW%22%3A-18.049563984375%2C%22latitudeNE%22%3A59.385618287793505%2C%22longitudeNE%22%3A12.953853984375021%2C%22contactCountyID%22%3A0%2C%22contactCountryID%22%3A0%2C%22londonBoroughID%22%3A0%2C%22filterByBounds%22%3Atrue%2C%22savedBounds%22%3Atrue%2C%22zoom%22%3A5%2C%22center%22%3A%7B%22lat%22%3A54.00366%2C%22lng%22%3A-2.547855%7D%7D; _gid=GA1.3.1000954634.1584850972; _gat=1; __atuvc=11%7C12%2C4%7C13; __atuvs=5e773c3c593ef6aa000; __atssc=google%3B7",
"Host": "www.isc.co.uk",
"Origin": "https://www.isc.co.uk",
"Referer": "https://www.isc.co.uk/schools/",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3927.0 Safari/537.36"}
response = requests.post('https://www.isc.co.uk/Umbraco/Api/FindSchoolApi/FindSchoolListResults?skip=20&take=20', headers = headers)
response.status_code
The website is loaded with JavaScript event which render it's data dynamically once the page loads.
requests library will not be able to render JavaScript on the fly. so you can use selenium or requests_html. and indeed there's a lot of modules which can do that.
Now, we do have another option on the table, to track from where the data is rendered. I were able to locate the XHR request which is used to retrieve the data from the back-end API and render it to the users side.
You can get the XHR request by open Developer-Tools and check Network and check XHR/JS requests made depending of the type of call such as fetch
import requests
import csv
data = {'locationLatitude': None, 'locationLongitude': None, 'distanceInMiles':
0, 'residencyTypes': [], 'genderGroup': None, 'ageRange': None, 'religiousAffiliation': None, 'financialAssistances': [], 'examinations': [], 'specialNeeds': False, 'scholarshipsAndBurseries': False, 'latitudeSW': 47.823214345168694, 'longitudeSW': -18.049563984375, 'latitudeNE': 59.385618287793505, 'longitudeNE': 12.953853984375021, 'contactCountyID': 0, 'contactCountryID': 0, 'londonBoroughID': 0, 'filterByBounds': True, 'savedBounds': True, 'zoom': 5, 'center': {'lat': 54.00366, 'lng': -2.547855}}
r = requests.post(
"https://www.isc.co.uk/Umbraco/Api/FindSchoolApi/FindSchoolListResults?skip=0&take=20", json=data).json()
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Address", "Phone", "Email"])
for item in r:
writer.writerow(
[item["Name"], item["FullAddress"], item["TelephoneNumber"], item["EmailAddress"]])
print("Done")
Output : View-Online

Understanding Bearer Authorization for web scraping using python 3.8 and requests

So I am looking to scrape the following site:
https://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
What I am running into using the Python Requests library is that the header requires I pass along an Authorization header that bears a token of some kind. While I can get this to work if I manually go to the page, copy and paste it, and then run my program, I am wondering how I could bypass this issue (After all, what is the point in running a scraper if I still have to visit the actual site manually and retrieve the authorization token).
I am newer to authorization/ bearer headers and am hoping someone might be able to clarify how the browser generates a token to retrieve this information/ how I can simulate this. Here is my code:
import requests
import json
import datetime
today = datetime.datetime.today()
url = "https://hyland.csod.com/services/x/career-site/v1/search"
# actual sitehttps://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
headers = {
'authority': 'hyland.csod.com',
'origin': 'https://hyland.csod.com',
'authorization': 'Bearer eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCIsImNsaWQiOiI0bDhnbnFhbGk3NjgifQ.eyJzdWIiOi0xMDMsImF1ZCI6IjRxNTFzeG5oY25yazRhNXB1eXZ1eGh6eCIsImNvcnAiOiJoeWxhbmQiLCJjdWlkIjoxLCJ0emlkIjoxNCwibmJkIjoiMjAxOTEyMzEyMTE0MTU5MzQiLCJleHAiOiIyMDE5MTIzMTIyMTUxNTkzNCIsImlhdCI6IjIwMTkxMjMxMjExNDE1OTM0In0.PlNdWXtb1uNoMuGIhI093ZbheRN_DwENTlkNoVr0j7Zah6JHd5cukudVFnZEiQmgBZ_nlDU4C-9JO_2We380Vg',
'content-type': 'application/json',
'accept': 'application/json; q=1.0, text/*; q=0.8, */*; q=0.1',
'x-requested-with': 'XMLHttpRequest',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'csod-accept-language': 'en-US',
'referer': 'https://hyland.csod.com/ux/ats/careersite/4/home?c=hyland',
'accept-encoding': 'gzip, deflate, br',
'cookie': 'CYBERU_lastculture=en-US; ASP.NET_SessionId=4q51sxnhcnrk4a5puyvuxhzx; cscx=hyland^|-103^|1^|14^|KumB4VhzYXML22MnMxjtTB9SKgHiWW0tFg0HbHnOek4=; c-s=expires=1577909201~access=/clientimg/hyland/*^!/content/hyland/*~md5=78cd5252d2efff6eb77d2e6bf0ce3127',
}
data = ['{"careerSiteId":4,"pageNumber":1,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}',
'{"careerSiteId":4,"pageNumber":2,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}']
def hyland(url, data):
# for openings in data:
dirty = requests.post(url, headers=headers, data=data).text
if 'Unauthorized' in dirty:
print(dirty)
print("There was an error connecting. Check Info")
# print(dirty)
clean = json.loads(dirty)
cleaner = json.dumps(clean, indent=4)
print("Openings at Hyland Software in Westlake as of {}".format(today.strftime('%m-%d-%Y')))
for i in range(0,60):
try:
print(clean["data"]["requisitions"][i]["displayJobTitle"])
print("")
print("")
except:
print("{} Openings at Hyland".format(i))
break
for datum in data:
hyland(url, data=datum)
So basically what my code is doing is sending a post request to the url above along with the headers and necessary data to retrieve what I want. This scraper works for a short period of time, but if I leave and come back after a few hours it no longer works due to authorization (at least that is what I have concluded).
Any help/ clarification on how all this works would be greatly appreciated.
Your code has a few problems:
As you noted you have to get the bearer token
You have to send your requests using requests.session() (as this webpage seems to pay attention to the cookies you send)
Optional: your headers had a lot of unnecessary headers that could be removed
All in all, here bellow is the working code:
import requests
import json
import datetime
today = datetime.datetime.today()
session = requests.session()
url = "https://hyland.csod.com:443/ux/ats/careersite/4/home?c=hyland"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:71.0) Gecko/20100101 Firefox/71.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
raw = session.get(url, headers=headers).text
token = raw[raw.index("token")+8:]
token = token[:token.index("\"")]
bearer_token = f"Bearer {token}"
url = "https://hyland.csod.com/services/x/career-site/v1/search"
# actual sitehttps://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:71.0) Gecko/20100101 Firefox/71.0", "Authorization": bearer_token}
data = ['{"careerSiteId":4,"pageNumber":1,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}',
'{"careerSiteId":4,"pageNumber":2,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}']
def hyland(url, data, session= session):
# for openings in data:
dirty = session.post(url, headers=headers, data=data).text
if 'Unauthorized' in dirty:
print(dirty)
print("There was an error connecting. Check Info")
# print(dirty)
clean = json.loads(dirty)
cleaner = json.dumps(clean, indent=4)
print("Openings at Hyland Software in Westlake as of {}".format(today.strftime('%m-%d-%Y')))
for i in range(0,60):
try:
print(clean["data"]["requisitions"][i]["displayJobTitle"])
print("")
print("")
except:
print("{} Openings at Hyland".format(i))
break
for datum in data:
hyland(url, data=datum, session = session)
hope this helps

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below
import requests
urls = "https://pan.baidu.com/s/1sj1JLJv"
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
'Content-Length': '0',
"Connection": "keep-alive"<br>
}
print(str((requests.get(urls, headers=header)).content, 'utf-8'))
from scrapy_redis.spiders import RedisCrawlSpider
class baiduuSpider(RedisCrawlSpider):
...
...
...
urls = "https://pan.baidu.com/s/1sj1JLJv"
yield scrapy.Request(url = urls,headers = headers,callback = self.first_parse)
def first_parse(self, response):
print(response.body.decode('utf-8'))
How do I fix this question
I'm sorry, but you won't succeed, because the page loads dynamically.
It is necessary to compile javascript on the fly - Selenium, Splash

Resources