Using Proxies in Python 3 - python-3.x

i am using proxies to make crawlers:
import requests
from bs4 import BeautifulSoup
import time
time.sleep(3)
import random
proxy_list = [
'66.82.144.29:8080',
'47.75.0.253:3129',
'217.119.82.14:8080' ]
proxies = random.choice(proxy_list)
for i in range(20):
url = "https://www.amazon.com/s/ref=sr_pg_{}".format(i) + "?fst=p90x%3A1%2Cas%3Aoff&rh=n%3A172282%2Cn%3A541966%2Cn%3A193870011%2Cn%3A172500%2Ck%3Acorsair+ddr4%2Cp_89%3ACorsair&page={}".format(i) + "&keywords=corsair+ddr4&ie=UTF8&qid=1522049082"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
response = requests.get(url, verify=False, headers=headers, proxies=proxies)
soup = BeautifulSoup(response.text.encode('utf-8'), 'html.parser')
but there is error keep poping out
AttributeError Traceback (most recent call last)
<ipython-input-7-82c14c70f937> in <module>()
25 url = "https://www.amazon.com/s/ref=sr_pg_{}".format(i) + "?fst=p90x%3A1%2Cas%3Aoff&rh=n%3A172282%2Cn%3A541966%2Cn%3A193870011%2Cn%3A172500%2Ck%3Acorsair+ddr4%2Cp_89%3ACorsair&page={}".format(i) + "&keywords=corsair+ddr4&ie=UTF8&qid=1522049082"
26 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
---> 27 response = requests.get(url, verify=False, headers=headers, proxies=proxies)
28 soup = BeautifulSoup(response.text.encode('utf-8'), 'html.parser')
29 containers = soup.select('li.s-result-item.celwidget')
c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\api.py in get(url, params, **kwargs)
70
71 kwargs.setdefault('allow_redirects', True)
---> 72 return request('get', url, params=params, **kwargs)
73
74
c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\api.py in request(method, url, **kwargs)
56 # cases, and look like a memory leak in others.
57 with sessions.Session() as session:
---> 58 return session.request(method=method, url=url, **kwargs)
59
60
c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
497
498 settings = self.merge_environment_settings(
--> 499 prep.url, proxies, stream, verify, cert
500 )
501
c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\sessions.py in merge_environment_settings(self, url, proxies, stream, verify, cert)
669 if self.trust_env:
670 # Set environment's proxies.
--> 671 no_proxy = proxies.get('no_proxy') if proxies is not None else None
672 env_proxies = get_environ_proxies(url, no_proxy=no_proxy)
673 for (k, v) in env_proxies.items():
AttributeError: 'str' object has no attribute 'get'
what happens

When you pass proxies to the requests library the object passed should be a mapping from protocol to proxy.
import requests
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
requests.get('http://example.org', proxies=proxies)
See http://docs.python-requests.org/en/master/user/advanced/#proxies
So in your case try:
proxy_list = [
'http://66.82.144.29:8080',
'http://47.75.0.253:3129',
'http://217.119.82.14:8080' ]
selected = random.choice(proxy_list)
proxies = { protocol: selected for protocol in ('http', 'https') }
Note that you also have to include the protocol as the proxy server itself could be using http, https, or indeed socks5.

Related

SSL error - Certificates Verify Failed when accessing website with Requests

I'm trying to connect with website to run webscrapping, but I'm stuck at SSL Error.
import urllib,ssl
import requests
from bs4 import BeautifulSoup
import certifi
import ssl
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
with requests.Session() as s:
url = "https://portal.librus.pl/rodzina/synergia/loguj"
r = s.get(url, headers=headers)
print(r.content)
gives error:
---------------------------------------------------------------------------
SSLCertVerificationError Traceback (most recent call last)
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
669 # Make the request on the httplib connection object.
--> 670 httplib_response = self._make_request(
671 conn,
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
380 try:
--> 381 self._validate_conn(conn)
382 except (SocketTimeout, BaseSSLError) as e:
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
977 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 978 conn.connect()
979
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connection.py in connect(self)
361
--> 362 self.sock = ssl_wrap_socket(
363 sock=conn,
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data)
385 if HAS_SNI and server_hostname is not None:
--> 386 return context.wrap_socket(sock, server_hostname=server_hostname)
387
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
499 # ctx._wrap_socket()
--> 500 return self.sslsocket_class._create(
501 sock=sock,
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1039 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1040 self.do_handshake()
1041 except (OSError, ValueError):
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\ssl.py in do_handshake(self, block)
1308 self.settimeout(None)
-> 1309 self._sslobj.do_handshake()
1310 finally:
SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
725
--> 726 retries = retries.increment(
727 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
445 if new_retry.is_exhausted():
--> 446 raise MaxRetryError(_pool, url, error or ResponseError(cause))
447
MaxRetryError: HTTPSConnectionPool(host='portal.librus.pl', port=443): Max retries exceeded with url: /rodzina/synergia/loguj (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)')))
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
<ipython-input-7-04d12114a1d8> in <module>
12 with requests.Session() as s:
13 url = "https://portal.librus.pl/rodzina/synergia/loguj"
---> 14 r = s.get(url, headers=headers)
15 soup = BeautifulSoup(r.content, "html5lib")
16 login_data["form_build_id"] = soup.find("input", attrs={"name": "form_build_id"})[
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\sessions.py in get(self, url, **kwargs)
541
542 kwargs.setdefault('allow_redirects', True)
--> 543 return self.request('GET', url, **kwargs)
544
545 def options(self, url, **kwargs):
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
528 }
529 send_kwargs.update(settings)
--> 530 resp = self.send(prep, **send_kwargs)
531
532 return resp
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
641
642 # Send the request
--> 643 r = adapter.send(request, **kwargs)
644
645 # Total elapsed time of the request (approximately)
c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
512 if isinstance(e.reason, _SSLError):
513 # This branch is for urllib3 v1.22 and later.
--> 514 raise SSLError(e, request=request)
515
516 raise ConnectionError(e, request=request)
SSLError: HTTPSConnectionPool(host='portal.librus.pl', port=443): Max retries exceeded with url: /rodzina/synergia/loguj (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)')))
Sorry for putting so long error massage but maybe there are some useful details.
Website is running fine in the browser but python can't cover the certificates. Can you advise how to deal with this?
Thanks,
Paulina

Connection timeout on port 443 on linode instance

I have been trying to write a python code and run it on one of my linode instance. The OS for the instance is Debian 11. This is my code:
# Libraries
import requests
import json
import math
url_oc = "https://www.nseindia.com/option-chain"
url_bnf = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
url_nf = 'https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY'
url_indices = "https://www.nseindia.com/api/allIndices"
# Headers
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'accept-language': 'en,gu;q=0.9,hi;q=0.8',
'accept-encoding': 'gzip, deflate, br'}
sess = requests.Session()
cookies = dict()
request = sess.get(url_oc, headers=headers, timeout=5)
cookies = dict(request.cookies)
When I run the program, it throws read timeout error.
Traceback (most recent call last):
File "/home/python-ws/sniper1.py", line 19, in <module>
request = sess.get(url_oc, headers=headers, timeout=5)
File "/usr/local/lib/python3.9/dist-packages/requests/sessions.py", line 600, in get
return self.request("GET", url, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/requests/sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.9/dist-packages/requests/sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/requests/adapters.py", line 578, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='www.nseindia.com', port=443): Read timed out. (read timeout=5)
I tried to:
increase the timeout. But still the connection timeouts.
ping the url directly and telnet it on port 443, it was success
ufw allow 443/tcp
But still the connection is timing out. The internet is also connected. Of all these, what could be the reason for the timeout?

Connection aborted.', RemoteDisconnected('Remote end closed connection without response') while using python

Hello I am attempting to reach https://api.louisvuitton.com/api/eng-us/catalog/availability/M80016 through a session while using request in python. Currently I am unable to reach it and get an error of Remote end closed connection without response.
I have been trying to debug but havent been successful. Bellow is my code and the output.
Code:
import requests
from requests.auth import HTTPBasicAuth
import json
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'}
s = requests.Session()
r = s.get("https://us.louisvuitton.com/eng-us/products/pocket-organizer-damier-graphite-nvprod2630093v#N60432",headers=headers)
if r:
print("Requested Successfully")
else:
print("Request Failed ==> " + str(r))
exit()
url2 = "https://api.qubit.com/graphql"
payload = json.dumps({
"query": "query ($trackingId: String!, $contextId: String!) {\n property(trackingId: $trackingId) {\n visitor(contextId: $contextId) {\n ipAddress\n ipLocation: location {\n city\n cityCode\n country\n countryCode\n latitude\n longitude\n area\n areaCode\n region\n regionCode\n }\n segment: segments {\n state\n }\n history {\n conversionCycleNumber: conversionCycle\n conversionNumber: conversions\n entranceNumber: entrances\n firstConversionTs: firstConversion\n firstViewTs: firstView\n lastConversionTs: lastConversion\n lastViewTs: lastView\n lifetimeValue\n sessionNumber: sessions\n viewNumber: views\n }\n }\n }\n}",
"variables": {
"trackingId": "louisvuitton_prod",
"contextId": "o6vfrf9jm4g-0k999shdp-fiadwa4"
}})
headers2 = {
'Content-Type': 'application/json'
}
x = s.post(url2,headers=headers2, data=payload)
if x:
print("Post Successfully")
else:
print("Post Failed ==> " + str(x))
exit()
headers3 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Accept': "*/*",
'Cache-Control': "no-cache",
'Host': "api.louisvuitton.com",
'Accept-Encoding': "gzip, deflate",
'Connection': "keep-alive",
'cache-control': "no-cache",
'Content-Type': 'application/json'
}
cookies = s.cookies
t = s.get("https://api.louisvuitton.com/api/eng-us/catalog/availability/M80016",headers=headers3,cookies=cookies)
if t:
print("Get Successfully")
else:
print("Get Failed ==> " + str(t))
exit()
Output
Requested Successfully
Post Successfully
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/urllib3-1.25.10-py3.8.egg/urllib3/connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "/usr/local/lib/python3.8/site-packages/urllib3-1.25.10-py3.8.egg/urllib3/connectionpool.py", line 426, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.8/site-packages/urllib3-1.25.10-py3.8.egg/urllib3/connectionpool.py", line 421, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1347, in getresponse
response.begin()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
Anyone have a clue or idea how to resolve this issues? Would appreciated any help.
If you inspect the cookies on the webpage in Chrome with Inspect Element -> application -> storage -> cookies -> https://us.louisvuitton.com/ you see about 40 cookies. However if you add import pprint to your code and at line 50 pprint.pprint(s.cookies.get_dict()) you see only 4 cookies. So you are missing many cookies.
The response you get is actually an Access Denied message as you can see if you use Inspect Element -> Network copy as cURL on the https://api.louisvuitton.com/api/eng-us/catalog/availability/nvprod... URL and remove the cookies except for your 4 and run it, if you run it will all the cookies it works fine.
So as there are many XHR requests than can set cookies I suggest you either go through all requests decode them if needed and read all the JavaScript files to see if they set cookies or a much easier solution use Selenium, requests-html https://pypi.org/project/requests-html/ or PyQT

Error retrieving AppStore application's reviews using app-store-scraper?

Here is what I have tried and it didn't work:
First pip3 install app-store-scraper and then:
from app_store_scraper import AppStore
from pprint import pprint
appstore_app = AppStore(country="us", app_name="yazio-fasting-food-tracker", app_id=946099227)
appstore_app.review()
pprint(appstore_app.reviews)
pprint(appstore_app.reviews_count)
Error:
---------------------------------------------------------------------------
gaierror Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/urllib3/connection.py in _new_conn(self)
159 conn = connection.create_connection(
--> 160 (self._dns_host, self.port), self.timeout, **extra_kw
161 )
/usr/local/lib/python3.7/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
60
---> 61 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
62 af, socktype, proto, canonname, sa = res
/usr/lib/python3.7/socket.py in getaddrinfo(host, port, family, type, proto, flags)
747 addrlist = []
--> 748 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
749 af, socktype, proto, canonname, sa = res
gaierror: [Errno -2] Name or service not known
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
676 headers=headers,
--> 677 chunked=chunked,
678 )
/usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
380 try:
--> 381 self._validate_conn(conn)
382 except (SocketTimeout, BaseSSLError) as e:
/usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
975 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 976 conn.connect()
977
/usr/local/lib/python3.7/dist-packages/urllib3/connection.py in connect(self)
307 # Add certificate verification
--> 308 conn = self._new_conn()
309 hostname = self.host
/usr/local/lib/python3.7/dist-packages/urllib3/connection.py in _new_conn(self)
171 raise NewConnectionError(
--> 172 self, "Failed to establish a new connection: %s" % e
173 )
NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x7f1a3eb18390>: Failed to establish a new connection: [Errno -2] Name or service not known
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
/usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
724 retries = retries.increment(
--> 725 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
726 )
/usr/local/lib/python3.7/dist-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
438 if new_retry.is_exhausted():
--> 439 raise MaxRetryError(_pool, url, error or ResponseError(cause))
440
MaxRetryError: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /us/app/yazio-fasting-food-tracker/id946099227 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f1a3eb18390>: Failed to establish a new connection: [Errno -2] Name or service not known'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-4-c7ec3a01ece6> in <module>
2 from pprint import pprint
3
----> 4 appstore_app = AppStore(country="us", app_name="yazio-fasting-food-tracker", app_id=946099227)
5 appstore_app.review(how_many=1000)
6
~/.local/lib/python3.7/site-packages/app_store_scraper/app_store.py in __init__(self, country, app_name, app_id, log_format, log_level, log_interval)
27 log_format=log_format,
28 log_level=log_level,
---> 29 log_interval=log_interval,
30 )
31
~/.local/lib/python3.7/site-packages/app_store_scraper/base.py in __init__(self, country, app_name, app_id, log_format, log_level, log_interval)
62 self._request_headers = {
63 "Accept": "application/json",
---> 64 "Authorization": self._token(),
65 "Connection": "keep-alive",
66 "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
~/.local/lib/python3.7/site-packages/app_store_scraper/base.py in _token(self)
126
127 def _token(self):
--> 128 self._get(self.url)
129 tags = self._response.text.splitlines()
130 for tag in tags:
~/.local/lib/python3.7/site-packages/app_store_scraper/base.py in _get(self, url, headers, params, total, backoff_factor, status_forcelist)
123 s.mount(self._base_request_url, HTTPAdapter(max_retries=retries))
124 logger.debug(f"Making a GET request: {url}")
--> 125 self._response = s.get(url, headers=headers, params=params)
126
127 def _token(self):
/usr/local/lib/python3.7/dist-packages/requests/sessions.py in get(self, url, **kwargs)
541
542 kwargs.setdefault('allow_redirects', True)
--> 543 return self.request('GET', url, **kwargs)
544
545 def options(self, url, **kwargs):
/usr/local/lib/python3.7/dist-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
528 }
529 send_kwargs.update(settings)
--> 530 resp = self.send(prep, **send_kwargs)
531
532 return resp
/usr/local/lib/python3.7/dist-packages/requests/sessions.py in send(self, request, **kwargs)
641
642 # Send the request
--> 643 r = adapter.send(request, **kwargs)
644
645 # Total elapsed time of the request (approximately)
/usr/local/lib/python3.7/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /us/app/yazio-fasting-food-tracker/id946099227 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f1a3eb18390>: Failed to establish a new connection: [Errno -2] Name or service not known'))
Please advise how to deal with this error, or maybe there is another way to parse AppStore reviews.
I've played a bit with this API wrapper and it seems the results you get back are either IP based or there's something odd about the wrapper.
Anyhow, here's the code I got 533 reviews. The entire dump is on pastebin.
import json
from app_store_scraper import AppStore
from pprint import pprint
appstore_app = AppStore(country="us", app_name="yazio-fasting-food-tracker", app_id=946099227)
appstore_app.review()
reviews = appstore_app.reviews
pprint(appstore_app.reviews_count)
for review in reviews:
review['date'] = review['date'].isoformat()
with open("data_dump.json", "w") as dd:
json.dump(reviews, dd, indent=4, sort_keys=True)
This outputs:
2020-10-03 18:28:35,477 [INFO] Base - Initialised: AppStore('us', 'yazio-fasting-food-tracker', 946099227)
2020-10-03 18:28:35,477 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/us/app/yazio-fasting-food-tracker/id946099227
2020-10-03 18:28:40,681 [INFO] Base - [id:946099227] Fetched 260 reviews (260 fetched in total)
533
2020-10-03 18:28:46,415 [INFO] Base - [id:946099227] Fetched 533 reviews (533 fetched in total)

Python crawler encounters error.HTTPError: HTTP Error 403: Forbidden

Python code also added User-Agent, but the operation will still be the following error, what is the solution? The Request Header obtained from the browser has been added. It is still useless.ps: manually open the web page, you can access normally, but the code sends a request, prompt 403:
import requests, time, os, urllib.request, socket
from bs4 import BeautifulSoup
def getimg():
os.system("mkdir Pic")
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "cc.itbb.men",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
r = requests.get("http://www.testowne.er/htm_data/8/1804/3099535.html", headers=headers)
r.encoding = 'GBK'
soup = BeautifulSoup(r.text, "html.parser")
iname = 0
for i in soup.find_all("input", type="image"):
iname += 1
i = i['src']
print(i)
urllib.request.urlretrieve(i, ".\\Pic\\%s" % str(iname))
========================output==============================================
Traceback (most recent call last):
File "getimg.py", line 70, in <module>
getimg()
File "getimg.py", line 41, in getimg
urllib.request.urlretrieve(i, ".\\Pic\\%s" % str(iname))
File "/usr/lib/python3.5/urllib/request.py", line 188, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 472, in open
response = meth(req, response)
File "/usr/lib/python3.5/urllib/request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.5/urllib/request.py", line 510, in error
return self._call_chain(*args)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
As explained in this answer:
This website is blocking the user-agent used by urllib, so you need to
change it in your request. Unfortunately I don't think urlretrieve
supports this directly.
However using shutil.copyfileobj() to save the file didn't work for me. I used this instead:
r_img = requests.get(url, stream=True)
if r_img.status_code == 200:
with open("img.jpg", 'wb') as f:
f.write(r_img.content)
Full code:
import os
import requests
from bs4 import BeautifulSoup
def download_images(url: str) -> None:
os.system('mkdir Pictures')
r = requests.get(url)
r.encoding = 'GBK'
soup = BeautifulSoup(r.text, 'html.parser')
for i, img in enumerate(soup.find_all('input', type='image')):
img_url = img['src']
print(i, img_url)
r_img = requests.get(img_url, stream=True)
if r_img.status_code == 200:
with open(f'Pictures/pic{i}.jpg', 'wb') as f:
f.write(r_img.content)
download_images('http://cc.itbb.men/htm_data/8/1804/3099535.html')
Notice usage of f-string to format the path. It is available for Python 3.6+, if you use older version of Python you can change to either % or .format(). Type hints I added to the function signature is the feature for Python 3.5+. You can also omit them, if you use older Python.

Resources