Altair error in saving data into the dashboard - python-3.x
Hi all I've a problem with altair and vega with the interaction of elasticsearch. In detail what happened is the following: I've a index-pattern generated by a data of 38.8mb in the order of 90k rows. I want to use this index-pattern in order to create a visualization that let me to filter based on two values. So the idea is the following I want to represent data in time and insert this visualization into a kibana dashboard. I should be able to interact with the visualization and have the possibility to do that based on the filter inserted. I manage to do that, in detail I manage to create this plot with the portion of code below: in details we can see the def saveVegaLiteVis and save object taken from the link then we can see the chart built and works as I want. I would like to solve this issue when data is increasing because wehn the data is less than 3 mb so in the oder of 10k I've not problem on visualization.
def saveVegaLiteVis(client, index, visName, altairChart, resultSize=100, timeField=True):
chart_json = json.loads(altairChart.to_json())
chart_json['data']['url'] = {
"%context%": True,
"index": index,
"body": {
"size": resultSize
}
}
if timeField:
chart_json['data']['url']['%timefield%'] = "timestamp"
visState = {
"type": "vega",
"aggs": [],
"params": {
"spec": json.dumps(chart_json, sort_keys=True, indent=4, separators=(',', ': ')),
},
"title": visName
}
visSavedObject={
"visualization" : {
"title" : visName,
"visState" : json.dumps(visState, sort_keys=True, indent=4, separators=(',', ': ')),
"uiStateJSON" : "{}",
"description" : "",
"version" : 1,
"kibanaSavedObjectMeta" : {
"searchSourceJSON" : json.dumps({
"query": {
"language": "kuery",
"query": ""
},
"filter": []
}),
}
},
"type" : "visualization",
"references" : [ ],
"migrationVersion" : {
"visualization" : "7.7.0"
},
"updated_at" : datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")
}
return client.index(index='.kibana',id='visualization:'+visName,body=visSavedObject)
def saveVegaVis(client, index, visName, altairChart, resultSize=100, timeField=True):
chart_json = json.loads(altairChart.to_json())
chart_json['spec']['data']['url'] = {
"%context%": True,
"index": index,
"body": {
"size": resultSize
}
}
if timeField:
chart_json['spec']['data']['url']['%timefield%'] = "timestamp"
visState = {
"type": "vega",
"aggs": [],
"params": {
"spec": json.dumps(chart_json, sort_keys=True, indent=4, separators=(',', ': ')),
},
"title": visName
}
visSavedObject={
"visualization" : {
"title" : visName,
"visState" : json.dumps(visState, sort_keys=True, indent=4, separators=(',', ': ')),
"uiStateJSON" : "{}",
"description" : "",
"version" : 1,
"kibanaSavedObjectMeta" : {
"searchSourceJSON" : json.dumps({
"query": {
"language": "kuery",
"query": ""
},
"filter": []
}),
}
},
"type" : "visualization",
"references" : [ ],
"migrationVersion" : {
"visualization" : "7.7.0"
},
"updated_at" : datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")
}
return client.index(index='.kibana',id='visualization:'+visName,body=visSavedObject)
def getDataBasedonIndex(index,idValue,es):
es.indices.refresh(index=index)
res = es.get(index=index, id=idValue)
print(res['_source'])
input_dropdown = alt.binding_select(options=[[1,2,3,4,5],[4],[3],[1],[2],[5],[1,4,5],[3,4,5],[1,2,3,4],[1,3,5],[1,3,4],[1,2,5],[1,2,4],[1,2,3],[4,5],[4,2],[2,3],[1,2],[5,3],[3,4],[2,5],[1,4],[1,5],[1,2],[1,3]])
selection = alt.selection_single(fields=['NUMBER'], bind=input_dropdown, name='FIELD: ')
#dropdown
input_dropdown1 = alt.binding_select(options=[['M3','M4','M5','M6'],['M4'],['M3'],['M6'],['M5']])
selection1 = alt.selection_single(fields=['SHAPE TYPE'], bind=input_dropdown1, name='FIELD2: ')
#shape
selection_Operation= alt.selection_multi(fields=['NUMBER:N'],bind='legend')
shape_Operation = alt.condition(selection_Operation ,alt.Shape('NUMBER:N'), alt.value('lightgray'))
color = alt.condition(selection,alt.Color('SHAPE TYPE:N'), alt.value('lightgray'))
interaction1 = alt.selection_interval(bind='scales', on="[mousedown[event.altKey], mouseup] > mousemove", translate="[mousedown[event.altKey], mouseup] > mousemove!",
zoom="wheel![event.altKey]"
)
interactionY = alt.selection_interval(bind='scales', encodings=['x'],on="[mousedown[event.shiftKey], mouseup] > mousemove",translate="[mousedown[event.shiftKey], mouseup] > mousemove!",
zoom="wheel![event.shiftKey]")
ScatterLine=alt.Chart(df).mark_point(filled=True).encode(x=alt.X('#timestamp:T', title='TIMESTAMP'),y=alt.Y('value:Q', title='value'), color=color,shape=shape_Operation,tooltip = ['value:N','NUMBER:N','SHAPE TYPE:N', alt.Tooltip('#timestamp:T', format = '%Y-%m-%d %H:%M'),'ID:N']
).add_selection(interaction1,interactionY,selection,selection1,selection_Operation).resolve_scale( x='independent').transform_filter(selection & selection1)
ScatterLine
saveVegaLiteVis(es, 'index-pattern1', 'RapresentationPoint', ScatterLine, timeField=True)
How could you see below there is an error generated only by teh saveVegaLiteVis routine. I'm trying to modify the saveVega inserting a resultSize=1000000 but generate at the same time the error below. How can I solve this? I want to be sure that the problem is only on save this in order to insert the information into the dashboard. This means that the visualization itself work.
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
420 # Otherwise it looks like a bug in the code.
--> 421 six.raise_from(e, None)
422 except (SocketTimeout, BaseSSLError, SocketError) as e:
/usr/lib/python3/dist-packages/six.py in raise_from(value, from_value)
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
415 try:
--> 416 httplib_response = conn.getresponse()
417 except BaseException as e:
/usr/lib/python3.8/http/client.py in getresponse(self)
1346 try:
-> 1347 response.begin()
1348 except ConnectionError:
/usr/lib/python3.8/http/client.py in begin(self)
306 while True:
--> 307 version, status, reason = self._read_status()
308 if status != CONTINUE:
/usr/lib/python3.8/http/client.py in _read_status(self)
267 def _read_status(self):
--> 268 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
269 if len(line) > _MAXLINE:
/usr/lib/python3.8/socket.py in readinto(self, b)
668 try:
--> 669 return self._sock.recv_into(b)
670 except timeout:
timeout: timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
~/.local/lib/python3.8/site-packages/elasticsearch/connection/http_urllib3.py in perform_request(self, method, url, params, body, timeout, ignore, headers)
250
--> 251 response = self.pool.urlopen(
252 method, url, body, retries=Retry(False), headers=request_headers, **kw
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
718
--> 719 retries = retries.increment(
720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
/usr/lib/python3/dist-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
375 # Disabled, indicate to re-raise the error.
--> 376 raise six.reraise(type(error), error, _stacktrace)
377
/usr/lib/python3/dist-packages/six.py in reraise(tp, value, tb)
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
664 # Make the request on the httplib connection object.
--> 665 httplib_response = self._make_request(
666 conn,
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
422 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 423 self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
424 raise
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in _raise_timeout(self, err, url, timeout_value)
329 if isinstance(err, SocketTimeout):
--> 330 raise ReadTimeoutError(
331 self, url, "Read timed out. (read timeout=%s)" % timeout_value
ReadTimeoutError: HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10)
During handling of the above exception, another exception occurred:
ConnectionTimeout Traceback (most recent call last)
<ipython-input-31-45635f0d8635> in <module>
----> 1 saveVegaLiteVis(es, 'index-pattern', 'RapresentationPoint', ScatterLine, timeField=True,resultSize=100000)
<ipython-input-3-984bfed0a208> in saveVegaLiteVis(client, index, visName, altairChart, resultSize, timeField)
46 }
47
---> 48 return client.index(index='.kibana',id='visualization:'+visName,body=visSavedObject)
49
50 def saveVegaVis(client, index, visName, altairChart, resultSize=100, timeField=True):
~/.local/lib/python3.8/site-packages/elasticsearch/client/utils.py in _wrapped(*args, **kwargs)
166 if p in kwargs:
167 params[p] = kwargs.pop(p)
--> 168 return func(*args, params=params, headers=headers, **kwargs)
169
170 return _wrapped
~/.local/lib/python3.8/site-packages/elasticsearch/client/__init__.py in index(self, index, body, doc_type, id, params, headers)
404 doc_type = "_doc"
405
--> 406 return self.transport.perform_request(
407 "POST" if id in SKIP_IN_PATH else "PUT",
408 _make_path(index, doc_type, id),
~/.local/lib/python3.8/site-packages/elasticsearch/transport.py in perform_request(self, method, url, headers, params, body)
413 raise e
414 else:
--> 415 raise e
416
417 else:
~/.local/lib/python3.8/site-packages/elasticsearch/transport.py in perform_request(self, method, url, headers, params, body)
379
380 try:
--> 381 status, headers_response, data = connection.perform_request(
382 method,
383 url,
~/.local/lib/python3.8/site-packages/elasticsearch/connection/http_urllib3.py in perform_request(self, method, url, params, body, timeout, ignore, headers)
261 raise SSLError("N/A", str(e), e)
262 if isinstance(e, ReadTimeoutError):
--> 263 raise ConnectionTimeout("TIMEOUT", str(e), e)
264 raise ConnectionError("N/A", str(e), e)
265
ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10))
I manage to solve the issue adding into the es setting a requestime that is higher.
es = Elasticsearch([{'host': HOST_ADDRESS, 'port': THE_PORT}], timeout=30)
(this information is taken by another issue link
In order to avoid that the system crush I need also to increment the disk space require for kibana. In order to that I insert "--max-old.space-size=2048" in the environment section on kibana.yml
Related
SSL error - Certificates Verify Failed when accessing website with Requests
I'm trying to connect with website to run webscrapping, but I'm stuck at SSL Error. import urllib,ssl import requests from bs4 import BeautifulSoup import certifi import ssl headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" } with requests.Session() as s: url = "https://portal.librus.pl/rodzina/synergia/loguj" r = s.get(url, headers=headers) print(r.content) gives error: --------------------------------------------------------------------------- SSLCertVerificationError Traceback (most recent call last) c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 669 # Make the request on the httplib connection object. --> 670 httplib_response = self._make_request( 671 conn, c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 380 try: --> 381 self._validate_conn(conn) 382 except (SocketTimeout, BaseSSLError) as e: c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn) 977 if not getattr(conn, "sock", None): # AppEngine might not have `.sock` --> 978 conn.connect() 979 c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connection.py in connect(self) 361 --> 362 self.sock = ssl_wrap_socket( 363 sock=conn, c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data) 385 if HAS_SNI and server_hostname is not None: --> 386 return context.wrap_socket(sock, server_hostname=server_hostname) 387 c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session) 499 # ctx._wrap_socket() --> 500 return self.sslsocket_class._create( 501 sock=sock, c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session) 1039 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets") -> 1040 self.do_handshake() 1041 except (OSError, ValueError): c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\ssl.py in do_handshake(self, block) 1308 self.settimeout(None) -> 1309 self._sslobj.do_handshake() 1310 finally: SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123) During handling of the above exception, another exception occurred: MaxRetryError Traceback (most recent call last) c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 438 if not chunked: --> 439 resp = conn.urlopen( 440 method=request.method, c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 725 --> 726 retries = retries.increment( 727 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace) 445 if new_retry.is_exhausted(): --> 446 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 447 MaxRetryError: HTTPSConnectionPool(host='portal.librus.pl', port=443): Max retries exceeded with url: /rodzina/synergia/loguj (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)'))) During handling of the above exception, another exception occurred: SSLError Traceback (most recent call last) <ipython-input-7-04d12114a1d8> in <module> 12 with requests.Session() as s: 13 url = "https://portal.librus.pl/rodzina/synergia/loguj" ---> 14 r = s.get(url, headers=headers) 15 soup = BeautifulSoup(r.content, "html5lib") 16 login_data["form_build_id"] = soup.find("input", attrs={"name": "form_build_id"})[ c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\sessions.py in get(self, url, **kwargs) 541 542 kwargs.setdefault('allow_redirects', True) --> 543 return self.request('GET', url, **kwargs) 544 545 def options(self, url, **kwargs): c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 528 } 529 send_kwargs.update(settings) --> 530 resp = self.send(prep, **send_kwargs) 531 532 return resp c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\sessions.py in send(self, request, **kwargs) 641 642 # Send the request --> 643 r = adapter.send(request, **kwargs) 644 645 # Total elapsed time of the request (approximately) c:\Apps\Anaconda3\v3_8_5_x64\Local\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 512 if isinstance(e.reason, _SSLError): 513 # This branch is for urllib3 v1.22 and later. --> 514 raise SSLError(e, request=request) 515 516 raise ConnectionError(e, request=request) SSLError: HTTPSConnectionPool(host='portal.librus.pl', port=443): Max retries exceeded with url: /rodzina/synergia/loguj (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)'))) Sorry for putting so long error massage but maybe there are some useful details. Website is running fine in the browser but python can't cover the certificates. Can you advise how to deal with this? Thanks, Paulina
Error retrieving AppStore application's reviews using app-store-scraper?
Here is what I have tried and it didn't work: First pip3 install app-store-scraper and then: from app_store_scraper import AppStore from pprint import pprint appstore_app = AppStore(country="us", app_name="yazio-fasting-food-tracker", app_id=946099227) appstore_app.review() pprint(appstore_app.reviews) pprint(appstore_app.reviews_count) Error: --------------------------------------------------------------------------- gaierror Traceback (most recent call last) /usr/local/lib/python3.7/dist-packages/urllib3/connection.py in _new_conn(self) 159 conn = connection.create_connection( --> 160 (self._dns_host, self.port), self.timeout, **extra_kw 161 ) /usr/local/lib/python3.7/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options) 60 ---> 61 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM): 62 af, socktype, proto, canonname, sa = res /usr/lib/python3.7/socket.py in getaddrinfo(host, port, family, type, proto, flags) 747 addrlist = [] --> 748 for res in _socket.getaddrinfo(host, port, family, type, proto, flags): 749 af, socktype, proto, canonname, sa = res gaierror: [Errno -2] Name or service not known During handling of the above exception, another exception occurred: NewConnectionError Traceback (most recent call last) /usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 676 headers=headers, --> 677 chunked=chunked, 678 ) /usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 380 try: --> 381 self._validate_conn(conn) 382 except (SocketTimeout, BaseSSLError) as e: /usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in _validate_conn(self, conn) 975 if not getattr(conn, "sock", None): # AppEngine might not have `.sock` --> 976 conn.connect() 977 /usr/local/lib/python3.7/dist-packages/urllib3/connection.py in connect(self) 307 # Add certificate verification --> 308 conn = self._new_conn() 309 hostname = self.host /usr/local/lib/python3.7/dist-packages/urllib3/connection.py in _new_conn(self) 171 raise NewConnectionError( --> 172 self, "Failed to establish a new connection: %s" % e 173 ) NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x7f1a3eb18390>: Failed to establish a new connection: [Errno -2] Name or service not known During handling of the above exception, another exception occurred: MaxRetryError Traceback (most recent call last) /usr/local/lib/python3.7/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 448 retries=self.max_retries, --> 449 timeout=timeout 450 ) /usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 724 retries = retries.increment( --> 725 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] 726 ) /usr/local/lib/python3.7/dist-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace) 438 if new_retry.is_exhausted(): --> 439 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 440 MaxRetryError: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /us/app/yazio-fasting-food-tracker/id946099227 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f1a3eb18390>: Failed to establish a new connection: [Errno -2] Name or service not known')) During handling of the above exception, another exception occurred: ConnectionError Traceback (most recent call last) <ipython-input-4-c7ec3a01ece6> in <module> 2 from pprint import pprint 3 ----> 4 appstore_app = AppStore(country="us", app_name="yazio-fasting-food-tracker", app_id=946099227) 5 appstore_app.review(how_many=1000) 6 ~/.local/lib/python3.7/site-packages/app_store_scraper/app_store.py in __init__(self, country, app_name, app_id, log_format, log_level, log_interval) 27 log_format=log_format, 28 log_level=log_level, ---> 29 log_interval=log_interval, 30 ) 31 ~/.local/lib/python3.7/site-packages/app_store_scraper/base.py in __init__(self, country, app_name, app_id, log_format, log_level, log_interval) 62 self._request_headers = { 63 "Accept": "application/json", ---> 64 "Authorization": self._token(), 65 "Connection": "keep-alive", 66 "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", ~/.local/lib/python3.7/site-packages/app_store_scraper/base.py in _token(self) 126 127 def _token(self): --> 128 self._get(self.url) 129 tags = self._response.text.splitlines() 130 for tag in tags: ~/.local/lib/python3.7/site-packages/app_store_scraper/base.py in _get(self, url, headers, params, total, backoff_factor, status_forcelist) 123 s.mount(self._base_request_url, HTTPAdapter(max_retries=retries)) 124 logger.debug(f"Making a GET request: {url}") --> 125 self._response = s.get(url, headers=headers, params=params) 126 127 def _token(self): /usr/local/lib/python3.7/dist-packages/requests/sessions.py in get(self, url, **kwargs) 541 542 kwargs.setdefault('allow_redirects', True) --> 543 return self.request('GET', url, **kwargs) 544 545 def options(self, url, **kwargs): /usr/local/lib/python3.7/dist-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 528 } 529 send_kwargs.update(settings) --> 530 resp = self.send(prep, **send_kwargs) 531 532 return resp /usr/local/lib/python3.7/dist-packages/requests/sessions.py in send(self, request, **kwargs) 641 642 # Send the request --> 643 r = adapter.send(request, **kwargs) 644 645 # Total elapsed time of the request (approximately) /usr/local/lib/python3.7/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 514 raise SSLError(e, request=request) 515 --> 516 raise ConnectionError(e, request=request) 517 518 except ClosedPoolError as e: ConnectionError: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /us/app/yazio-fasting-food-tracker/id946099227 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f1a3eb18390>: Failed to establish a new connection: [Errno -2] Name or service not known')) Please advise how to deal with this error, or maybe there is another way to parse AppStore reviews.
I've played a bit with this API wrapper and it seems the results you get back are either IP based or there's something odd about the wrapper. Anyhow, here's the code I got 533 reviews. The entire dump is on pastebin. import json from app_store_scraper import AppStore from pprint import pprint appstore_app = AppStore(country="us", app_name="yazio-fasting-food-tracker", app_id=946099227) appstore_app.review() reviews = appstore_app.reviews pprint(appstore_app.reviews_count) for review in reviews: review['date'] = review['date'].isoformat() with open("data_dump.json", "w") as dd: json.dump(reviews, dd, indent=4, sort_keys=True) This outputs: 2020-10-03 18:28:35,477 [INFO] Base - Initialised: AppStore('us', 'yazio-fasting-food-tracker', 946099227) 2020-10-03 18:28:35,477 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/us/app/yazio-fasting-food-tracker/id946099227 2020-10-03 18:28:40,681 [INFO] Base - [id:946099227] Fetched 260 reviews (260 fetched in total) 533 2020-10-03 18:28:46,415 [INFO] Base - [id:946099227] Fetched 533 reviews (533 fetched in total)
Python3 's try/except not working when using requests library
I wrote a simple program using requests library, I want to print 'sorry, the url does not exist!' if input url is wrong. I found that if I input a fake url to get_request() function, it will give me a ConnectionError error, thus I put it after except: import requests def get_request(url): a = requests.get(url) return a myurl = 'https://nberccc.com/videos?page=1' # a fake website url try: c = get_request(myurl) except ConnectionError: print('sorry, the url does not exist!') it returns: --------------------------------------------------------------------------- gaierror Traceback (most recent call last) C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connection.py in _new_conn(self) 158 conn = connection.create_connection( --> 159 (self._dns_host, self.port), self.timeout, **extra_kw) 160 C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options) 56 ---> 57 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM): 58 af, socktype, proto, canonname, sa = res C:\ProgramData\Anaconda3\lib\socket.py in getaddrinfo(host, port, family, type, proto, flags) 747 addrlist = [] --> 748 for res in _socket.getaddrinfo(host, port, family, type, proto, flags): 749 af, socktype, proto, canonname, sa = res gaierror: [Errno 11001] getaddrinfo failed During handling of the above exception, another exception occurred: NewConnectionError Traceback (most recent call last) C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 599 body=body, headers=headers, --> 600 chunked=chunked) 601 C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 342 try: --> 343 self._validate_conn(conn) 344 except (SocketTimeout, BaseSSLError) as e: C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn) 838 if not getattr(conn, 'sock', None): # AppEngine might not have `.sock` --> 839 conn.connect() 840 C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connection.py in connect(self) 300 # Add certificate verification --> 301 conn = self._new_conn() 302 hostname = self.host C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connection.py in _new_conn(self) 167 raise NewConnectionError( --> 168 self, "Failed to establish a new connection: %s" % e) 169 NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x000001602C18F4E0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed During handling of the above exception, another exception occurred: MaxRetryError Traceback (most recent call last) C:\ProgramData\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 448 retries=self.max_retries, --> 449 timeout=timeout 450 ) C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 637 retries = retries.increment(method, url, error=e, _pool=self, --> 638 _stacktrace=sys.exc_info()[2]) 639 retries.sleep() C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace) 397 if new_retry.is_exhausted(): --> 398 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 399 MaxRetryError: HTTPSConnectionPool(host='porndoe111.com', port=443): Max retries exceeded with url: /videos?page=1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001602C18F4E0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')) During handling of the above exception, another exception occurred: ConnectionError Traceback (most recent call last) <ipython-input-37-f61b8d93ce3e> in <module> 1 myurl = 'https://nberccc.com/videos?page=1' ----> 2 requests.get(url) 3 4 def get_request(url): 5 a = requests.get(url) C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs) 73 74 kwargs.setdefault('allow_redirects', True) ---> 75 return request('get', url, params=params, **kwargs) 76 77 C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs) 58 # cases, and look like a memory leak in others. 59 with sessions.Session() as session: ---> 60 return session.request(method=method, url=url, **kwargs) 61 62 C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 531 } 532 send_kwargs.update(settings) --> 533 resp = self.send(prep, **send_kwargs) 534 535 return resp C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs) 644 645 # Send the request --> 646 r = adapter.send(request, **kwargs) 647 648 # Total elapsed time of the request (approximately) C:\ProgramData\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 514 raise SSLError(e, request=request) 515 --> 516 raise ConnectionError(e, request=request) 517 518 except ClosedPoolError as e: ConnectionError: HTTPSConnectionPool(host='porndoe111.com', port=443): Max retries exceeded with url: /videos?page=1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001602C18F4E0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')) Why it doesn't print my own error message? when I run this: myurl1 = 'https://github.com/songxxiao' # a normal website url try: c = get_request(myurl1) except ConnectionError: print('sorry, the url does not exist!') it works well. So how to modify my code above to catch that Error and print my own error message?
Scrapy throuws Exception "raise _DefGen_Return(val) twisted.internet.defer._DefGen_Return: "
When I run the code locally (windows 10) everything works fine. Have checked other answers here and other resources, but failed to figure out any solution. After deploying to ScrapingHub Im getting this error message: [scrapy.core.scraper] Spider error processing <POST http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp> (referer: http://oris.co.palm-beach.fl.us/or_web1/) Less Traceback (most recent call last): File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1299, in _inlineCallbacks result = g.send(result) File "/usr/local/lib/python3.6/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request defer.returnValue((yield download_func(request=request,spider=spider))) File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1276, in returnValue raise _DefGen_Return(val) twisted.internet.defer._DefGen_Return: <200 http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp> During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 42, in process_spider_input result = method(response=response, spider=spider) File "/usr/local/lib/python3.6/site-packages/scrapy_pagestorage.py", line 68, in process_spider_input self.save_response(response, spider) File "/usr/local/lib/python3.6/site-packages/scrapy_pagestorage.py", line 102, in save_response self._writer.write(payload) File "/usr/local/lib/python3.6/site-packages/scrapinghub/hubstorage/batchuploader.py", line 224, in write data = jsonencode(item) File "/usr/local/lib/python3.6/site-packages/scrapinghub/hubstorage/serialization.py", line 38, in jsonencode return dumps(o, default=jsondefault) File "/usr/local/lib/python3.6/json/__init__.py", line 238, in dumps **kw).encode(obj) File "/usr/local/lib/python3.6/json/encoder.py", line 199, in encode chunks = self.iterencode(o, _one_shot=True) File "/usr/local/lib/python3.6/json/encoder.py", line 257, in iterencode return _iterencode(o, 0) TypeError: keys must be a string Here is a snippet of my Scrapy function that throws this error The ToDate and FromDate are passed as arguments to spider: start_urls = ['http://oris.co.palm-beach.fl.us/or_web1/'] def parse(self, response): # inspect_response(response, self) url = 'http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp' headers = { 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 'origin': "http://oris.co.palm-beach.fl.us", 'content-type': "application/x-www-form-urlencoded", 'dnt': "1", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 'cache-control': "no-cache", } # Date range should be visin 90 days data = {'FromDate': self.FromDate, 'PageSize': '500', 'RecSetSize': '500', 'ToDate': self.ToDate, 'consideration': '', 'search_by': 'DocType', 'search_entry': 'LP'} body = urlencode(data) yield scrapy.Request(url, method="POST", headers = headers, body = body, callback = self.parsed) def parsed(self, response): # inspect_response(response, self) # Getting all View urls. urls=response.xpath("//a[#class = 'list_2']/#href").extract() for url in urls: url = url.replace('\r', '').replace('\t','').replace('\n','') url = url.replace('\r', '').replace('\t','').replace('\n','') url = response.urljoin(url) url = url.replace('details.asp','details_des.asp') + '&linked=&party_seq=' yield scrapy.Request(url, callback = self.details)
ok. the issue was with - Messagepack is not available ( this was in the Debug log, not in the errors though., and page storage enabled for this project. I have disabled pagestorage and it works fine now. I wished Error messages were more readable in Scrapy and SH.
Using Proxies in Python 3
i am using proxies to make crawlers: import requests from bs4 import BeautifulSoup import time time.sleep(3) import random proxy_list = [ '66.82.144.29:8080', '47.75.0.253:3129', '217.119.82.14:8080' ] proxies = random.choice(proxy_list) for i in range(20): url = "https://www.amazon.com/s/ref=sr_pg_{}".format(i) + "?fst=p90x%3A1%2Cas%3Aoff&rh=n%3A172282%2Cn%3A541966%2Cn%3A193870011%2Cn%3A172500%2Ck%3Acorsair+ddr4%2Cp_89%3ACorsair&page={}".format(i) + "&keywords=corsair+ddr4&ie=UTF8&qid=1522049082" headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'} response = requests.get(url, verify=False, headers=headers, proxies=proxies) soup = BeautifulSoup(response.text.encode('utf-8'), 'html.parser') but there is error keep poping out AttributeError Traceback (most recent call last) <ipython-input-7-82c14c70f937> in <module>() 25 url = "https://www.amazon.com/s/ref=sr_pg_{}".format(i) + "?fst=p90x%3A1%2Cas%3Aoff&rh=n%3A172282%2Cn%3A541966%2Cn%3A193870011%2Cn%3A172500%2Ck%3Acorsair+ddr4%2Cp_89%3ACorsair&page={}".format(i) + "&keywords=corsair+ddr4&ie=UTF8&qid=1522049082" 26 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'} ---> 27 response = requests.get(url, verify=False, headers=headers, proxies=proxies) 28 soup = BeautifulSoup(response.text.encode('utf-8'), 'html.parser') 29 containers = soup.select('li.s-result-item.celwidget') c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\api.py in get(url, params, **kwargs) 70 71 kwargs.setdefault('allow_redirects', True) ---> 72 return request('get', url, params=params, **kwargs) 73 74 c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\api.py in request(method, url, **kwargs) 56 # cases, and look like a memory leak in others. 57 with sessions.Session() as session: ---> 58 return session.request(method=method, url=url, **kwargs) 59 60 c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 497 498 settings = self.merge_environment_settings( --> 499 prep.url, proxies, stream, verify, cert 500 ) 501 c:\users\terry\appdata\local\programs\python\python36-32\lib\site-packages\requests\sessions.py in merge_environment_settings(self, url, proxies, stream, verify, cert) 669 if self.trust_env: 670 # Set environment's proxies. --> 671 no_proxy = proxies.get('no_proxy') if proxies is not None else None 672 env_proxies = get_environ_proxies(url, no_proxy=no_proxy) 673 for (k, v) in env_proxies.items(): AttributeError: 'str' object has no attribute 'get' what happens
When you pass proxies to the requests library the object passed should be a mapping from protocol to proxy. import requests proxies = { 'http': 'http://10.10.1.10:3128', 'https': 'http://10.10.1.10:1080', } requests.get('http://example.org', proxies=proxies) See http://docs.python-requests.org/en/master/user/advanced/#proxies So in your case try: proxy_list = [ 'http://66.82.144.29:8080', 'http://47.75.0.253:3129', 'http://217.119.82.14:8080' ] selected = random.choice(proxy_list) proxies = { protocol: selected for protocol in ('http', 'https') } Note that you also have to include the protocol as the proxy server itself could be using http, https, or indeed socks5.