Connections aren't closing with Python3 asyncio concurrent HTTP get requests - python-3.x

I've just started using the asyncio libs from Python3.4 and wrote a small program which attempts to concurrently fetch 50 webpages at a time. The program blows up after a few hundred requests with a 'Too many open files' exception.
I thought that my fetch method closes the connections with the 'response.read_and_close()' method call.
Any ideas what's going on here? Am I going about this problem the right way?
import asyncio
import aiohttp
#asyncio.coroutine
def fetch(url):
response = yield from aiohttp.request('GET', url)
response = yield from response.read_and_close()
return response.decode('utf-8')
#asyncio.coroutine
def print_page(url):
page = yield from fetch(url)
# print(page)
#asyncio.coroutine
def process_batch_of_urls(round, urls):
print("Round starting: %d" % round)
coros = []
for url in urls:
coros.append(asyncio.Task(print_page(url)))
yield from asyncio.gather(*coros)
print("Round finished: %d" % round)
#asyncio.coroutine
def process_all():
api_url = 'https://google.com'
for i in range(10):
urls = []
for url in range(50):
urls.append(api_url)
yield from process_batch_of_urls(i, urls)
loop = asyncio.get_event_loop()
loop.run_until_complete(process_all())
The error I'm getting is:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/aiohttp/client.py", line 106, in request
File "/usr/local/lib/python3.4/site-packages/aiohttp/connector.py", line 135, in connect
File "/usr/local/lib/python3.4/site-packages/aiohttp/connector.py", line 242, in _create_connection
File "/usr/local/Cellar/python3/3.4.1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/asyncio/base_events.py", line 424, in create_connection
File "/usr/local/Cellar/python3/3.4.1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/asyncio/base_events.py", line 392, in create_connection
File "/usr/local/Cellar/python3/3.4.1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py", line 123, in __init__
OSError: [Errno 24] Too many open files
During handling of the above exception, another exception occurred:

Aha, I grok you problem.
Explicit connector definitely can solve the issue.
https://github.com/KeepSafe/aiohttp/pull/79 should fix it for implicit connectors too.
Thank you very much for finding resource leak in aiohttp
UPD.
aiohttp 0.8.2 has no the problem.

Ok I finally got it to work.
Turns out I had to use a TCPConnector which pools connections.
So I made this variable:
connector = aiohttp.TCPConnector(share_cookies=True, loop=loop)
and pass it through to each get request. My new fetch routine looks like this:
#asyncio.coroutine
def fetch(url):
data = ""
try:
yield from asyncio.sleep(1)
response = yield from aiohttp.request('GET', url, connector=connector)
except Exception as exc:
print('...', url, 'has error', repr(str(exc)))
else:
data = (yield from response.read()).decode('utf-8', 'replace')
response.close()
return data

Related

asyncio.wait_for didn't cancel the task

After the asyncio.wait_for timeout, the task was not cancelled
The script below is the minimized script to reproduce it. The tcp server just sent two chars after 100 seconds later after client connected
import sys
import asyncio
import socket
async def test_single_call():
reader, writer = await asyncio.open_connection(host='127.0.0.1', port=8888)
try:
msg = await asyncio.wait_for(reader.read(1), timeout=3)
print("Unexcepted message received:" , msg, file=sys.stderr)
assert False
except asyncio.TimeoutError:
pass
msg = await reader.read(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(test_single_call())
loop.close()
The tcpclient(code above) is expected to timeout 3 seconds later, and read again after that; but it seems the task was not cancelled after it was timeout. My python version is 3.6.9
Traceback (most recent call last):
File "tcpclient.py", line 17, in <module>
loop.run_until_complete(test_single_call())
File "/usr/lib/python3.6/asyncio/base_events.py", line 484, in run_until_complete
return future.result()
File "tcpclient.py", line 14, in test_single_call
msg = await reader.read(1)
File "/usr/lib/python3.6/asyncio/streams.py", line 634, in read
yield from self._wait_for_data('read')
File "/usr/lib/python3.6/asyncio/streams.py", line 452, in _wait_for_data
'already waiting for incoming data' % func_name)
RuntimeError: read() called while another coroutine is already waiting for incoming data
I also uploaded the tcp server here
For Linux Python 3.6, it had this issue. two options:
Upgrade to Python 3.8 or 3.9
Replace the open_connection and StreamReader with loop.sock_connet, loop.sock_recv, loop.sock_sendall
eg:
import sys
import asyncio
import socket
async def test_single_call(loop):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock = await loop.sock_connect(sock, server_address)
try:
msg = await asyncio.wait_for(loop.sock_recv(sock, 1), timeout=3)
print("Unexcepted message received:" , msg, file=sys.stderr)
assert False
except asyncio.TimeoutError:
pass
msg = await loop.sock_recv(sock, 1)
loop = asyncio.get_event_loop()
loop.run_until_complete(test_single_call(loop))
loop.close()

OS Error while doing requests for images with multiple threads in python

I'm making a program that gets info from a website about games, among that info, images, since i'm trying to download info of all games on that website, using a single thread with a 1Mbps connection would be very painful, so i decided to take action against this issue and programmed to spawn a thread for each letter of the alphabet that a game starts with, (games can be filtered by such). So, inside the function that downloads the corresponding image to certain game, while i have more than one thread, at some point in execution (sooner or later) an error is raised, then inside the except block that handles it, another exception is raised, and so on, over and over... this immediately causes threads to come to an end, but the fact is that, when i'm left with only a petty single thread to rely on, that thread goes on very well without giving any trouble.
Question:
How to solve this, and why is it happening?
Deduction:
I think that, when multiple threads get to requests.get line inside the download_image function (the function where the very problem must lie), maybe it fails because of multiple requests... that is as far as i can try to guess.
I really don't have the least idea of how to solve this, that being said, i would appreciate any help, thanks in advance.
I got rid of all the functions not having to do anything with the problem described above.
I spawn the threads at program's end, and each thread target function is named get_all_games_from_letter.
CODE
from bs4 import BeautifulSoup
from string import ascii_lowercase
from datetime import date
from vandal_constants import *
from PIL import Image
from requests.exceptions import ConnectionError
from exceptions import NoTitleException
from validator_collection import url as url_check
from rawgpy import RAWG
from io import BytesIO
import traceback
import requests
import threading
import sqlite3
import concurrent.futures
### GLOBALS #####
FROM_RAWG = False
INSERT_SQL = ''
# CONSTANTS ########
rawg = RAWG('A Collector')
#################
def download_image(tag=None, game=None, rawg_game=None):
if tag:
return sqlite3.Binary(requests.get(url).content) if (url := tag['data-src']) else None
elif game:
global FROM_RAWG
img_tag = game.select_one(IMG_TAG_SELECTOR)
if img_tag and img_tag.get('data-src', None):
try:
if url_check(img_tag['data-src']):
return sqlite3.Binary(requests.get(img_tag['data-src']).content)
print(f"{img_tag['data-src']} is NOT a valid url")
except ConnectionError:
try:
print('Error While downloading from "Vandal.elespannol.com" website:')
traceback.print_exc()
except Exception:
print('Another Exception Ocurred')
traceback.print_exc()
except OSError:
print('Error en el Handshake parece')
traceback.print_exc()
FROM_RAWG = True
if rawg_game and getattr(rawg_game, 'background_image', None):
try:
print('Continue to download from RAWG')
return sqlite3.Binary(requests.get(rawg_game.background_image).content)
except ConnectionError:
print('Error While downloading from RAWG:')
traceback.print_exc()
return None
def prepare_game_record(game, db_games_set):
global INSERT_SQL
title = getattr(game.select_one(TITLE_TAG_SELECTOR), 'text', None)
if not title:
raise NoTitleException()
if title in db_games_set:
print(f'Already Have {title} in database')
return None
description = game.select_one(DESCRIPTION_TAG_SELECTOR)
rawg_game = None
try:
rawg_game = rawg.search(title)[0]
except Exception as err:
print('No rawg')
traceback.print_exc()
game_data = {
'nombre': title,
'descripcion': description.text if description else rawg_game.description if rawg_game else '',
'genero': genres if (genres := translate_genres(game.select_one(GENRES_TAG_SELECTOR).contents[1].strip().split(' / '))) else '',
'fondo': resize_image(img) if (img := download_image(game=game, rawg_game=rawg_game)) and not FROM_RAWG else img,
'year': None,
}
if not INSERT_SQL:
INSERT_SQL = construct_sql_insert(**game_data)
if hasattr(rawg_game, 'released'):
game_data['year'] = date.fromisoformat(rawg_game.released).year
return tuple(game_data.values())
def get_all_games_from_letter(letter):
global FROM_RAWG
counter = 36
hashes_set = set()
with sqlite3.connect('/media/l0new0lf/LocalStorage/data.db') as connection:
cursor = connection.cursor()
cursor.execute(f'SELECT nombre FROM juegos where nombre like "{letter.upper()}%"')
db_games_set = []
for row in cursor:
db_games_set.append(row[0])
db_games_set = set(db_games_set)
while True:
try:
prepared_games = []
rq = requests.get(
f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}')
if rq:
print('Request GET: from ' +
f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}' + ' Got Workable HTML !')
else:
print('Request GET: from ' +
f'https://vandal.elespanol.com/juegos/13/pc/letra/{letter}/inicio/{counter}' + ' Not Working !!, getting next page!')
continue
if rq.status_code == 301 or rq.status_code == 302 or rq.status_code == 303 or rq.status_code == 304:
print(f'No more games in letter {letter}\n**REDIRECTING TO **')
break
counter += 1
soup = BeautifulSoup(rq.content, 'lxml')
main_table = soup.select_one(GAME_SEARCH_RESULTS_TABLE_SELECTOR)
if hash(main_table.get_text()) not in hashes_set:
hashes_set.add(hash(main_table.get_text()))
else:
print('Repeated page ! I\'m done with this letter.')
break
game_tables = main_table.find_all(
'table', {'class': GAME_TABLES_CLASS})
print('entering game_tables loop')
for game in game_tables:
FROM_RAWG = False
try:
game_record = prepare_game_record(game, db_games_set)
except NoTitleException:
print('There is no title for this game, DISCARDING!')
continue
except Exception as err:
print('Unknown ERROR in prepare_games_record function')
traceback.print_exc()
continue
if not game_record:
continue
prepared_games.append(game_record)
print('Game successfully prepared !')
if prepared_games:
print(f'Thread, Writing to Database')
try:
cursor.executemany(INSERT_SQL, prepared_games)
connection.commit()
except Exception as err:
print(err)
print('done')
except Exception as err:
print('TRULY UNEXPECTED EXCEPTION')
print(err)
traceback.print_exc()
continue
#get_all_games_from_letter('c') You use a single thread?, no trouble at all!!
with concurrent.futures.ThreadPoolExecutor(len(ascii_lowercase)) as executor:
for letter in ascii_lowercase:
executor.submit(get_all_games_from_letter, letter)
Error Stack Trace:
Note: This is only part of the errors, but the rest is the very same.
Game successfully prepared !
Error While downloading from "Vandal.elespannol.com" website:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 996, in _validate_conn
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 366, in connect
self.sock = ssl_wrap_socket(
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 370, in ssl_wrap_socket
return context.wrap_socket(sock, server_hostname=server_hostname)
File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/usr/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
OSError: [Errno 0] Error
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3/dist-packages/six.py", line 702, in reraise
raise value.with_traceback(tb)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 996, in _validate_conn
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 366, in connect
self.sock = ssl_wrap_socket(
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 370, in ssl_wrap_socket
return context.wrap_socket(sock, server_hostname=server_hostname)
File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/usr/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', OSError(0, 'Error'))
To solve the problem what one would need is just add a global lock in order that when each thread tries to request.get an image, it has to ask in the first place if some thread is already using it. That is, downloading an image is restricted to just simultaneously one use for all threads
#######GLOBALS####
lock = threading.Lock() #Add this to globals variables
##################
def download_image(tag=None, game=None, rawg_game=None):
if tag:
return sqlite3.Binary(requests.get(url).content) if (url := tag['data-src']) else None
elif game:
global FROM_RAWG
img_tag = game.select_one(IMG_TAG_SELECTOR)
if img_tag and img_tag.get('data-src', None):
try:
if url_check(img_tag['data-src']):
lock.acquire() #acquire the lock for downloading (it means other threads must wait until the one that acquired finishes)
temp = sqlite3.Binary(requests.get(img_tag['data-src']).content)
lock.release() # release the lock when done with receiving the HttpResponse
return temp
print(f"{img_tag['data-src']} is NOT a valid url")
except ConnectionError:
try:
print('Error While downloading from "Vandal.elespannol.com" website:')
traceback.print_exc()
except Exception:
print('Another Exception Ocurred')
traceback.print_exc()
except OSError:
print('Error en el Handshake parece')
traceback.print_exc()
FROM_RAWG = True
if rawg_game and getattr(rawg_game, 'background_image', None):
try:
print('Continue to download from RAWG')
lock.acquire() #acquire the lock for downloading (it means other threads must wait until the one that acquired finishes)
temp = sqlite3.Binary(requests.get(rawg_game.background_image).content)
lock.release() # release the lock when done with
return temp
except ConnectionError:
print('Error While downloading from RAWG:')
traceback.print_exc()
return None
And done, no more troubles with downloading images in multiple threads.... but still... i don't actually know why i would need to make sure of that one request.get is made for all threads, i thought OS handles this issue by using queues or something.

How to use timeout to stop blocking function subscribe.simple

I want to use timeout to stop the blocking function of mqtt, I use a the timeout_decorator module, it can stop command function but cannot stop blocking function, subscribe.simple.
The following code runs successfully
import time
import timeout_decorator
#timeout_decorator.timeout(5, timeout_exception=StopIteration)
def mytest():
print("Start")
for i in range(1,10):
time.sleep(1)
print("{} seconds have passed".format(i))
if __name__ == '__main__':
mytest()
the result as follow:
Start
1 seconds have passed
2 seconds have passed
3 seconds have passed
4 seconds have passed
Traceback (most recent call last):
File "timeutTest.py", line 12, in <module>
mytest()
File "/home/gyf/.local/lib/python3.5/site-packages/timeout_decorator/timeout_decorator.py", line 81, in new_function
return function(*args, **kwargs)
File "timeutTest.py", line 8, in mytest
time.sleep(1)
File "/home/gyf/.local/lib/python3.5/site-packages/timeout_decorator/timeout_decorator.py", line 72, in handler
_raise_exception(timeout_exception, exception_message)
File "/home/gyf/.local/lib/python3.5/site-packages/timeout_decorator/timeout_decorator.py", line 45, in _raise_exception
raise exception()
timeout_decorator.timeout_decorator.TimeoutError: 'Timed Out'
but I failed with the subscribe.simple API
import timeout_decorator
#timeout_decorator.timeout(5)
def sub():
# print(type(msg))
print("----before simple")
# threading.Timer(5,operateFail,args=)
msg = subscribe.simple("paho/test/simple", hostname=MQTT_IP,port=MQTT_PORT,)
print("----after simple")
return msg
publish.single("paho/test/single", "cloud to device", qos=2, hostname=MQTT_IP,port=MQTT_PORT)
try:
print("pub")
msg = sub()
print(msg)
except StopIteration as identifier:
print("error")
The result infinitely wait
pub
----before simple
I want the function which include subscribe.simple API can stop after 5 seconds.
Asyncio won't be able to handle blocking function in the same thread. therefore using asyncio.wait_for failed. However, inspired by this blog post I used loop.run_in_executor to keep control on the blocking thread.
from paho.mqtt import subscribe
import asyncio
MQTT_IP = "localhost"
MQTT_PORT = 1883
msg = None
def possibly_blocking_function():
global msg
print("listenning for message")
msg = subscribe.simple(
"paho/test/simple",
hostname=MQTT_IP,
port=MQTT_PORT,
)
print("message received!")
async def main():
print("----before simple")
try:
await asyncio.wait_for(
loop.run_in_executor(None, possibly_blocking_function), timeout=5
)
except asyncio.TimeoutError:
pass
print("----after simple")
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Output :
----before simple
listenning for message
----after simple
Please note this is not perfect, the program won't end since there are running tasks. You can exit it using various solution but this is out of scope since I am still looking for a clean way to close that stuck thread.

Python 3.5.2: socket.timeout exception causes typeerror

I'm a bit of a Python newbie and this is my first post to stackoverflow so please bear with me. :)
Before posting i have searched google and stackoverflow but cant seem to find anything similar to my issue.
I have a script that polls a website and retrieves the content.
It works fine for hours however if it encounters a socket timeout the script throws a typeerror even though I have an exception for it.
I'm sure I am missing something obvious, but cant put my finger on it.
Code:
timingout = 10
def get_url(url):
try:
sock = urllib.request.urlopen(url, timeout=timingout)
orig_html = sock.read()
html = orig_html.decode("utf-8", errors="ignore").encode('cp1252', errors='ignore')
sock.close()
except KeyboardInterrupt:
# Kill program if Control-C is pressed
sys.exit(0)
except urllib.error.URLError as e:
print("***Error= Page ", e.reason)
return
except timingout:
print("socket timed out - URL: %s", url)
else:
# See if site is Down or errors eg: 404
if html == None:
print ("page contains no content!?!")
return ''
# See if site is complaining
elif html == site_overload:
if _verbose:
print('ERROR: Too many requests - SLEEPING 600 secs')
time.sleep(600)
return ''
# If not, we are good
elif html:
return html
error:
return self._sock.recv_into(b)
**socket.timeout: timed out**
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 201, in <module>
main()
File "test.py", line 140, in main
http_text = get_text(site_id)
File "test.py", line 110, in get_text
return get_url(url)
File "test.py", line 59, in get_url
except timingout:
**TypeError: catching classes that do not inherit from BaseException is not allowed**
Thanks in advance for any suggestions & help!
It's caused by trying to use timingout to catch an exception. timingout is an integer object whereas the except statement only accepts objects inherited from the BaseException class.
Remove that except because it doesn't do anything. Also consider revising your try statement to only include a single operation. It will make troubleshooting easier and prevent you from having to break up your try statements later on when a bug occurs.

AttributeError: 'module' object has no attribute 'ensure_future'

Hi i am writing a n/w bound server application using python asyncio which can accept a post request.
In post request i am accepting a symbol parameter
please tell me the best way to deal with n/w bound application.where i am collecting the data from another web api's by sending the post request to them.
Following is the code :
import asyncio
import aiohttp
import json
import logging
# async def fetch_content(url, symbols):
# yield from aiohttp.post(url, symbols=symbols)
#asyncio.coroutine
def fetch_page(writer, url, data):
response = yield from aiohttp.post(url, data=data)
resp = yield from response.read_and_close()
print(resp)
writer.write(resp)
return
#asyncio.coroutine
def process_payload(writer, data, scale):
tasks = []
data = data.split('\r\n\r\n')[1]
data = data.split('\n')
data = [x.split(':') for x in data]
print(data)
data = {x[0]: x[1] for x in data}
print(data)
# data = data[0].split(':')[1]
data = data['symbols']
print(data)
data = data.split(',')
data_len = len(data)
data_first = 0
data_last = scale
url = 'http://xxxxxx.xxxxxx.xxx/xxxx/xxxx'
while data_last < data_len:
tasks.append(asyncio.ensure_future(fetch_page(writer, url,{'symbols': ",".join(data[data_first:data_last])})))
data_first += scale
data_last += scale
tasks.append(asyncio.ensure_future(fetch_page(writer, url,{'symbols': ",".join(data[data_first:data_last])})))
loop.run_until_complete(tasks)
return
#asyncio.coroutine
def process_url(url):
pass
#asyncio.coroutine
def echo_server():
yield from asyncio.start_server(handle_connection, 'xxxxxx.xxxx.xxx', 3000)
#asyncio.coroutine
def handle_connection(reader, writer):
data = yield from reader.read(8192)
if data:
message = data.decode('utf-8')
print(message)
yield from process_payload(writer, message, 400)
writer.write_eof()
writer.close()
#url = 'http://XXXXXXX.xxxxx.xxx/xxxx/xxxxxx/xxx'
data = {'symbols': 'GD-US,14174T10,04523Y10,88739910,03209R10,46071F10,77543110,92847N10'}
loop = asyncio.get_event_loop()
loop.run_until_complete(echo_server())
try:
loop.run_forever()
finally:
loop.close()
But i am receiving the following error:
future: <Task finished coro=<handle_connection() done, defined at fql_server_async_v2.py:53> exception=AttributeError("'module' object has no attribute 'ensure_future'",)>
Traceback (most recent call last):
File "/home/user/anugupta/lib/python3.4/asyncio/tasks.py", line 234, in _step
result = coro.send(value)
File "fql_server_async_v2.py", line 60, in handle_connection
yield from process_payload(writer, message, 400)
File "/home/user/anugupta/lib/python3.4/asyncio/coroutines.py", line 141, in coro
res = func(*args, **kw)
File "fql_server_async_v2.py", line 41, in process_payload
tasks.append(asyncio.ensure_future(fetch_page(writer, url, {'symbols':",".join(data[data_first:data_last])})))
AttributeError: 'module' object has no attribute 'ensure_future'
^CTraceback (most recent call last):
File "fql_server_async_v2.py", line 72, in <module>
loop.run_forever()
File "/home/user/anugupta/lib/python3.4/asyncio/base_events.py", line 236, in run_forever
self._run_once()
File "/home/user/anugupta/lib/python3.4/asyncio/base_events.py", line 1017, in _run_once
event_list = self._selector.select(timeout)
File "/home/user/anugupta/lib/python3.4/selectors.py", line 424, in select
fd_event_list = self._epoll.poll(timeout, max_ev)
ensure_future was added in asyncio 3.4.4, use async for earlier versions.
While async is deprecated now it will be supported in oversable future.

Resources