python3.6 start 1 million requests with aiohttp and asyncio - python-3.x

I'm trying to make 1 million requests with aiohttp and asyncio continuously in 10 times which 10k at each time. When I print the start time of each request, I found that the 1 million requests are NOT start at a very closed time but in serval minutes. In my understanding, the 1 million requests will be sent without any wait(or just say in microseconds?) Hope someone can help me give a suggestion how to change the code, and my code is as below. Thanks in advance!
import asyncio
import requests
import json
import pymysql
from aiohttp import ClientSession
from datetime import datetime
import uvloop
# login config
URL_LOGIN = "https://test.com/user/login"
APP_ID = "sample_app_id"
APP_SECRET = "sample_secret"
async def login_user(phone, password, session, i):
start_time = datetime.now()
h = {
"Content-Type": "application/json"
}
data = {
"phone": phone,
"password": password,
"appid": APP_ID,
"appsecret": APP_SECRET
}
try:
async with session.post(url=URL_LOGIN, data=json.dumps(data), headers=h) as response:
r = await response.read()
end_time = datetime.now()
cost = (end_time-start_time).seconds
msg = "number %d request,start_time:%s, cost_time: %d, response: %s\n" % (i, start_time, cost, r.decode())
print("running %d" % i, datetime.now())
except Exception as e:
print("running %d" % i)
msg = "number %d request raise error" % i+str(e)+"\n"
with open("log", "a+") as f:
f.write(msg)
async def bound_login(sem, phone, password, session, i):
async with sem:
await login_user(phone, password, session, i)
async def run_login(num):
tasks = []
sem = asyncio.Semaphore(10000)
async with ClientSession() as session:
for i in range(num):
task = asyncio.ensure_future(bound_login(sem, str(18300000000+i), "123456", session, i))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
start = datetime.now()
number = 100000
loop = uvloop.new_event_loop()
asyncio.set_event_loop(loop)
future = asyncio.ensure_future(run_login(number))

When I print the start time of each request, I found that the 1 million requests are NOT start at a very closed time but in serval minutes.
Your code does issue a total of 1 million requests, but with the constraint that no more than 10 thousand of them runs in parallel at any given time. This is like having 10k request slots at your disposal - the first 10,000 requests will be started immediately, but the 10,001st will have to wait for a previous request to finish so it can get a free slot.
This is why 1 million requests cannot start instantaneously or near-instantaneously, most of them have to wait for some download to finish, and that takes time.
In my understanding, the 1 million requests will be sent without any wait
The current code explicitly makes the requests wait in order to prevent more than 10k of them running in parallel. If you really want to (try to) make a million parallel requests, remove the semaphore and create the ClientSession using a connector with limit set to None.
However, be aware that maintaining a million open connections will likely not work due to limits of the operating system and the hardware. (You should still be able to start the connections near-instantaneously, but I'd expect most of them to exit with an exception shortly afterwards.)

Related

How to make multiple REST calls asynchronous in python3

I have the following code to make multiple REST calls. Basically I have a dictionary where key is a string and value is a JSON date that I need to use as payload to pass to a REST API POST method.
At the moment, the dictionary contains 10 entries, so I need to make 10 REST calls.
At the moment, I have implemented using requests package in python3 which is synchronous in nature. So after 1 REST call, it waits for its response and similarly for 10 REST calls, it will wait 10 times for the response from API.
def createCategories(BACKEND_URL, token, category):
url = os.path.join(BACKEND_URL, 'api/v1/category-creation')
category_dict = read_payloads(category)
headers = {
"token": f'{token}',
"Content-Type": "application/json",
"accept": "application/json"
}
for name, category_payload in category_dict.items():
json_payload = json.dumps(category_payload)
response = requests.request("POST", url, headers=headers, data=json_payload)
##########################
## Load as string and parsing
response_data = json.loads(response.text)
print(response_data)
category_id = response_data['id']
message = 'The entity with id: ' + str(category_id) + ' is created successfully. '
logging.info(message)
return "categories created successfully."
I read that we need to use asyncio to make these asynchronous. What code changes do I need to make?
You can continue using requests library. You need to use threading or concurrent.futures modules to make several requests simutaneoudly.
Another option is to use some async library like aiohttp or some others.
import requests
from threading import current_thread
from concurrent.futures import ThreadPoolExecutor, Future
from time import sleep, monotonic
URL = "https://api.github.com/events"
def make_request(url: str) -> int:
r = requests.get(url)
sleep(2.0) # wait n seconds
return r.status_code
def done_callback(fut: Future):
if fut.exception():
res = fut.exception()
print(f"{current_thread().name}. Error: {res}")
elif fut.cancelled():
print(f"Task was canceled")
else:
print(f"{current_thread().name}. Result: {fut.result()}")
if __name__ == '__main__':
urls = [URL for i in range(20)] # 20 tasks
start = monotonic()
with ThreadPoolExecutor(5) as pool:
for i in urls:
future_obj = pool.submit(make_request, i)
future_obj.add_done_callback(done_callback)
print(f"Time passed: {monotonic() - start}")

python concurrent.futures skip timeout processes

I am dealing with thousands of image urls and want to use concurrent.futures.ProcessPoolExecutor to speed up.
Since some of the urls are broken or images are large, the process function may hang or unexpectedly consume a lot of time during processing. I want to add a timeout on the process function like 10 seconds to get rid of these invalid images.
I tried to set the timeout param in futures .as_completed, the TimeoutException could be successfully raised. However, it seems that the main process will still wait until the timeout child process is completed. Is there any approach to immediately kill the timeout child process and put next url into the pool?
from concurrent import futures
def process(url):
### Some time consuming operation
return result
def main():
urls = ['url1','url2','url3',...,'url100']
with futures.ProcessPoolExecutor(max_workers=10) as executor:
future_list = {executor.submit(process, url):url for url in urls}
results = []
try:
for future in futures.as_completed(future_list, timeout=10):
results.append(future.result())
except futures._base.TimeoutException:
print("timeout")
print(results)
if __name__ == '__main__':
main()
In above example, suppose that I have 100 urls, 10 of them are invalid and may cost a lot of time ,how to get the rest 90 urls' processed result list?
Not with the concurrent.futures library.
The pebble module has been developed to overcome such limitation.
from pebble import ProcessPool
from concurrent.futures import TimeoutError
with process.ProcessPool() as pool:
future = pool.schedule(function, args=(1,2), timeout=5)
try:
result = future.result() # blocks until results are ready
except TimeoutError as error:
print("Function took longer than %d seconds" % error.args[1])

Python27 Is it able to make timer without thread.Timer?

So, basically I want to make timer but I don't want to use thread.Timer for
efficiency
Python produces thread by itself, it is not efficient and better not to use it.
I search the essay related to this. And checked It is slow to use thread.
e.g) single process was divided into N, and made it work into Thread, It was slower.
However I need to use Thread for this.
class Works(object):
def __init__(self):
self.symbol_dict = config.ws_api.get("ASSET_ABBR_LIST")
self.dict = {}
self.ohlcv1m = []
def on_open(self, ws):
ws.send(json.dumps(config.ws_api.get("SUBSCRIPTION_DICT")))
everytime I get the message form web socket server, I store in self.dict
def on_message(self,ws,message):
message = json.loads(message)
if len(message) > 2 :
ticker = message[2]
pair = self.symbol_dict[(ticker[0])]
baseVolume = ticker[5]
timestmap = time.time()
try:
type(self.dict[pair])
except KeyError as e:
self.dict[pair] = []
self.dict[pair].append({
'pair':pair,
'baseVolume' : baseVolume,
})
def run(self):
websocket.enableTrace(True)
ws = websocket.WebSocketApp(
url = config.ws_api.get("WEBSOCK_HOST"),
on_message = self.on_message,
on_open = self.on_open
)
ws.run_forever(sslopt = {"cert_reqs":ssl.CERT_NONE})
'once in every 60s it occurs. calculate self.dict and save in to self.ohlcv1m
and will sent it to db. eventually self.dict and self.ohlcv1m initialized again to store 1min data from server'
def every60s(self):
threading.Timer(60, self.every60s).start()
for symbol in self.dict:
tickerLists = self.dict[symbol]
self.ohlcv1m.append({
"V": sum([
float(ticker['baseVolume']) for ticker in tickerLists]
})
#self.ohlcv1m will go to database every 1m
self.ohlcv1 = [] #init again
self.dict = {} #init again
if __name__ == "__main__":
work=Works()
t1 = threading.Thread(target=work.run)
t1.daemon = True
t1.start()
work.every60s()
(sorry for the indention)
I am connecting to socket by running run_forever() and getting realtimedata
Every 60s I need to check and calculate the data
Is there any way to make 60s without thread in python27?
I will be so appreciate you answer If you give me any advice.
Thank you
The answer comes down to if you need the code to run exactly every 60 seconds, or if you can just wait 60 seconds between runs (i.e. if the logic takes 5 seconds, it'll run every 65 seconds).
If you're happy with just a 60 second gap between runs, you could do
import time
while True:
every60s()
time.sleep(60)
If you're really set on not using threads but having it start every 60 seconds regardless of the last poll time, you could time the last execution and subtract that from 60 seconds to get the sleep time.
However, really, with the code you've got there you're not going to run into any of the issues with Python threads you might have read about. Those issues come in when you've got multiple threads all running at the same time and all CPU bound, which doesn't seem to be the case here unless there's some very slow, CPU intensive work that's not in your provided code.

How can I control traffic, when I use "for" in locust?

When I use "for" in locust, I do not know why the "req/s" is too high.
class UserBehavior(TaskSet):
#task(1)
def start_congche(self):
filename = 'D:\测试\项目\精励评分\从车评分/阳光压力测试数据.csv'
with open(filename) as f:
reader = csv.DictReader(f)
for test in reader:
self.client.post("/DataPreFillServer/DataPreFillProductService", first +test["vin"] + vincode +test["vehicle_code"] + vehicleCode + end)
class WebsiteUser(HttpLocust):
task_set = UserBehavior
host = "http://10.10.6.12:8080"
min_wait = 1000
max_wait = 1000
But, if I do not use "for", everything is ok....
class UserBehavior(TaskSet):
#task(1)
def start_congche(self):
self.client.post("/DataPreFillServer/DataPreFillProductService", first + vincode + vehicleCode + end)
use the class of queue..
each time from the queue to take the value.
user_data_queue = queue.Queue()
filename = 'XXXXXXXX.csv'
with open(filename) as f:
reader = csv.DictReader(f)
for test in reader:
data = {
"vin": test["vin"],
"vehicle_code": test["vehicle_code"],
}
user_data_queue.put_nowait(data)
try:
data = self.locust.user_data_queue.get_nowait()
except queue.Empty:
exit(0)
self.client.post("/DataPreFillServer/DataPreFillProductService",payload)
I believe the reason why you are seeing 20 requests/second in the first approach is the for loop is executing multiple post requests each time one of the five Locust users attacks the system. Depending how large the file is, let's say its 20 iterations, that means each user executed likely in parallel 20 times and it the test ended.
Take a look at your start and end times, the first test finishes in ~8 seconds while the other one takes around 30 something.
In the second test, five locust users execute a single post request per user and has to go back and continue executing one at a time until 100 requests are satisfied.

Python Fire dynamic urls using multithreading

I'm new to to Python-Threading, and I've gone through multiple posts but I really did not understand how to use it. However I tried to complete my task, and I want to check if I'm doing it with right approach.
Task is :
Read big CSV containing around 20K records, fetch id from each record and fire an HTTP API call for each record of the CSV.
t1 = time.time()
file_data_obj = csv.DictReader(open(file_path, 'rU'))
threads = []
for record in file_data_obj:
apiurl = https://www.api-server.com?id=record.get("acc_id", "")
thread = threading.Thread(target=requests.get, args=(apiurl,))
thread.start()
threads.append(thread)
t2 = time.time()
for thread in threads:
thread.join()
print("Total time required to process a file - {} Secs".format(t2-t1))
As there are 20K records, would it start 20K threads? OR OS/Python will handle it? If yes, can we restrict it?
How can I collect the response returned by requests.get?
Would t2 - t1 really give mw the time required to process whole file?
As there are 20K records, would it start 20K threads? OR OS/Python will handle it? If yes, can we restrict it?
Yes - it will start a thread for each iteration. The maximum amount of threads is dependent on your OS.
How can I grab the response returned by requests.get?
If you want to use the threading module only, you'll have to make use of a Queue. Threads return None by design, hence you'll have to implement a line of communication between the Thread and you main loop yourself.
from queue import Queue
from threading import Thread
import time
# A thread that produces data
q = Queue()
def return_get(q, apiurl):
q.put(requests.get(apiurl)
for record in file_data_obj:
apiurl = https://www.api-server.com?id=record.get("acc_id", "")
t = threading.Thread(target=return_get, args=(q, apiurl))
t.start()
threads.append(t)
for thread in threads:
thread.join()
while not q.empty:
r = q.get() # Fetches the first item on the queue
print(r.text)
An alternative is to use a worker pool.
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import urllib.request
threads = []
pool = ThreadPoolExecutor(10)
# Submit work to the pool
for record in file_data_obj:
apiurl = https://www.api-server.com?id=record.get("acc_id", "")
t = pool.submit(fetch_url, 'http://www.python.org')
threads.append(t)
for t in threads:
print(t.result())
You can use ThreadPoolExecutor
Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
Create pool executor with N workers
with concurrent.futures.ThreadPoolExecutor(max_workers=N_workers) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

Resources