Speed up using multi threading python3 - python-3.x

Actually I am creating a proxy checker but the problem is it is taking a lot of time to check because there are a lot of the proxies
def proxy():
lives = []
allproxy = []
def fetch_proxy():
raw_proxy = []
res = requests.get(proxy_api)
raw_proxy = res.text.splitlines()
return raw_proxy
allproxy = fetch_proxy()
for proxy in allproxy:
try:
proxyDictChk = {
"https" : "https://"+proxy,
"http" : "http://"+proxy,
}
res = requests.get("http://httpbin.org/ip",proxies=proxyDictChk,timeout=3)
print("Proxy is Working")
lives.append(proxy)
except Exception as e:
print("Proxy Dead")
return lives
print(proxy())
I am curious that how I can use multithreading here to make this fast
PS. Thanks in advance

The python docs provide a pretty good example, https://docs.python.org/3/library/concurrent.futures.html
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(check_proxy, url, 60): url for url in allproxy}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
is_valid = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%s page is %s' % (url, is_valid))
So you would just need to define the function check_proxy.
def check_proxy( proxy ):
try:
proxyDictChk = {
"https" : "https://"+proxy,
"http" : "http://"+proxy,
}
res = requests.get("http://httpbin.org/ip",proxies=proxyDictChk,timeout=3)
print("Proxy is Working")
return True
except Exception as e:
print("Proxies Dead!")
return False
Essentially, use an executor and submit a function that does what you want. Then use the future to get the results of the functions as they're completed.
Also, since this lets the exception bubble up, you don't have to handle it in the function.
def check_proxy( proxy ):
proxyDictChk = { "https" : "https://"+proxy,
"http" : "http://"+proxy,
}
res = requests.get("http://httpbin.org/ip",proxies=proxyDictChk,timeout=3)
return True
Now the exception can be handled at the future state. You could change the return type to something more meaningful.

Related

using httpx to send 100K get requests

I'm using the httpx library and asyncio to try and send about 100K of get requests.
I ran the code and received httpx.ConnectError so I opened wireshark and saw that I was getting a lot of messages saying TCP Retransmission TCP Port numbers reused
when I saw the data in wireshark and the error httpx.ConnectError I added limits = httpx.Limits(max_connections=10000) to limit the amount of active connections to 10,000 but I still get that error.
my code:
import asyncio
import json
import httpx
SOME_URL = "some url"
ANOTHER_URL = "another url"
MAX = 10000
async def search():
guids = [guid for guid in range(688001, 800000)] # 688001 - 838611
timeout = httpx.Timeout(None)
limits = httpx.Limits(max_connections=MAX)
async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
tasks = [client.get(f"{SOME_URL}{guid}", timeout=timeout) for guid in guids]
blob_list = await asyncio.gather(*tasks) # <---- error from here !!!!!
blob_list = [(res, guid) for res, guid in zip(blob_list, guids)]
guids = [guid for res, guid in blob_list]
blob_list = [json.loads(res.text)["blob_name"] for res, guid in blob_list]
async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
tasks = [client.get(f"{ANOTHER_URL}{blob}", timeout=timeout) for blob in blob_list]
game_results = await asyncio.gather(*tasks) # <---- error from here !!!!!
game_results = [(res, guid) for res, guid in zip(game_results, guids)]
game_results = [guid for res, guid in game_results]
print(game_results)
def main():
asyncio.run(search())
if __name__ == '__main__':
main()
this is a minimal version of my code there some steps in between the requests that I deleted, but I didn't touch the code that made the trouble, there are comments on the lines that I receive the errors (# <---- error from here !!!!!).
does anyone know how to solve this? or another way to send about 100K of get requests fast?
I managed to solve my problem with the following code:
(this is not the entire code, only the parts needed to send the requests, I have some stuff in between)
import asyncio
from aiohttp import ClientSession
SOME_URL = "some url"
ANOTHER_URL = "another url"
MAX_SIM_CONNS = 50
worker_responses = []
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def fetch_worker(url_queue: asyncio.Queue):
global worker_responses
async with ClientSession() as session:
while True:
url = await url_queue.get()
try:
if url is None:
return
response = await fetch(url, session)
worker_responses.append(response)
finally:
url_queue.task_done()
# calling task_done() is necessary for the url_queue.join() to work correctly
async def fetch_all(base_url: str, range_: range):
url_queue = asyncio.Queue(maxsize=10000)
worker_tasks = []
for i in range(MAX_SIM_CONNS):
wt = asyncio.create_task(fetch_worker(url_queue))
worker_tasks.append(wt)
for i in range_:
await url_queue.put(f"{base_url}{i}")
for i in range(MAX_SIM_CONNS):
# tell the workers that the work is done
await url_queue.put(None)
await url_queue.join()
await asyncio.gather(*worker_tasks)
if __name__ == '__main__':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(fetch_all(SOME_URL, range(680_842, 840_423)))
print(worker_responses)
I used aiohttp instead of httpx and used asyncio.Queue to reduce RAM usage and it worked for me.

count successful and unsuccessful post requests for asynchronous post call/request

I need help in implementing the logic to count number of successful post calls which are asynchronous in nature (status_code=200) as well as failed_calls (status_code != 200)
I am new to coroutines. Would appreciate if someone can suggest a better way of making a post asynchronous call which can be retried, polled for status, and that can emit metrics for successful post requests as well.
Following is my code:
asyncio.get_event_loop().run_in_executor(
None,
self.publish_actual,
event_name,
custom_payload,
event_message_params,
)
which calls publish_actual:
def publish_actual(
self,
event_name: str,
custom_payload={},
event_message_params=[],
):
"""Submits a post request using the request library
:param event_name: name of the event
:type event_name: str
:param key: key for a particular application
:param custom_payload: custom_payload, defaults to {}
:type custom_payload: dict, optional
:param event_message_params: event_message_params, defaults to []
:type event_message_params: list, optional
"""
json_data = {}
path = f"/some/path"
self.request(path, "POST", json=json_data)
which calls following request function
def request(self, api_path, method="GET", **kwargs):
try:
self._validate_configuration()
headers = {}
api_endpoint = self.service_uri.to_url(api_path)
logger.debug(api_endpoint)
if "headers" in kwargs and kwargs["headers"]:
headers.update(kwargs["headers"])
headers = {"Content-Type": "application/json"}
begin = datetime.now()
def build_success_metrics(response, *args, **kwargs):
tags = {
"name": "success_metrics",
"domain": api_endpoint,
"status_code": 200,
}
build_metrics(tags)
def check_for_errors(response, *args, **kwargs):
response.raise_for_status()
response = self.session.request(
method=method,
url=api_endpoint,
headers=headers,
timeout=self.timeout,
hooks={"response": [build_success_metrics, check_for_errors]},
**kwargs,
)
end = datetime.now()
logger.debug(
f"'{method}' request against endpoint '{api_endpoint}' took {round((end - begin).total_seconds() * 1000, 3)} ms"
)
logger.debug(f"response: {response}")
except RequestException as e:
tags = {
"name": "error_metrics",
"domain": api_endpoint,
"exception_class": e.__class__.__name__,
}
build_metrics(tags)
return f"Exception occured: {e}"
Let me know if anything else is required from my end to explain what exactly I have done and what I am trying to achieve.
There is not much await and async in your example so I've just addressed the counting part of your question in general terms in asyncio. asyncio.Queue is good for this because you can separate out the counting from the cause quite simply.
import asyncio
import aiohttp
class Count():
def __init__(self, queue: asyncio.Queue):
self.queue = queue
self.good = 0
self.bad = 0
async def count(self):
while True:
result = await self.queue.get()
if result == 'Exit':
return
if result == 200:
self.good += 1
else:
self.bad += 1
async def request(q: asyncio.Queue):
async with aiohttp.ClientSession() as session:
for _ in range(5): # just poll 30 times in this instance
await asyncio.sleep(0.1)
async with session.get(
'https://httpbin.org/status/200%2C500', ssl=False
) as response:
q.put_nowait(response.status)
q.put_nowait('Exit')
async def main():
q = asyncio.Queue()
cnt = Count(q)
tasks = [cnt.count(), request(q)]
await asyncio.gather(*[asyncio.create_task(t) for t in tasks])
print(cnt.good, cnt.bad)
if __name__ == "__main__":
asyncio.run(main())
Output is random given httpbin response. Should add to 5.
4 1

Confused with the use of __init__

I created a class that it's created from the YouTube API. It looks like this:
class YouTube:
def __init__(self,name,version,developerKey):
self.service=build(name, version, developerKey)
def get_video_info(self,vid_id):
vid_request = self.service.videos().list(
part = 'snippet,statistics',
id = vid_id,
fields = 'items(kind,id,statistics)')
vid_response = vid_request.execute()
return vid_response
if __name__ == "__main__":
name = 'youtube'
version = 'v3'
api_token='xxxx'
query=YouTube(name,version,api_token)
vid_id='YYYYY_VID_ID'
response = query.get_video_info(vid_id)
pprint(response)
and it works fine, but then I tried the following on the init method:
def __init__(self):
self.name = 'youtube'
self.version = 'v3'
self.developerKey = 'xxxxxxx'
self.service = build(self.name, self.version,self.developerKey)
if __name__ == "__main__":
query = YouTube()
response = query.get_video_info(vid_id)
pprint(response)
I get the following error:
def get_video_info(self,vid_id):
vid_request = self.service.videos().list(
part = 'snippet,statistics',
id=vid_id,
fields= 'items(kind,id,statistics)')
Exception has occurred: AttributeError 'str' object has no attribute 'request'
vid_response = vid_request.execute()
I searched online and I see that this Exception occurs in a variety of situation, and I feel lost? Could someone point me in which direction I should search?
According to this documentation, this is how the build function is defined:
build(serviceName, version, http=None, discoveryServiceUrl=DISCOVERY_URI, developerKey=None, model=None, requestBuilder=HttpRequest, credentials=None, cache_discovery=True, cache=None, client_options=None, adc_cert_path=None, adc_key_path=None, num_retries=1)
Construct a Resource for interacting with an API.
Therefore, you should pass the developer key as a keyword argument in your second snippet:
self.service = build(
self.name,
self.version,
developerKey = self.developerKey
)

Django: Unable to run async requests.get on google. Exception Response can't be used in 'await' expression

I am unable to understand how async works. I send simple get requests to google with a proxy to check the validity of proxy in a async method. I get the error:
'''object Response can't be used in 'await' expression'''
Method to get proxies. Code for getting the list of proxies is copied from a tutorial:
def get_proxies(self, number_of_proxies=10):
"""Returns max 10 free https proxies by scraping
free-proxy website.
#arg number_of_proxies to be returned"""
try:
if number_of_proxies > 10: number_of_proxies = 10
url = 'https://abc-list.net/'
response = requests.get(url)
response_text = response.text
parser = fromstring(response_text)
proxies = set()
for i in parser.xpath('//tbody/tr'):
if len(proxies) >= number_of_proxies:
break
if i.xpath('.//td[7][contains(text(),"yes")]'):
#Grabbing IP and corresponding PORT
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
return proxies
except Exception as e:
print('Exception while abc list from url: ', e)
return None
Method to check the validity of proxy:
async def is_valid_proxy(self, proxy):
"""Check the validity of a proxy by sending
get request to google using the given proxy."""
try:
response = await requests.get("http://8.8.4.4", proxies={"http": proxy, "https": proxy}, timeout=10)
if await response.status_code == requests.codes.ok:
print('got a valid proxy')
return True
except Exception as e:
print('Invalid proxy. Exception: ', e)
return False
Method to get the valid proxies:
async def get_valid_proxies(self, number_of_proxies=10):
proxies = self.get_proxies(number_of_proxies)
print(len(proxies), proxies)
valid_proxies = []
valid_proxies = await asyncio.gather(*[proxy for proxy in proxies if await self.is_valid_proxy(proxy)])
return valid_proxies
And a call to the above method:
proxies = asyncio.run(get_valid_proxies())
Now the best solution for me will be to check the validity of a proxy in def get_proxies(self, number_of_proxies=10): before adding it to the proxies list. But have no clue how to achieve that in async way. Therefore, I tried to do a workaround but that is also not working. The method works without async but I call this method many times and it is very slow, therefore would like to make it async.
Thank you
Now after changing the above code by using aiohttp it still throws an exception and doesn't look like async as the requests seem to be sent after one finishes as its very slow same as before.
New is_valid_proxy:
async with aiohttp.ClientSession() as session:
session.proxies={"http": proxy, "https": proxy}
async with session.get('http://8.8.4.4',
timeout=10) as response:
status_code = await response.status_code
# response = await requests.get("https://www.google.com/", proxies={"http": proxy, "https": proxy}, timeout=10)
# if await response.status_code == requests.codes.ok:
if status_code == requests.codes.ok:
print('got a valid proxy')
return True
except Exception as e:
print('Invalid proxy. Exception: ', e)
return False
Won't even display the error or exception. Here is the message:
Invalid proxy. Exception:

How to post group of requests to 2 urls with aiohttp

I have 2 URLs and 60k+ requests. Basically, I need to post every request to both URLs, then compare their responses, but not to wait for the response to post another request.
I've tried to do it with aiohttp and asyncio
import asyncio
import time
import aiohttp
import os
from aiofile import AIOFile
testURL = ""
prodURL = ""
directoryWithRequests = ''
directoryToWrite = ''
headers = {'content-type': 'application/soap+xml'}
i = 1
async def fetch(session, url, reqeust):
global i
async with session.post(url=url, data=reqeust.encode('utf-8'), headers=headers) as response:
if response.status != 200:
async with AIOFile(directoryToWrite + str(i) + '.xml', 'w') as afp:
await afp.write(reqeust)
i += 1
return await response.text()
async def fetch_all(session, urls, request):
results = await asyncio.gather(*[asyncio.create_task(fetch(session, url, request)) for url in urls])
return results
async def asynchronousRequests(requestBody):
urls = [testURL, prodURL]
global i
with open(requestBody) as my_file:
body = my_file.read()
async with aiohttp.ClientSession() as session:
htmls = await fetch_all(session, urls, body)
# some conditions
async def asynchronous():
try:
start = time.time()
futures = [asynchronousRequests(directoryWithRequests + i) for i in os.listdir(directoryWithRequests)]
for future in asyncio.as_completed(futures):
result = await future
print("Process took: {:.2f} seconds".format(time.time() - start))
except Exception as e:
print(str(e))
if __name__ == '__main__':
try:
# AsyncronTest
ioloop = asyncio.ProactorEventLoop()
ioloop.run_until_complete(asynchronous())
ioloop.close()
if i == 1:
print('Regress is OK')
else:
print('Number of requests to check = {}'.format(i))
except Exception as e:
print(e)
I believe that the code above works, but it creates N futures, where the N equals to the number of request files. This brings to sort of ddos because the server can't response to that number of requests at the same time.
Found suitable solution. Basically it's just 2 async tasks:
tasks = [
postRequest(testURL, client, body),
postRequest(prodURL, client, body)
]
await asyncio.wait(tasks)
It's not the same performance as the code in the question with afortable number of requests, but as least it doesn't ddos the server that much.

Resources