Pymongo, Motor memory leak - python-3.x

Background: I use tornado + motor, and found the mem_usage increase.
Then I code the test.py. The db.tasks "size" : 12192854 (10+M). After one minute, MEM USAGE / LIMIT is 1.219GiB / 8GiB
env:
python 3.7.5
motor 2.5.0 (2.1.0 before upgrade)
multidict 4.7.5
pymongo 3.12.0
Here are my code
import os
import gc
import time
import logging
import asyncio
import uvloop
import pdb
import pymongo
import base64
from tornado.platform.asyncio import AsyncIOMainLoop
from guppy import hpy
from motor import motor_asyncio
mongo_auth = 'xxxxx='
runtime_mongos = arch_mongos = {
"host": f"mongodb://{base64.b64decode(mongo_auth).decode()}#" + ','.join(
[
"1xxx:27024",
"2xxx:27024",
"3xxx:27024",
]),
"readPreference": "secondaryPreferred"
}
table = motor_asyncio.AsyncIOMotorClient(**runtime_mongos)["db"]["tasks"]
async def get_data():
return await table.find().sort([
("priority", pymongo.ASCENDING),
("start_uts", pymongo.ASCENDING),
]).to_list(None)
async def test():
while True:
a = await get_data()
print(len(a))
await asyncio.sleep(1)
gc.collect() # no use!
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(test())

Finally, I found the python process has a lot of threads, then I get a clue about the motor 'ThreadPoolExecutor'.
code in motor 2.1:
if 'MOTOR_MAX_WORKERS' in os.environ:
max_workers = int(os.environ['MOTOR_MAX_WORKERS'])
else:
max_workers = tornado.process.cpu_count() * 5
_EXECUTOR = ThreadPoolExecutor(max_workers=max_workers)
I set MOTOR_MAX_WORKERS=1 and the mem_usage keeps in low level.
I deploy my project in docker.But, the cpu of the container is not exclusive.I guess this is the reason of 'max_workers' is irrational.
My fault...

Related

Why does my prompt-toolkit Dialog flicker on terminal output with asyncio and patch_stout?

When running a prompt-toolkit Dialog it will flicker if there is terminal output from a different asyncio task.
I am using the context with patch_stdout() within a task as mentioned in the doc, at least as far as I understand it. I read somewhere that starting with prompt-toolkit 3.0 it uses the default asyncio event loop and does not create one itself.
And since asyncio.run always creates a new event loop and closes it at the end and the context manager is within that, I have no clue what could be the reason for the flickering.
(Python 3.9, prompt-toolkit 3.0.36)
This is a MCVE:
import asyncio
from prompt_toolkit.patch_stdout import patch_stdout
from prompt_toolkit.shortcuts.dialogs import _create_app, _return_none
from prompt_toolkit.widgets import Button, Dialog, Label
dialog_align = Dialog(
title='Please align',
body=Label(text="init", dont_extend_height=True),
buttons=[Button(text='Start measurement', width=21, handler=_return_none)],
with_background=True,
)
async def prompt_align():
return await _create_app(dialog_align, style=None).run_async()
async def main_datasource():
while True:
await asyncio.sleep(0.5)
print("test")
async def main():
with patch_stdout():
task1 = asyncio.create_task(prompt_align())
task2 = asyncio.create_task(main_datasource())
await asyncio.gather(task1, task2)
if __name__ == "__main__":
try:
from asyncio import run
except ImportError:
asyncio.run_until_complete(main())
else:
asyncio.run(main())

Script Multiprocessing dont finish all task, and also i get 100 cpu?

i need to ask if part of my script is correct, working "i think fine" but i think really i have somethink wrong, because still i get CPU 100% and so many time dont finish all task but after 50/100 task is like frozen.
Any info how to edit it ? Or Maybe just tell me where is the error ?
Thank you
Ps. I have inserted all the modules that the script requires and only the part that should be of interest for multiprocessing and also just firt part of the script.
Many Thanks
from __future__ import print_function
import sys
import os
import easygui
import pyautogui as py
import datetime
import pwinput
import json
from collections import Counter
import random
import string
import threading
import subprocess
import multiprocessing
import queue
from multiprocessing import cpu_count
from multiprocessing import Value, Lock, Process, Queue, current_process
import numpy as np
import grequests
import requests
from requests.exceptions import ConnectionError
from requests.exceptions import HTTPError
import time
from time import sleep
number_of_processes = cpu_count()
class Counter(object):
def __init__(self, initval=0):
self.val = Value('i', initval)
self.lock = Lock()
def increment(self):
with self.lock:
self.val.value += 1
def value(self):
with self.lock:
return self.val.value
def updateTitle(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username):
while True:
hits = int(counterhits.value())
done = int(counterdone.value())
shtot = int(countersl.value())
maitot = int(countml.value())
remain_scan = number_of_task - hits
elapsed = time.strftime('%H:%M:%S', time.gmtime(time.time() - start))
ctypes.windll.kernel32.SetConsoleTitleW(f'Site Valid For: {number_of_task} | Started: {hits} | Complete: {done} | Remain: {remain_scan} | SL Found: {shtot} | ML Found: {maitot} | Threads: {number_of_processes} | Time elapsed: {elapsed} ! Licensed at: {username}')
sleep(0.3)
def worker_main(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml):
while True:
try:
site = tasks_to_do.get_nowait()
if site is None:
break
except Queue.Empty:
break
except Queue.Full:
sleep(0.5)
continue
counterhits.increment()
do_work(site,counterhits,counterdone,countersl,countml)
tasks_finished.put(site + current_process().name)
counterdone.increment()
return True
def main():
global username
number_of_task = int(len(filter_data))
counterhits = Counter(0)
counterdone = Counter(0)
countersl = Counter(0)
countml = Counter(0)
tasks_to_do = Queue()
tasks_finished = Queue()
processes1 = []
prefix = ['http://']
# creating processes
for w in range(number_of_processes):
p1 = Process(target=worker_main, args=(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml))
processes1.append(p1)
p1.start()
procs = [Process(target=updateTitle, args=(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username), daemon=True) for i in range(1)]
for p in procs: p.start()
for site_il in filter_data:
site_or = site_il.rstrip("\n")
if (site_or.startswith("http://")) :
site_or = site_or.replace("http://","")
elif (site_or.startswith("https://")) :
site_or = site_or.replace("https://","")
site_or = site_or.rstrip()
site_or = site_or.split('/')[0]
if ('www.' in site_or) :
site_or = site_or.replace("www.", "")
sitexx = [sub + site_or for sub in prefix]
for site in sitexx:
tasks_to_do.put(site)
# completing process
for p1 in processes1:
p1.join()
for p in procs: p.join()
# print the output
while not tasks_finished.empty():
print(tasks_finished.get())
os.system('pause>nul')
return True
if __name__ == '__main__':
if sys.platform.startswith('win'):
# On Windows calling this function is necessary.
multiprocessing.freeze_support()
main()

Trying to get data using Websocket & run some parallel code on the value returned using Asyncio

I am just getting started with asyncio in python. What I am trying to do is below :
A Websocket connects to a data provider and keeps listening for new data (Run forever )
Parallelly work on this data and maybe save the data to a file. Make buy /sell decisions(Any parellel operation is okay)
Trying to use asyncio to achieve this (Will threads be better? Although threads seem more complicated that asyncio )
I am Using jupyter notebook( So Event loop is already created, maybe that's the problem ?)
Code 1 works , but this blocks my event loop and only keeps printing the data. All other code is blocked. Since its always busy in the web socket I guess.
import ssl
import websocket
import json
from IPython.display import display, clear_output
from threading import Thread
def on_message(ws, message):
resp1 = json.loads(message)
clear_output(wait=True)
print(resp1['events'][0]['price'])
def run():
ws = websocket.WebSocketApp("wss://api.gemini.com/v1/marketdata/BTCUSD",on_message=on_message)
ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
ws_run = Thread(target=run)
ws_run.start()
print("ok") # this prints once, but then it gets blocked by the websocket code.
I tried code 2 :
But this hangs forever and doesn't do anything.
import asyncio
import ssl
import websocket
import json
from IPython.display import display, clear_output
async def on_message(ws, message):
resp1 = json.loads(message)
clear_output(wait=True)
#print(resp1,sort_keys=True, indent=4)
#print(resp1['events'][0]['side'])
print(resp1['events'][0]['price'])
async def count2(x):
x=0
for i in range(5):
await asyncio.sleep(0.01)
print('second' , x)
y=x+10
return y
async def main():
await asyncio.gather(count(x), on_message(ws, message))
if __name__ == "__main__":
import time
ws = websocket.WebSocketApp("wss://api.gemini.com/v1/marketdata/BTCUSD",on_message=on_message)
asyncio.get_event_loop().run_forever(ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}))
s = time.perf_counter()
await main()
elapsed = time.perf_counter() - s
print(f" executed in {elapsed:0.2f} seconds.")
Tried this variant in main(), still no response. :
if __name__ == "__main__":
import time
ws = websocket.WebSocketApp("wss://api.gemini.com/v1/marketdata/BTCUSD",on_message=on_message)
ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
s = time.perf_counter()
await main()
elapsed = time.perf_counter() - s
print(f" executed in {elapsed:0.2f} seconds.")
Update: I got this to work ,but I dont know if this is the right way, without using run_forever():
import ssl
import websocket
import asyncio
import time
async def MySock():
while True:
print(ws.recv())
await asyncio.sleep(0.5)
async def MyPrint():
while True:
print("-------------------------------------------")
await asyncio.sleep(0.5)
async def main():
await asyncio.gather(MySock(),MyPrint())
if __name__ == "__main__":
ws = websocket.WebSocket()
ws.connect("wss://api.gemini.com/v1/marketdata/btcusd?top_of_book=true&offers=true")
s = time.perf_counter()
await main()
elapsed = time.perf_counter() - s
print(f" executed in {elapsed:0.2f} seconds.")

Why aiohttp create thread in every iteration?

I try using aiohttp for sending some request in loop.
When I run this code I see in process list threads at first iteration:
5 thread,
next iteration after 5 sec
add 5 thread,
etc
I try find answer, but I only found info about aiodns, I test with aiodns, but situations not chaged. aiohttp run thread, why ? what's I doing wrong ?
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import asyncio
import aiohttp
import aiodns
import aiohttp.resolver
aiohttp.resolver.DefaultResolver = aiohttp.resolver.AsyncResolver
url = "http://localhost/script"
async def process_send():
print("Run process_send")
while 1:
print(aiohttp.resolver.DefaultResolver)
async with aiohttp.ClientSession() as session:
async with session.post(url, data={'id': '13459393', 'data': 'somedate'}, timeout=15) as resp:
print("Send result %s " % (str(resp.status)))
await asyncio.sleep(5)
if __name__ == '__main__':
ioloop = asyncio.get_event_loop()
tasks = [ioloop.create_task(process_send()) for _ in range(5)]
wait_tasks = asyncio.wait(tasks)
ioloop.run_until_complete(wait_tasks)
ioloop.close()
Only resolver.py can create thread.
from aiohttp.resolver import AsyncResolver
resolver = AsyncResolver()
tcp_conn = aiohttp.TCPConnector(resolver=resolver)
async with aiohttp.ClientSession(connector=tcp_conn) as session:

Using Twisted + Cyclone + PyPy to handle POST requests cause a memory leak?

After a lot of investigating, I found out that after serving hundreds of thousands of HTTP POST requests, there's a memory leak. The strange part is that the memory leak only occurs when using PyPy.
Here's an example code:
from twisted.internet import reactor
import tornado.ioloop
do_tornado = False
port = 8888
if do_tornado:
from tornado.web import RequestHandler, Application
else:
from cyclone.web import RequestHandler, Application
class MainHandler(RequestHandler):
def get(self):
self.write("Hello, world")
def post(self):
self.write("Hello, world")
if __name__ == "__main__":
routes = [(r"/", MainHandler)]
application = Application(routes)
print port
if do_tornado:
application.listen(port)
tornado.ioloop.IOLoop.instance().start()
else:
reactor.listenTCP(port, application)
reactor.run()
Here is the test code I am using to generate requests:
from twisted.internet import reactor, defer
from twisted.internet.task import LoopingCall
from twisted.web.client import Agent, HTTPConnectionPool
from twisted.web.iweb import IBodyProducer
from zope.interface import implements
pool = HTTPConnectionPool(reactor, persistent=True)
pool.retryAutomatically = False
pool.maxPersistentPerHost = 10
agent = Agent(reactor, pool=pool)
bid_url = 'http://localhost:8888'
class StringProducer(object):
implements(IBodyProducer)
def __init__(self, body):
self.body = body
self.length = len(body)
def startProducing(self, consumer):
consumer.write(self.body)
return defer.succeed(None)
def pauseProducing(self):
pass
def stopProducing(self):
pass
def callback(a):
pass
def error_callback(error):
pass
def loop():
d = agent.request('POST', bid_url, None, StringProducer("Hello, world"))
#d = agent.request('GET', bid_url)
d.addCallback(callback).addErrback(error_callback)
def main():
exchange = LoopingCall(loop)
exchange.start(0.02)
#log.startLogging(sys.stdout)
reactor.run()
main()
Note that this code does not leak with CPython nor with Tornado and Pypy! The code leaks only when using Twisted and Pypy together, and ONLY when using a POST request.
To see the leak, you have to send hundreds of thousands of requests.
Note that when setting PYPY_GC_MAX, the process eventually crashes.
What's going on?
Turns out that the cause of the leak is the BytesIO module.
Here's how to simulate the leak on Pypy.
from io import BytesIO
while True: a = BytesIO()
Here's the fix:
https://bitbucket.org/pypy/pypy/commits/40fa4f3a0740e3aac77862fe8a853259c07cb00b

Resources