elasticsearch using multiprocessing in python - python-3.x

I am trying to read huge volume of data like around 1 TB and load into elastic search .
what are the possible ways I can check for loading that much volume
here I am browing coding options for the same and thought of using python multi processing .
so i split my large file into small chunks then used this sample to read my files and to load into elasticsearch using the multi processeing . Is this right kind of approach?
python code:
def read_sample(filename):
my code to read from file and output s the element
def elasticinsert(filename):
deque(helpers.parallel_bulk(es,read_sample(filename),index="sample",doc_type="samples"), maxlen=0)
def main():
data=[]
data=[filename for filename in list_of_sample_files]
pool=multiprocessing.Pool(processes=2,maxtasksperchild=1)
result=pool.map(elasticinsert,data)
if __name__ == "__main__":
main()
Now I am getting some kind of SSL issues and here is the traceback . How can I resolve this ??
Traceback (most recent call last):
File "/usr/lib/python3.4/site-packages/elasticsearch/connection/http_requests.py", line 76, in perform_request
response = self.session.send(prepared_request, **send_kwargs)
File "/usr/lib/python3.4/site-packages/requests/sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3.4/site-packages/requests/adapters.py", line 447, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:1769)
Any help for me . Thanks for all your time .

The simple solution here would be to use a threading interface instead of multiprocessing.
import threading
import queue
def read_sample(filename):
'''my code to read from file and output s the element'''
def elasticinsert(filename):
''' Some Operaitions '''
q.put(filename)#Can be any data you want to put
def main():
q = queue.Queue()
threads=[]
for i in list_of_sample_files:
t = threading.Thread(target=elasticinsert, args=(i,))
threads.append(t)
t[-1].start()
[t.join() for t in threads]
while not q.empty():
q.get()

Related

How to share the files (files object) between the various processes in python?

I'm implementing the multiprocessing, there are lot of JSON files, I want to read and write those files by various processes, I'm doing multiprocessing. I don't want the race condition in between so I also want the processes needs to be synchronised there.
I'm trying the following dummy code. But I don't know why it is not working I'm using multiprocess Queue to share the open file object. Could you guys suggest me is there anything wrong I'm doing, i'm getting error, I'm new to multiprocessing stuff.
Below is my code:
from multiprocessing import Queue, Process, Lock
def writeTofile(q, lock, i):
print(f'some work by {i}')
text = f" Process {i} -- "
ans =""
for i in range(10000):
ans += text
#critiacl section
lock.acquire()
file = q.get()
q.put(file)
file.write(ans)
lock.release()
print(f'updated by process {i}')
def main():
q = Queue()
lock = Lock()
jobs = []
with open("test.txt", mode = 'a') as file:
q.put(file)
for i in range(4):
process = Process(target = writeTofile, args = (q, lock, i))
jobs.append(process)
process.start()
for j in jobs:
j.join()
print('completed')
if __name__ == "__main__":
main()
This is the error I'm getting below:
Traceback (most recent call last):
File "/Users/akshaysingh/Desktop/ipnb/multi-processing.py", line 42, in <module>
main()
File "/Users/akshaysingh/Desktop/ipnb/multi-processing.py", line 27, in main
q.put(file)
File "<string>", line 2, in put
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/managers.py", line 808, in _callmethod
conn.send((self._id, methodname, args, kwds))
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/connection.py", line 211, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle '_io.TextIOWrapper' object

python multiprocessing fastq function

I am a new user of mutliprocessing module in Python3.
I have 2 fastq files (forward and reverse) and I want to process forward/reverse couple of reads. For this, from forward read, I get the corresponding reverse and apply a function with multiple arguments on the couple. So far, I've done it sequentially on 1 thread, which is quite long for huge files. Now, I would like to improve speed by parallelising the function application, so I create chunk of the forward file and apply function to each chunk using multiprocessing. Here is the code:
def chunk_itr(iterator, chunk_size):
"""
Function to split fastq file into smallest files for faster processing
From biopython solutions
"""
entry = True
while entry:
chunk = []
while len(chunk) < chunk_size:
try:
entry = next(iterator)
except StopIteration:
entry = None
if entry is None:
break
chunk.append(entry)
if chunk:
yield chunk
def chunk_fastq(f_fastq, chunkSize, path2out):
rec_itr = SeqIO.parse(open(f_fastq), "fastq")
os.mkdir(os.path.join(path2out, "chunk_files"))
dir_out = os.path.join(path2out, "chunk_files")
base = os.path.basename(f_fastq)
fname = os.path.splitext(base)[0]
for i, chunk in enumerate(chunk_itr(rec_itr, chunkSize)):
out_chunk_name = os.path.join(dir_out, "{0}_chunk{1}.fastq".format(fname, i))
with open(out_chunk_name, "w") as handle:
SeqIO.write(chunk, handle, "fastq")
def testmulti(fwd_chunk, rev_idx):
fwd_idx = SeqIO.index(fwd_chunk, "fastq")
for i in fwd_idx:
print(i, rev_idx[i])
pathfwd = "path/to/forward_file"
f_rev = "path/to/rev_fastq"
def main():
rev_idx = SeqIO.index(f_rev, "fastq")
chunk_fastq(pathfwd, 1000, path2chunk)
files = [f for f in os.listdir(path2chunk)]
# sequential
for i in files:
testmulti(i, rev_idx)
# parallel process
processes = []
for i in files:
proc = mp.Process(target=testmulti, args=(i, rev_idx,))
processes.append(proc)
proc.start()
for p in processes:
p.join()
The sequential approach works fine but the parallel one crash with the following error:
Process Process-2:
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "test.py", line 28, in testmulti
print(i, rev_idx[i])
File "test.py", line 28, in testmulti
print(i, rev_idx[i])
File "/home/user/.local/lib/python3.6/site-packages/Bio/File.py", line 417, in __getitem__
record = self._proxy.get(self._offsets[key])
File "/home/user/.local/lib/python3.6/site-packages/Bio/File.py", line 417, in __getitem__
record = self._proxy.get(self._offsets[key])
File "/home/user/.local/lib/python3.6/site-packages/Bio/SeqIO/_index.py", line 69, in get
return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
File "/home/user/.local/lib/python3.6/site-packages/Bio/SeqIO/_index.py", line 69, in get
return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
File "/home/user/.local/lib/python3.6/site-packages/Bio/SeqIO/_index.py", line 664, in get_raw
raise ValueError("Problem with quality section")
File "/home/user/.local/lib/python3.6/site-packages/Bio/SeqIO/_index.py", line 642, in get_raw
raise ValueError("Premature end of file in seq section")
ValueError: Problem with quality section
ValueError: Premature end of file in seq section
From the Index class description in biopython, it seems there is an issue with the file format/structure
I double checked the input files and there are no error (and it works with the sequential approach).
My guess so far:
using Process like this is not a good option (I also tried pool.starmap, but without success)
since the f_rev is indexed once and then each process try to use it in parallel, there is a conflict
Any help would be appreciated
Thank you!
Ok, so I am still not 100% sure on the cause of the error, but after increasing the size of my fastq files I was able to replicate it.
It definitely has to do with the reverse index object created with SeqIO.index however I'm struggling to fully grasp what exactly that looks like from the source code as there is a lot of inheritance going on. I suspect that it is something to do with passing an open file-handle object to the child processes, but again I'm not well-versed enough in that side of things to guarantee it.
However I can successfully prevent the error. The solution involves moving your creation of the reverse index to the child process. I don't see any good reason not to either, the whole point of the SeqIO.Index method is that it creates a low-memory index rather than reading the whole file into memory, so creating one per-child process shouldn't be excessively expensive.
def testmulti(fwd_chunk, rev):
rev_idx = SeqIO.index(rev, "fastq")
fwd_idx = SeqIO.index(fwd_chunk, "fastq")
for i in fwd_idx:
print(i, rev_idx[i])
pathfwd = "path/to/forward_file"
f_rev = "path/to/rev_fastq"
def main():
chunk_fastq(pathfwd, 1000, path2chunk)
files = [f for f in os.listdir(path2chunk)]
# sequential
for i in files:
testmulti(i, f_rev)
# parallel process
processes = []
for i in files:
proc = mp.Process(target=testmulti, args=(i, f_rev,))
processes.append(proc)
proc.start()
for p in processes:
p.join()

Why am I getting an ValueError: too many file descriptors in select()?

I load into the proxies variable my proxies and try to do async requests for get ip. Its simple:
async def get_ip(proxy):
timeout = aiohttp.ClientTimeout(connect=5)
async with aiohttp.ClientSession(timeout=timeout) as session:
try:
async with session.get('https://api.ipify.org?format=json', proxy=proxy, timeout=timeout) as response:
json_response = await response.json()
print(json_response)
except:
pass
if __name__ == "__main__":
proxies = []
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(get_ip(proxy)) for proxy in proxies]
loop.run_until_complete(asyncio.wait(tasks))
print('time spent to work: {} sec --------------'.format(time.time()-start_time))
This code work fine when i try to do 100-200-300-400 requests, but when is count more than 500 i alltime getting error:
Traceback (most recent call last):
File "async_get_ip.py", line 60, in <module>
loop.run_until_complete(asyncio.wait(tasks))
File "C:\Python37\lib\asyncio\base_events.py", line 571, in run_until_complete
self.run_forever()
File "C:\Python37\lib\asyncio\base_events.py", line 539, in run_forever
self._run_once()
File "C:\Python37\lib\asyncio\base_events.py", line 1739, in _run_once
event_list = self._selector.select(timeout)
File "C:\Python37\lib\selectors.py", line 323, in select
r, w, _ = self._select(self._readers, self._writers, [], timeout)
File "C:\Python37\lib\selectors.py", line 314, in _select
r, w, x = select.select(r, w, w, timeout)
ValueError: too many file descriptors in select()
I was looking for a solution, but all I found was a limitation at the OS. Can I somehow get around this problem without using additional libraries?
It's not a good idea to start unlimited amount of requests simultaneously. Each started request will consume some resources from CPU/RAM to OS's select() capacity, what, as is in your case, sooner or later will lead to problems.
To avoid the situation you should use asyncio.Semaphore which allows to limit maximum amount of simultaneous connections.
I believe only few changes should be made in your code:
sem = asyncio.Semaphore(50)
async def get_ip(proxy):
async with sem:
# ...
Here's full complex example of how to use semaphore in general.
P.S.
except:
pass
You should never do such thing, it'll just break a code sooner or later.
At very-very least use except Exception.

Python Multiprocessing( TypeError: cannot serialize '_io.BufferedReader' object )

I'm trying to make dictionary attack on zip file using Pool to increase speed.
But I face next error in Python 3.6, while it works in Python 2.7:
Traceback (most recent call last):
File "zip_crack.py", line 42, in <module>
main()
File "zip_crack.py", line 28, in main
for result in results:
File "/usr/lib/python3.6/multiprocessing/pool.py", line 761, in next
raise value
File "/usr/lib/python3.6/multiprocessing/pool.py", line 450, in _ handle_tasks
put(task)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/usr/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot serialize '_io.BufferedReader' object
I tried to search for same errors but couldn't find answer that can help here.
Code looks like this
def crack(pwd, f):
try:
key = pwd.strip()
f.extractall(pwd=key)
return True
except:
pass
z_file = zipfile.ZipFile("../folder.zip")
with open('words.dic', 'r') as passes:
start = time.time()
lines = passes.readlines()
pool = Pool(50)
results = pool.imap_unordered(partial(crack, f=z_file), lines)
pool.close()
for result in results:
if result:
pool.terminate()
break
pool.join()
I also tried another approach using map
with contextlib.closing(Pool(50)) as pool:
pool.map(partial(crack, f=z_file), lines)
which worked great and found passwords quickly in Python 2.7 but it throws same exception in python 3.6

python hangs even with exception handling

I've got a raspberry PI attached to a MCP3008 ADC which is measuring an analog voltage across a thermistor. I'm using the gpiozero python library for communication between the PI and ADC. My code below runs for several minutes and then spits out an error, and then hangs on function get_temp_percent. That function returns the average of five measurements from the ADC. I'm using Signal to throw an exception after 1 second of waiting to try to get past the hang, but it just throws an error and hangs anyway. It looks like nothing in my except statement is being read. Why am I not escaping the code hang?
import time
from gpiozero import MCP3008
from math import log
import pymysql.cursors
from datetime import datetime as dt
import signal
import os
def handler(signum, frame):
print('Signal handler called with signal', signum, frame)
raise Exception("Something went wrong!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
def get_temp_percent(pos=0):
x=[]
for i in range(0,5):
while True:
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(1)
adc = MCP3008(pos)
x.append(adc.value)
#adc.close()
except Exception as inst:
print('get_temp_percent {}'.format(inst) )
signal.alarm(0)
continue
break
signal.alarm(0)
time.sleep(.1)
return round(sum(x)/len(x),5)
def write_date(temp0):
<writes temp0 to mysql db >
# Connect to the database
connection = pymysql.connect(host='', user='', password='', db='',cursorclass = pymysql.cursors.DictCursor)
while True:
temp_percent = get_temp_percent()
print('Temp Percent = {}'.format(temp_percent) )
<some function that do some arithmetic to go temp_percent to temp0>
write_date(temp0)
print('Data Written')
time.sleep(1)
print('Sleep time over')
print('')
Function get_temp_percent causes the problem below
Signal handler called with signal 14 <frame object at 0x76274800>
Exception ignored in: <bound method SharedMixin.__del__ of SPI(closed)>
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/gpiozero/mixins.py", line 137, in __del__
super(SharedMixin, self).__del__()
File "/usr/lib/python3/dist-packages/gpiozero/devices.py", line 122, in __del__
self.close()
File "/usr/lib/python3/dist-packages/gpiozero/devices.py", line 82, in close
old_close()
File "/usr/lib/python3/dist-packages/gpiozero/pins/local.py", line 102, in close
self.pin_factory.release_all(self)
File "/usr/lib/python3/dist-packages/gpiozero/pins/__init__.py", line 85, in release_all
with self._res_lock:
File "/home/pi/Desktop/testing exceptions.py", line 13, in handler
raise Exception("Something went wrong!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
Exception: Something went wrong!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
It looks like your call to gpiozero does a lot of work behind the scenes.
When your exception is processed, the library is trying to clean up and gets stuck.
I took a quick look at the docs for the library and it looks like you may be able to keep hold of the pins so you can re-use them.
e.g.
import ...
adcs = {}
def get_adc_value(pos):
if pos not in adcs:
adcs[pos] = MCP3008(pos)
return adcs[pos].value
def get_temp_percent(pos=0):
x = []
for i in range(0, 5):
x.append(get_adc_value(pos))
time.sleep(.1)
return round(sum(x)/len(x),5)
while True:
temp_percent = get_temp_percent()
...

Resources