Preface: This is my first attempt at using these tools
Context: I have a very large file I am trying to process. So I am attempting to break the file up into smaller chunks. Then load those files into a queue for processing.
The goals was to speed up what is a very slow process.
Code:
import lifetimes
import os
import pandas
import Queue
import threading
import multiprocessing
import glob
import subprocess
#move master to processing dir
os.system("cp /data/ltv-testing1.csv /data/out")
#break master csv into 1 million row chunks
subprocess.call(['bash', '/home/ddewberry/LTV_CSV_Split.sh'])
#remove master file
os.remove("/data/out/ltv-testing1.csv")
os.chdir("/data/out")
# Create List of Files
worker_data = glob.glob('split_*')
#build queue with file list
q = Queue.Queue(worker_data)
#import tools for data processing
from lifetimes.utils import summary_data_from_transaction_data
#define worker for threads
def worker(outfile = '/data/in/Worker.csv'):
while True:
item = q.get()
data = pandas.read_csv(item)
summary = summary_data_from_transaction_data(data, data[[2]], data[[1]])
summary.to_csv(outfile%s % (item))
q.task_done()
cpus=multiprocessing.cpu_count() #detect number of cores
print("Creating %d threads" % cpus)
for i in range(cpus):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
q.join()
#clean up
for row in worker_data:
os.remove(row)
Problem:
I don't get any error messages however it doesn't work at all. (It basically does nothing)
I am very confused at what I did wrong or what I need to fix.
import lifetimes
import os
import pandas
import Queue
import threading
import multiprocessing
import glob
import subprocess
#move master to processing dir
os.system("cp /data/ltv-testing1.csv /data/out")
#break master csv into 1 million row chunks
subprocess.call(['bash', '/home/ddewberry/LTV_CSV_Split.sh'])
#remove master file
os.remove("/data/out/ltv-testing1.csv")
os.chdir("/data/out")
# Create List of Files
worker_data = glob.glob('split_*')
# rename to csv
for row in worker_data:
os.rename(row, row+'.csv')
worker_data1 = glob.glob('split_*')
#build queue with file list
q = Queue.Queue()
for files in worker_data1:
q.put(files)
Related
Hello I am working on a simulation of a buffer where I need to use threads and locks. So I created two function one so the consumer gets his trail and the second one is once he gets his trail he can go to the next line to get his meal.
However my code never stops running and never goes to the second function were he could get his meal.
from concurrent.futures import thread
import random
import threading
import time
import concurrent.futures
import logging
import traceback
from numpy import number
#Creating the two queues with 50 students
consumers = [x+1 for x in range(50)]
trail = []
meal = []
#putting the locks for both queues
meal_lock = threading.Lock()
trail_lock = threading.Lock()
def trail(x):
global trail_lock
while True:
trail_lock.acquire()
trail.append(x)
if x in trail:
print(f"Consumer {x} Got his trail")
trail_lock.release()
def meal(x):
global meal_lock
while True:
meal_lock.acquire()
if x in trail:
trail.remove(x)
print("Got his meal")
meal.append(x)
meal_lock.release()
break
number_of_meals = 5
number_of_trails = 5
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_trails) as executor:
executor.map(trail, range(number_of_trails))
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_meals) as executor:
executor.map(meal, range(1+y,number_of_meals))
I created a code (using Tkinter, Python3 and matplotlid) that could read data from different serial ports, save them to csv, then create graphs and finally preview data in GUI. The code was splited in two different scripts. The main script contained reading data, save data to csv an priview of data and the other script contained the graph creation.
Today I rewrote the code using the answer of #user2464430 here. The code is working, but I can't update the GUI. Opens once and then no refresh with new data.
The following code is a part of total code.
My code is:
from PIL import ImageTk, Image
import tkinter as Tk
import multiprocessing
from queue import Empty, Full
from time import strftime
import serial
import numpy as np
import matplotlib.pyplot as plt
from drawnow import *
from pylab import *
import pandas as pd
from datetime import timedelta
from datetime import datetime
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import locale
import os
class GuiApp(object):
def __init__(self, image):
self.root = Tk.Tk()
self.root.resizable(width=False, height=False)
self.root.geometry("1600x800+0+0")
C = Canvas(self.root, bg="black", width=1600, height=800)
def BasicLabels():
....... # in this stage create multiple axis labels
Î¥AxisLabels()
BasicLabels()
def ValueLabels():
....... # Read and munipulate datas from CSV file and print in in labels
ValueLabels()
C.pack()
def GenerateData(q): #Read Serial Ports and store data to CSV file
file_exists = os.path.isfile("BigData.csv")
header = [["Daytime,T1"]]
if not file_exists:
with open("BigData.csv", "a+") as csvfile:
np.savetxt(csvfile, header, delimiter=",", fmt="%s", comments="")
while True:
try:
ser1 = serial.Serial(port="COM4", baudrate=9600)
read_ser1 = ser1.readline()
if read_ser1 == "":
read_ser1 = "Missing Value"
else:
read_ser1 = ser1.readline()
read_ser1 = str(read_ser1[0 : len(read_ser1)].decode("utf-8"))
# print("COM4:", read_ser1)
ser1.close()
except:
print("Failed 1")
read_ser1 = "9999,9999,9999,9999,9999"
daytime = strftime(" %d-%m-%Y %H:%M:%S")
rows = [
daytime
+ ","
+ read_ser1.strip()
]
with open("BigData.csv", "a+") as csvfile:
np.savetxt(csvfile, rows, delimiter=",", fmt="%s", comments="")
CreateGraphs()
def CreateGraphs():
#Code to generate graph. Called every time i have new line in CSV.
if __name__ == "__main__":
# Queue which will be used for storing Data
q = multiprocessing.Queue()
q.cancel_join_thread() # or else thread that puts data will not term
gui = GuiApp(q)
t1 = multiprocessing.Process(target=GenerateData, args=(q,))
t1.start()
gui.root.mainloop()
t1.join()
The graphs are generating after while True in GenerateData.
All datas for labels and graphs are coming from CSV file and not directly from serial port.
Is it possible to update GUI with latest datas from CSV and created graphs?
Thank for your time.
I have written a small programm that uses a pipe.
The parent takes care of camera connection while the child is processing the images.
The child process calls two functions FunctionA and FunctionB. Both times the image from the parent is processed.
I want to run the two functions as fast as possible since they are time consuming 0.1s, 0.12s. I tried multiprocessing.Processes and multiprocessing.pools. The bottle neck in the first case is the creation of the two processes for each image passed by the parent. Pools only make sense if I would have a list of images which I cant wait for since the speed up from the pipe will be wasted which is already quite good -3.123 seconds. Does somebody has a smart solution for this sort of problem?
Kind regards :)
Example code:
import multiprocessing
import cv2
import glob
import cv2
from multiprocessing import pool
from multiprocessing.dummy import Pool as ThreadPool
import glob
import time
from ProcA import FunctionA
from ProcB import FunctionB
import Lines
import Feature
#===============================================================================
# Test Pipe
#===============================================================================
def cam_loop(pipe_parent):
imagePathes = glob.glob("Images\*.jpg")
for path in imagePathes:
image = cv2.imread(path)
pipe_parent.send(image)
StringFromChild = pipe_parent.recv()
print("StringFromChild:",StringFromChild)
def show_loop(pipe_child):
#cv2.namedWindow('pepe')
proc = Preprocessor.Preprocessor()
line = Lines.Lines()
features = Feature.FeatureDetector()
imgIdx = 0
Q_Barcode = multiprocessing.Queue(10)
Q_CapFeatures = multiprocessing.Queue(100)
while True:
image = pipe_child.recv()
start = time.time()
#Calculating features
#Create processes
p1 = multiprocessing.Process(target = FunctionA, args = (proc, image, imgIdx, None, None, None,))
p1.start()
p2 = multiprocessing.Process(target = FunctionB, args = (line, proc, features, image, imgIdx,))
p2.start()
p1.join()
p2.join()
#send features calculated to parent
pipe_child.send("OK")
end = time.time()
print("TimeMultiProc",end - start)
start = time.time()
#Calculating feature
FunctionA(proc, image, imgIdx, None, None, None)
FunctionB(line, proc, features, image, imgIdx)
#send features calculated to parent
end = time.time()
print("TimeSerial",end - start)
if __name__ == '__main__':
logger = multiprocessing.log_to_stderr()
logger.setLevel(multiprocessing.SUBDEBUG)
pipe_parent, pipe_child = multiprocessing.Pipe()
cam_process = multiprocessing.Process(target=cam_loop,args=(pipe_parent, ))
cam_process.start()
show_process = multiprocessing.Process(target=show_loop,args=(pipe_child, ))
show_process.start()
cam_process.join()
show_loop.join()
I am following the principles laid down in this post to safely output the results which will eventually be written to a file. Unfortunately, the code only print 1 and 2, and not 3 to 6.
import os
import argparse
import pandas as pd
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist):
for par in parlist:
queue.put(par)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
try:
par=queueIn.get(block=False)
res=doCalculation(par)
queueOut.put((res))
queueIn.task_done()
except:
break
def doCalculation(par):
return par
def write(queue):
while True:
try:
par=queue.get(block=False)
print("response:",par)
except:
break
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
On running the code it prints,
$ python3 tst.py
Queue size 6
response: 1
response: 2
Also, is it possible to ensure that the write function always outputs 1,2,3,4,5,6 i.e. in the same order in which the data is fed into the feed queue?
The error is somehow with the task_done() call. If you remove that one, then it works, don't ask me why (IMO that's a bug). But the way it works then is that the queueIn.get(block=False) call throws an exception because the queue is empty. This might be just enough for your use case, a better way though would be to use sentinels (as suggested in the multiprocessing docs, see last example). Here's a little rewrite so your program uses sentinels:
import os
import argparse
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist, nthreads):
for par in parlist:
queue.put(par)
for i in range(nthreads):
queue.put(None)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
par=queueIn.get()
if par is None:
break
res=doCalculation(par)
queueOut.put((res))
def doCalculation(par):
return par
def write(queue):
while not queue.empty():
par=queue.get()
print("response:",par)
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod, nthreads))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
A few things to note:
the sentinel is putting a None into the queue. Note that you need one sentinel for every worker process.
for the write function you don't need to do the sentinel handling as there's only one process and you don't need to handle concurrency (if you would do the empty() and then get() thingie in your calc function you would run into a problem if e.g. there's only one item left in the queue and both workers check empty() at the same time and then both want to do get() and then one of them is locked forever)
you don't need to put feed and write into processes, just put them into your main function as you don't want to run it in parallel anyway.
how can I have the same order in output as in input? [...] I guess multiprocessing.map can do this
Yes map keeps the order. Rewriting your program into something simpler (as you don't need the workerQueue and writerQueue and adding random sleeps to prove that the output is still in order:
from multiprocessing import Pool
import time
import random
def calc(val):
time.sleep(random.random())
return val
if __name__ == "__main__":
considerperiod=[1,2,3,4,5,6]
with Pool(processes=2) as pool:
print(pool.map(calc, considerperiod))
I have a function that yields lines from a huge CSV file lazily:
def get_next_line():
with open(sample_csv,'r') as f:
for line in f:
yield line
def do_long_operation(row):
print('Do some operation that takes a long time')
I need to use threads such that each record I get from the above function I can call do_long_operation.
Most places on Internet have examples like this, and I am not very sure if I am on the right path.
import threading
thread_list = []
for i in range(8):
t = threading.Thread(target=do_long_operation, args=(get_next_row from get_next_line))
thread_list.append(t)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
My questions are:
How do I start only a finite number of threads, say 8?
How do I make sure that each of the threads will get a row from get_next_line?
You could use a thread pool from multiprocessing and map your tasks to a pool of workers:
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
from time import sleep
def process_line(l):
print l, "started"
sleep(randint(0, 3))
print l, "done"
def get_next_line():
with open("sample.csv", 'r') as f:
for line in f:
yield line
f = get_next_line()
t = Pool(processes=8)
for i in f:
t.map(process_line, (i,))
t.close()
t.join()
This will create eight workers and submit your lines to them, one by one. As soon as a process is "free", it will be allocated a new task.
There is a commented out import statement, too. If you comment out the ThreadPool and import Pool from multiprocessing instead, you will get subprocesses instead of threads, which may be more efficient in your case.
Using a Pool/ThreadPool from multiprocessing to map tasks to a pool of workers and a Queue to control how many tasks are held in memory (so we don't read too far ahead into the huge CSV file if worker processes are slow):
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
import time, os
from multiprocessing import Queue
def process_line(l):
print("{} started".format(l))
time.sleep(randint(0, 3))
print("{} done".format(l))
def get_next_line():
with open(sample_csv, 'r') as f:
for line in f:
yield line
# use for testing
# def get_next_line():
# for i in range(100):
# print('yielding {}'.format(i))
# yield i
def worker_main(queue):
print("{} working".format(os.getpid()))
while True:
# Get item from queue, block until one is available
item = queue.get(True)
if item == None:
# Shutdown this worker and requeue the item so other workers can shutdown as well
queue.put(None)
break
else:
# Process item
process_line(item)
print("{} done working".format(os.getpid()))
f = get_next_line()
# Use a multiprocessing queue with maxsize
q = Queue(maxsize=5)
# Start workers to process queue items
t = Pool(processes=8, initializer=worker_main, initargs=(q,))
# Enqueue items. This blocks if the queue is full.
for l in f:
q.put(l)
# Enqueue the shutdown message (i.e. None)
q.put(None)
# We need to first close the pool before joining
t.close()
t.join()
Hannu's answer is not the best method.
I ran the code on a 100M rows CSV file. It took me forever to perform the operation.
However, prior to reading his answer, I had written the following code:
def call_processing_rows_pickably(row):
process_row(row)
import csv
from multiprocessing import Pool
import time
import datetime
def process_row(row):
row_to_be_printed = str(row)+str("hola!")
print(row_to_be_printed)
class process_csv():
def __init__(self, file_name):
self.file_name = file_name
def get_row_count(self):
with open(self.file_name) as f:
for i, l in enumerate(f):
pass
self.row_count = i
def select_chunk_size(self):
if(self.row_count>10000000):
self.chunk_size = 100000
return
if(self.row_count>5000000):
self.chunk_size = 50000
return
self.chunk_size = 10000
return
def process_rows(self):
list_de_rows = []
count = 0
with open(self.file_name, 'rb') as file:
reader = csv.reader(file)
for row in reader:
print(count+1)
list_de_rows.append(row)
if(len(list_de_rows) == self.chunk_size):
p.map(call_processing_rows_pickably, list_de_rows)
del list_de_rows[:]
def start_process(self):
self.get_row_count()
self.select_chunk_size()
self.process_rows()
initial = datetime.datetime.now()
p = Pool(4)
ob = process_csv("100M_primes.csv")
ob.start_process()
final = datetime.datetime.now()
print(final-initial)
This took 22 minutes. Obviously, I need to have more improvements. For example, the Fred library in R takes 10 minutes maximum to do this task.
The difference is: I am creating a chunk of 100k rows first, and then I pass it to a function which is mapped by threadpool(here, 4 threads).