I'm pretty much brand new to python and I have been working on a script that parses the csv files in any given directory. After I implemented a queue and threads, I've been stuck on this issue of the threads not picking up new work, even though there are still items in the queue. For example, if I specify the max # of threads as 3, and there are 6 items in the queue, the threads pick up 3 files, process them, then hang, indefinitely. I may just be conceptually misunderstanding the multithreading process.
ETA:
Some of the code has been removed for security reasons.
q = Queue.Queue()
threads = []
for file in os.listdir(os.chdir(arguments.path)):
if (file.endswith('.csv')):
q.put(file)
for i in range(max_threads):
worker = threading.Thread(target=process, name='worker-{}'.format(thread_count))
worker.setDaemon(True)
worker.start()
threads.append(worker)
thread_count += 1
q.join()
def process():
with open(q.get()) as csvfile:
#do stuff
q.task_done()
You forgot a to loop over the Queue in your threads...
def process():
while True: #<---------------- keep getting stuff from the queue
with open(q.get()) as csvfile:
#do stuff
q.task_done()
That said, You are maybe re-inventing the wheel, try using a Thread Pool:
from concurrent.futures import ThreadPoolExecutor
l = [] # a list should do it ...
for file in os.listdir(arguments.path):
if (file.endswith('.csv')):
l.append(file)
def process(file):
return "this is the file i got %s" % file
with ThreadPoolExecutor(max_workers=4) as e:
results = list(e.map(process, l))
Related
Im trying to generate data in two threads and get that data in a separate thread that prints the data.
3 threads, 2 threads generate data , 1 thread consumes the data generated.
The Problem: not getting both generated data into the consumer thread
How can I pass data generated in 2 threads and deliver it in the consumer thread?
#from threading import Thread
import concurrent.futures
import time
# A thread that produces data
def producer(out_q):
while True:
# Produce some data
global data
data = data + 2
out_q.put(data)
# Another thread that produces data
def ac(out2_q):
while True:
global x
x = x + 898934567
out2_q.put(data)
# A thread that consumes data
def consumer(in_q):
while True:
# Get BOTH produced data from 2 threads
data = in_q.get()
# Process the data
time.sleep(.4)
print(data, end=' ', flush=True)
x=0
data = 0
q = Queue()
with concurrent.futures.ThreadPoolExecutor() as executor:
t1 = executor.submit(consumer, q)
t2 = executor.submit(producer,q)
t3 = executor.submit(ac, q)```
I recommend to go with threading.Thread in this case. Please see the code below and follow comments. Feel free to ask questions.
from threading import Thread, Event
from queue import Queue
import time
def producer_one(q: Queue, e: Event):
while not e.is_set():
q.put("one")
time.sleep(1)
print("Producer # one stopped")
def producer_two(q: Queue, e: Event):
while not e.is_set():
q.put("two")
time.sleep(2)
print("Producer # two stopped")
def consumer(q: Queue):
while True:
item = q.get()
print(item)
q.task_done() # is used to unblock queue - all tasks were done
time.sleep(2)
# will never be printed ! - since it is daemon thread
print("All work is done by consumer!")
if __name__ == '__main__':
_q = Queue() # "connects" threads
_e = Event() # is used to stop producers from the Main Thread
# create threads block
producer_th1 = Thread(target=producer_one, args=(_q, _e, ))
producer_th2 = Thread(target=producer_two, args=(_q, _e, ))
# daemon means that thread will be stopped when main thread stops
consumer_th = Thread(target=consumer, args=(_q, ), daemon=True)
try:
# starts block:
producer_th1.start()
producer_th2.start()
consumer_th.start()
time.sleep(20)
_e.set() # ask producers to stop
except KeyboardInterrupt:
_e.set() # ask producer threads to stop
print("Asked Producer Threads to stop")
finally:
producer_th1.join() # main thread is block until producer_th1 is not stopped
producer_th2.join() # main thread is block until producer_th2 is not stopped
_q.join() # now wait consumer to finish all tasks from queue
print("Queue is empty and program will be finished soon")
time.sleep(2) # just wait 2 seconds to show that consumer stops with main thread
print("All done!")
I am trying to scrape some websites using the python's threading and thread-safe queue module. I'm observing an increase in memory usage as I test on more URLs. Below is my code for your reference:
from collections import defaultdict
from queue import Queue
from threading import Thread
import itertools
from time import time
import newspaper
import requests
import pickle
data = defaultdict(list)
def get_links():
return (url for url in pickle.load(open('urls.pkl','rb')))
# for url in urls[:500]:
# yield url
def download_url(url):
try:
resp = requests.get(url)
article = newspaper.Article(resp.url)
article.download(input_html=resp.content)
article.parse()
data['url'].append(url)
data['result'].append(article.text)
except:
pass
class DownloadWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
# Get the work from the queue and expand the tuple
link = self.queue.get()
try:
download_url(link)
print(link,"done")
finally:
self.queue.task_done()
print(self.queue.qsize())
def main():
ts = time()
links = get_links()
# Create a queue to communicate with the worker threads
queue = Queue()
# Create worker threads
for x in range(4):
worker = DownloadWorker(queue)
# Setting daemon to True will let the main thread exit even though the workers are blocking
worker.daemon = True
worker.start()
# Put the tasks into the queue as a tuple
for link in itertools.islice(links,1000):
queue.put(link)
# Causes the main thread to wait for the queue to finish processing all the tasks
queue.join()
pickle.dump(data, open('scrapped_results.pkl','wb'))
print('Took %s mins' %((time() - ts)/60))
if __name__ == '__main__':
main()
If tested on 100 URLs the memory consumption stays constant at 0.1% but it increases as the more number of URLs are tested (0.2%,0.4%,0.5%). Max URLs I have tested are 1000. The mix of questions I have is below:
Why memory consumption increase?
Is memory increasing because the queue not getting emptied before it gets filled? My understanding of queue is that it empties itself as the data in the queue gets processed.
Is there a way to keep the memory usage constant by the threads?
Is it because of the data in the defaultdict is getting bigger?
Can timeout help here? Where can I declare a timeout?
Is it the newspaper and requests?
I am following the principles laid down in this post to safely output the results which will eventually be written to a file. Unfortunately, the code only print 1 and 2, and not 3 to 6.
import os
import argparse
import pandas as pd
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist):
for par in parlist:
queue.put(par)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
try:
par=queueIn.get(block=False)
res=doCalculation(par)
queueOut.put((res))
queueIn.task_done()
except:
break
def doCalculation(par):
return par
def write(queue):
while True:
try:
par=queue.get(block=False)
print("response:",par)
except:
break
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
On running the code it prints,
$ python3 tst.py
Queue size 6
response: 1
response: 2
Also, is it possible to ensure that the write function always outputs 1,2,3,4,5,6 i.e. in the same order in which the data is fed into the feed queue?
The error is somehow with the task_done() call. If you remove that one, then it works, don't ask me why (IMO that's a bug). But the way it works then is that the queueIn.get(block=False) call throws an exception because the queue is empty. This might be just enough for your use case, a better way though would be to use sentinels (as suggested in the multiprocessing docs, see last example). Here's a little rewrite so your program uses sentinels:
import os
import argparse
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist, nthreads):
for par in parlist:
queue.put(par)
for i in range(nthreads):
queue.put(None)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
par=queueIn.get()
if par is None:
break
res=doCalculation(par)
queueOut.put((res))
def doCalculation(par):
return par
def write(queue):
while not queue.empty():
par=queue.get()
print("response:",par)
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod, nthreads))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
A few things to note:
the sentinel is putting a None into the queue. Note that you need one sentinel for every worker process.
for the write function you don't need to do the sentinel handling as there's only one process and you don't need to handle concurrency (if you would do the empty() and then get() thingie in your calc function you would run into a problem if e.g. there's only one item left in the queue and both workers check empty() at the same time and then both want to do get() and then one of them is locked forever)
you don't need to put feed and write into processes, just put them into your main function as you don't want to run it in parallel anyway.
how can I have the same order in output as in input? [...] I guess multiprocessing.map can do this
Yes map keeps the order. Rewriting your program into something simpler (as you don't need the workerQueue and writerQueue and adding random sleeps to prove that the output is still in order:
from multiprocessing import Pool
import time
import random
def calc(val):
time.sleep(random.random())
return val
if __name__ == "__main__":
considerperiod=[1,2,3,4,5,6]
with Pool(processes=2) as pool:
print(pool.map(calc, considerperiod))
I have a function that yields lines from a huge CSV file lazily:
def get_next_line():
with open(sample_csv,'r') as f:
for line in f:
yield line
def do_long_operation(row):
print('Do some operation that takes a long time')
I need to use threads such that each record I get from the above function I can call do_long_operation.
Most places on Internet have examples like this, and I am not very sure if I am on the right path.
import threading
thread_list = []
for i in range(8):
t = threading.Thread(target=do_long_operation, args=(get_next_row from get_next_line))
thread_list.append(t)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
My questions are:
How do I start only a finite number of threads, say 8?
How do I make sure that each of the threads will get a row from get_next_line?
You could use a thread pool from multiprocessing and map your tasks to a pool of workers:
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
from time import sleep
def process_line(l):
print l, "started"
sleep(randint(0, 3))
print l, "done"
def get_next_line():
with open("sample.csv", 'r') as f:
for line in f:
yield line
f = get_next_line()
t = Pool(processes=8)
for i in f:
t.map(process_line, (i,))
t.close()
t.join()
This will create eight workers and submit your lines to them, one by one. As soon as a process is "free", it will be allocated a new task.
There is a commented out import statement, too. If you comment out the ThreadPool and import Pool from multiprocessing instead, you will get subprocesses instead of threads, which may be more efficient in your case.
Using a Pool/ThreadPool from multiprocessing to map tasks to a pool of workers and a Queue to control how many tasks are held in memory (so we don't read too far ahead into the huge CSV file if worker processes are slow):
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
import time, os
from multiprocessing import Queue
def process_line(l):
print("{} started".format(l))
time.sleep(randint(0, 3))
print("{} done".format(l))
def get_next_line():
with open(sample_csv, 'r') as f:
for line in f:
yield line
# use for testing
# def get_next_line():
# for i in range(100):
# print('yielding {}'.format(i))
# yield i
def worker_main(queue):
print("{} working".format(os.getpid()))
while True:
# Get item from queue, block until one is available
item = queue.get(True)
if item == None:
# Shutdown this worker and requeue the item so other workers can shutdown as well
queue.put(None)
break
else:
# Process item
process_line(item)
print("{} done working".format(os.getpid()))
f = get_next_line()
# Use a multiprocessing queue with maxsize
q = Queue(maxsize=5)
# Start workers to process queue items
t = Pool(processes=8, initializer=worker_main, initargs=(q,))
# Enqueue items. This blocks if the queue is full.
for l in f:
q.put(l)
# Enqueue the shutdown message (i.e. None)
q.put(None)
# We need to first close the pool before joining
t.close()
t.join()
Hannu's answer is not the best method.
I ran the code on a 100M rows CSV file. It took me forever to perform the operation.
However, prior to reading his answer, I had written the following code:
def call_processing_rows_pickably(row):
process_row(row)
import csv
from multiprocessing import Pool
import time
import datetime
def process_row(row):
row_to_be_printed = str(row)+str("hola!")
print(row_to_be_printed)
class process_csv():
def __init__(self, file_name):
self.file_name = file_name
def get_row_count(self):
with open(self.file_name) as f:
for i, l in enumerate(f):
pass
self.row_count = i
def select_chunk_size(self):
if(self.row_count>10000000):
self.chunk_size = 100000
return
if(self.row_count>5000000):
self.chunk_size = 50000
return
self.chunk_size = 10000
return
def process_rows(self):
list_de_rows = []
count = 0
with open(self.file_name, 'rb') as file:
reader = csv.reader(file)
for row in reader:
print(count+1)
list_de_rows.append(row)
if(len(list_de_rows) == self.chunk_size):
p.map(call_processing_rows_pickably, list_de_rows)
del list_de_rows[:]
def start_process(self):
self.get_row_count()
self.select_chunk_size()
self.process_rows()
initial = datetime.datetime.now()
p = Pool(4)
ob = process_csv("100M_primes.csv")
ob.start_process()
final = datetime.datetime.now()
print(final-initial)
This took 22 minutes. Obviously, I need to have more improvements. For example, the Fred library in R takes 10 minutes maximum to do this task.
The difference is: I am creating a chunk of 100k rows first, and then I pass it to a function which is mapped by threadpool(here, 4 threads).
I have created a program to generate data points of functions that I later plot. The program takes a class which defines the function, creates a data outputting object which when called generates the data to a text file. To make the whole process faster I put the jobs in threads, however when I do, the data generated is not always correct. I have attached a picture to show what I mean:
Here are some of the relevant bits of code:
from queue import Queue
import threading
import time
queueLock = threading.Lock()
workQueue = Queue(10)
def process_data(threadName, q, queue_window, done):
while not done.get():
queueLock.acquire() # check whether or not the queue is locked
if not workQueue.empty():
data = q.get()
# data is the Plot object to be run
queueLock.release()
data.parent_window = queue_window
data.process()
else:
queueLock.release()
time.sleep(1)
class WorkThread(threading.Thread):
def __init__(self, threadID, q, done):
threading.Thread.__init__(self)
self.ID = threadID
self.q = q
self.done = done
def get_qw(self, queue_window):
# gets the queue_window object
self.queue_window = queue_window
def run(self):
# this is called when thread.start() is called
print("Thread {0} started.".format(self.ID))
process_data(self.ID, self.q, self.queue_window, self.done)
print("Thread {0} finished.".format(self.ID))
class Application(Frame):
def __init__(self, etc):
self.threads = []
# does some things
def makeThreads(self):
for i in range(1, int(self.threadNum.get()) +1):
thread = WorkThread(i, workQueue, self.calcsDone)
self.threads.append(thread)
# more code which just processes the function etc, sorts out the gui stuff.
And in a separate class (as I'm using tkinter, so the actual code to get the threads to run is called in a different window) (self.parent is the Application class):
def run_jobs(self):
if self.running == False:
# threads are only initiated when jobs are to be run
self.running = True
self.parent.calcsDone.set(False)
self.parent.threads = [] # just to make sure that it is initially empty, we want new threads each time
self.parent.makeThreads()
self.threads = self.parent.threads
for thread in self.threads:
thread.get_qw(self)
thread.start()
# put the jobs in the workQueue
queueLock.acquire()
for job in self.job_queue:
workQueue.put(job)
queueLock.release()
else:
messagebox.showerror("Error", "Jobs already running")
This is all the code which relates to the threads.
I don't know why when I run the program with multiple threads some data points are incorrect, whilst running it with just 1 single thread the data is all perfect. I tried looking up "threadsafe" processes, but couldn't find anything.
Thanks in advance!