Python Multithreading missing data - multithreading

useI am working on a python script to check if the url is working. The script will write the url and response code to a log file.
To speed up the check, I am using threading and queue.
The script works well if the number of url's to check is small but when increasing the number of url's to hundreds, some url's just will miss from the log file.
Is there anything I need to fix?
My script is
#!/usr/bin/env python
import Queue
import threading
import urllib2,urllib,sys,cx_Oracle,os
import time
from urllib2 import HTTPError, URLError
queue = Queue.Queue()
##print_queue = Queue.Queue()
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
infourl.status = code
infourl.code = code
return infourl
http_error_300 = http_error_302
http_error_301 = http_error_302
http_error_303 = http_error_302
http_error_307 = http_error_302
class ThreadUrl(threading.Thread):
#Threaded Url Grab
## def __init__(self, queue, print_queue):
def __init__(self, queue,error_log):
threading.Thread.__init__(self)
self.queue = queue
## self.print_queue = print_queue
self.error_log = error_log
def do_something_with_exception(self,idx,url,error_log):
exc_type, exc_value = sys.exc_info()[:2]
## self.print_queue.put([idx,url,exc_type.__name__])
with open( error_log, 'a') as err_log_f:
err_log_f.write("{0},{1},{2}\n".format(idx,url,exc_type.__name__))
def openUrl(self,pair):
try:
idx = pair[1]
url = 'http://'+pair[2]
opener = urllib2.build_opener(NoRedirectHandler())
urllib2.install_opener(opener)
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1')
#open urls of hosts
resp = urllib2.urlopen(request, timeout=10)
## self.print_queue.put([idx,url,resp.code])
with open( self.error_log, 'a') as err_log_f:
err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))
except:
self.do_something_with_exception(idx,url,self.error_log)
def run(self):
while True:
#grabs host from queue
pair = self.queue.get()
self.openUrl(pair)
#signals to queue job is done
self.queue.task_done()
def readUrlFromDB(queue,connect_string,column_name,table_name):
try:
connection = cx_Oracle.Connection(connect_string)
cursor = cx_Oracle.Cursor(connection)
query = 'select ' + column_name + ' from ' + table_name
cursor.execute(query)
#Count lines in the file
rows = cursor.fetchall()
total = cursor.rowcount
#Loop through returned urls
for row in rows:
#print row[1],row[2]
## url = 'http://'+row[2]
queue.put(row)
cursor.close()
connection.close()
return total
except cx_Oracle.DatabaseError, e:
print e[0].context
raise
def main():
start = time.time()
error_log = "D:\\chkWebsite_Error_Log.txt"
#Check if error_log file exists
#If exists then deletes it
if os.path.isfile(error_log):
os.remove(error_log)
#spawn a pool of threads, and pass them queue instance
for i in range(10):
t = ThreadUrl(queue,error_log)
t.setDaemon(True)
t.start()
connect_string,column_name,table_name = "user/pass#db","*","T_URL_TEST"
tn = readUrlFromDB(queue,connect_string,column_name,table_name)
#wait on the queue until everything has been processed
queue.join()
## print_queue.join()
print "Total retrived: {0}".format(tn)
print "Elapsed Time: %s" % (time.time() - start)
main()

Python's threading module isn't really multithreaded because of the global interpreter lock, http://wiki.python.org/moin/GlobalInterpreterLock as such you should really use multiprocessing http://docs.python.org/library/multiprocessing.html if you really want to take advantage of multiple cores.
Also you seem to be accessing a file simultatnously
with open( self.error_log, 'a') as err_log_f:
err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))
This is really bad AFAIK, if two threads are trying to write to the same file at the same time or almost at the same time, keep in mind, their not really multithreaded, the behavior tends to be undefined, imagine one thread writing while another just closed it...
Anyway you would need a third queue to handle writing to the file.

At first glance this looks like a race condition, since many threads are trying to write to the log file at the same time. See this question for some pointers on how to lock a file for writing (so only one thread can access it at a time).

Related

How do I create imap checker that reads from csv file and loop for each line using multithreading?

The script only checks the first 10 lines of csv file, I want the script to iterate over all the lines of the file using threads to speed up the process.
Code:
import time
import csv
import imaplib
from threading import Thread
combo = []
FileToOpen = open("emails.csv", "r")
csvDictReader = csv.DictReader(FileToOpen)
successEmail = open("SuccessEmails.txt", "a")
for email in csvDictReader:
combo.append(email)
rows_count = len(list(csvDictReader))
t1 = time.perf_counter()
combo_new = combo
def ConnectorImap(combo_new):
for Email in combo_new:
login = Email['login']
password = Email['password']
imap_serv = "imap." + login.split('#')[-1]
mail_serv = "mail." + login.split('#')[-1]
try:
print(login,password,imap_serv)
print('logging in as %s' % login)
# create an IMAP4 class with SSL
imap_ssl = imaplib.IMAP4_SSL(imap_serv)
resp_code, response = imap_ssl.login(login,password)
print(resp_code)
if resp_code == "OK":
successEmail.write(login + ',' + password)
successEmail.write("\n")
imap_ssl.logout()
except Exception as e:
print(e)
pass
threads = []
for idx, line in enumerate(rows_count):
# We start one thread per url present.
process = Thread(target=ConnectorImap, args=(combo_new))
process.start()
threads.append(process)
for process in threads:
process.join()
t2 = time.perf_counter()
print(f'finished in{t2 - t1} seconds')
I am new to python please any help! I want the script to iterate over all the lines of the file using threads to speed up the process.

Asyncio big list of Task with sequential combine run_in_executor and standard Coroutine in each

I need to handle list of 2500 ip-addresses from csv file. So I need to create_task from coroutine 2500 times. Inside every coroutine firstly I need to fast-check access of IP:PORT via python module "socket" and it is a synchronous function want to be in loop.run_in_executor(). Secondly if IP:PORT is opened I need to connect to this socket via asyncssh.connect() for doing some bash commands and this is standart asyncio coroutine. Then I need to collect results of this bash commands to another csv file.
Additionaly there is an issue in Linux: system can not open more than 1024 connections at same time. I think it may be solved by making list of lists[1000] with asyncio.sleep(1) between or something like that.
I expected my tasks will be executed by 1000 in 1 second but it only 20 in 1 sec. Why?
Little working code snippet with comments here:
#!/usr/bin/env python3
import asyncio
import csv
import time
from pathlib import Path
import asyncssh
import socket
from concurrent.futures import ThreadPoolExecutor as Executor
PARALLEL_SESSIONS_COUNT = 1000
LEASES_ALL = Path("ip_list.csv")
PORT = 22
TIMEOUT = 1
USER = "testuser1"
PASSWORD = "123"
def is_open(ip,port,timeout):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(timeout)
try:
s.connect((ip, int(port)))
s.shutdown(socket.SHUT_RDWR)
return {"result": True, "error": "NoErr"}
except Exception as ex:
return {"result": False, "error": str(ex)}
finally:
s.close()
def get_leases_list():
# Minimal csv content:
# header must contain "IPAddress"
# every other line is concrete ip-address.
result = []
with open(LEASES_ALL, newline="") as csvfile_1:
reader_1 = csv.DictReader(csvfile_1)
result = list(reader_1)
return result
def split_list(some_list, sublist_count):
result = []
while len(some_list) > sublist_count:
result.append(some_list[:sublist_count])
some_list = some_list[sublist_count:]
result.append(some_list)
return result
async def do_single_host(one_lease_dict): # Function for each Task
# Firstly
IP = one_lease_dict["IPAddress"]
loop = asyncio.get_event_loop()
socket_check = await loop.run_in_executor(None, is_open, IP, PORT, TIMEOUT)
print(socket_check, IP)
# Secondly
if socket_check["result"] == True:
async with asyncssh.connect(host=IP, port=PORT, username=USER, password=PASSWORD, known_hosts=None) as conn:
result = await conn.run("uname -r", check=True)
print(result.stdout, end="") # Just print without write in file at this point.
def aio_root():
leases_list = get_leases_list()
list_of_lists = split_list(leases_list, PARALLEL_SESSIONS_COUNT)
r = []
loop = asyncio.get_event_loop()
for i in list_of_lists:
for j in i:
task = loop.create_task(do_single_host(j))
r.append(task)
group = asyncio.wait(r)
loop.run_until_complete(group) # At this line execute only by 20 in 1sec. Can't understand why :(
loop.close()
def main():
aio_root()
if __name__ == '__main__':
main()
loop.run_in_exectutor signature:
awaitable loop.run_in_executor(executor, func, *args)ΒΆ
The default ThreadPoolExecutor is used if executor is None.
ThreadPoolExecutor document:
Changed in version 3.5: If max_workers is None or not given, it will default to the number of processors on the machine, multiplied by 5, assuming that ThreadPoolExecutor is often used to overlap I/O instead of CPU work and the number of workers should be higher than the number of workers for ProcessPoolExecutor.
Changed in version 3.8: Default value of max_workers is changed to min(32, os.cpu_count() + 4). This default value preserves at least 5 workers for I/O bound tasks. It utilizes at most 32 CPU cores for CPU bound tasks which release the GIL. And it avoids using very large resources implicitly on many-core machines.

Processing huge CSV file using Python and multithreading

I have a function that yields lines from a huge CSV file lazily:
def get_next_line():
with open(sample_csv,'r') as f:
for line in f:
yield line
def do_long_operation(row):
print('Do some operation that takes a long time')
I need to use threads such that each record I get from the above function I can call do_long_operation.
Most places on Internet have examples like this, and I am not very sure if I am on the right path.
import threading
thread_list = []
for i in range(8):
t = threading.Thread(target=do_long_operation, args=(get_next_row from get_next_line))
thread_list.append(t)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
My questions are:
How do I start only a finite number of threads, say 8?
How do I make sure that each of the threads will get a row from get_next_line?
You could use a thread pool from multiprocessing and map your tasks to a pool of workers:
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
from time import sleep
def process_line(l):
print l, "started"
sleep(randint(0, 3))
print l, "done"
def get_next_line():
with open("sample.csv", 'r') as f:
for line in f:
yield line
f = get_next_line()
t = Pool(processes=8)
for i in f:
t.map(process_line, (i,))
t.close()
t.join()
This will create eight workers and submit your lines to them, one by one. As soon as a process is "free", it will be allocated a new task.
There is a commented out import statement, too. If you comment out the ThreadPool and import Pool from multiprocessing instead, you will get subprocesses instead of threads, which may be more efficient in your case.
Using a Pool/ThreadPool from multiprocessing to map tasks to a pool of workers and a Queue to control how many tasks are held in memory (so we don't read too far ahead into the huge CSV file if worker processes are slow):
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
import time, os
from multiprocessing import Queue
def process_line(l):
print("{} started".format(l))
time.sleep(randint(0, 3))
print("{} done".format(l))
def get_next_line():
with open(sample_csv, 'r') as f:
for line in f:
yield line
# use for testing
# def get_next_line():
# for i in range(100):
# print('yielding {}'.format(i))
# yield i
def worker_main(queue):
print("{} working".format(os.getpid()))
while True:
# Get item from queue, block until one is available
item = queue.get(True)
if item == None:
# Shutdown this worker and requeue the item so other workers can shutdown as well
queue.put(None)
break
else:
# Process item
process_line(item)
print("{} done working".format(os.getpid()))
f = get_next_line()
# Use a multiprocessing queue with maxsize
q = Queue(maxsize=5)
# Start workers to process queue items
t = Pool(processes=8, initializer=worker_main, initargs=(q,))
# Enqueue items. This blocks if the queue is full.
for l in f:
q.put(l)
# Enqueue the shutdown message (i.e. None)
q.put(None)
# We need to first close the pool before joining
t.close()
t.join()
Hannu's answer is not the best method.
I ran the code on a 100M rows CSV file. It took me forever to perform the operation.
However, prior to reading his answer, I had written the following code:
def call_processing_rows_pickably(row):
process_row(row)
import csv
from multiprocessing import Pool
import time
import datetime
def process_row(row):
row_to_be_printed = str(row)+str("hola!")
print(row_to_be_printed)
class process_csv():
def __init__(self, file_name):
self.file_name = file_name
def get_row_count(self):
with open(self.file_name) as f:
for i, l in enumerate(f):
pass
self.row_count = i
def select_chunk_size(self):
if(self.row_count>10000000):
self.chunk_size = 100000
return
if(self.row_count>5000000):
self.chunk_size = 50000
return
self.chunk_size = 10000
return
def process_rows(self):
list_de_rows = []
count = 0
with open(self.file_name, 'rb') as file:
reader = csv.reader(file)
for row in reader:
print(count+1)
list_de_rows.append(row)
if(len(list_de_rows) == self.chunk_size):
p.map(call_processing_rows_pickably, list_de_rows)
del list_de_rows[:]
def start_process(self):
self.get_row_count()
self.select_chunk_size()
self.process_rows()
initial = datetime.datetime.now()
p = Pool(4)
ob = process_csv("100M_primes.csv")
ob.start_process()
final = datetime.datetime.now()
print(final-initial)
This took 22 minutes. Obviously, I need to have more improvements. For example, the Fred library in R takes 10 minutes maximum to do this task.
The difference is: I am creating a chunk of 100k rows first, and then I pass it to a function which is mapped by threadpool(here, 4 threads).

Python Tweepy streaming with multitasking

in Python 2.7 I am successful in using the following code to listen to a direct message stream on an account:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy import API
from tweepy.streaming import StreamListener
# These values are appropriately filled in the code
consumer_key = '######'
consumer_secret = '######'
access_token = '######'
access_token_secret = '######'
class StdOutListener( StreamListener ):
def __init__( self ):
self.tweetCount = 0
def on_connect( self ):
print("Connection established!!")
def on_disconnect( self, notice ):
print("Connection lost!! : ", notice)
def on_data( self, status ):
print("Entered on_data()")
print(status, flush = True)
return True
# I can add code here to execute when a message is received, such as slicing the message and activating something else
def on_direct_message( self, status ):
print("Entered on_direct_message()")
try:
print(status, flush = True)
return True
except BaseException as e:
print("Failed on_direct_message()", str(e))
def on_error( self, status ):
print(status)
def main():
try:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)
api = API(auth)
# If the authentication was successful, you should
# see the name of the account print out
print(api.me().name)
stream = Stream(auth, StdOutListener())
stream.userstream()
except BaseException as e:
print("Error in main()", e)
if __name__ == '__main__':
main()
This is great, and I can also execute code when I receive a message, but the jobs I'm adding to a work queue need to be able to stop after a certain amount of time. I'm using a popular start = time.time() and subtracting current time to determine elapsed time, but this streaming code does not loop to check the time. I just waits for a new message, so the clock is never checked so to speak.
My question is this: How can I get streaming to occur and still track time elapsed? Do I need to use multithreading as described in this article? http://www.tutorialspoint.com/python/python_multithreading.htm
I am new to Python and having fun playing around with hardware attached to a Raspberry Pi. I have learned so much from Stackoverflow, thank you all :)
I'm not sure exactly how you want to decide when to stop, but you can pass a timeout argument to the stream to give up after a certain delay.
stream = Stream(auth, StdOutListener(), timeout=30)
That will call your listener's on_timeout() method. If you return true, it will continue streaming. Otherwise, it will stop.
Between the stream's timeout argument and your listener's on_timeout(), you should be able to decide when to stop streaming.
I found I was able to get some multithreading code the way I wanted to. Unlike this tutorial from Tutorialspoint which gives an example of launching multiple instances of the same code with varying timing parameters, I was able to get two different blocks of code to run in their own instances
One block of code constantly adds 10 to a global variable (var).
Another block checks when 5 seconds elapses then prints var's value.
This demonstrates 2 different tasks executing and sharing data using Python multithreading.
See code below
import threading
import time
exitFlag = 0
var = 10
class myThread1 (threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
#var counting block begins here
print "addemup starting"
global var
while (var < 100000):
if var > 90000:
var = 0
var = var + 10
class myThread2 (threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
#time checking block begins here and prints var every 5 secs
print "checkem starting"
global var
start = time.time()
elapsed = time.time() - start
while (elapsed < 10):
elapsed = time.time() - start
if elapsed > 5:
print "var = ", var
start = time.time()
elapsed = time.time() - start
# Create new threads
thread1 = myThread1(1, "Thread-1", 1)
thread2 = myThread2(2, "Thread-2", 2)
# Start new Threads
thread1.start()
thread2.start()
print "Exiting Main Thread"
My next task will be breaking up my twitter streaming in to its own thread, and passing direct messages received as variables to a task queueing program, while hopefully the first thread continues to listen for more direct messages.

python 3 multithreading output to CSV file (blank)

I am new to python, i have got this multithreading working from a tutorial i ran across.
Unsure if it is goo practice or not.
What i want to archive:
pings the list of hostnames and returns up or down.
writes results to csv file
What this file currently does is:
pings the list of hostnames and returns up or down.
the csv file it creates is empty and doesnt appear to write any results to it.
I have done some testing and found that with the pings multithreadin over serial code is approx 16 times faster for me.
I am doing massive amounts of pings approx 9000 and want them returned asap.
Can you please let me know where i have gone wrong with the csv part.
import threading
from queue import Queue
import time
import subprocess as sp
import csv
# lock to serialize console output
lock = threading.Lock()
def do_work(item):
#time.sleep(1) # pretend to do some lengthy work.
# Make sure the whole print completes or threads can mix up output in one line.
status,result = sp.getstatusoutput("ping -n 3 " + str(item))
if status == 0:
result = 'Up'
else:
result = 'Down'
with lock:
output.writerow({'hostname': item,'status': result})
array.append({'hostname': item,'status': result})
print(threading.current_thread().name,item,result)
# The worker thread pulls an item from the queue and processes it
def worker():
while True:
item = q.get()
do_work(item)
q.task_done()
# Create the queue and thread pool.
q = Queue()
for i in range(100):
t = threading.Thread(target=worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
array = []
# stuff work items on the queue (in this case, just a number).
start = time.perf_counter()
headers = ['status','hostname']
output = csv.DictWriter(open('host-export.csv','w'), delimiter=',', lineterminator='\n', fieldnames=headers)
output.writeheader()
txt = open("hosts.txt", 'r', encoding="utf8")
for line in txt:
q.put(line,array)
q.join() # block until all tasks are done
# "Work" took .1 seconds per task.
# 20 tasks serially would be 2 seconds.
# With 4 threads should be about .5 seconds (contrived because non-CPU intensive "work")
print(array)
print('time:',time.perf_counter() - start)
I also added bulk writing for the csv thinking maybe i just cant access the csv object in the function but that also didnt work as below.
headers = ['status','hostname']
output = csv.DictWriter(open('host-export.csv','w'), delimiter=',', lineterminator='\n', fieldnames=headers)
output.writeheader()
output.writerows(array)
I fiugured out what i have done wrong.
I didnt close the file connection so it didnt write to the file.
here is the code i am using now to site my csv file.
fieldnames = ['ip', 'dns', 'pings'] #headings
test_file = open('test2-p.csv','w', newline='') #open file
csvwriter = csv.DictWriter(test_file, delimiter=',', fieldnames=fieldnames) #set csv writing settings
csvwriter.writeheader() #write csv headings
for row in rows: #write to csv file
csvwriter.writerow(row)
test_file.close()

Resources