loop over a python list - python-3.x

I have a python list of Ids which I am calling in my function. There are around 200 ids. I would like to know what will be the best way to call these ids in chunks like I call 10 or 20 ids at a time and in next call, I call the next 20 ids and so on.. I have used multithreading here to make it faster but it seems to take lot of time. Here is the code I managed:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import datetime as dt
df = pd.ExcelFile('ids.xlsx').parse('Sheet1')
x=[]
x.append(df['external_ids'].to_list())
def download():
#client is my python sdk
dtest_df = client.datapoints.retrieve_dataframe(external_id = x[0], start=0, end="now",granularity='1m')
dtest_df = dtest_df.rename(columns = {'index':'timestamp'})
client.datapoints.insert_dataframe(dtest_df,external_id_headers = True,dropna = True)
print(dtest_df)
with ThreadPoolExecutor(max_workers=20) as executor:
future = executor.submit(download)
print(future.result())

Related

How to simulate a buffet with threads and locks

Hello I am working on a simulation of a buffer where I need to use threads and locks. So I created two function one so the consumer gets his trail and the second one is once he gets his trail he can go to the next line to get his meal.
However my code never stops running and never goes to the second function were he could get his meal.
from concurrent.futures import thread
import random
import threading
import time
import concurrent.futures
import logging
import traceback
from numpy import number
#Creating the two queues with 50 students
consumers = [x+1 for x in range(50)]
trail = []
meal = []
#putting the locks for both queues
meal_lock = threading.Lock()
trail_lock = threading.Lock()
def trail(x):
global trail_lock
while True:
trail_lock.acquire()
trail.append(x)
if x in trail:
print(f"Consumer {x} Got his trail")
trail_lock.release()
def meal(x):
global meal_lock
while True:
meal_lock.acquire()
if x in trail:
trail.remove(x)
print("Got his meal")
meal.append(x)
meal_lock.release()
break
number_of_meals = 5
number_of_trails = 5
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_trails) as executor:
executor.map(trail, range(number_of_trails))
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_meals) as executor:
executor.map(meal, range(1+y,number_of_meals))

Python - Multiprocessing passing Pandas Dataframe

I'm trying to use multiprocessing to do some web scraping and then add it to a unique DataFrame for each Process and then merge all the DataFrames together at the end to avoid Locks.
I tried out the code below and it ran but the DataFrame would only hold the data for a moment but not when the process finish running.
Am I missing something?
import pandas as pd
import multiprocessing
def add_row_to_db(database):
database.loc[len(database.index)] = ['Sample', 'Testing']
print('Added')
print(f'{database}\n')
if __name__ == '__main__':
columns_name = ['name', 'power']
db = pd.DataFrame(columns=columns_name)
process = multiprocessing.Process(target=add_row_to_db, args=(db,))
process.start()
process.join()
print(db)
Output
Added
name power
0 Sample Testing
Empty DataFrame
Columns: [name, power]
Index: []
Process finished with exit code 0

Fastest iteration on a dataframe by applying an url function

I need to request some data from an url by inserting a variable=var for each row of my dataframe. I wrote a function that iterates over each row
def df_eval(data):
data_eval = data.copy()
df_price = []
for i in data_eval.index:
var = data_eval.at[i, 'var']
url = ("http://blablabla/params&cid={}".format(var))
r_json = requests.get(url).json()
df = json_normalize(r_json)
df_price.append(df['price'])
print(df_price)
data_eval['price_eval'] = df_price
return data_eval
Could you be able to suggest a faster way for this operation. Currently it takes about 30 minutes over 23000 rows.
You could paralelize your calls like this:
import random
import pandas as pd
import numpy as np
from multiprocessing import Pool
data_split = np.array_split(data, n_cores)
pool = Pool(n_cores)
data = pd.concat(pool.map(df_eval, data_split))
pool.close()
pool.join()
Source: https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1

How do I process several lists at once?

I have a big list of numbers. I want to split that big list of numbers into x number of lists and process them in parallel.
Here's the code that I have so far:
from multiprocessing import Pool
import numpy
def processNumList(numList):
for num in numList:
outputList.append(num ** 2)
numThreads = 5
bigNumList = list(range(50))
splitNumLists = numpy.array_split(bigNumList, numThreads)
outputList = []
for numList in splitNumLists:
processNumList(numList)
print(outputList)
The above code does the following:
Splits a big list of numbers into the specified number of smaller lists
Passes each of those lists to the processNumList function
Prints the result list afterwards
Everything there works as expected, but it only processes one list at a time. I want every list to be processed simultaneously.
What is the proper code to do that? I experimented with pool but could never seem to get it working.
You could try something like this:
import threading
class MyClass(threading.Thread):
def __init__(self):
# init stuff
def run(self, arg, arg2):
# your logic to process the list
# split the list as you already did
for _ in range(numThreads):
MyThread(arg, arg2).start()
Here's the code I ended up using.
I used threading.Thread() to process the lists asynchronously and then called thread.join() to ensure that all of the threads were finished before moving on.
I added time.sleep for demonstration purposes (to simulate a lengthy task), but obviously you wouldn't want to use that in production code.
import numpy
import threading
import time
def process_num_list(numList):
for num in numList:
output_list.append(num ** 2)
time.sleep(1)
num_threads = 5
big_num_list = list(range(30))
split_num_lists = numpy.array_split(big_num_list, num_threads)
output_list = []
threads = []
for num_list in split_num_lists:
thread = threading.Thread(target=process_num_list, args=[num_list])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
print(output_list)
As a bonus, here's a working example of five Selenium windows:
from selenium import webdriver
import numpy
import threading
import time
def scrapeSites(siteList):
print("Preparing to scrape " + str(len(siteList)) + " sites")
driver = webdriver.Chrome(executable_path = r"..\chromedriver.exe")
driver.set_window_size(700, 400)
for site in siteList:
print("\nNow scraping " + site)
driver.get(site)
pageTitles.append(driver.title)
driver.quit()
numThreads = 5
fullWebsiteList = ["https://en.wikipedia.org/wiki/Special:Random"] * 30
splitWebsiteLists = numpy.array_split(fullWebsiteList, numThreads)
pageTitles = []
threads = []
for websiteList in splitWebsiteLists:
thread = threading.Thread(target=scrapeSites, args=[websiteList])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
print(pageTitles)

Customize Python Script On Azure ML

I want to use Fuzzywuzzy logic on python script. I am implement in this way but i didn't get anything.
This is my python script code:
import pandas as pd
from fuzzywuzzy import process
def azureml_main(dataframe1 = None):
return dataframe1,
def get_matches(query, choice, limit = 6):
result = process.extract(query, choice, limit = limit)
return result,
get_matches("admissibility", dataframe1)

Resources