I am feeding a long list of inputs in a function that calls an API to retrieve data. My list is around 40.000 unique inputs. Currently, the function returns output every 1-2 seconds or so. Quick maths tells me that it would take over 10+ hrs before my function will be done. I therefore want to speed this process up, but have struggles finding a solution. I am quite a beginner, so threading/pooling is quite difficult for me. I hope someone is able to help me out here.
The function:
import quandl
import datetime
import numpy as np
quandl.ApiConfig.api_key = 'API key here'
def get_data(issue_date, stock_ticker):
# Prepare var
stock_ticker = "EOD/" + stock_ticker
# Volatility
date_1 = datetime.datetime.strptime(issue_date, "%d/%m/%Y")
pricing_date = date_1 + datetime.timedelta(days=-40) # -40 days of issue date
volatility_date = date_1 + datetime.timedelta(days=-240) # -240 days of issue date (-40,-240 range)
# Check if code exists : if not -> return empty array
try:
stock = quandl.get(stock_ticker, start_date=volatility_date, end_date=pricing_date) # get pricing data
except quandl.errors.quandl_error.NotFoundError:
return []
daily_close = stock['Adj_Close'].pct_change() # returns using adj.close
stock_vola = np.std(daily_close) * np.sqrt(252) # annualized volatility
# Average price
stock_pricing_date = date_1 + datetime.timedelta(days=-2) # -2 days of issue date
stock_pricing_date2 = date_1 + datetime.timedelta(days=-12) # -12 days of issue date
stock_price = quandl.get(stock_ticker, start_date=stock_pricing_date2, end_date=stock_pricing_date)
stock_price_average = np.mean(stock_price['Adj_Close']) # get average price
# Amihuds Liquidity measure
liquidity_pricing_date = date_1 + datetime.timedelta(days=-20)
liquidity_pricing_date2 = date_1 + datetime.timedelta(days=-120)
stock_data = quandl.get(stock_ticker, start_date=liquidity_pricing_date2, end_date=liquidity_pricing_date)
p = np.array(stock_data['Adj_Close'])
returns = np.array(stock_data['Adj_Close'].pct_change())
dollar_volume = np.array(stock_data['Adj_Volume'] * p)
illiq = (np.divide(returns, dollar_volume))
print(np.nanmean(illiq))
illiquidity_measure = np.nanmean(illiq, dtype=float) * (10 ** 6) # multiply by 10^6 for expositional purposes
return [stock_vola, stock_price_average, illiquidity_measure]
I then use a seperate script to select my csv file with the list with rows, each row containing the issue_date, stock_ticker
import function
import csv
import tkinter as tk
from tkinter import filedialog
# Open File Dialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
# Load Spreadsheet data
f = open(file_path)
csv_f = csv.reader(f)
next(csv_f)
result_data = []
# Iterate
for row in csv_f:
try:
return_data = function.get_data(row[1], row[0])
if len(return_data) != 0:
# print(return_data)
result_data_loc = [row[1], row[0]]
result_data_loc.extend(return_data)
result_data.append(result_data_loc)
except AttributeError:
print(row[0])
print('\n\n')
print(row[1])
continue
if result_data is not None:
with open('resuls.csv', mode='w', newline='') as result_file:
csv_writer = csv.writer(result_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for result in result_data:
# print(result)
csv_writer.writerow(result)
else:
print("No results found!")
It is quite messy, but like I mentioned before, I am definitely a beginner. Speeding this up would greatly help me.
Related
So, I'm trying to generate some fake random data of a given dimension size. Essentially, I want a dataframe in which the data has a uniform random distribution. The data consist of both continuous and categorical values. I've written the following code, but it doesn't work the way I want it to be.
import random
import pandas as pd
import time
from datetime import datetime
# declare global variables
adv_name = ['soft toys', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_loc = ['location_1', 'location_2', 'location_3',
'location_4', 'location_5']
adv_prod = ['baby product', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_size = [1, 2, 3, 4, 10]
adv_layout = ['static', 'dynamic'] # advertisment layout type on website
# adv_date, start_time, end_time = []
num = 10 # the given dimension
# define function to generate random advert locations
def rand_shuf_loc(str_lst, num):
lst = adv_loc
# using list comprehension
rand_shuf_str = [item for item in lst for i in range(num)]
return(rand_shuf_str)
# define function to generate random advert names
def rand_shuf_prod(loc_list, num):
rand_shuf_str = [item for item in loc_list for i in range(num)]
random.shuffle(rand_shuf_str)
return(rand_shuf_str)
# define function to generate random impression and click data
def rand_clic_impr(num):
rand_impr_lst = []
click_lst = []
for i in range(num):
rand_impr_lst.append(random.randint(0, 100))
click_lst.append(random.randint(0, 100))
return {'rand_impr_lst': rand_impr_lst, 'rand_click_lst': click_lst}
# define function to generate random product price and discount
def rand_prod_price_discount(num):
prod_price_lst = [] # advertised product price
prod_discnt_lst = [] # advertised product discount
for i in range(num):
prod_price_lst.append(random.randint(10, 100))
prod_discnt_lst.append(random.randint(10, 100))
return {'prod_price_lst': prod_price_lst, 'prod_discnt_lst': prod_discnt_lst}
def rand_prod_click_timestamp(stime, etime, num):
prod_clik_tmstmp = []
frmt = '%d-%m-%Y %H:%M:%S'
for i in range(num):
rtime = int(random.random()*86400)
hours = int(rtime/3600)
minutes = int((rtime - hours*3600)/60)
seconds = rtime - hours*3600 - minutes*60
time_string = '%02d:%02d:%02d' % (hours, minutes, seconds)
prod_clik_tmstmp.append(time_string)
time_stmp = [item for item in prod_clik_tmstmp for i in range(num)]
return {'prod_clik_tmstmp_lst':time_stmp}
def main():
print('generating data...')
# print('generating random geographic coordinates...')
# get the impressions and click data
impression = rand_clic_impr(num)
clicks = rand_clic_impr(num)
product_price = rand_prod_price_discount(num)
product_discount = rand_prod_price_discount(num)
prod_clik_tmstmp = rand_prod_click_timestamp("20-01-2018 13:30:00",
"23-01-2018 04:50:34",num)
lst_dict = {"ad_loc": rand_shuf_loc(adv_loc, num),
"prod": rand_shuf_prod(adv_prod, num),
"imprsn": impression['rand_impr_lst'],
"cliks": clicks['rand_click_lst'],
"prod_price": product_price['prod_price_lst'],
"prod_discnt": product_discount['prod_discnt_lst'],
"prod_clik_stmp": prod_clik_tmstmp['prod_clik_tmstmp_lst']}
fake_data = pd.DataFrame.from_dict(lst_dict, orient="index")
res = fake_data.apply(lambda x: x.fillna(0)
if x.dtype.kind in 'biufc'
# where 'biufc' means boolean, integer,
# unicode, float & complex data types
else x.fillna(random.randint(0, 100)
)
)
print(res.transpose())
res.to_csv("fake_data.csv", sep=",")
# invoke the main function
if __name__ == "__main__":
main()
Problem 1
when I execute the above code snippet, it prints fine but when written to csv format, its horizontally positioned; i.e., it looks like this... How do I position it vertically when writing to csv file? What I want is 7 columns (see lst_dict variable above) with n number of rows?
Problem 2
I dont understand why the random date is generated for the first 50 columns and remaining columns are filled with numerical values?
To answer your first question, replace
print(res.transpose())
with
res.transpose() print(res)
To answer your second question look at the length of the output of the method
rand_shuf_loc()
it as well as the other helper functions only produce a list of 50 items.
The creation of res using the method
fake_data.apply
replaces all nan with a random numeric, so it also applies a numeric to the columns without any predefined values.
import pandas as pd
from datetime import datetime
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data["INVDAT"] = data["INVDAT"].apply(lambda x: datetime.combine(x, datetime.min.time()))
data["INVDAT"] = data["INVDAT"].dt.strftime("%m-%d-%Y")
print(data)
# output to new file
# new_data = data
# new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
I'm trying to format the INVDAT column to correct date format like 11/25/19, I've tried multiple solutions but keep running into errors like this one: TypeError: combine() argument 1 must be datetime.date, not int, I then tried to convert the integer to date type but it errors also.
Or you can simply use df["INVDAT"] = pd.to_datetime(df["INVDAT"], format="%m/%d/%y"), in this case you don't need the datetime pakage. For further information you should look the docs.
data['INVDAT'] = data['INVDAT'].astype('str')
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
This solution works but if the date representation is a single month like 12519 ( expected output 1/25/19), it fails. I tried using a conditional to add a 0 to the front if len() < 6 but it gives me an error that the dtype is int64.
import pandas as pd
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data['INVDAT'] = data['INVDAT'].astype('str')
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
for i in data['INVDAT'].str.len():
if i <= 5:
data['INVDAT'] = data['INVDAT'].apply(lambda x: '{0:0>6}'.format(x))
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
else:
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
# output to new file
new_data = data
new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
This is the solution, it's sloppy but works
I am trying to find parameter estimates using using minimization. The code I wrote works but there are two problems:
I finds only a local minimum. I tried to solve this by using basinhopping.
It takes very long until I get a result and since I have to do this minimization around 1000 times this becomes a big issue.
So my questions are:
Do you know how I could optimize my code so that it runs faster for the minimization.
Is there a way I can change the basinhopping part so that it runs faster? eg. set niter lower or a differnt method im not aware of. I tried running it like this and after 10 hour I didnt get a response for even one of the 1000 individuals for basinhopping.
Is there another way to find a global minimum?
Feel free to ask further questions please.
My code:
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import basinhopping
from scipy.integrate import odeint
import pickle
import os
import pandas as pd
import datetime
import numpy.random as npr
import csv
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###IDS
df = pd.read_csv('1_Youtuber_SingleNrSheet_Comedy.csv', sep = ";", skipinitialspace=True) ######Change Name
YoutuberID = df["Channel_ID"].tolist()
##print(YoutuberID)
with open("9_p_q_m_Fun_ExtendedBass_VIEWS_Comedy_test.csv", "w" ,newline='',encoding='utf-8') as csv_file2: ######Change Name
csv_writer2 = csv.writer(csv_file2, delimiter=';')
csv_writer2.writerow(["Type","p", "q", "m","Functionvalue"])
count = 0
for ID in YoutuberID[0:]: ###Change
try:
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###ALL INFO
Days = pd.read_csv('3_API_Call_ALL_info_Comedy_v2.csv', sep = ";", skipinitialspace=True)
views_path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python\Daily_Views_Comedy" ######Change Name
os.chdir(views_path)
SVR = pd.read_csv("4_COMEDY_DailyViews_Clean_" + str(count) + "_" + ID + ".csv", sep = ";", parse_dates=True, dayfirst=True) ######Change Name
## print(SVR[SVR.columns[0]])
SVR = SVR[SVR[SVR.columns[0]]< "2018-05-01"] ####CHANGE DATE FOR DIF CAT
## print(SVR)
#####SV Input
SV = np.array(SVR["Daily Views"])
## print(SV)
Days = Days[Days["channelId"] == ID]
## print(Days)
Days["publishedAt"] = pd.to_datetime(Days.publishedAt)
Days = Days[Days["publishedAt"] > "2015-01-08"] ##"2015-01-10"
## print(Days)
##### Timedelta #####
start_date = pd.to_datetime("2015-06-08")
##print(start_date)
video_upload_day =[]
for video_date in Days["publishedAt"]:
TimeDelta = video_date - start_date
video_upload_day.append(TimeDelta.days)
##print(video_upload_day)
##print(videoT)
nvideos = len(video_upload_day)
ndays = len(SV)
videoT = np.array(video_upload_day)
## print(videoT,nvideos,ndays)
def objective(x):
p = x[0]
q = x[1]
m = x[2]
estimateV = np.zeros( (ndays, nvideos) )
for t in range( ndays ):
for v in range( nvideos ):
if videoT[v] <= t:
estimateV[ t,v ] = p*m + (q-p) * np.sum(estimateV[0:t,v],axis=0) - (q/m) * (np.sum(estimateV[0:t,v],axis=0)**2)
estimateSV = np.sum( estimateV, axis = 1 )
return np.sum( (SV - estimateSV)**2 )
This is the minimization part. I made one for the normal minimization and one for basinhopping and seperated it with ##.
###### MINIMIZATION #######
mguess = round(sum(SV)/(nvideos*2),0)
print(sum(SV),mguess)
x0 = np.array([0.001, 0.01, mguess]) ####Make it less volatile to first guess? Make bigger steps for m?
b1 = (0.00001,0.5)
b2 = (10**4,10**7)
bnds = (b1,b1,b2)
## minimizer_kwargs = dict(method="L-BFGS-B",bounds=bnds)
## res = basinhopping(objective, x0,niter=20, minimizer_kwargs=minimizer_kwargs)
res = minimize(objective, x0,bounds = bnds)
print(res)
csv_writer2.writerow(["COMEDY",res.x[0], res.x[1],res.x[2],res.fun]) ###CHANNGE CAT
print("CURRERNT YOUTUBER IS:",count)
count += 1
except:
print("PROBLEM",count)
count += 1
## print(res,res.x[0],res.x[1],res.x[2],res.fun)
from tkinter import *
from datetime import datetime, timedelta
import pickle
from tkinter import messagebox
filename = "file.pk" #filename data stored as dic = {time:[name,item]}
root = Tk()
t = IntVar()
i = StringVar()
n = StringVar()
def Notification_On_Completing(n, i, t):
return messagebox.showinfo("Completed", "Name:- {}, Item:-{}, in Days:-{}".format(n, i, t))
def Check_If_Time_Is_Over(): #my actual problem is in this function
with open(filename, "rb") as f:
dic = pickle.load(f)
now = datetime.now()
for i, j in dic: #trying to loop and check if the time == now()
if i == now:
Notification_On_Completing(j[0], j[1], i) #if its true return the key which is equal with its value
elif i != now: #if now time i am tryinng to show how much time left
print(i - now, "Time has left for name:-{}, item:-{}".format(j[0],j[1]))
else:
root.after(10000, Check_If_Time_Is_Over)
def SaveTheDaysToNotify():
now = datetime.now()
time = t.get() # days to wait beforer notifying
item = i.get() #item name
name = i.get() #name
end = now + timedelta(days=time) #adding today with the number of days to notify
with open(filename.pk, "rb") as f: # avoiding the overide of the files
dic = pickle.load(f)
dic= {end:[name, item]} # saving a days to notify as dic which will also show the name , and item
with open("file.pk", "wb") as f: #keeping record of the time time to notify
pickle.dump(dic, f)
Check_If_Time_Is_Over()
#Gui starts from here
time1 = Entry(root,textvariable=t).pack() #taking entry as of time, name and item
name1 = Entry(root,textvariable=n).pack()
item1 = Entry(root, textvariable=i).pack()
ss = Button(root,text="done",command=SaveTheDaysToNotify).pack() #adding to the pickle database with format as dictionary as dic ={time:[name, item]}
root.mainloop()
Check_If_Time_Is_Over()
I am trying to make a program which will take a entry as Time, item, name.Time will be taken as days to show notification after.For example the program should show the notification after x days and continously checking the if the x days has come even if the program would be closed and reopened for certain times in days or hours interval.
You can store the date of the input as well as the number inside a file. You can use datetime.timedelta to add days to your time.
Every time your program is run you check if this special file is present, read the date and the number, check if date+number_days is today or before today and create your notification.
If your program is not startet, it wont show anything. If your program is not closed in between it wont show anything - the latter case could be accomplished with some timers and checking periodically inside TKs main loop (remeber hour, if changed, check file again or smth...)
Example:
import os
from datetime import datetime , timedelta
fn = "myfile.txt"
date = None
days = None
def delDateFile():
"""Remove date file after showing notification, so next time user will
be asked to input days again."""
os.remove(fn)
def writeDate(d:int):
"""Adds d days to now(), writes it to file and returns the date."""
date = datetime.now() + timedelta(days = d)
with open(fn,"w") as f: # overwriting existing file
f.write(stringFromDate(date ))
return date.date()
def dateFromString(s):
"""Parses a string into a date. Format: '%Y-%m-%d'"""
try:
return datetime.strptime(s,'%Y-%m-%d').date()
except ValueError:
print("Malformed date")
return writeDate(-1)
def stringFromDate(d):
"""Makes a string from a date. Format: '%Y-%m-%d'"""
return datetime.strftime(datetime.now(),'%Y-%m-%d')
def needToShowNotification():
def checkIfFileExistsAndExtractDate():
"""Reads a date from file, returns date from file or None if no file exists."""
if os.path.exists(fn):
with open(fn,"r") as f:
date = dateFromString(f.readline())
return date
return None
def askForDaysAndWriteFile():
"""Asks for days and writes a file with the new date. Malformed input produces date of yesterday"""
try:
days = int(input("Days? "))
except (ValueError, EOFError): # malformed input: empty or no number
days = -1
return writeDate(days)
# prefer date from file, else ask and write file
date = checkIfFileExistsAndExtractDate() or askForDaysAndWriteFile()
return date < datetime.now().date()
def Notif():
print("------------------")
print("-- NOTIFICATION --")
print("------------------")
delDateFile()
if needToShowNotification():
Notif()
else:
print("Go ahead, no need to notify")
Run it with a negative days input to see the notification (or delete the created file)
I am trying to learn how to use multiprocessing and have managed to get the code below to work. The goal is to work through every combination of the variables within the CostlyFunction by setting n equal to some number (right now it is 100 so the first 100 combinations are tested). I was hoping I could manipulate w as each process returned its list (CostlyFunction returns a list of 7 values) and only keep the results in a given range. Right now, w holds all 100 lists and then lets me manipulate those lists but, when I use n=10MM, w becomes huge and costly to hold. Is there a way to evaluate CostlyFunction's output as the workers return values and then 'throw out' values I don't need?
if __name__ == "__main__":
import csv
csvFile = open('C:\\Users\\bryan.j.weiner\\Desktop\\test.csv', 'w', newline='')
#width = -36000000/1000
#fronteir = [None]*1000
currtime = time()
n=100
po = Pool()
res = po.map_async(CostlyFunction,((i,) for i in range(n)))
w = res.get()
spamwriter = csv.writer(csvFile, delimiter=',')
spamwriter.writerows(w)
print(('2: parallel: time elapsed:', time() - currtime))
csvFile.close()
Unfortunately, Pool doesn't have a 'filter' method; otherwise, you might've been able to prune your results before they're returned. Pool.imap is probably the best solution you'll find for dealing with your memory issue: it returns an iterator over the results from CostlyFunction.
For sorting through the results, I made a simple list-based class called TopList that stores a fixed number of items. All of its items are the highest-ranked according to a key function.
from collections import Userlist
def keyfunc(a):
return a[5] # This would be the sixth item in a result from CostlyFunction
class TopList(UserList):
def __init__(self, key, *args, cap=10): # cap is the largest number of results
super().__init__(*args) # you want to store
self.cap = cap
self.key = key
def add(self, item):
self.data.append(item)
self.data.sort(key=self.key, reverse=True)
self.data.pop()
Here's how your code might look:
if __name__ == "__main__":
import csv
csvFile = open('C:\\Users\\bryan.j.weiner\\Desktop\\test.csv', 'w', newline='')
n = 100
currtime = time()
po = Pool()
best = TopList(keyfunc)
result_iter = po.imap(CostlyFunction, ((i,) for i in range(n)))
for result in result_iter:
best.add(result)
spamwriter = csv.writer(csvFile, delimiter=',')
spamwriter.writerows(w)
print(('2: parallel: time elapsed:', time() - currtime))
csvFile.close()