Estimating parameters using minimization in Python and speed up this process - python-3.x

I am trying to find parameter estimates using using minimization. The code I wrote works but there are two problems:
I finds only a local minimum. I tried to solve this by using basinhopping.
It takes very long until I get a result and since I have to do this minimization around 1000 times this becomes a big issue.
So my questions are:
Do you know how I could optimize my code so that it runs faster for the minimization.
Is there a way I can change the basinhopping part so that it runs faster? eg. set niter lower or a differnt method im not aware of. I tried running it like this and after 10 hour I didnt get a response for even one of the 1000 individuals for basinhopping.
Is there another way to find a global minimum?
Feel free to ask further questions please.
My code:
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import basinhopping
from scipy.integrate import odeint
import pickle
import os
import pandas as pd
import datetime
import numpy.random as npr
import csv
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###IDS
df = pd.read_csv('1_Youtuber_SingleNrSheet_Comedy.csv', sep = ";", skipinitialspace=True) ######Change Name
YoutuberID = df["Channel_ID"].tolist()
##print(YoutuberID)
with open("9_p_q_m_Fun_ExtendedBass_VIEWS_Comedy_test.csv", "w" ,newline='',encoding='utf-8') as csv_file2: ######Change Name
csv_writer2 = csv.writer(csv_file2, delimiter=';')
csv_writer2.writerow(["Type","p", "q", "m","Functionvalue"])
count = 0
for ID in YoutuberID[0:]: ###Change
try:
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###ALL INFO
Days = pd.read_csv('3_API_Call_ALL_info_Comedy_v2.csv', sep = ";", skipinitialspace=True)
views_path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python\Daily_Views_Comedy" ######Change Name
os.chdir(views_path)
SVR = pd.read_csv("4_COMEDY_DailyViews_Clean_" + str(count) + "_" + ID + ".csv", sep = ";", parse_dates=True, dayfirst=True) ######Change Name
## print(SVR[SVR.columns[0]])
SVR = SVR[SVR[SVR.columns[0]]< "2018-05-01"] ####CHANGE DATE FOR DIF CAT
## print(SVR)
#####SV Input
SV = np.array(SVR["Daily Views"])
## print(SV)
Days = Days[Days["channelId"] == ID]
## print(Days)
Days["publishedAt"] = pd.to_datetime(Days.publishedAt)
Days = Days[Days["publishedAt"] > "2015-01-08"] ##"2015-01-10"
## print(Days)
##### Timedelta #####
start_date = pd.to_datetime("2015-06-08")
##print(start_date)
video_upload_day =[]
for video_date in Days["publishedAt"]:
TimeDelta = video_date - start_date
video_upload_day.append(TimeDelta.days)
##print(video_upload_day)
##print(videoT)
nvideos = len(video_upload_day)
ndays = len(SV)
videoT = np.array(video_upload_day)
## print(videoT,nvideos,ndays)
def objective(x):
p = x[0]
q = x[1]
m = x[2]
estimateV = np.zeros( (ndays, nvideos) )
for t in range( ndays ):
for v in range( nvideos ):
if videoT[v] <= t:
estimateV[ t,v ] = p*m + (q-p) * np.sum(estimateV[0:t,v],axis=0) - (q/m) * (np.sum(estimateV[0:t,v],axis=0)**2)
estimateSV = np.sum( estimateV, axis = 1 )
return np.sum( (SV - estimateSV)**2 )
This is the minimization part. I made one for the normal minimization and one for basinhopping and seperated it with ##.
###### MINIMIZATION #######
mguess = round(sum(SV)/(nvideos*2),0)
print(sum(SV),mguess)
x0 = np.array([0.001, 0.01, mguess]) ####Make it less volatile to first guess? Make bigger steps for m?
b1 = (0.00001,0.5)
b2 = (10**4,10**7)
bnds = (b1,b1,b2)
## minimizer_kwargs = dict(method="L-BFGS-B",bounds=bnds)
## res = basinhopping(objective, x0,niter=20, minimizer_kwargs=minimizer_kwargs)
res = minimize(objective, x0,bounds = bnds)
print(res)
csv_writer2.writerow(["COMEDY",res.x[0], res.x[1],res.x[2],res.fun]) ###CHANNGE CAT
print("CURRERNT YOUTUBER IS:",count)
count += 1
except:
print("PROBLEM",count)
count += 1
## print(res,res.x[0],res.x[1],res.x[2],res.fun)

Related

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,
I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

How can I speed these API queries up?

I am feeding a long list of inputs in a function that calls an API to retrieve data. My list is around 40.000 unique inputs. Currently, the function returns output every 1-2 seconds or so. Quick maths tells me that it would take over 10+ hrs before my function will be done. I therefore want to speed this process up, but have struggles finding a solution. I am quite a beginner, so threading/pooling is quite difficult for me. I hope someone is able to help me out here.
The function:
import quandl
import datetime
import numpy as np
quandl.ApiConfig.api_key = 'API key here'
def get_data(issue_date, stock_ticker):
# Prepare var
stock_ticker = "EOD/" + stock_ticker
# Volatility
date_1 = datetime.datetime.strptime(issue_date, "%d/%m/%Y")
pricing_date = date_1 + datetime.timedelta(days=-40) # -40 days of issue date
volatility_date = date_1 + datetime.timedelta(days=-240) # -240 days of issue date (-40,-240 range)
# Check if code exists : if not -> return empty array
try:
stock = quandl.get(stock_ticker, start_date=volatility_date, end_date=pricing_date) # get pricing data
except quandl.errors.quandl_error.NotFoundError:
return []
daily_close = stock['Adj_Close'].pct_change() # returns using adj.close
stock_vola = np.std(daily_close) * np.sqrt(252) # annualized volatility
# Average price
stock_pricing_date = date_1 + datetime.timedelta(days=-2) # -2 days of issue date
stock_pricing_date2 = date_1 + datetime.timedelta(days=-12) # -12 days of issue date
stock_price = quandl.get(stock_ticker, start_date=stock_pricing_date2, end_date=stock_pricing_date)
stock_price_average = np.mean(stock_price['Adj_Close']) # get average price
# Amihuds Liquidity measure
liquidity_pricing_date = date_1 + datetime.timedelta(days=-20)
liquidity_pricing_date2 = date_1 + datetime.timedelta(days=-120)
stock_data = quandl.get(stock_ticker, start_date=liquidity_pricing_date2, end_date=liquidity_pricing_date)
p = np.array(stock_data['Adj_Close'])
returns = np.array(stock_data['Adj_Close'].pct_change())
dollar_volume = np.array(stock_data['Adj_Volume'] * p)
illiq = (np.divide(returns, dollar_volume))
print(np.nanmean(illiq))
illiquidity_measure = np.nanmean(illiq, dtype=float) * (10 ** 6) # multiply by 10^6 for expositional purposes
return [stock_vola, stock_price_average, illiquidity_measure]
I then use a seperate script to select my csv file with the list with rows, each row containing the issue_date, stock_ticker
import function
import csv
import tkinter as tk
from tkinter import filedialog
# Open File Dialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
# Load Spreadsheet data
f = open(file_path)
csv_f = csv.reader(f)
next(csv_f)
result_data = []
# Iterate
for row in csv_f:
try:
return_data = function.get_data(row[1], row[0])
if len(return_data) != 0:
# print(return_data)
result_data_loc = [row[1], row[0]]
result_data_loc.extend(return_data)
result_data.append(result_data_loc)
except AttributeError:
print(row[0])
print('\n\n')
print(row[1])
continue
if result_data is not None:
with open('resuls.csv', mode='w', newline='') as result_file:
csv_writer = csv.writer(result_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for result in result_data:
# print(result)
csv_writer.writerow(result)
else:
print("No results found!")
It is quite messy, but like I mentioned before, I am definitely a beginner. Speeding this up would greatly help me.

Series format pandas

import pandas as pd
from datetime import datetime
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data["INVDAT"] = data["INVDAT"].apply(lambda x: datetime.combine(x, datetime.min.time()))
data["INVDAT"] = data["INVDAT"].dt.strftime("%m-%d-%Y")
print(data)
# output to new file
# new_data = data
# new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
I'm trying to format the INVDAT column to correct date format like 11/25/19, I've tried multiple solutions but keep running into errors like this one: TypeError: combine() argument 1 must be datetime.date, not int, I then tried to convert the integer to date type but it errors also.
Or you can simply use df["INVDAT"] = pd.to_datetime(df["INVDAT"], format="%m/%d/%y"), in this case you don't need the datetime pakage. For further information you should look the docs.
data['INVDAT'] = data['INVDAT'].astype('str')
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
This solution works but if the date representation is a single month like 12519 ( expected output 1/25/19), it fails. I tried using a conditional to add a 0 to the front if len() < 6 but it gives me an error that the dtype is int64.
import pandas as pd
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data['INVDAT'] = data['INVDAT'].astype('str')
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
for i in data['INVDAT'].str.len():
if i <= 5:
data['INVDAT'] = data['INVDAT'].apply(lambda x: '{0:0>6}'.format(x))
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
else:
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
# output to new file
new_data = data
new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
This is the solution, it's sloppy but works

Goodness of fit always being zero despite taking random data?

I'm trying to write code that generates random data and computes goodness of fit but I'm not understanding why the chi-squared test is always zero, may I have a fix for this ? For an attempted fix I tried playing around with different types to see if I get any resulting changes in the initial output, also I've tried changing the parameters to the loop in question.
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a = random.randint(-10,10)
b = random.randint(-10,10)
y = a * (b + i)
observed_values.append(y)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = o_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for i in z_o_e_v_f:
t_s += [((o_v_f)-(e_v_f))]**2/(e_v_f) # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()
In your original code, e_v_f = o_v.astype(float) made o_v_f, e_v_f ending up the same. There was also some issue in the for loop. I have edited your code a bit. See what it does you are looking for:
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a_o = random.randint(-10,10)
b_o = random.randint(-10,10)
y_o = a_o * (b_o + i)
observed_values.append(y_o)
# a_e = random.randint(-10,10)
# b_e = random.randint(-10,10)
# y_e = a_e * (b_e + i)
expected_values.append(y_o + 5)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = e_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for o, e in z_o_e_v_f:
t_s += (o - e) **2 / e # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()

Subtraction between 'dict_values' and 'float'

I am getting the error "TypeError: unsupported operand type(s) for -: 'dict_values' and 'float'" from line 173 in the sample code. I have copied from a book that does not yet seem to be updated to Python 3 and other forum topics don't seem to cover this problem.
It is trying to calculate the error in an optimsation for the difference in market values and model values, but the data storage type is different across the two.
Thanks
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import calendar
# frame
from get_year_deltas import get_year_deltas
from constant_short_rate import constant_short_rate
from market_environment import market_environment
from plot_option_stats import plot_option_stats
# simulation
from sn_random_numbers import sn_random_numbers
from simulation_class import simulation_class
from geometric_brownian_motion import geometric_brownian_motion
from jump_diffusion import jump_diffusion
from square_root_diffusion import square_root_diffusion
# valuation
from valuation_class import valuation_class
from valuation_mcs_european import valuation_mcs_european
from valuation_mcs_american import valuation_mcs_american
from derivatives_position import derivatives_position
from derivatives_portfolio import derivatives_portfolio
#import os
#path = os.getcwd()
url = 'http://www.stoxx.com/download/historical_values/h_vstoxx.txt'
vstoxx_index = pd.read_csv(url, index_col=0, header=2,parse_dates=True, dayfirst=True)
vstoxx_index = vstoxx_index[('2013/12/31' < vstoxx_index.index) & (vstoxx_index.index < '2014/4/1')]
vstoxx_futures = pd.read_excel('./vstoxx_march_2014.xlsx', 'vstoxx_futures')
del vstoxx_futures['A_SETTLEMENT_PRICE_SCALED']
del vstoxx_futures['A_CALL_PUT_FLAG']
del vstoxx_futures['A_EXERCISE_PRICE']
del vstoxx_futures['A_PRODUCT_ID']
columns = ['DATE', 'EXP_YEAR', 'EXP_MONTH', 'PRICE']
vstoxx_futures.columns = columns
def third_friday(date):
day = 21 - (calendar.weekday(date.year, date.month, 1) + 2) % 7
return dt.datetime(date.year, date.month, day)
set(vstoxx_futures['EXP_MONTH'])
third_fridays = {}
for month in set(vstoxx_futures['EXP_MONTH']):
third_fridays[month] = third_friday(dt.datetime(2014, month, 1))
#third_fridays
tf = lambda x: third_fridays[x]
vstoxx_futures['MATURITY'] = vstoxx_futures['EXP_MONTH'].apply(tf)
#vstoxx_futures.tail()
vstoxx_options = pd.read_excel('./vstoxx_march_2014.xlsx', 'vstoxx_options')
#vstoxx_options.info()
del vstoxx_options['A_SETTLEMENT_PRICE_SCALED']
del vstoxx_options['A_PRODUCT_ID']
columns = ['DATE', 'EXP_YEAR', 'EXP_MONTH', 'TYPE', 'STRIKE', 'PRICE']
vstoxx_options.columns = columns
vstoxx_options['MATURITY'] = vstoxx_options['EXP_MONTH'].apply(tf)
#vstoxx_options.head()
vstoxx_options['STRIKE'] = vstoxx_options['STRIKE'] / 100.0
save = False
if save is True:
import warnings
warnings.simplefilter('ignore')
h5 = pd.HDFStore('./vstoxx_march_2014.h5', complevel=9, complib='blosc')
h5['vstoxx_index'] = vstoxx_index
h5['vstoxx_futures'] = vstoxx_futures
h5['vstoxx_options'] = vstoxx_options
h5.close()
pricing_date = dt.datetime(2014, 3, 31)
# last trading day in March 2014
maturity = third_fridays[10]
# October maturity
initial_value = vstoxx_index['V2TX'][pricing_date]
# VSTOXX on pricing_date
forward = vstoxx_futures[(vstoxx_futures.DATE == pricing_date) & (vstoxx_futures.MATURITY == maturity)]['PRICE'].values[0]
tol = 0.20
option_selection = vstoxx_options[(vstoxx_options.DATE == pricing_date)
& (vstoxx_options.MATURITY == maturity)
& (vstoxx_options.TYPE == 'C')
& (vstoxx_options.STRIKE > (1 - tol) * forward)
& (vstoxx_options.STRIKE < (1 + tol) * forward)]
me_vstoxx = market_environment('me_vstoxx', pricing_date)
me_vstoxx.add_constant('initial_value', initial_value)
me_vstoxx.add_constant('final_date', maturity)
me_vstoxx.add_constant('currency', 'EUR')
me_vstoxx.add_constant('frequency', 'B')
me_vstoxx.add_constant('paths', 10000)
csr = constant_short_rate('csr', 0.01)
# somewhat arbitrarily chosen here
me_vstoxx.add_curve('discount_curve', csr)
# parameters to be calibrated later
me_vstoxx.add_constant('kappa', 1.0)
me_vstoxx.add_constant('theta', 1.2 * initial_value)
vol_est = vstoxx_index['V2TX'].std() * np.sqrt(len(vstoxx_index['V2TX']) / 252.0)
me_vstoxx.add_constant('volatility', vol_est)
# vol_est
vstoxx_model = square_root_diffusion('vstoxx_model', me_vstoxx)
me_vstoxx.add_constant('strike', forward)
me_vstoxx.add_constant('maturity', maturity)
payoff_func = 'np.maximum(maturity_value - strike, 0)'
vstoxx_eur_call = valuation_mcs_european('vstoxx_eur_call',vstoxx_model, me_vstoxx, payoff_func)
option_models = {}
for option in option_selection.index:
strike = option_selection['STRIKE'].ix[option]
me_vstoxx.add_constant('strike', strike)
option_models[option] = valuation_mcs_european( 'eur_call_%d' % strike, vstoxx_model, me_vstoxx, payoff_func )
def calculate_model_values(p0):
'''
Returns all relevant option values.
Parameters
p0 : tuple/list, tuple of kappa, theta, volatility
Returns
model_values : dict, dictionary with model values
'''
kappa, theta, volatility = p0
vstoxx_model.update(kappa=kappa,
theta=theta,
volatility=volatility)
model_values = {}
for option in option_models:
model_values[option] = option_models[option].present_value(fixed_seed=True)
return model_values
# calculate_model_values((0.5, 27.5, vol_est))
i = 0
def mean_squared_error(p0):
'''
Returns the mean-squared error given the model and market values.
Parameters
p0 : tuple/list, tuple of kappa, theta, volatility
Returns
MSE : float, mean-squared error
'''
global i
model_values = np.array(calculate_model_values(p0).values())
market_values = option_selection['PRICE'].values
option_diffs = model_values - market_values
MSE = np.sum(option_diffs ** 2) / len(option_diffs)
# vectorized MSE calculation
if i % 20 == 0:
if i == 0:
print( '%4s' % i, '%6s' % "kappa", '%6s' % "theta", '%6s —>' % "vola", '%6s' % "MSE")
print( '%4d' % i, '%6.3f' % p0[0], '%6.3f' % p0[1], '%6.3f —>' % p0[2], '%6.3f' % MSE )
i += 1
return MSE
mean_squared_error((0.5, 27.5, vol_est))

Resources