Goodness of fit always being zero despite taking random data? - python-3.x

I'm trying to write code that generates random data and computes goodness of fit but I'm not understanding why the chi-squared test is always zero, may I have a fix for this ? For an attempted fix I tried playing around with different types to see if I get any resulting changes in the initial output, also I've tried changing the parameters to the loop in question.
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a = random.randint(-10,10)
b = random.randint(-10,10)
y = a * (b + i)
observed_values.append(y)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = o_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for i in z_o_e_v_f:
t_s += [((o_v_f)-(e_v_f))]**2/(e_v_f) # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()

In your original code, e_v_f = o_v.astype(float) made o_v_f, e_v_f ending up the same. There was also some issue in the for loop. I have edited your code a bit. See what it does you are looking for:
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a_o = random.randint(-10,10)
b_o = random.randint(-10,10)
y_o = a_o * (b_o + i)
observed_values.append(y_o)
# a_e = random.randint(-10,10)
# b_e = random.randint(-10,10)
# y_e = a_e * (b_e + i)
expected_values.append(y_o + 5)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = e_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for o, e in z_o_e_v_f:
t_s += (o - e) **2 / e # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()

Related

How can I interpolate values from two lists (in Python)?

I am relatively new to coding in Python. I have mainly used MatLab in the past and am used to having vectors that can be referenced explicitly rather than appended lists. I have a script where I generate a list of x- and y- (z-, v-, etc) values. Later, I want to interpolate and then print a table of the values at specified points. Here is a MWE. The problem is at line 48:
yq = interp1d(x_list, y_list, xq(nn))#interp1(output1(:,1),output1(:,2),xq(nn))
I'm not sure I have the correct syntax for the last two lines either:
table[nn] = ('%.2f' %xq, '%.2f' %yq)
print(table)
Here is the full script for the MWE:
#This script was written to test how to interpolate after data was created in a loop and stored as a list. Can a list be accessed explicitly like a vector in matlab?
#
from scipy.interpolate import interp1d
from math import * #for ceil
from astropy.table import Table #for Table
import numpy as np
# define the initial conditions
x = 0 # initial x position
y = 0 # initial y position
Rmax = 10 # maxium range
""" initializing variables for plots"""
x_list = [x]
y_list = [y]
""" define functions"""
# not necessary for this MWE
"""create sample data for MWE"""
# x and y data are calculated using functions and appended to their respective lists
h = 1
t = 0
tf = 10
N=ceil(tf/h)
# Example of interpolation without a loop: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html#d-interpolation-interp1d
#x = np.linspace(0, 10, num=11, endpoint=True)
#y = np.cos(-x**2/9.0)
#f = interp1d(x, y)
for i in range(N):
x = h*i
y = cos(-x**2/9.0)
""" appends selected data for ability to plot"""
x_list.append(x)
y_list.append(y)
## Interpolation after x- and y-lists are already created
intervals = 0.5
nfinal = ceil(Rmax/intervals)
NN = nfinal+1 # length of table
dtype = [('Range (units?)', 'f8'), ('Drop? (units)', 'f8')]
table = Table(data=np.zeros(N, dtype=dtype))
for nn in range(NN):#for nn = 1:NN
xq = 0.0 + (nn-1)*intervals #0.0 + (nn-1)*intervals
yq = interp1d(x_list, y_list, xq(nn))#interp1(output1(:,1),output1(:,2),xq(nn))
table[nn] = ('%.2f' %xq, '%.2f' %yq)
print(table)
Your help and patience will be greatly appreciated!
Best regards,
Alex
Your code has some glaring issues that made it really difficult to understand. Let's first take a look at some things I needed to fix:
for i in range(N):
x = h*1
y = cos(-x**2/9.0)
""" appends selected data for ability to plot"""
x_list.append(x)
y_list.append(y)
You are appending a single value without modifying it. What I presume you wanted is down below.
intervals = 0.5
nfinal = ceil(Rmax/intervals)
NN = nfinal+1 # length of table
dtype = [('Range (units?)', 'f8'), ('Drop? (units)', 'f8')]
table = Table(data=np.zeros(N, dtype=dtype))
for nn in range(NN):#for nn = 1:NN
xq = 0.0 + (nn-1)*intervals #0.0 + (nn-1)*intervals
yq = interp1d(x_list, y_list, xq(nn))#interp1(output1(:,1),output1(:,2),xq(nn))
table[nn] = ('%.2f' %xq, '%.2f' %yq)
This is where things get strange. First: use pandas tables, this is the more popular choice. Second: I have no idea what you are trying to loop over. What I presume you wanted was to vary the number of points for the interpolation, which I have done so below. Third: you are trying to interpolate a point, when you probably want to interpolate over a range of points (...interpolation). Lastly, you are using the interp1d function incorrectly. Please take a look at the code below or run it here; let me know what you exactly wanted (specifically: what should xq / xq(nn) be?), because the MRE you provided is quite confusing.
from scipy.interpolate import interp1d
from math import *
import numpy as np
Rmax = 10
h = 1
t = 0
tf = 10
N = ceil(tf/h)
x = np.arange(0,N+1)
y = np.cos(-x**2/9.0)
interval = 0.5
NN = ceil(Rmax/interval) + 1
ip_list = np.arange(1,interval*NN,interval)
xtable = []
ytable = []
for i,nn in enumerate(ip_list):
f = interp1d(x,y)
x_i = np.arange(0,nn+interval,interval)
xtable += [x_i]
ytable += [f(x_i)]
[print(i) for i in xtable]
[print(i) for i in ytable]

Trying to rule out astrology but something is wrong

I am trying to rule out a possible astrology effect on populations as a statistically insignificant effect but to no avail. I am using Pearson's Chi Square test on two distributions of sun signs from two different populations one of astronaut pilots and the other one of celebrities. Something must be wrong but I failed to find it, probably on the statistics side.
import numpy as np
import pandas as pd
import ephem
from collections import Counter, namedtuple
import matplotlib.pyplot as plt
from scipy import stats
models = pd.read_csv('models.csv', delimiter=',')
astronauts = pd.read_csv('astronauts.csv', delimiter=',')
models = models.sample(229)
astronauts = astronauts.sample(229)
sun = ephem.Sun()
def get_planet_constellation(planet, dataset):
person_planet_constellation = []
for person in dataset['Birth Date']:
planet.compute(person)
person_planet_constellation += [ephem.constellation(planet)[1]]
return person_planet_constellation
def plot_bar_group(planet, data1, data2):
fig, ax = plt.subplots()
plt.bar(data1.keys(), data1.values(), alpha=0.5)
plt.bar(data2.keys(), data2.values(), alpha=0.5)
plt.legend(['astronauts', 'models'])
ylabel = 'Percentages of ' + planet.name + ' in constellation'
ax.set_ylabel(ylabel)
title = 'Histogram of ' + planet.name + ' in constellation by group'
ax.set_title(title)
plt.show()
astronaut_sun_constellation = Counter(
get_planet_constellation(sun, astronauts))
model_sun_constellation = Counter(get_planet_constellation(sun, models))
plot_bar_group(sun, astronaut_sun_constellation, model_sun_constellation)
a = list(astronaut_sun_constellation.values())
b = list(model_sun_constellation.values())
s = np.array([a, b])
stat, p, dof, expected = stats.chi2_contingency(s)
print(stat, p, dof, expected)
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
if p <= alpha:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
https://www.dropbox.com/s/w7rye6m5lbihjlh/astronauts.csv
https://www.dropbox.com/s/xlxanr0pxqtxcvv/models.csv
I have eventually found the bug, it was on passing the counter as a list to the chisquare function, it must be sorted first, otherwise chisquare sees a major difference in the counters values. All astrology effects now are insignificant as expected at the level of 0.95

Finding the minimum using fmin()

I am trying to minimize the "function()" with respect to two parameters. I have done so by creating mesh arrays and used them in the above "function()" to return similar meshed array values. However, upon using "fmin()" to find the minimum, the output says that the operators could not be broadcasted.
The code is pasted below:
import numpy as np
from scipy.optimize import fmin
import matplotlib.pyplot as plt
i=0
x_values = np.arange(-10,10,2)
y_values = np.arange(-10,10,2)
x_mesh = np.empty((0,len(x_values)))
y_mesh = np.empty((0,len(y_values)))
for i in range(len(x_values)):
y_mesh = np.vstack((y_mesh, y_values))
i=0
for i in range(len(y_values)):
x_mesh = np.vstack((x_mesh, x_values))
y_mesh = np.transpose(y_mesh)
def function(x_mesh, y_mesh):
return (2*x_mesh**2 + y_mesh**2)
''' Want to minimize function '''
x_start = np.zeros((len(x_values), len(y_values)))
y_start = x_start
y = fmin(lamda x_mesh: function(x_mesh, y_mesh), (x_start, y_start), full_output = True, disp = 0)
The output shown was:
File "C:/Users/User/Documents/Year2/Programming/elrter.py", line 42, in function
return (2*x_mesh**2 + y_mesh**2)
ValueError: operands could not be broadcast together with shapes (200,) (10,10)
But why does this happen? What is the solution?

Estimating parameters using minimization in Python and speed up this process

I am trying to find parameter estimates using using minimization. The code I wrote works but there are two problems:
I finds only a local minimum. I tried to solve this by using basinhopping.
It takes very long until I get a result and since I have to do this minimization around 1000 times this becomes a big issue.
So my questions are:
Do you know how I could optimize my code so that it runs faster for the minimization.
Is there a way I can change the basinhopping part so that it runs faster? eg. set niter lower or a differnt method im not aware of. I tried running it like this and after 10 hour I didnt get a response for even one of the 1000 individuals for basinhopping.
Is there another way to find a global minimum?
Feel free to ask further questions please.
My code:
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import basinhopping
from scipy.integrate import odeint
import pickle
import os
import pandas as pd
import datetime
import numpy.random as npr
import csv
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###IDS
df = pd.read_csv('1_Youtuber_SingleNrSheet_Comedy.csv', sep = ";", skipinitialspace=True) ######Change Name
YoutuberID = df["Channel_ID"].tolist()
##print(YoutuberID)
with open("9_p_q_m_Fun_ExtendedBass_VIEWS_Comedy_test.csv", "w" ,newline='',encoding='utf-8') as csv_file2: ######Change Name
csv_writer2 = csv.writer(csv_file2, delimiter=';')
csv_writer2.writerow(["Type","p", "q", "m","Functionvalue"])
count = 0
for ID in YoutuberID[0:]: ###Change
try:
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###ALL INFO
Days = pd.read_csv('3_API_Call_ALL_info_Comedy_v2.csv', sep = ";", skipinitialspace=True)
views_path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python\Daily_Views_Comedy" ######Change Name
os.chdir(views_path)
SVR = pd.read_csv("4_COMEDY_DailyViews_Clean_" + str(count) + "_" + ID + ".csv", sep = ";", parse_dates=True, dayfirst=True) ######Change Name
## print(SVR[SVR.columns[0]])
SVR = SVR[SVR[SVR.columns[0]]< "2018-05-01"] ####CHANGE DATE FOR DIF CAT
## print(SVR)
#####SV Input
SV = np.array(SVR["Daily Views"])
## print(SV)
Days = Days[Days["channelId"] == ID]
## print(Days)
Days["publishedAt"] = pd.to_datetime(Days.publishedAt)
Days = Days[Days["publishedAt"] > "2015-01-08"] ##"2015-01-10"
## print(Days)
##### Timedelta #####
start_date = pd.to_datetime("2015-06-08")
##print(start_date)
video_upload_day =[]
for video_date in Days["publishedAt"]:
TimeDelta = video_date - start_date
video_upload_day.append(TimeDelta.days)
##print(video_upload_day)
##print(videoT)
nvideos = len(video_upload_day)
ndays = len(SV)
videoT = np.array(video_upload_day)
## print(videoT,nvideos,ndays)
def objective(x):
p = x[0]
q = x[1]
m = x[2]
estimateV = np.zeros( (ndays, nvideos) )
for t in range( ndays ):
for v in range( nvideos ):
if videoT[v] <= t:
estimateV[ t,v ] = p*m + (q-p) * np.sum(estimateV[0:t,v],axis=0) - (q/m) * (np.sum(estimateV[0:t,v],axis=0)**2)
estimateSV = np.sum( estimateV, axis = 1 )
return np.sum( (SV - estimateSV)**2 )
This is the minimization part. I made one for the normal minimization and one for basinhopping and seperated it with ##.
###### MINIMIZATION #######
mguess = round(sum(SV)/(nvideos*2),0)
print(sum(SV),mguess)
x0 = np.array([0.001, 0.01, mguess]) ####Make it less volatile to first guess? Make bigger steps for m?
b1 = (0.00001,0.5)
b2 = (10**4,10**7)
bnds = (b1,b1,b2)
## minimizer_kwargs = dict(method="L-BFGS-B",bounds=bnds)
## res = basinhopping(objective, x0,niter=20, minimizer_kwargs=minimizer_kwargs)
res = minimize(objective, x0,bounds = bnds)
print(res)
csv_writer2.writerow(["COMEDY",res.x[0], res.x[1],res.x[2],res.fun]) ###CHANNGE CAT
print("CURRERNT YOUTUBER IS:",count)
count += 1
except:
print("PROBLEM",count)
count += 1
## print(res,res.x[0],res.x[1],res.x[2],res.fun)

Subtraction between 'dict_values' and 'float'

I am getting the error "TypeError: unsupported operand type(s) for -: 'dict_values' and 'float'" from line 173 in the sample code. I have copied from a book that does not yet seem to be updated to Python 3 and other forum topics don't seem to cover this problem.
It is trying to calculate the error in an optimsation for the difference in market values and model values, but the data storage type is different across the two.
Thanks
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import calendar
# frame
from get_year_deltas import get_year_deltas
from constant_short_rate import constant_short_rate
from market_environment import market_environment
from plot_option_stats import plot_option_stats
# simulation
from sn_random_numbers import sn_random_numbers
from simulation_class import simulation_class
from geometric_brownian_motion import geometric_brownian_motion
from jump_diffusion import jump_diffusion
from square_root_diffusion import square_root_diffusion
# valuation
from valuation_class import valuation_class
from valuation_mcs_european import valuation_mcs_european
from valuation_mcs_american import valuation_mcs_american
from derivatives_position import derivatives_position
from derivatives_portfolio import derivatives_portfolio
#import os
#path = os.getcwd()
url = 'http://www.stoxx.com/download/historical_values/h_vstoxx.txt'
vstoxx_index = pd.read_csv(url, index_col=0, header=2,parse_dates=True, dayfirst=True)
vstoxx_index = vstoxx_index[('2013/12/31' < vstoxx_index.index) & (vstoxx_index.index < '2014/4/1')]
vstoxx_futures = pd.read_excel('./vstoxx_march_2014.xlsx', 'vstoxx_futures')
del vstoxx_futures['A_SETTLEMENT_PRICE_SCALED']
del vstoxx_futures['A_CALL_PUT_FLAG']
del vstoxx_futures['A_EXERCISE_PRICE']
del vstoxx_futures['A_PRODUCT_ID']
columns = ['DATE', 'EXP_YEAR', 'EXP_MONTH', 'PRICE']
vstoxx_futures.columns = columns
def third_friday(date):
day = 21 - (calendar.weekday(date.year, date.month, 1) + 2) % 7
return dt.datetime(date.year, date.month, day)
set(vstoxx_futures['EXP_MONTH'])
third_fridays = {}
for month in set(vstoxx_futures['EXP_MONTH']):
third_fridays[month] = third_friday(dt.datetime(2014, month, 1))
#third_fridays
tf = lambda x: third_fridays[x]
vstoxx_futures['MATURITY'] = vstoxx_futures['EXP_MONTH'].apply(tf)
#vstoxx_futures.tail()
vstoxx_options = pd.read_excel('./vstoxx_march_2014.xlsx', 'vstoxx_options')
#vstoxx_options.info()
del vstoxx_options['A_SETTLEMENT_PRICE_SCALED']
del vstoxx_options['A_PRODUCT_ID']
columns = ['DATE', 'EXP_YEAR', 'EXP_MONTH', 'TYPE', 'STRIKE', 'PRICE']
vstoxx_options.columns = columns
vstoxx_options['MATURITY'] = vstoxx_options['EXP_MONTH'].apply(tf)
#vstoxx_options.head()
vstoxx_options['STRIKE'] = vstoxx_options['STRIKE'] / 100.0
save = False
if save is True:
import warnings
warnings.simplefilter('ignore')
h5 = pd.HDFStore('./vstoxx_march_2014.h5', complevel=9, complib='blosc')
h5['vstoxx_index'] = vstoxx_index
h5['vstoxx_futures'] = vstoxx_futures
h5['vstoxx_options'] = vstoxx_options
h5.close()
pricing_date = dt.datetime(2014, 3, 31)
# last trading day in March 2014
maturity = third_fridays[10]
# October maturity
initial_value = vstoxx_index['V2TX'][pricing_date]
# VSTOXX on pricing_date
forward = vstoxx_futures[(vstoxx_futures.DATE == pricing_date) & (vstoxx_futures.MATURITY == maturity)]['PRICE'].values[0]
tol = 0.20
option_selection = vstoxx_options[(vstoxx_options.DATE == pricing_date)
& (vstoxx_options.MATURITY == maturity)
& (vstoxx_options.TYPE == 'C')
& (vstoxx_options.STRIKE > (1 - tol) * forward)
& (vstoxx_options.STRIKE < (1 + tol) * forward)]
me_vstoxx = market_environment('me_vstoxx', pricing_date)
me_vstoxx.add_constant('initial_value', initial_value)
me_vstoxx.add_constant('final_date', maturity)
me_vstoxx.add_constant('currency', 'EUR')
me_vstoxx.add_constant('frequency', 'B')
me_vstoxx.add_constant('paths', 10000)
csr = constant_short_rate('csr', 0.01)
# somewhat arbitrarily chosen here
me_vstoxx.add_curve('discount_curve', csr)
# parameters to be calibrated later
me_vstoxx.add_constant('kappa', 1.0)
me_vstoxx.add_constant('theta', 1.2 * initial_value)
vol_est = vstoxx_index['V2TX'].std() * np.sqrt(len(vstoxx_index['V2TX']) / 252.0)
me_vstoxx.add_constant('volatility', vol_est)
# vol_est
vstoxx_model = square_root_diffusion('vstoxx_model', me_vstoxx)
me_vstoxx.add_constant('strike', forward)
me_vstoxx.add_constant('maturity', maturity)
payoff_func = 'np.maximum(maturity_value - strike, 0)'
vstoxx_eur_call = valuation_mcs_european('vstoxx_eur_call',vstoxx_model, me_vstoxx, payoff_func)
option_models = {}
for option in option_selection.index:
strike = option_selection['STRIKE'].ix[option]
me_vstoxx.add_constant('strike', strike)
option_models[option] = valuation_mcs_european( 'eur_call_%d' % strike, vstoxx_model, me_vstoxx, payoff_func )
def calculate_model_values(p0):
'''
Returns all relevant option values.
Parameters
p0 : tuple/list, tuple of kappa, theta, volatility
Returns
model_values : dict, dictionary with model values
'''
kappa, theta, volatility = p0
vstoxx_model.update(kappa=kappa,
theta=theta,
volatility=volatility)
model_values = {}
for option in option_models:
model_values[option] = option_models[option].present_value(fixed_seed=True)
return model_values
# calculate_model_values((0.5, 27.5, vol_est))
i = 0
def mean_squared_error(p0):
'''
Returns the mean-squared error given the model and market values.
Parameters
p0 : tuple/list, tuple of kappa, theta, volatility
Returns
MSE : float, mean-squared error
'''
global i
model_values = np.array(calculate_model_values(p0).values())
market_values = option_selection['PRICE'].values
option_diffs = model_values - market_values
MSE = np.sum(option_diffs ** 2) / len(option_diffs)
# vectorized MSE calculation
if i % 20 == 0:
if i == 0:
print( '%4s' % i, '%6s' % "kappa", '%6s' % "theta", '%6s —>' % "vola", '%6s' % "MSE")
print( '%4d' % i, '%6.3f' % p0[0], '%6.3f' % p0[1], '%6.3f —>' % p0[2], '%6.3f' % MSE )
i += 1
return MSE
mean_squared_error((0.5, 27.5, vol_est))

Resources