PyMySQL - getting column data from dictionary by fetchall in

PyMySQL - getting column data from dictionary by fetchall in - python-3.x

Using PyMySQL python 3.6.3 versions, getting an DictCursor, and then fetchall(). I get all the data and .description says:
(('recdate', 12, None, 19, 19, 0, False), ('outdoorhumidity', 246, None, 9, 9, 3, True), ('outdoortemperature', 246, None, 9, 9, 3, True)).
Printing the rows I get, f.ex:
2005-12-31 23:12:00 89.000 -6.667
2005-12-31 23:13:00 89.000 -6.667
2005-12-31 23:15:00 89.000 -6.650
2005-12-31 23:16:00 89.000 -6.650
2005-12-31 23:17:00 89.000 -6.640
Note the missing minute ...23:14:00 - but I do this for a bigger missing data imputations project. So this is sample data around the missing data. Via the dictionary I want to get the incomplete time series as well as f ex 3rd column, in the best way -simple easy readable code? Do I in each case have to know how many rows there are?
import pymysql
dbServerName = "127.0.0.1"
dbUser = "root"
dbPassword = "mypwd"
dbName = "dbname"
charSet = "utf8"
cursorType = pymysql.cursors.DictCursor
connectionObject = pymysql.connect(host=dbServerName, user=dbUser, password=dbPassword,
db=dbName, charset=charSet,cursorclass=cursorType)
try:
cursorObject = connectionObject.cursor()
sqlQuery = "SELECT recdate, outdoorhumidity, outdoortemperature FROM mytable WHERE recdate BETWEEN '2005-12-31 23:12:00' AND '2005-12-31 23:17:00';"
cursorObject.execute(sqlQuery)
#Fetch all the rows - within the cursor? Can this be done?
rows = cursorObject.fetchall()
print(cursorObject.description)
for row in rows:
print(row["recdate"], row["outdoorhumidity"], row["outdoortemperature"])
except Exception as e:
print("Exeception occured:{}".format(e))
finally:
cursorObject.close()
connectionObject.close()

The things I wanted to achieve was much simpler to use with pandas, sqlalchemy, datetime and time. And also made commentes around differerent ways to go about referencing Pandas DataFrames.
Some things are of cause a bit special to just my case, but can be interesting for others, especially to play around with the use of .iloc and .loc, and type convertions handling time.
# Python version
# '3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64
bit (AMD64)]'
# Running Spyder IDE version 3.2.6
# PANDAS VERSION '0.22.0'enter code here
import pandas as pd
import time
from datetime import datetime
from sqlalchemy import create_engine
def GetSqlData(begdatetime,enddatetime):
temp_df = pd.read_sql_query("""SELECT recdate, outdoortemperature
FROM osterasen
WHERE recdate BETWEEN
%(c1)s
AND
%(c2)s """,
engine,
params={'c1': begdatetime, 'c2': enddatetime}
)
temp_df['recdate'] = temp_df['recdate'].astype('datetime64[ns]')
return temp_df
iso_datetimeformat = "%Y-%m-%d %H:%M:%S"
# BEGIN - CREATE MOCK MISSING DATES
# would have to come from the WeatherData.missings table
print("Step 0.0: ")
print(" Get missing data info from weatherdata.missings...")
num_of_missings=21
range_good_m = num_of_missings * 2
print(range_good_m)
range_good_s = range_good_m * 60
print(range_good_s)
a_complete_end_str = '2005-09-24 09:09:00'
b_complete_beg_str = '2005-09-24 09:31:00'
a_complete_end = time.mktime(datetime.strptime(a_complete_end_str,
iso_datetimeformat ).timetuple())
print("a_complete_end: ", a_complete_end)
b_complete_beg = time.mktime(datetime.strptime(b_complete_beg_str, i
iso_datetimeformat ).timetuple())
print("b_complete_beg: ", b_complete_beg)
a_complete_beg = a_complete_end - range_good_s
print("a_complete_beg: ", a_complete_beg)
b_complete_end = b_complete_beg + range_good_s
print("b_complete_end: ", b_complete_end)
print("b_complete_end: type: ", type(b_complete_end))
m_missing_beg = a_complete_end + 60
m_missing_end = b_complete_beg - 60
m_missing_beg_str = str(datetime.fromtimestamp(m_missing_beg))
m_missing_end_str = str(datetime.fromtimestamp(m_missing_end))
a_complete_beg_str = str(datetime.fromtimestamp(a_complete_beg ))
b_complete_end_str = str(datetime.fromtimestamp(b_complete_end ))
# Print out the ranges
print(a_complete_beg_str)
print(a_complete_end_str)
print(m_missing_beg_str)
print(m_missing_end_str)
print(b_complete_beg_str)
print(b_complete_end_str)
# END - CREATE MOCK MISSING DATES
connection_str =
'mysql+pymysql://root:mypassword#127.0.0.1:3306/weatherdata'
engine = create_engine(connection_str,encoding='utf8')
print("Step 1.0: ")
df_a = GetSqlData( a_complete_beg_str, a_complete_end_str )
print(type(df_a)," ", type(df_a.head))
print(df_a)
x1 = df_a.loc[5, 'outdoortemperature']
y1 = df_a.loc[5, 'recdate']
print("x1 = ", x1, type(x1))
print("y1 = ", y1, type(y1))
print("Step 2.0: ")
df_b = GetSqlData( b_complete_beg_str, b_complete_end_str )
print(type(df_b)," ", type(df_b.head))
print(df_b)
x2 = df_b.loc[5, 'outdoortemperature']
y2 = df_b.loc[5, 'recdate']
print("x2 = ", x2, type(x2))
print("y2 = ", y2, type(y2))

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

failed to execute script python exe

I'm trying to convert my python script to EXE.
What the script does is a basic analysis of an excel file, and generates a report in pdf.
Also in the script, I create a png file, then reload it to the pdf through the script.
I'm trying to convert the py file to EXE but it doesn't work :(
the script (works great as py file):
import pandas as pd
import os
from pandasql import sqldf
from datetime import datetime
import numpy as nu
from tkinter import *
import tkinter as tk
from fpdf import FPDF
import matplotlib.pyplot as plt
def start_gui(root):
myLabel = Label(root, text='Hi! Here you can output the sessions report').grid(row=0, column=0)
start_button = Button(root, text='Produce Report', padx=30, pady=20, command=main, fg='blue').grid(row=50, column=0)
root.mainloop()
pass
def print_full_results(df):
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pass
def load_data(path):
df = pd.read_csv(path)
df = pd.DataFrame(df)
return df
def clean_raw_data(raw_data):
raw_data = raw_data.dropna(how='all') # Drop the rows where all elements are missing.
raw_data = raw_data.dropna(axis=1, how='all') # Drop the columns where all elements are missing.
raw_data = raw_data.reset_index() # Reset the indexes after droping rows
raw_data = raw_data.drop(columns=['index'])
raw_data = raw_data.rename(
columns={'Meeting ID': 'Meeting_ID', 'User Name': 'Admin_User_Name', 'Uzer Eam1l': 'Admin_Email',
'Has Zoom Rooms?': 'Has_Zoom_Rooms', 'Creation Time': 'Meeting_Creation_Time',
'Start Time': 'Meeting_Start_Time', 'End Time': 'Meeting_End_Time',
'Duration (Minutes)': 'Meeting_Duration_min', 'Ncmf (prjgjncl Ncmf)': 'User_Name',
'Usfr fncil': 'User_Email', 'Join Time': 'User_Join_Time', 'Leave Time': 'User_Leave_Time',
'Duration (Minutes).1': 'User_Duration_min'})
raw_data = convert_relevant_types(raw_data)
raw_data = fill_null_emails(raw_data)
return raw_data
def convert_relevant_types(db):
pd.options.mode.chained_assignment = None # default='warn'
# relevant columns (Meeting_Creation_Time,Meeting_Start_Time,Meeting_End_Time,User_Join_Time,User_Leave_Time): convert string to date
for i in range(len(db['Meeting_Start_Time'])):
creation_date = datetime.strptime(db['Meeting_Creation_Time'][i], '%m/%d/%y %H:%M')
start_date = datetime.strptime(db['Meeting_Start_Time'][i], '%m/%d/%y %H:%M')
end_date = datetime.strptime(db['Meeting_End_Time'][i], '%m/%d/%y %H:%M')
user_join_date = datetime.strptime(db['User_Join_Time'][i], '%m/%d/%y %H:%M')
user_leave_date = datetime.strptime(db['User_Leave_Time'][i], '%m/%d/%y %H:%M')
db['Meeting_Creation_Time'][i] = creation_date
db['Meeting_Start_Time'][i] = start_date
db['Meeting_End_Time'][i] = end_date
db['User_Join_Time'][i] = user_join_date
db['User_Leave_Time'][i] = user_leave_date
# relevant columns (Meeting_Duration_min,User_Duration_min): convert string to int
for i in range(len(db['Meeting_Duration_min'])):
db['Meeting_Duration_min'][i] = int(db['Meeting_Duration_min'][i])
db['User_Duration_min'][i] = int(db['User_Duration_min'][i])
return db
def fill_null_emails(db):
for i in range(len(db['User_Email'])):
if pd.isnull(db['User_Email'][i]):
db['User_Email'][i] = db['User_Name'][i] + ' Missing Mail'
return db
def pdff_space_down(pdf):
pdf.cell(0, 10, '', ln=1, align='L')
return pdf
def pdff_write(pdf, text, space=5, align='L'):
pdf.cell(0, space, text, ln=1, align='L')
return pdf
def pdff_write_table(pdf, data, spacing=1.5):
col_width = pdf.w / 4.5
row_height = pdf.font_size
for row in data:
for item in row:
pdf.cell(col_width, row_height * spacing,
txt=item, border=1)
pdf.ln(row_height * spacing)
return pdf
def create_pdf(today,min_date, max_date, sessions_num, total_cost, costs_table, num_of_users, avg_users_come):
pdf = FPDF(orientation='p', unit='mm', format='A4')
pdf.add_page()
pdf.set_font('Arial', size=10)
pdf.cell(0, 10, 'Date:{}'.format(today), ln=1, align='L')
pdf.set_font('times', 'B', size=24)
pdf.cell(0, 8, 'Home Assignment - Ziv Mor', ln=1, align='C')
pdf.set_font('times', size=18)
pdf.cell(0, 10, 'Zoom-Sessions Report (Automated by Python)', ln=1, align='C')
pdf.cell(0, 10, '({}'.format(min_date) + ' To {})'.format(max_date), ln=1, align='C')
pdf.set_font('times', 'U', size=15)
pdf = pdff_write(pdf, 'Sessions Analysis', space=20)
pdf.set_font('times', size=13)
pdf = pdff_write(pdf, 'Total Number of Sessions: {} (Team meetings are not include)'.format(sessions_num), space=15)
pdf.set_font('times', 'UB', size=13)
pdf.cell(0, 10, 'Number Of Sessions By Dates', ln=1.5, align='C')
pdf.image('sessions_by_day_plot.png', x=55, y=None, w=100, h=70, type='', link='')
pdf = pdff_space_down(pdf)
pdf.set_font('times', size=13)
pdf = pdff_write(pdf, 'Sessions Participants Segmentation:', space=10)
pdf = pdff_write_table(pdf, costs_table)
pdf.set_font('times', 'UB', size=13)
pdf.cell(0, 20, 'Sessions Total Cost: {} NIS'.format(total_cost), ln=1, align='C')
pdf.set_font('times', 'U', size=15)
pdf = pdff_write(pdf, 'Users Analysis', space=17)
pdf.set_font('times', size=13)
pdf = pdff_write(pdf, 'Total Number of Users Engaged: {}'.format(num_of_users), space=10)
pdf = pdff_write(pdf, 'The Average Frequency of Arrival of Each User : {} Sessions'.format(avg_users_come),
space=10)
pdf.output('Zoom Report_{}.pdf'.format(str(datetime.today()).replace(':', '.', 3)))
def main():
path = os.path.join(sys.path[0], 'participant sessions data.csv')
raw_data = load_data(path)
zoom_db = clean_raw_data(raw_data)
'''------------------------------SQL Queries---------------------------------'''
# todo asume פגישת צוות - not counted
question_1_query = 'Select date(Meeting_Start_Time)date, count(distinct Meeting_Start_Time)Num_Of_Sessions From zoom_db where Topic <>"פגישת צוות" Group by date(Meeting_Start_Time)'
answer_1_table = sqldf(question_1_query)
num_of_sessions = nu.sum(list(answer_1_table['Num_Of_Sessions']))
# count for each meeting the number of participants
question_2_query = 'Select Topic, Meeting_Start_Time, count(Distinct User_Email)num_of_Users From zoom_db Group by Meeting_Start_Time, Meeting_ID'
answer_2_table = sqldf(question_2_query)
# count for each user number of times the user arrived to session
# todo - mention I didnt concluded rows that user got in for less than 1 minute + there are a lot of users without mail so I assume for
question_3_query = 'select User_Email, count(*)num_of_arrivals from(Select User_Email, Meeting_Start_Time, Meeting_ID From zoom_db Where User_Duration_min <> 0 Group by User_Email, Meeting_ID , Meeting_Start_Time) group by User_Email Order by num_of_arrivals desc'
answer_3_table = sqldf(question_3_query)
# Calculate the avg times of arrival of users (Using the result of 3'rd question query #todo - asumming not conclud the host
participants_arrivals_list = list(answer_3_table['num_of_arrivals'])[1:]
avg_users_come = round((nu.average(participants_arrivals_list)), 2)
'''---------------------More Calculates for the report------------------------'''
# Calculate the intervals of dates
min_date_qu = sqldf('select min(date(Meeting_Start_Time)) from zoom_db')
min_date_qu = list(min_date_qu['min(date(Meeting_Start_Time))'])[0]
max_date_qu = sqldf('select max(date(Meeting_Start_Time)) from zoom_db')
max_date_qu = list(max_date_qu['max(date(Meeting_Start_Time))'])[0]
num_meetings0_5 = sqldf('select count(*) from answer_2_table where num_of_users<=5 and Topic <>"פגישת צוות"')
num_meetings0_5 = list(num_meetings0_5['count(*)'])[0]
num_meetings5_10 = sqldf(
'select count(*) from answer_2_table where num_of_users>5 and num_of_users<=10 and Topic <>"פגישת צוות"')
num_meetings5_10 = list(num_meetings5_10['count(*)'])[0]
num_meetings10_15 = sqldf(
'select count(*) from answer_2_table where num_of_users>10 and num_of_users<=15 and Topic <>"פגישת צוות"')
num_meetings10_15 = list(num_meetings10_15['count(*)'])[0]
num_meetings_15_plus = sqldf('select count(*) from answer_2_table where num_of_users>15 and Topic <>"פגישת צוות"')
num_meetings_15_plus = list(num_meetings_15_plus['count(*)'])[0]
total_cost = 50 * num_meetings0_5 + 100 * num_meetings5_10 + 150 * num_meetings10_15 + 200 * num_meetings_15_plus
costs_table = [['Session type', 'Number of sessions', 'Cost'],
['0-5 participants', str(num_meetings0_5), str(50 * num_meetings0_5)],
['5-10 participants', str(num_meetings5_10), str(100 * num_meetings5_10)],
['10-15 participants', str(num_meetings10_15), str(150 * num_meetings10_15)],
['15+ participants', str(num_meetings_15_plus), str(200 * num_meetings_15_plus)]]
sessions_by_day_plot = answer_1_table.plot.bar(x='date', y='Num_Of_Sessions', rot=80)
plt.savefig('sessions_by_day_plot.png')
num_of_users = sqldf('select count(*) From answer_3_table')
num_of_users = list(num_of_users['count(*)'])[0]
today = datetime.today().strftime("%b-%d-%Y")
'''----------------------------------Out-Put Results------------------------'''
create_pdf(today = today , max_date=max_date_qu, min_date=min_date_qu, sessions_num=num_of_sessions,
total_cost=total_cost, costs_table=costs_table, num_of_users=num_of_users, avg_users_come=avg_users_come)
writer = pd.ExcelWriter('Zoom Report_{}.xlsx'.format(str(datetime.today()).replace(':', '.', 3)))
(answer_2_table).to_excel(writer , sheet_name='Sessions Number of Participants')
(answer_3_table).to_excel(writer, sheet_name='Participants show-up')
writer.save()
'''---------------------Delete not relevant files------------------------'''
plot1_path = os.path.join(sys.path[0], 'sessions_by_day_plot.png')
os.remove(plot1_path)
exit()
if __name__ == '__main__':
root = Tk()
start_gui(root)
# main()

Series format pandas

import pandas as pd
from datetime import datetime
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data["INVDAT"] = data["INVDAT"].apply(lambda x: datetime.combine(x, datetime.min.time()))
data["INVDAT"] = data["INVDAT"].dt.strftime("%m-%d-%Y")
print(data)
# output to new file
# new_data = data
# new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
I'm trying to format the INVDAT column to correct date format like 11/25/19, I've tried multiple solutions but keep running into errors like this one: TypeError: combine() argument 1 must be datetime.date, not int, I then tried to convert the integer to date type but it errors also.

Or you can simply use df["INVDAT"] = pd.to_datetime(df["INVDAT"], format="%m/%d/%y"), in this case you don't need the datetime pakage. For further information you should look the docs.

data['INVDAT'] = data['INVDAT'].astype('str')
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
This solution works but if the date representation is a single month like 12519 ( expected output 1/25/19), it fails. I tried using a conditional to add a 0 to the front if len() < 6 but it gives me an error that the dtype is int64.

import pandas as pd
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data['INVDAT'] = data['INVDAT'].astype('str')
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
for i in data['INVDAT'].str.len():
if i <= 5:
data['INVDAT'] = data['INVDAT'].apply(lambda x: '{0:0>6}'.format(x))
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
else:
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
# output to new file
new_data = data
new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
This is the solution, it's sloppy but works

Estimating parameters using minimization in Python and speed up this process

I am trying to find parameter estimates using using minimization. The code I wrote works but there are two problems:
I finds only a local minimum. I tried to solve this by using basinhopping.
It takes very long until I get a result and since I have to do this minimization around 1000 times this becomes a big issue.
So my questions are:
Do you know how I could optimize my code so that it runs faster for the minimization.
Is there a way I can change the basinhopping part so that it runs faster? eg. set niter lower or a differnt method im not aware of. I tried running it like this and after 10 hour I didnt get a response for even one of the 1000 individuals for basinhopping.
Is there another way to find a global minimum?
Feel free to ask further questions please.
My code:
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import basinhopping
from scipy.integrate import odeint
import pickle
import os
import pandas as pd
import datetime
import numpy.random as npr
import csv
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###IDS
df = pd.read_csv('1_Youtuber_SingleNrSheet_Comedy.csv', sep = ";", skipinitialspace=True) ######Change Name
YoutuberID = df["Channel_ID"].tolist()
##print(YoutuberID)
with open("9_p_q_m_Fun_ExtendedBass_VIEWS_Comedy_test.csv", "w" ,newline='',encoding='utf-8') as csv_file2: ######Change Name
csv_writer2 = csv.writer(csv_file2, delimiter=';')
csv_writer2.writerow(["Type","p", "q", "m","Functionvalue"])
count = 0
for ID in YoutuberID[0:]: ###Change
try:
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###ALL INFO
Days = pd.read_csv('3_API_Call_ALL_info_Comedy_v2.csv', sep = ";", skipinitialspace=True)
views_path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python\Daily_Views_Comedy" ######Change Name
os.chdir(views_path)
SVR = pd.read_csv("4_COMEDY_DailyViews_Clean_" + str(count) + "_" + ID + ".csv", sep = ";", parse_dates=True, dayfirst=True) ######Change Name
## print(SVR[SVR.columns[0]])
SVR = SVR[SVR[SVR.columns[0]]< "2018-05-01"] ####CHANGE DATE FOR DIF CAT
## print(SVR)
#####SV Input
SV = np.array(SVR["Daily Views"])
## print(SV)
Days = Days[Days["channelId"] == ID]
## print(Days)
Days["publishedAt"] = pd.to_datetime(Days.publishedAt)
Days = Days[Days["publishedAt"] > "2015-01-08"] ##"2015-01-10"
## print(Days)
##### Timedelta #####
start_date = pd.to_datetime("2015-06-08")
##print(start_date)
video_upload_day =[]
for video_date in Days["publishedAt"]:
TimeDelta = video_date - start_date
video_upload_day.append(TimeDelta.days)
##print(video_upload_day)
##print(videoT)
nvideos = len(video_upload_day)
ndays = len(SV)
videoT = np.array(video_upload_day)
## print(videoT,nvideos,ndays)
def objective(x):
p = x[0]
q = x[1]
m = x[2]
estimateV = np.zeros( (ndays, nvideos) )
for t in range( ndays ):
for v in range( nvideos ):
if videoT[v] <= t:
estimateV[ t,v ] = p*m + (q-p) * np.sum(estimateV[0:t,v],axis=0) - (q/m) * (np.sum(estimateV[0:t,v],axis=0)**2)
estimateSV = np.sum( estimateV, axis = 1 )
return np.sum( (SV - estimateSV)**2 )
This is the minimization part. I made one for the normal minimization and one for basinhopping and seperated it with ##.
###### MINIMIZATION #######
mguess = round(sum(SV)/(nvideos*2),0)
print(sum(SV),mguess)
x0 = np.array([0.001, 0.01, mguess]) ####Make it less volatile to first guess? Make bigger steps for m?
b1 = (0.00001,0.5)
b2 = (10**4,10**7)
bnds = (b1,b1,b2)
## minimizer_kwargs = dict(method="L-BFGS-B",bounds=bnds)
## res = basinhopping(objective, x0,niter=20, minimizer_kwargs=minimizer_kwargs)
res = minimize(objective, x0,bounds = bnds)
print(res)
csv_writer2.writerow(["COMEDY",res.x[0], res.x[1],res.x[2],res.fun]) ###CHANNGE CAT
print("CURRERNT YOUTUBER IS:",count)
count += 1
except:
print("PROBLEM",count)
count += 1
## print(res,res.x[0],res.x[1],res.x[2],res.fun)

Subtraction between 'dict_values' and 'float'

I am getting the error "TypeError: unsupported operand type(s) for -: 'dict_values' and 'float'" from line 173 in the sample code. I have copied from a book that does not yet seem to be updated to Python 3 and other forum topics don't seem to cover this problem.
It is trying to calculate the error in an optimsation for the difference in market values and model values, but the data storage type is different across the two.
Thanks
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import calendar
# frame
from get_year_deltas import get_year_deltas
from constant_short_rate import constant_short_rate
from market_environment import market_environment
from plot_option_stats import plot_option_stats
# simulation
from sn_random_numbers import sn_random_numbers
from simulation_class import simulation_class
from geometric_brownian_motion import geometric_brownian_motion
from jump_diffusion import jump_diffusion
from square_root_diffusion import square_root_diffusion
# valuation
from valuation_class import valuation_class
from valuation_mcs_european import valuation_mcs_european
from valuation_mcs_american import valuation_mcs_american
from derivatives_position import derivatives_position
from derivatives_portfolio import derivatives_portfolio
#import os
#path = os.getcwd()
url = 'http://www.stoxx.com/download/historical_values/h_vstoxx.txt'
vstoxx_index = pd.read_csv(url, index_col=0, header=2,parse_dates=True, dayfirst=True)
vstoxx_index = vstoxx_index[('2013/12/31' < vstoxx_index.index) & (vstoxx_index.index < '2014/4/1')]
vstoxx_futures = pd.read_excel('./vstoxx_march_2014.xlsx', 'vstoxx_futures')
del vstoxx_futures['A_SETTLEMENT_PRICE_SCALED']
del vstoxx_futures['A_CALL_PUT_FLAG']
del vstoxx_futures['A_EXERCISE_PRICE']
del vstoxx_futures['A_PRODUCT_ID']
columns = ['DATE', 'EXP_YEAR', 'EXP_MONTH', 'PRICE']
vstoxx_futures.columns = columns
def third_friday(date):
day = 21 - (calendar.weekday(date.year, date.month, 1) + 2) % 7
return dt.datetime(date.year, date.month, day)
set(vstoxx_futures['EXP_MONTH'])
third_fridays = {}
for month in set(vstoxx_futures['EXP_MONTH']):
third_fridays[month] = third_friday(dt.datetime(2014, month, 1))
#third_fridays
tf = lambda x: third_fridays[x]
vstoxx_futures['MATURITY'] = vstoxx_futures['EXP_MONTH'].apply(tf)
#vstoxx_futures.tail()
vstoxx_options = pd.read_excel('./vstoxx_march_2014.xlsx', 'vstoxx_options')
#vstoxx_options.info()
del vstoxx_options['A_SETTLEMENT_PRICE_SCALED']
del vstoxx_options['A_PRODUCT_ID']
columns = ['DATE', 'EXP_YEAR', 'EXP_MONTH', 'TYPE', 'STRIKE', 'PRICE']
vstoxx_options.columns = columns
vstoxx_options['MATURITY'] = vstoxx_options['EXP_MONTH'].apply(tf)
#vstoxx_options.head()
vstoxx_options['STRIKE'] = vstoxx_options['STRIKE'] / 100.0
save = False
if save is True:
import warnings
warnings.simplefilter('ignore')
h5 = pd.HDFStore('./vstoxx_march_2014.h5', complevel=9, complib='blosc')
h5['vstoxx_index'] = vstoxx_index
h5['vstoxx_futures'] = vstoxx_futures
h5['vstoxx_options'] = vstoxx_options
h5.close()
pricing_date = dt.datetime(2014, 3, 31)
# last trading day in March 2014
maturity = third_fridays[10]
# October maturity
initial_value = vstoxx_index['V2TX'][pricing_date]
# VSTOXX on pricing_date
forward = vstoxx_futures[(vstoxx_futures.DATE == pricing_date) & (vstoxx_futures.MATURITY == maturity)]['PRICE'].values[0]
tol = 0.20
option_selection = vstoxx_options[(vstoxx_options.DATE == pricing_date)
& (vstoxx_options.MATURITY == maturity)
& (vstoxx_options.TYPE == 'C')
& (vstoxx_options.STRIKE > (1 - tol) * forward)
& (vstoxx_options.STRIKE < (1 + tol) * forward)]
me_vstoxx = market_environment('me_vstoxx', pricing_date)
me_vstoxx.add_constant('initial_value', initial_value)
me_vstoxx.add_constant('final_date', maturity)
me_vstoxx.add_constant('currency', 'EUR')
me_vstoxx.add_constant('frequency', 'B')
me_vstoxx.add_constant('paths', 10000)
csr = constant_short_rate('csr', 0.01)
# somewhat arbitrarily chosen here
me_vstoxx.add_curve('discount_curve', csr)
# parameters to be calibrated later
me_vstoxx.add_constant('kappa', 1.0)
me_vstoxx.add_constant('theta', 1.2 * initial_value)
vol_est = vstoxx_index['V2TX'].std() * np.sqrt(len(vstoxx_index['V2TX']) / 252.0)
me_vstoxx.add_constant('volatility', vol_est)
# vol_est
vstoxx_model = square_root_diffusion('vstoxx_model', me_vstoxx)
me_vstoxx.add_constant('strike', forward)
me_vstoxx.add_constant('maturity', maturity)
payoff_func = 'np.maximum(maturity_value - strike, 0)'
vstoxx_eur_call = valuation_mcs_european('vstoxx_eur_call',vstoxx_model, me_vstoxx, payoff_func)
option_models = {}
for option in option_selection.index:
strike = option_selection['STRIKE'].ix[option]
me_vstoxx.add_constant('strike', strike)
option_models[option] = valuation_mcs_european( 'eur_call_%d' % strike, vstoxx_model, me_vstoxx, payoff_func )
def calculate_model_values(p0):
'''
Returns all relevant option values.
Parameters
p0 : tuple/list, tuple of kappa, theta, volatility
Returns
model_values : dict, dictionary with model values
'''
kappa, theta, volatility = p0
vstoxx_model.update(kappa=kappa,
theta=theta,
volatility=volatility)
model_values = {}
for option in option_models:
model_values[option] = option_models[option].present_value(fixed_seed=True)
return model_values
# calculate_model_values((0.5, 27.5, vol_est))
i = 0
def mean_squared_error(p0):
'''
Returns the mean-squared error given the model and market values.
Parameters
p0 : tuple/list, tuple of kappa, theta, volatility
Returns
MSE : float, mean-squared error
'''
global i
model_values = np.array(calculate_model_values(p0).values())
market_values = option_selection['PRICE'].values
option_diffs = model_values - market_values
MSE = np.sum(option_diffs ** 2) / len(option_diffs)
# vectorized MSE calculation
if i % 20 == 0:
if i == 0:
print( '%4s' % i, '%6s' % "kappa", '%6s' % "theta", '%6s —>' % "vola", '%6s' % "MSE")
print( '%4d' % i, '%6.3f' % p0[0], '%6.3f' % p0[1], '%6.3f —>' % p0[2], '%6.3f' % MSE )
i += 1
return MSE
mean_squared_error((0.5, 27.5, vol_est))

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

PyMySQL - getting column data from dictionary by fetchall in - python-3.x

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

failed to execute script python exe

Series format pandas

Estimating parameters using minimization in Python and speed up this process

Subtraction between 'dict_values' and 'float'

Categories

Resources