Series format pandas - python-3.x

import pandas as pd
from datetime import datetime
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data["INVDAT"] = data["INVDAT"].apply(lambda x: datetime.combine(x, datetime.min.time()))
data["INVDAT"] = data["INVDAT"].dt.strftime("%m-%d-%Y")
print(data)
# output to new file
# new_data = data
# new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
I'm trying to format the INVDAT column to correct date format like 11/25/19, I've tried multiple solutions but keep running into errors like this one: TypeError: combine() argument 1 must be datetime.date, not int, I then tried to convert the integer to date type but it errors also.

Or you can simply use df["INVDAT"] = pd.to_datetime(df["INVDAT"], format="%m/%d/%y"), in this case you don't need the datetime pakage. For further information you should look the docs.

data['INVDAT'] = data['INVDAT'].astype('str')
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
This solution works but if the date representation is a single month like 12519 ( expected output 1/25/19), it fails. I tried using a conditional to add a 0 to the front if len() < 6 but it gives me an error that the dtype is int64.

import pandas as pd
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data['INVDAT'] = data['INVDAT'].astype('str')
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
for i in data['INVDAT'].str.len():
if i <= 5:
data['INVDAT'] = data['INVDAT'].apply(lambda x: '{0:0>6}'.format(x))
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
else:
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
# output to new file
new_data = data
new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
This is the solution, it's sloppy but works

Related

Automatically Extracting the Datetime Format from a Pandas Series [duplicate]

I am trying to format the column 'Data' to make a pattern with dates.
The formats I have are:
1/30/20 16:00
1/31/2020 23:59
2020-02-02T23:43:02
Here is the code for the dataframe.
import requests
import pandas as pd
import numpy as np
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
one_df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
one_df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
I tried adding the code bellow but it doesn't bring the result I wanted
pd.to_datetime(one_df['Data'])
one_df.style.format({"Data": lambda t: t.strftime("%m/%d/%Y")})
Any help?
UPDATE
This is the complete code, but it doesn't work. Many exceptions printed with different date formats.
import requests
import pandas as pd
import numpy as np
from datetime import datetime
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
df = pd.DataFrame()
DATE_FORMATS = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"]
df["Região"] = one_df["Province/State"].fillna(one_df["Admin2"])
df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
df["Confirmados"] = one_df["Confirmed"]
df["Mortes"] = one_df["Deaths"]
df["Recuperados"] = one_df["Recovered"]
def parse(x_):
for fmt in DATE_FORMATS :
try:
tmp = datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
return tmp
except ValueError:
print(x_)
pd.to_datetime(df['Data'])
df['Data'] = df['Data'].apply(lambda x: parse(x))
#df['Data'].strftime('%m/%d/%Y')
#df['Data'] = df['Data'].map(lambda x: x.strftime('%m/%d/%Y') if x else '')
df.to_excel(r'C:\Users\guilh\Downloads\Covid2\Covid-19.xlsx', index=False, encoding="utf8")
print(df)
from datetime import datetime
import pandas as pd
You could save all possible formats in a list as -
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
Define a function that loops through the formats and tries to parse it.
(Fixed a bug, where the print statement should have been outside the for loop)
issues = set()
def parse(x_):
for fmt in DATE_FORMATS:
try:
return datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
except ValueError:
pass
issues.add(x_)
sample = ["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"]
df = pd.DataFrame({'data': sample})
df['data'] = df['data'].apply(lambda x: parse(x))
assert df['Data'].isna().sum() == len(issues) == 0, "Issues observed, nulls observed in dataframe"
print("Done")
Output
data
0 01/30/2020
1 01/31/2020
2 02/02/2020
If df.apply() comes across a particular date format that hasn't been defined in the list, it would simply print None since nothing would be returned by the function parse()
also here, letting pd.to_datetime infer the format does the trick:
import pandas as pd
s = pd.to_datetime(["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"])
print(s)
# DatetimeIndex(['2020-01-30 16:00:00', '2020-01-31 23:59:00',
# '2020-02-02 23:43:02'],
# dtype='datetime64[ns]', freq=None)
Note that if your date/time format generally provides the day first (e.g. 30.1.2021 for Jan 30th 2021), set keyword dayfirst=True.

How can I speed these API queries up?

I am feeding a long list of inputs in a function that calls an API to retrieve data. My list is around 40.000 unique inputs. Currently, the function returns output every 1-2 seconds or so. Quick maths tells me that it would take over 10+ hrs before my function will be done. I therefore want to speed this process up, but have struggles finding a solution. I am quite a beginner, so threading/pooling is quite difficult for me. I hope someone is able to help me out here.
The function:
import quandl
import datetime
import numpy as np
quandl.ApiConfig.api_key = 'API key here'
def get_data(issue_date, stock_ticker):
# Prepare var
stock_ticker = "EOD/" + stock_ticker
# Volatility
date_1 = datetime.datetime.strptime(issue_date, "%d/%m/%Y")
pricing_date = date_1 + datetime.timedelta(days=-40) # -40 days of issue date
volatility_date = date_1 + datetime.timedelta(days=-240) # -240 days of issue date (-40,-240 range)
# Check if code exists : if not -> return empty array
try:
stock = quandl.get(stock_ticker, start_date=volatility_date, end_date=pricing_date) # get pricing data
except quandl.errors.quandl_error.NotFoundError:
return []
daily_close = stock['Adj_Close'].pct_change() # returns using adj.close
stock_vola = np.std(daily_close) * np.sqrt(252) # annualized volatility
# Average price
stock_pricing_date = date_1 + datetime.timedelta(days=-2) # -2 days of issue date
stock_pricing_date2 = date_1 + datetime.timedelta(days=-12) # -12 days of issue date
stock_price = quandl.get(stock_ticker, start_date=stock_pricing_date2, end_date=stock_pricing_date)
stock_price_average = np.mean(stock_price['Adj_Close']) # get average price
# Amihuds Liquidity measure
liquidity_pricing_date = date_1 + datetime.timedelta(days=-20)
liquidity_pricing_date2 = date_1 + datetime.timedelta(days=-120)
stock_data = quandl.get(stock_ticker, start_date=liquidity_pricing_date2, end_date=liquidity_pricing_date)
p = np.array(stock_data['Adj_Close'])
returns = np.array(stock_data['Adj_Close'].pct_change())
dollar_volume = np.array(stock_data['Adj_Volume'] * p)
illiq = (np.divide(returns, dollar_volume))
print(np.nanmean(illiq))
illiquidity_measure = np.nanmean(illiq, dtype=float) * (10 ** 6) # multiply by 10^6 for expositional purposes
return [stock_vola, stock_price_average, illiquidity_measure]
I then use a seperate script to select my csv file with the list with rows, each row containing the issue_date, stock_ticker
import function
import csv
import tkinter as tk
from tkinter import filedialog
# Open File Dialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
# Load Spreadsheet data
f = open(file_path)
csv_f = csv.reader(f)
next(csv_f)
result_data = []
# Iterate
for row in csv_f:
try:
return_data = function.get_data(row[1], row[0])
if len(return_data) != 0:
# print(return_data)
result_data_loc = [row[1], row[0]]
result_data_loc.extend(return_data)
result_data.append(result_data_loc)
except AttributeError:
print(row[0])
print('\n\n')
print(row[1])
continue
if result_data is not None:
with open('resuls.csv', mode='w', newline='') as result_file:
csv_writer = csv.writer(result_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for result in result_data:
# print(result)
csv_writer.writerow(result)
else:
print("No results found!")
It is quite messy, but like I mentioned before, I am definitely a beginner. Speeding this up would greatly help me.

Use of datetime timedelta with numpy 3d array

I have a 3D array with the count of number of days past a benchmark date (e.g., 01.01.2000). I am interested in the actual day-of-year (DOY: 1-365/366)rather than the total number of days past a given date.
For a single value, the below syntax works. For e.g.,
import numpy as np
import datetime
data = 1595
date = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(data -1)
date.timetuple().tm_yday
134
However, I am having issues with using a 3D array.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
data
array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int_array - 1)
return date_.timetuple().tm_yday
doy_data = data * 0 #Empty array
for i in range(2):
doy_data[:, :, i] = Int_to_DOY(data[:, :, i])
Here is the error message and I am not able to figure this out.
TypeError: unsupported type for timedelta days component: numpy.ndarray
Thanks for your help.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int(int_array) -1)
return date_.timetuple().tm_yday
doy_data = data.flatten()
for i in range(len(doy_data)):
doy_data[i] = Int_to_DOY(doy_data[i])
doy_data = doy_data.reshape((2,2,2))
Since you tagged pandas:
data = np.array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
s = pd.to_datetime('2000-01-01') + pd.to_timedelta(data.ravel(), unit='D')
s.dayofyear.values.reshape(data.shape) - 1
Output:
array([[[135, 134],
[138, 138]],
[[135, 138],
[134, 134]]], dtype=int64)

Estimating parameters using minimization in Python and speed up this process

I am trying to find parameter estimates using using minimization. The code I wrote works but there are two problems:
I finds only a local minimum. I tried to solve this by using basinhopping.
It takes very long until I get a result and since I have to do this minimization around 1000 times this becomes a big issue.
So my questions are:
Do you know how I could optimize my code so that it runs faster for the minimization.
Is there a way I can change the basinhopping part so that it runs faster? eg. set niter lower or a differnt method im not aware of. I tried running it like this and after 10 hour I didnt get a response for even one of the 1000 individuals for basinhopping.
Is there another way to find a global minimum?
Feel free to ask further questions please.
My code:
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import basinhopping
from scipy.integrate import odeint
import pickle
import os
import pandas as pd
import datetime
import numpy.random as npr
import csv
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###IDS
df = pd.read_csv('1_Youtuber_SingleNrSheet_Comedy.csv', sep = ";", skipinitialspace=True) ######Change Name
YoutuberID = df["Channel_ID"].tolist()
##print(YoutuberID)
with open("9_p_q_m_Fun_ExtendedBass_VIEWS_Comedy_test.csv", "w" ,newline='',encoding='utf-8') as csv_file2: ######Change Name
csv_writer2 = csv.writer(csv_file2, delimiter=';')
csv_writer2.writerow(["Type","p", "q", "m","Functionvalue"])
count = 0
for ID in YoutuberID[0:]: ###Change
try:
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###ALL INFO
Days = pd.read_csv('3_API_Call_ALL_info_Comedy_v2.csv', sep = ";", skipinitialspace=True)
views_path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python\Daily_Views_Comedy" ######Change Name
os.chdir(views_path)
SVR = pd.read_csv("4_COMEDY_DailyViews_Clean_" + str(count) + "_" + ID + ".csv", sep = ";", parse_dates=True, dayfirst=True) ######Change Name
## print(SVR[SVR.columns[0]])
SVR = SVR[SVR[SVR.columns[0]]< "2018-05-01"] ####CHANGE DATE FOR DIF CAT
## print(SVR)
#####SV Input
SV = np.array(SVR["Daily Views"])
## print(SV)
Days = Days[Days["channelId"] == ID]
## print(Days)
Days["publishedAt"] = pd.to_datetime(Days.publishedAt)
Days = Days[Days["publishedAt"] > "2015-01-08"] ##"2015-01-10"
## print(Days)
##### Timedelta #####
start_date = pd.to_datetime("2015-06-08")
##print(start_date)
video_upload_day =[]
for video_date in Days["publishedAt"]:
TimeDelta = video_date - start_date
video_upload_day.append(TimeDelta.days)
##print(video_upload_day)
##print(videoT)
nvideos = len(video_upload_day)
ndays = len(SV)
videoT = np.array(video_upload_day)
## print(videoT,nvideos,ndays)
def objective(x):
p = x[0]
q = x[1]
m = x[2]
estimateV = np.zeros( (ndays, nvideos) )
for t in range( ndays ):
for v in range( nvideos ):
if videoT[v] <= t:
estimateV[ t,v ] = p*m + (q-p) * np.sum(estimateV[0:t,v],axis=0) - (q/m) * (np.sum(estimateV[0:t,v],axis=0)**2)
estimateSV = np.sum( estimateV, axis = 1 )
return np.sum( (SV - estimateSV)**2 )
This is the minimization part. I made one for the normal minimization and one for basinhopping and seperated it with ##.
###### MINIMIZATION #######
mguess = round(sum(SV)/(nvideos*2),0)
print(sum(SV),mguess)
x0 = np.array([0.001, 0.01, mguess]) ####Make it less volatile to first guess? Make bigger steps for m?
b1 = (0.00001,0.5)
b2 = (10**4,10**7)
bnds = (b1,b1,b2)
## minimizer_kwargs = dict(method="L-BFGS-B",bounds=bnds)
## res = basinhopping(objective, x0,niter=20, minimizer_kwargs=minimizer_kwargs)
res = minimize(objective, x0,bounds = bnds)
print(res)
csv_writer2.writerow(["COMEDY",res.x[0], res.x[1],res.x[2],res.fun]) ###CHANNGE CAT
print("CURRERNT YOUTUBER IS:",count)
count += 1
except:
print("PROBLEM",count)
count += 1
## print(res,res.x[0],res.x[1],res.x[2],res.fun)

How do i sort a text file by column numerically?

from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0
You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)

Resources