Automatically Extracting the Datetime Format from a Pandas Series [duplicate] - python-3.x

I am trying to format the column 'Data' to make a pattern with dates.
The formats I have are:
1/30/20 16:00
1/31/2020 23:59
2020-02-02T23:43:02
Here is the code for the dataframe.
import requests
import pandas as pd
import numpy as np
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
one_df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
one_df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
I tried adding the code bellow but it doesn't bring the result I wanted
pd.to_datetime(one_df['Data'])
one_df.style.format({"Data": lambda t: t.strftime("%m/%d/%Y")})
Any help?
UPDATE
This is the complete code, but it doesn't work. Many exceptions printed with different date formats.
import requests
import pandas as pd
import numpy as np
from datetime import datetime
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
df = pd.DataFrame()
DATE_FORMATS = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"]
df["Região"] = one_df["Province/State"].fillna(one_df["Admin2"])
df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
df["Confirmados"] = one_df["Confirmed"]
df["Mortes"] = one_df["Deaths"]
df["Recuperados"] = one_df["Recovered"]
def parse(x_):
for fmt in DATE_FORMATS :
try:
tmp = datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
return tmp
except ValueError:
print(x_)
pd.to_datetime(df['Data'])
df['Data'] = df['Data'].apply(lambda x: parse(x))
#df['Data'].strftime('%m/%d/%Y')
#df['Data'] = df['Data'].map(lambda x: x.strftime('%m/%d/%Y') if x else '')
df.to_excel(r'C:\Users\guilh\Downloads\Covid2\Covid-19.xlsx', index=False, encoding="utf8")
print(df)

from datetime import datetime
import pandas as pd
You could save all possible formats in a list as -
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
Define a function that loops through the formats and tries to parse it.
(Fixed a bug, where the print statement should have been outside the for loop)
issues = set()
def parse(x_):
for fmt in DATE_FORMATS:
try:
return datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
except ValueError:
pass
issues.add(x_)
sample = ["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"]
df = pd.DataFrame({'data': sample})
df['data'] = df['data'].apply(lambda x: parse(x))
assert df['Data'].isna().sum() == len(issues) == 0, "Issues observed, nulls observed in dataframe"
print("Done")
Output
data
0 01/30/2020
1 01/31/2020
2 02/02/2020
If df.apply() comes across a particular date format that hasn't been defined in the list, it would simply print None since nothing would be returned by the function parse()

also here, letting pd.to_datetime infer the format does the trick:
import pandas as pd
s = pd.to_datetime(["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"])
print(s)
# DatetimeIndex(['2020-01-30 16:00:00', '2020-01-31 23:59:00',
# '2020-02-02 23:43:02'],
# dtype='datetime64[ns]', freq=None)
Note that if your date/time format generally provides the day first (e.g. 30.1.2021 for Jan 30th 2021), set keyword dayfirst=True.

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

Series format pandas

import pandas as pd
from datetime import datetime
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data["INVDAT"] = data["INVDAT"].apply(lambda x: datetime.combine(x, datetime.min.time()))
data["INVDAT"] = data["INVDAT"].dt.strftime("%m-%d-%Y")
print(data)
# output to new file
# new_data = data
# new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
I'm trying to format the INVDAT column to correct date format like 11/25/19, I've tried multiple solutions but keep running into errors like this one: TypeError: combine() argument 1 must be datetime.date, not int, I then tried to convert the integer to date type but it errors also.
Or you can simply use df["INVDAT"] = pd.to_datetime(df["INVDAT"], format="%m/%d/%y"), in this case you don't need the datetime pakage. For further information you should look the docs.
data['INVDAT'] = data['INVDAT'].astype('str')
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
This solution works but if the date representation is a single month like 12519 ( expected output 1/25/19), it fails. I tried using a conditional to add a 0 to the front if len() < 6 but it gives me an error that the dtype is int64.
import pandas as pd
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data['INVDAT'] = data['INVDAT'].astype('str')
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
for i in data['INVDAT'].str.len():
if i <= 5:
data['INVDAT'] = data['INVDAT'].apply(lambda x: '{0:0>6}'.format(x))
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
else:
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
# output to new file
new_data = data
new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
This is the solution, it's sloppy but works

Use of datetime timedelta with numpy 3d array

I have a 3D array with the count of number of days past a benchmark date (e.g., 01.01.2000). I am interested in the actual day-of-year (DOY: 1-365/366)rather than the total number of days past a given date.
For a single value, the below syntax works. For e.g.,
import numpy as np
import datetime
data = 1595
date = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(data -1)
date.timetuple().tm_yday
134
However, I am having issues with using a 3D array.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
data
array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int_array - 1)
return date_.timetuple().tm_yday
doy_data = data * 0 #Empty array
for i in range(2):
doy_data[:, :, i] = Int_to_DOY(data[:, :, i])
Here is the error message and I am not able to figure this out.
TypeError: unsupported type for timedelta days component: numpy.ndarray
Thanks for your help.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int(int_array) -1)
return date_.timetuple().tm_yday
doy_data = data.flatten()
for i in range(len(doy_data)):
doy_data[i] = Int_to_DOY(doy_data[i])
doy_data = doy_data.reshape((2,2,2))
Since you tagged pandas:
data = np.array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
s = pd.to_datetime('2000-01-01') + pd.to_timedelta(data.ravel(), unit='D')
s.dayofyear.values.reshape(data.shape) - 1
Output:
array([[[135, 134],
[138, 138]],
[[135, 138],
[134, 134]]], dtype=int64)

ValueError: year is out of range using matplotlib.pyplot

After I call candlestick_ohlc, I can't seem to convert the x axis dates to something matplotlib can understand.
I'm a noob Python programmer. I've tried turning the dataframe into a list, I've tried passing dates to candlestick_ohlc, nothing seems to work other than changing
df['time'] = (df['time'].astype('float'))
into
df['time'] = (df['time'].astype('float')\1000)
Although that renders the wrong datetime.
import requests
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import matplotlib.style as style
import matplotlib.ticker as mticker
from matplotlib.dates import date2num
from mpl_finance import candlestick_ohlc
import datetime as dt
import numpy as np
import matplotlib.ticker as mticker
def get_data(date):
""" Query the API for 2000 days historical price data starting from "date". """
url = "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=2000&toTs={}".format(date)
r = requests.get(url)
ipdata = r.json()
return ipdata
def get_df(from_date, to_date):
""" Get historical price data between two dates. """
date = to_date
holder = []
# While the earliest date returned is later than the earliest date requested, keep on querying the API
# and adding the results to a list.
while date > from_date:
data = get_data(date)
holder.append(pd.DataFrame(data['Data']))
date = data['TimeFrom']
# Join together all of the API queries in the list.
df = pd.concat(holder, axis = 0)
# Remove data points from before from_date
df = df[df['time']>from_date]
# Convert to timestamp to readable date format
# df['time'] = pd.to_datetime(df['time'], unit='s')
# Make the DataFrame index the time
df.set_index('time', inplace=True)
# And sort it so its in time order
df.sort_index(ascending=False, inplace=True)
return df
df = get_df(1528502400, 1560112385)
style.use('dark_background')
fig = plt.figure()
ax1 = plt.subplot2grid((1,1), (0,0))
df = df.reset_index()
cols = ['time', 'open', 'high', 'low', 'close', 'volumefrom', 'volumeto']
df = df[cols]
#IF YOU /1000 AFER ('float') IT WILL RUN BUT NOT CORRECT DATE
df['time'] = (df['time'].astype('float'))
print(df.dtypes)
ohlc = df.values.tolist()
candlestick_ohlc(ax1, ohlc, width=.4, colorup='g', colordown='r')
# IF YOU COMMENT NEXT 4 LINES IT WILL RUN, but NO DATES for XAXIS
date_fmt = "%d-%m-%Y"
date_formatter = mdate.DateFormatter(date_fmt)
ax1.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()
ax1.set_ylabel('BTC Price (USD)')
ax1.set_xlabel('Date')
plt.show()
Expected result would be date labels plotted as d-m-y. :)
Wish this had dates for xaxis labels not seconds since 1970
This is what I want it to look like, but with accurate dates
This is how to fix the code:
df['time'] = df['time'].apply(mdates.epoch2num)
It was definitely one of those lines of code that you spend hours on... now I know.

Converting string with nano seconds to timestamp

Im trying to convert String Datatype to Timestamp data type but Im getting NONE as a result
Sample Data and Code
20181016T192403.635918+02:00
date_format = "yyyyMMdd'T'HHmmss.SSSSSSZ”
data_frame = data_frame.withColumn('dob_ts', unix_timestamp('dob', date_format).cast(‘timestamp’)
Other formats (yyyyMMdd'T'HHmmss.SSS) works fine but not this one.
How to convert this format to timestamp?
You can using udf to define your function. Hence, in the user defined function you can handle this case by an if or what you want:
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.types import TimestampType
def date_time_to_date(input_date_time):
split_ind = input_date_time.find('T')
new_date = input_date_time
if split_ind > -1:
new_date = input_date_time[:split_ind] + input_date_time[split_ind + 1:]
return datetime.strptime(input_date_time, '%Y%m%d %H%M%S.%f')
udf_date_time_to_date = udf(new_date, TimestampType())
data_frame = data_frame.withColumn('dob_ts', udf_date_time_to_date('dob'))

Resources