to_datetime() in pandas returns a Categorical type rather than a datetime object - python-3.x

Here is a sample of the code:
data.timestamp = pd.to_datetime(data.timestamp, infer_datetime_format = True, utc = True)
data.timestamp.dtype
CategoricalDtype(categories=['2016-01-10 06:00:00+00:00', '2016-01-10 07:00:00+00:00',
'2016-01-10 08:00:00+00:00', '2016-01-10 09:00:00+00:00',
'2016-01-10 10:00:00+00:00', '2016-01-10 11:00:00+00:00',
'2016-01-10 12:00:00+00:00', '2016-01-10 13:00:00+00:00',
'2016-01-10 14:00:00+00:00', '2016-01-10 15:00:00+00:00',
...
'2016-12-31 13:00:00+00:00', '2016-12-31 14:00:00+00:00',
'2016-12-31 15:00:00+00:00', '2016-12-31 16:00:00+00:00',
'2016-12-31 17:00:00+00:00', '2016-12-31 18:00:00+00:00',
'2016-12-31 19:00:00+00:00', '2016-12-31 20:00:00+00:00',
'2016-12-31 21:00:00+00:00', '2016-12-31 23:00:00+00:00'],
ordered=False)
How can I solve this issue?

data.timestamp = pd.to_datetime(data.timestamp, infer_datetime_format = True, utc = True).astype('datetime64[ns]')
This worked.

Related

Need to convert a date into target timezone date with DST (daylight savings time)

I have a datetime in this format "20200123114953". I am able to convert the datetime to target timezone datetime as "2020-01-23T00:19:53-0600", in this it is not respecting daylight saving.
I expect the time with offset value "-5:00", but I get "-6:00" for US/Eastern.
Could someone please help me out with the logic in Python that respects DST?
import datetime as dt
import pendulum
import pytz
def getDateTime(datetime_, is_timezone_required=None, input_format=None, output_format=None, default='False'):
is_timezone_required = False if is_timezone_required.lower() in ["false"] else True
timezone = None
input_format = "%Y%m%d%H%M%S"
output_format = "%Y-%m-%dT%H:%M:%Sz"
timezone = "US/Eastern"
if is_timezone_required:
if "%z" not in output_format:
output_format += "%z"
else:
if "%z" in output_format:
output_format = output_format.replace("%z", "")
datetime_object = dt.datetime.strptime(datetime_, input_format)
timezone_py = pytz.timezone(timezone)
datetime_object = datetime_object.astimezone(timezone_py)
output = dt.datetime.strftime(datetime_object, output_format)
return output
getDate = getDateTime('20200123114953', "True")
print(getDate)
"This gets an output: 2020-01-23T01:19:53z-0500
My expectation is : 2020-01-23T02:19:53z-0400"

return missing dates Python

I have a CSV file with 1600 dates and I'm trying to find all missing dates. For example:
03-10-2019
01-10-2019
29-09-2019
28-09-2019
should return : 02-10-2019,30-09-2019.
Here's what I've wrote:
with open('measurements.csv','r') as csvfile:
df = pd.read_csv(csvfile, delimiter=',')
timestamps = df['observation_time'] #Getting only the date
for line in timestamps:
date_str = line
try: # convert string to time
date = date_time_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
dates.append(date)
except:
print("Date parsing failed")
dates = pd.DataFrame(dates,columns =['actual_date'])
pd.date_range(start = dates.min(), end = dates.max()).difference(dates.index)
This returns an error that
"Cannot convert input [actual_date 2018-09-17 22:00:00 dtype:
datetime64[ns]] of type to
Timestamp"
Idea is use DataFrame.asfreq for add all missing values to DatetimeIndex, so possible filter by boolean indexing with Series.isna:
df['observation_time'] = pd.to_datetime(df['observation_time'], dayfirst=True)
df1 = df.set_index(df['observation_time']).sort_index().asfreq('d')
print (df1)
observation_time
observation_time
2019-09-28 2019-09-28
2019-09-29 2019-09-29
2019-09-30 NaT
2019-10-01 2019-10-01
2019-10-02 NaT
2019-10-03 2019-10-03
dates = df1.index[df1['observation_time'].isna()]
print (dates )
DatetimeIndex(['2019-09-30', '2019-10-02'], dtype='datetime64[ns]',
name='observation_time', freq=None)

TypeError: NoneType is unsubscriptable - IF statement

I am trying to find fuzzy string matches for university names and print a certain score (10, 5 ,3) to a csv each time depending on what list the closest match came from.
data = [["MIT"], ["Stanford"], ...]
Data1 = ['MASSACHUSETTS INSTITUTE OF TECHNOLOGY (MIT)'], ['STANFORD UNIVERSITY'],...
So far I have tried:
1 for uni in data:
2 hit = process.extractOne(str(uni[0]), data1, scorer = fuzz.token_set_ratio, score_cutoff = 90)
3 if float(hit[1]) < 100:
4 print("not found")
5 else:
print("Closest match for " + str(uni[0]) + " is " + str(hit[0]) " + "score: 10")
At this point I get the TypeError: NoneType is unsubscriptable for line 3
I have checked the type of my variable:
print(type(hit)) #I was getting tuple now NoneType...
print(len(hit)) # Was getting 2 now unsubscriptable
print(float(hit[1])) # 100
As I understood this error comes up when a variable is not the type one thinks it is. Any idea how to resolve this issue? Many thanks
Thanks to #inthevortex, I was able to complete the code as follows:
for uni in data:
hit = process.extractOne(str(uni[0]), data10, scorer = fuzz.token_set_ratio, score_cutoff = 90)
try:
if float(hit[1]) >= 94:
with open(filename, mode='a', newline="") as csv_file:
fieldnames = ['bwbnr', 'uni_name', 'match', 'points']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=';')
writer.writerow({'bwbnr': str(uni[0]), 'uni_name': str(uni[0]), 'match': str(hit), 'points': 10})
except:
hit1 = process.extractOne(str(uni[0]), data11, scorer = fuzz.token_set_ratio, score_cutoff = 90)
try:
if float(hit1[1]) >= 94:
with open(filename, mode='a', newline="") as csv_file:
fieldnames = ['bwbnr', 'uni_name', 'match', 'points']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=';')
writer.writerow({'bwbnr': str(uni[0]), 'uni_name': str(uni[0]), 'match': str(hit1), 'points': 5})
... and so on... until the last except.
Thanks to #inthevortex I completed the code using the try-except method:
for uni in data:
hit = process.extractOne(str(uni[0]), data10, scorer = fuzz.token_set_ratio, score_cutoff = 90)
try:
if float(hit[1]) >= 94:
with open(filename, mode='a', newline="") as csv_file:
fieldnames = ['bwbnr', 'uni_name', 'match', 'points']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=';')
writer.writerow({'bwbnr': str(uni[0]), 'uni_name': str(uni[0]), 'match': str(hit), 'points': 10})
except:
hit1 = process.extractOne(str(uni[0]), data11, scorer = fuzz.token_set_ratio, score_cutoff = 90)
try:
if float(hit1[1]) >= 94:
with open(filename, mode='a', newline="") as csv_file:
fieldnames = ['bwbnr', 'uni_name', 'match', 'points']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=';')
writer.writerow({'bwbnr': str(uni[0]), 'uni_name': str(uni[0]), 'match': str(hit1), 'points': 5})
All the way down to the last list I wanted to compare with, again with try-except!

Convert date string to datetime for whole column of df

df = pd.read_csv('bitcoin.csv')
print(df)
gives
Date Open High Low Close Volume
0 Apr 16, 2018 8337.57 8371.15 7925.73 8058.67 5,631,310,000
1 Apr 15, 2018 7999.33 8338.42 7999.33 8329.11 5,244,480,000 ....
I tried
pd.to_datetime(pd.Series(['Date']), format = '%b %d, %Y')
but got
TypeError: Unrecognized value type: && ValueError: time data 'Date' does not match format '%b %d %Y' (match)
I also tried
df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%b %d, %Y')
but got SyntaxError: unexpected EOF while parsing
when running
print(df['Date'])
after printing it says
Name: Date, Length: 1567, dtype: object
Not sure whats going on here? Is it already a datetime object?
Seems like you're missing an extra parenthesis at the end here:
df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%b %d, %Y'))
I would suggest you do something like this, adding to #COLDSPEED comment:
df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x, format = '%b %d, %Y', errors = 'coerce'))
df['Date'] = pd.to_datetime(df.Date).dt.strftime('%b %d, %Y')
Output
0 Apr 16, Jan 01, 1970
1 Apr 15, Jan 01, 1970
Name: Date, dtype: object

Simplify large-data processing script

I am trying to do the following but it takes to much time.
Can someone please suggest a quicker way of doing this
f = open('answer.csv','w')
f.write('Datetime,0: Vm,0: Va,1: Vm,1: Va,2: Vm,2: Va,3: Vm,3: Va,4: Vm,4: Va,5: Vm,5: Va,6: Vm,6: Va,7: Vm,7: Va,8: Vm,8: Va,9: Vm,9: Va,10: Vm,10: Va,11: Vm,11: Va,12: Vm,12: Va,13: Vm,13: Va\n')
# 'n' is around 8000000
# 'PQ_data' is a pandas DataFrame with more than n rows
# 'class' is a python class object with some functions in it
for i in range(n):
p = []
q = []
for j in range(1,14):
if j<=10:
p.append(PQ_data['{} P'.format(j)][i])
q.append(PQ_data['{} Q'.format(j)][i])
else:
p.append(0)
q.append(0)
class.do_something(p,q)
vm = class.get_Vm().tolist()
va = class.get_Va().tolist()
# above methods return 14 length lists.
# PQ_data.index has datetime values
f.write('{}'.format(PQ_data.index[i]))
for j in range(len(vm)):
f.write(',{},{}'.format(vm[j],va[j]))
f.write('\n')
f.close()
Try this. If not, you might need to throw multiprocessing at it
import csv
import itertools
with open('answer.csv','w') as fout:
outfile = csv.writer(fout)
outfile.writerow(['Datetime', '0: Vm', '0: Va', '1: Vm', '1: Va', '2: Vm', '2: Va', '3: Vm', '3: Va', '4: Vm', '4: Va', '5: Vm', '5: Va', '6: Vm', '6: Va', '7: Vm', '7: Va', '8: Vm', '8: Va', '9: Vm', '9: Va', '10: Vm', '10: Va', '11: Vm', '11: Va', '12: Vm', '12: Va', '13: Vm', '13: Va'])
for i in range(n):
p = [PQ_data['{} P'.format(j)][i] for j in range(1,11)] + [0]*3
q = [PQ_data['{} Q'.format(j)][i] for j in range(1,11)] + [0]*3
class.do_something(p,q)
vm = class.get_Vm().tolist()
va = class.get_Va().tolist()
row = itertools.chain([PQ_data.index[i]], itertools.chain.from_iterable((vm[j],va[j]) for j in range(len(vm))))
outfile.writerow(row)

Resources