I just try to get the weather data for a time range.
I want to get daily OR hourly data for a whole year.
I just tried the following code:
from forecastiopy import *
from datetime import date, datetime, timedelta
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2015, 1, 1)
end_date = date(2015, 12, 31)
for single_date in daterange(start_date, end_date):
time = single_date.strftime("%Y-%m-%d")
print('DATE: ', time)
city = [40.730610, -73.935242]
fio = ForecastIO.ForecastIO(apikey,
units=ForecastIO.ForecastIO.UNITS_SI,
lang=ForecastIO.ForecastIO.LANG_ENGLISH,
latitude=city[0], longitude=city[1])
print('Latitude:', fio.latitude, 'Longitude:', fio.longitude)
print('Timezone', fio.timezone, 'Offset', fio.offset)
print(fio.get_url()) # You might want to see the request url
if fio.has_hourly() is True:
hourly = FIOHourly.FIOHourly(fio)
print('Hourly')
print('Summary:', hourly.summary)
print('Icon:', hourly.icon)
for hour in range(0, hourly.hours()):
print('Hour', hour+1)
for item in hourly.get_hour(hour).keys():
print(item + ' : ' + str(hourly.get_hour(hour)[item]))
# Or access attributes directly for a given minute.
print(hourly.hour_5_time)
else:
print('No Hourly data')
I get:
DATUM: 2015-01-01
DATUM: 2015-01-02
DATUM: 2015-01-03
...
DATUM: 2015-12-29
DATUM: 2015-12-30
Latitude: 40.73061 Longitude: -73.935242
Timezone America/New_York Offset -4
Hourly
Summary: Light rain starting this afternoon.
Icon: rain
Hour 1
visibility : 16.09
humidity : 0.52
...
Hour 49
visibility : 16.09
humidity : 0.57
apparentTemperature : 23.52
icon : partly-cloudy-day
precipProbability : 0
windGust : 2.7
uvIndex : 2
time : 1498395600
precipIntensity : 0
windSpeed : 2.07
pressure : 1014.84
summary : Mostly Cloudy
windBearing : 37
temperature : 23.34
ozone : 308.33
cloudCover : 0.65
dewPoint : 14.43
1498237200
How can I use for the time parameter each day of a specific year to get 365 daily reports or 365 * 24 hourly reports? I am not a specialist in python.
This blog provides some code to query between dates https://nipunbatra.github.io/blog/2013/download_weather.html
times = []
data = {}
for attr in attributes:
data[attr] = []
start = datetime.datetime(2015, 1, 1)
for offset in range(1, 60):
forecast = forecastio.load_forecast(api_key, lat, lng, time=start+datetime.timedelta(offset), units="us")
h = forecast.hourly()
d = h.data
for p in d:
times.append(p.time)
try:
for i in attributes:
data[i].append(p.d[i])
except:
print(KeyError)
df = pd.DataFrame(data, index=times)
It works for me on python 3.6...however, i am getting a error KeyError: 'temperature' when i query dates around march 2019 for my coordinates... so in this code I added try catach error in the for p in d loop
Hope this helps
Related
I am trying to resample 1 min bars to 5 min but I am getting incorrect results.
1 min data:
I am using this to resample:
df2.resample("5min").agg({'open':'first',
'high':'max',
'low:'min',
'close':'last'})
I get:
For the second row bar (00:00:00) the high should be 110.34 not 110.35, and the close shoulb be 110.33.
How do I fix this?
EDIT 1 To create data:
import datetime
import pandas as pd
idx = pd.date_range("2021-09-23 23:55", periods=11, freq="1min")
df = pd.DataFrame(index = idx)
data = [110.34,
110.33,110.34,110.33,110.33,110.33,
110.32,110.35,110.34,110.32,110.33,
]
df['open'] = data
df['high'] = data
df['low'] = data
df['close'] = data
df2 = df.resample("5min").agg({'open':'first',
'high':'max',
'low':'min',
'close':'last'})
print(df)
print("----")
print(df2)
We can specify the closed='right' and label='right' optional keyword arguments
d = {'open':'first','high':'max',
'low':'min','close':'last'}
df.resample("5min", closed='right', label='right').agg(d)
open high low close
2021-09-23 23:55:00 110.34 110.34 110.34 110.34
2021-09-24 00:00:00 110.33 110.34 110.33 110.33
2021-09-24 00:05:00 110.32 110.35 110.32 110.33
I am getting input as start_date and end_date, now I want to fetch data based on months between start_date and end_date. How that can be done in python/django?
I am getting date in format as -
start_date = '2021-5-5' #YYYY-MM-DD format
end_date = '2021-6-5'
Required results -
result = [
{
'month' : 'may',
'data' : data_for_may # from date 5th of may to 31st of may
},
{
'month' : 'june',
'data' : data_for_june # from date 1st of june to 5th of june
}
]
I think you're better off doing:
from datetime import datetime
from django.db.models import Count
from django.contrib.auth.models import User
start_date = datetime.strptime('2021-5-5' , '%Y-%m-%d')
month_end_date = datetime.strptime('2021-6-5' , '%Y-%m-%d')
# SELECT year(last_login), month(last_login), count(*)
# FROM auth_user
# GROUP BY year(last_login), month(last_login)
# ORDER BY year(last_login), month(last_login)
qs = (User.objects.values('last_login__month', 'last_login__year')
.annotate(data=Count('*'))
.order_by('last_login__year', 'last_login__month'))
# WHERE last_login ...
qs = qs.filter(last_login__range=[start_date, month_end_date])
result = []
for item in qs:
result.append({
# get pretty name i.e "January"
'month': datetime(1900, item['last_login__month'] , 1).strftime('%B'),
'data': item['data']
})
result # [{'month': 'May', 'data': 81}, {'month': 'June', 'data': 15}])
Why do I think this is better? (over the other answers provided)
You will only have 1 record PER month PER year, easy to quantify/predict, better on performance.
I wrote tests for you by the way ;)
https://gist.github.com/kingbuzzman/0197da03c52ae9a798c99d0cf58c758c#file-month_data-py-L82-L133
As a comment inside the gist, I provide examples on how to test it using docker
Depending on how much data you have, I would fetch all data in a single query, ordered by datetime, and then group them in Python. The following snippet illustrates that idea.
from itertools import groupby
data = User.objects.order_by('last_login')
result = []
for (year, month), data_per_month in groupby(data, key=lambda x: (x.last_login.year(), x.last_login.month())):
result.append({
'year': year,
'month': month,
'data': data_per_month
})
This will probably be fast and easily fitting in memory with 10,000s of objects. When fetching millions of records though, you might need to reconsider.
First, you need to convert your strings into dates :
start_date = datetime.strptime(start_date , '%Y-%m-%d')
end_date = datetime.strptime(end_date , '%Y-%m-%d')
Then you can iterate on each month to populate your results array :
start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
all_data = User.objects.filter(last_login__date__range=[start_date, end_date])
results = []
while start_date.year < end_date.year or start_date.month <= end_date.month:
results.append({
'month': start_date.strftime('%B'),
'year': start_date.strftime('%Y'),
'data': all_data.filter(last_login__date__month=start_date.month, last_login__date__year=start_date.year)
})
# Increment the start_date by one month using the dateutil library
start_date = start_date + dateutil.relativedelta(months=+1)
NOTE : I edited my first idea for this solution and tested it in my Django project.
I have a df describing transactions like
transaction start_in_s_since_epoch duration_in_s charged_energy_in_wh
1 1.457423e+09 1821.0 1732
2 1.457389e+09 35577.0 18397
3 1.457425e+09 2.0 0
[...]
I assume the charged_energy is linear through the transaction. I would like to transform it to a time series with the granularity of a day. charged_energy within a day should be summed up as well as duration.
day sum_duration_in_s sum_charged_energy_in_wh
2016-03-16 00:00 123 456
2016-03-17 00:00 456 789
2016-03-18 00:00 789 012
[...]
Any idea? I am struggling with the borders between days. This transaction with
transaction start_in_s_since_epoch duration_in_s charged_energy_in_wh
500 1620777300 600 1000
should be equally divided to
day sum_duration_in_s sum_charged_energy_in_wh
2021-05-11 00:00 300 500
2021-05-11 00:00 300 500
This did it for me. Slow af but works:
from datetime import datetime
from datetime_truncate import truncate
df_tmp = pd.DataFrame()
for index, row in df.iterrows():
day_in_s = 60*60*24
start = row.start_in_s_since_epoch
time = row.duration_in_s
energy_per_s = row.charged_energy_in_wh / row.duration_in_s
till_midnight_in_s = truncate(pd.to_datetime(start + day_in_s, unit='s'), 'day').timestamp() - start
rest_in_s = time - till_midnight_in_s
data = {'day':truncate(pd.to_datetime(start, unit='s'), 'day'),
'sum_duration_in_s':min(time, till_midnight_in_s),
'sum_charged_energy_in_wh':min(time, till_midnight_in_s) * energy_per_s}
df_tmp = df_tmp.append(data, ignore_index=True)
while rest_in_s > 0:
start += day_in_s
data = {'day':truncate(pd.to_datetime(start, unit='s'), 'day'),
'sum_duration_in_s':min(rest_in_s, day_in_s),
'sum_charged_energy_in_wh':min(rest_in_s, day_in_s) * energy_per_s}
df_tmp = df_tmp.append(data, ignore_index=True)
rest_in_s = rest_in_s - day_in_s
df_ts = df_tmp.groupby(['date']).agg({'sum_charged_energy_in_wh':sum,
'sum_duration_in_s':sum}).sort_values('date')
df_ts = df_ts.asfreq('D', fill_value=0)
I have two date columns, I want to subtract the two columns based on conditions. First check for all the blanks in the first column and then check second column for blanks and the third condition check if the subtracted dates are less than one. If these conditions are satisfied, carry out subtraction of the the two columns. Something like this:
'''if [Recommendation signed] = null or [Executed Date] = null or Duration.Days([Contract Executed Date]-[Recommendation signed]) < 1 then null else Duration.Days([Contract Executed Date]-[Recommendation signed])'''
You can do that using apply function. For example you want to store the value into a new column called day difference.
Make sure these were datetime columns (if they're not apply to_datetime function).
df['Recommendation signed'] = pd.to_datetime(data['Recommendation signed']).dt.date
df['Executed Date'] = pd.to_datetime(data['Executed Date']).dt.date
df['Contract Executed Date'] = pd.to_datetime(data['Contract Executed Date']).dt.date
def substract_columns(row):
if pd.isnull(row['Recommendation signed']) or pd.isnull(row['Executed Date']) or ((row['Contract Executed Date'] - row['Recommendation signed']) == '0 days'):
return None
else:
row['Contract Executed Date'] - row['Recommendation signed']
df['day difference'] = df.apply(substract_columns, axis=1)
Hope this helps.
Here's one way to do it. Since no data was provided I created my own generator. The solution is contained within find_duration and how it is used in df.apply(find_duration, axis=1).
from datetime import datetime, timedelta
from itertools import islice
import numpy as np
import pandas as pd
RECOMMENDATION_IS_PENDING = "RECOMMENDATION_IS_PENDING"
EXECUTION_IS_PENDING = "EXECUTION_IS_PENDING"
COMPLETED_IN_LESS_THAN_ONE_DAY = "COMPLETED_IN_LESS_THAN_ONE_DAY"
COMPLETED_IN_MORE_THAN_ONE_DAY = "COMPLETED_IN_MORE_THAN_ONE_DAY"
MIN_YEAR = 1900
MAX_YEAR = 2020
NUM_YEARS = MAX_YEAR - MIN_YEAR + 1
START_DATE = datetime(MIN_YEAR, 1, 1, 00, 00, 00)
END_DATE = START_DATE + timedelta(days=365 * NUM_YEARS)
NUM_RECORDS = 20
def random_datetime(rng, dt):
return START_DATE + (END_DATE - START_DATE) * rng.uniform()
def less_than_one_day(rng, dt):
hours = int(np.round(23.0 * rng.uniform()))
return dt + timedelta(hours=hours)
def more_than_one_day(rng, dt):
days = 1 + int(np.round(100.0 * rng.uniform()))
return dt + timedelta(days=days)
def null_datetime(rng, dt):
return None
class RecordGenerator:
PROBABILITIES = {
RECOMMENDATION_IS_PENDING: 0.1,
EXECUTION_IS_PENDING: 0.2,
COMPLETED_IN_LESS_THAN_ONE_DAY: 0.2,
COMPLETED_IN_MORE_THAN_ONE_DAY: 0.5,
}
GENERATORS = {
RECOMMENDATION_IS_PENDING: (null_datetime, random_datetime),
EXECUTION_IS_PENDING: (random_datetime, null_datetime),
COMPLETED_IN_LESS_THAN_ONE_DAY: (random_datetime, less_than_one_day),
COMPLETED_IN_MORE_THAN_ONE_DAY: (random_datetime, more_than_one_day),
}
def __init__(self, seed=0):
self.rng = np.random.RandomState(seed)
def __iter__(self):
while True:
res = self.rng.uniform()
for kind, val in self.PROBABILITIES.items():
res -= val
if res <= 0.0:
break
recommendation_signed_fn, execution_date_fn = self.GENERATORS[kind]
recommendation_signed = recommendation_signed_fn(self.rng, None)
execution_date = execution_date_fn(self.rng, recommendation_signed)
yield recommendation_signed, execution_date
def find_duration(df):
duration = df["execution_date"] - df["recommendation_signed"]
if duration is pd.NaT or duration < pd.Timedelta(days=1):
return None
return duration
if __name__ == "__main__":
records = RecordGenerator()
recommendation_signed_dates, execution_dates = zip(*islice(records, NUM_RECORDS))
df = pd.DataFrame.from_dict({
"recommendation_signed": recommendation_signed_dates,
"execution_date": execution_dates,
})
print(f"`recommendation_signed` is null: [{df['recommendation_signed'].isnull().sum()}]")
print(f"`execution_date` is null: [{df['execution_date'].isnull().sum()}]")
print(f"`completed_in_less_than_one_day`: [{((df['execution_date'] - df['recommendation_signed']) < pd.Timedelta(days=1)).sum()}]")
print(f"`completed_in_more_than_one_day`: [{((df['execution_date'] - df['recommendation_signed']) >= pd.Timedelta(days=1)).sum()}]")
df["completion_time"] = df.apply(find_duration, axis=1)
print(df)
Output:
`recommendation_signed` is null: [2]
`execution_date` is null: [2]
`completed_in_less_than_one_day`: [4]
`completed_in_more_than_one_day`: [12]
recommendation_signed execution_date completion_time
0 1986-06-25 08:07:14.808395 1986-08-25 08:07:14.808395 61 days
1 1951-03-25 17:08:27.986156 1951-05-30 17:08:27.986156 66 days
2 2007-11-01 03:42:35.672304 2007-11-02 01:42:35.672304 NaT
3 1995-09-26 12:52:16.917964 1995-09-27 00:52:16.917964 NaT
4 2011-12-03 23:24:45.808880 2011-12-11 23:24:45.808880 8 days
5 NaT 1902-06-12 22:41:33.183052 NaT
6 1994-02-04 07:01:47.052493 1994-05-03 07:01:47.052493 88 days
7 1996-08-19 20:06:42.217770 1996-10-05 20:06:42.217770 47 days
8 1914-04-21 14:09:37.598524 1914-06-25 14:09:37.598524 65 days
9 2014-03-25 07:15:55.137157 NaT NaT
10 1950-02-21 13:04:11.684479 1950-03-20 13:04:11.684479 27 days
11 1955-02-27 21:06:22.090510 1955-04-26 21:06:22.090510 58 days
12 NaT 1974-09-07 20:55:17.329968 NaT
13 1974-08-07 21:21:33.578522 1974-11-10 21:21:33.578522 95 days
14 1943-06-22 15:59:39.451885 1943-08-06 15:59:39.451885 45 days
15 1907-04-14 20:35:27.269379 1907-06-21 20:35:27.269379 68 days
16 1925-06-10 13:05:57.968982 1925-06-24 13:05:57.968982 14 days
17 1943-12-25 06:52:07.566032 1943-12-25 19:52:07.566032 NaT
18 2019-07-07 12:44:00.201327 2019-07-07 14:44:00.201327 NaT
19 1919-07-05 05:38:11.678570 NaT NaT
You could try something like this:
import numpy as np
from datetime import datetime, timedelta
df['Recommendation Signed'] = pd.to_datetime(df['Recommendation Signed'], errors='coerce')
df['Contract Executed Date'] = pd.to_datetime(df['Contract Executed Date'], errors='coerce')
df['date_difference'] = np.where(df['Recommendation Signed'].isnull() | df['Contract Executed Date'].isnull() | ((df['Contract Executed Date'] - df['Recommendation Signed'] ) < timedelta(days=1)), np.datetime64('NaT'), df['Contract Executed Date'] - df['Recommendation Signed'])
I have a dataframe that records concentrations for several different locations in different years, with a high temporal frequency (<1 hour). I am trying to make a bar/multibar plot showing mean concentrations, at different locations in different years
To calculate mean concentration, I have to apply quality control filters to daily and monthly data.
My approach is to first apply filters and resample per year and then do the grouping by location and year.
Also, out of all the locations (in the column titled locations) I have to choose only a few rows. So, I am slicing the original dataframe and creating a new dataframe with selected rows.
I am not able to achieve this using the following code:
date=df['date']
location = df['location']
df.date = pd.to_datetime(df.date)
year=df.date.dt.year
df=df.set_index(date)
df['Year'] = df['date'].map(lambda x: x.year )
#Location name selection/correction in each city:
#Changing all stations:
df['location'] = df['location'].map(lambda x: "M" if x == "mm" else x)
#New dataframe:
df_new = df[(df['location'].isin(['K', 'L', 'M']))]
#Data filtering:
df_new = df_new[df_new['value'] >= 0]
df_new.drop(df_new[df_new['value'] > 400].index, inplace = True)
df_new.drop(df_new[df_new['value'] <2].index, inplace = True)
diurnal = df_new[df_new['value']].resample('12h')
diurnal_mean = diurnal.mean()[diurnal.count() >= 9]
daily_mean=diurnal_mean.resample('d').mean()
df_month=daily_mean.resample('m').mean()
df_yearly=df_month[df_month['value']].resample('y')
#For plotting:
df_grouped = df_new.groupby(['location', 'Year']).agg({'value':'sum'}).reset_index()
sns.barplot(x='location',y='value',hue='Year',data= df_grouped)
This is one of the many errors that cropped up:
"None of [Float64Index([22.73, 64.81, 8.67, 19.98, 33.12, 37.81, 39.87, 42.29, 37.81,\n 36.51,\n ...\n 11.0, 40.0, 23.0, 80.0, 50.0, 60.0, 40.0, 80.0, 80.0,\n 17.0],\n dtype='float64', length=63846)] are in the [columns]"
ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
This is a sample dataframe, showing what I need to plot; value column should ideally represent resampled values, after performing the quality control operations and resampling.
Unnamed: 0 location value \
date location value
2017-10-21 08:45:00+05:30 8335 M 339.3
2017-08-18 17:45:00+05:30 8344 M 45.1
2017-11-08 13:15:00+05:30 8347 L 594.4
2017-10-21 13:15:00+05:30 8659 N 189.9
2017-08-18 15:45:00+05:30 8662 N 46.5
This is how the a part of the actual data should look like, after selecting the chosen locations. I am a new user so cannot attach a screenshot of the graph I require. This query is an extension of the query I had posted earlier , with the additional requirement of plotting resampled data instead of simple value counts. Iteration over years to plot different group values as bar plot in pandas
Any help will be much appreciated.
Fundamentally, your errors come with this unclear indexing where you are passing continuous, float values of one column for rowwise selection of index which currently is a datetime type.
df_new[df_new['value']] # INDEXING DATETIME USING FLOAT VALUES
...
df_month[df_month['value']] # COLUMN value DOES NOT EXIST
Possibly, you meant to select the column value (out of the others) during resampling.
diurnal = df_new['value'].resample('12h')
diurnal.mean()[diurnal.count() >= 9]
daily_mean = diurnal_mean.resample('d').mean()
df_month = daily_mean.resample('m').mean() # REMOVE value BEING UNDERLYING SERIES
df_yearly = df_month.resample('y')
However, no where above do you retain location for plotting. Hence, instead of resample, use groupby(pd.Grouper(...))
# AGGREGATE TO KEEP LOCATION AND 12h
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
)
# FILTER
diurnal_sub = diurnal[diurnal["count"] >= 9]
# MULTIPLE DATE TIME LEVEL MEANS
daily_mean = diurnal_sub.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal_sub.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = diurnal_sub.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
print(df_yearly)
To demonstrate with random, reproducible data:
Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(242020)
random_df = pd.DataFrame({'date': (np.random.choice(pd.date_range('2017-01-01', '2019-12-31'), 5000) +
pd.to_timedelta(np.random.randint(60*60, 60*60*24, 5000), unit='s')),
'location': np.random.choice(list("KLM"), 5000),
'value': np.random.uniform(10, 1000, 5000)
})
Aggregation
loc_list = list("KLM")
# NEW DATA FRAME WITH DATA FILTERING
df = (random_df.set_index(random_df['date'])
.assign(Year = lambda x: x['date'].dt.year,
location = lambda x: x['location'].where(x["location"] != "mm", "M"))
.query('(location == #loc_list) and (value >= 2 and value <= 400)')
)
# 12h AGGREGATION
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
.query("count >= 2")
)
# d, m, y AGGREGATION
daily_mean = diurnal.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = (diurnal.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
.reset_index()
.assign(Year = lambda x: x["date"].dt.year)
)
print(df_yearly)
# location date mean Year
# 0 K 2017-12-31 188.984592 2017
# 1 K 2018-12-31 199.521702 2018
# 2 K 2019-12-31 216.497268 2019
# 3 L 2017-12-31 214.347873 2017
# 4 L 2018-12-31 199.232711 2018
# 5 L 2019-12-31 177.689221 2019
# 6 M 2017-12-31 222.412711 2017
# 7 M 2018-12-31 241.597977 2018
# 8 M 2019-12-31 215.554228 2019
Plotting
sns.set()
fig, axs = plt.subplots(figsize=(12,5))
sns.barplot(x='location', y='mean', hue='Year', data= df_yearly, ax=axs)
plt.title("Location Value Yearly Aggregation", weight="bold", size=16)
plt.show()
plt.clf()
plt.close()