How to merge as-of for date but left-join for another column? - python-3.x

I have the following dataframes:
assets = pd.DataFrame(columns = ['date','asset'], data = [[datetime.date(2022,10,21),'SPY'],[datetime.date(2022,10,21),'FTSE'], [datetime.date(2022,11,12),'SPY'],[datetime.date(2022,11,12),'FTSE']])
prices = pd.DataFrame(columns = ['date','asset', 'price'], data = [[datetime.date(2022,10,11),'SPY',10],[datetime.date(2022,10,11),'FTSE',5],[datetime.date(2022,11,8),'SPY',100],[datetime.date(2022,11,8),'FTSE',50]]
for each asset I want to get the as-of price (at the nearest date). How to do so?
If I have only one asset it is easy:
assets_spy = assets#.loc[assets['asset']=='SPY']
prices_spy = prices#.loc[prices['asset']=='SPY']
assets_spy.index = pd.to_datetime(assets_spy['date'])
prices_spy.index = pd.to_datetime(prices_spy['date'])
merged = pd.merge_asof(assets_spy.sort_index(),
prices_spy.sort_index(),
direction='nearest',right_index=True,left_index=True)
but if I follow the same logic for multiple assets, it won't match.

The function pandas.merge_asof has an optional parameter named by that you can use for matching using a column or list of columns before performing merge operation. Therefore, you could adapt your code like this:
import pandas as pd
import datetime
assets = pd.DataFrame(columns = ['date', 'asset'],
data = [[datetime.date(2022, 10, 21), 'SPY'],
[datetime.date(2022, 10, 21), 'FTSE'],
[datetime.date(2022, 11, 12), 'SPY'],
[datetime.date(2022, 11, 12), 'FTSE']])
prices = pd.DataFrame(columns = ['date', 'asset', 'price'],
data = [[datetime.date(2022, 10, 11), 'SPY', 10],
[datetime.date(2022, 10, 11), 'FTSE', 5],
[datetime.date(2022, 11, 8), 'SPY', 100],
[datetime.date(2022, 11, 8), 'FTSE', 50]])
merged = pd.merge_asof(
assets.astype({'date': 'datetime64[ns]'}).sort_values('date').convert_dtypes(),
prices.astype({'date': 'datetime64[ns]'}).sort_values('date').convert_dtypes(),
direction = 'nearest',
on = 'date',
by = 'asset',
)
merged
# Returns:
#
# date asset price
# 0 2022-10-21 SPY 10 <-- Merged using SPY price from '2022-10-11'
# 1 2022-10-21 FTSE 5 <-- Merged using FTSE price from '2022-10-11'
# 2 2022-11-12 SPY 100 <-- Merged using SPY price from '2022-11-08'
# 3 2022-11-12 FTSE 50 <-- Merged using FTSE price from '2022-11-08'
Output Screenshot:

Related

How to display data across, by row, in pie chart in plotly/streamlit?

I have pandas df that looks like this that I want to display as a dashboard:
fname col1 col2 col3 sum
A 2 3 3 10
B 1 2 3 12
C 6 6 3 13
If a fname is selected by row, I want to display the pie slices as the column values by row.
What is the best way to display the data by fname grouped across by row in a pie chart?
I am not sure what to display when all the column values for fname are selected.
I tried creating a sunburst chart like so, but the chart is extremely convoluted:
px.sunburst(df, values='sum', path=[
'col3',
'col2',
'col1',
'fname'],
title='pie')
Here is a basic example.
import plotly.express as px
import pandas as pd
import streamlit as st
data = {
'ctry': ['USA', 'PHI', 'CHN'],
'gold': [12, 1, 20,],
'silver': [4,4, 12],
'bronze': [8, 2, 30],
'sum': [24, 7, 62]
}
df = pd.DataFrame(data)
st.dataframe(df)
cols = st.columns([1, 1])
with cols[0]:
medal_type = st.selectbox('Medal Type', ['gold', 'silver', 'bronze'])
fig = px.pie(df, values=medal_type, names='ctry',
title=f'number of {medal_type} medals',
height=300, width=200)
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0),)
st.plotly_chart(fig, use_container_width=True)
with cols[1]:
st.text_input('sunburst', label_visibility='hidden', disabled=True)
fig = px.sunburst(df, path=['ctry', 'gold', 'silver', 'bronze'],
values='sum', height=300, width=200)
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0),)
st.plotly_chart(fig, use_container_width=True)
Output

Can not get any Bokeh graphs show up when the check boxes are ticked

I have tried the below code to generate bokeh graphs for each of the element when the their respective check boxes are ticked.
data is a df with columns like 'id', 'data_point', 'max', 'min'
I am expecting a graph where if the data_point is empty or nonetype then the graph should not appear otherwise when the check boxes are ticked beside the id the graphs should appear for whichever id is being selected by the check box
This should happen for every unique id.
for one unique id the graph should look like below
data = df
if len(data) == 0:
logging.debug(" No data")
else:
req_hover = HoverTool(
tooltips=[
('data_point: ', '#data_point'),
('id: ', '#id')])
req_figure = figure(title='Graph', x_axis_label="Index", y_axis_label="id/data_point [ms]",
plot_width=800, plot_height=400, output_backend="webgl")
req_figure.add_tools(handover_hover)
id_values = data['id'].drop_duplicates()
column_view_data = data[
['id', 'tag', 'sw', 'name']].drop_duplicates().dropna()
column_view_data_source = ColumnDataSource(column_view_data)
data_table = DataTable(selectable='checkbox', source=column_view_data_source, columns=columns, width=1450,
height=400, css_classes=['blueTable'])
name_dict_req = {'name': [], 'legend': [], 'label': []}
logging.info('START OF DRAWINGS')
for ind, element in enumerate(elements):
d = []
for i, item in data.iterrows():
it_color = Turbo256[random.randint(0, 255)]
if element == item['id']:
if item['data_point'] is None or str(item['data_point']) == '[]':
item['data_point'] = '0'
else:
item['data_point'] = [float(x) for x in item['data_point'].strip('[').strip(']').split(',')]
raw_data_element_count = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20] # this is my x axis which will be 1 to 20
d.append({'id': id,
'number of raw data points': data_point_element_count,
'data_point': item['data_point']})
d_new = pd.DataFrame(d)
name_glyph_req = req_figure.line(x='number of raw data points',
y='data_point',
line_width=2,
legend_label=str(item['id']),
source=d_new,
color=it_color)
name_dict_req['name'].append(name_glyph_req)
name_dict_req['label'].append(str(item['id']))
logging.info('AFTER DRAWINGS LOOP')
for label in range(len(data.id.unique())):
name_dict_req['legend'].append(req_figure.legend.items[label])
initial_value = []
options = list(data.id.unique())
for i, name in enumerate(options):
options[i] = str(name)
for i in range(len(options)):
if name_dict_req['label'][i] in initial_value:
name_dict_req['name'][i].visible = True
name_dict_req['legend'][i].visible = True
else:
name_dict_req['name'][i].visible = False
name_dict_req['legend'][i].visible = False
#########################
callback_datatable = CustomJS(args=dict(source=column_view_data_source), code=callback_file_content_mq)
###################
column_view_data_source.selected.js_on_change('indices', callback_datatable)
req_figure.legend.location = "top_left"
req_figure.legend.click_policy = "hide"
logging.info('END DRAWINGS END SETUP')
I think I did not provide a name dict in the customJS call in my code. So the issue was there that even if I click the checkboxes the graphs were not showing

How to push Nested Dict Data to csv file in python?

I am trying to create a csv file from the data below using python3:-
I am recicving the data in this dict format from a for loop :-
data = [{('Apr', 'Apurv'): 10, ('Mar', 'Apurv'): 58}, {('Apr', 'Smriti'): 12, ('Mar', 'Smriti'): 70}, {('Apr', 'Pankhuri'): 12, ('Mar', 'Pankhuri'): 73}, {('Apr', 'Parth'): 21, ('Mar', 'Parth'): 101}]
I need fetch the header and rows from data so that i can push data to csv.
header = ["Name", 'Apr', 'March'] # This could be more depending on the months present in data
And row should be like :
row1 = ["Apurv", 10, 58]
row2 = ["Smriti", 12, 70]
row3 = ["Pankhuri, 12, 73]
row4 = ["Parth", 21, 103]
rows = [row1, row2, row3, row4]
with open("smriti.csv", 'w') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(header)
csvwriter.writerow(rows)
I need to put each name and month as different columns and insert the rows with respective name and count values corresponding to it.
The output should look like this :-
I am trying to use csv library in python.
Can anybody suggest how i can do this ?
Here is a code to help you experiment with:
import csv
data = [{('Apr', 'Apurv'): 10, ('Mar', 'Apurv'): 58},
{('Apr', 'Smriti'): 12, ('Mar', 'Smriti'): 70},
{('Apr', 'Pankhuri'): 12, ('Mar', 'Pankhuri'): 73},
{('Apr', 'Parth'): 21, ('Mar', 'Parth'): 101}]
header = ["Name"]
result = []
for item in data:
line = {}
for index, element in enumerate(item):
# Collect header elements
if element[0] not in header:
header.append(element[0])
# Fill table content
if index == 0:
line["Name"] = element[1]
line[element[0]] = item[element[0], element[1]]
if index == 1:
result.append(line)
# Double-check the data
print(header)
print(result)
# Create the csv file
with open('smriti.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=header)
writer.writeheader()
writer.writerows(result)

How to filter time series if data exists at least data every 6 hours?

I'd like to verify if there is data at least once every 6 hours per ID, and filter out the IDs that do not meet this criteria.
essentially a filter: "if ID's data not at least every 6h, drop id from dataframe"
I try to use the same method for filtering one per day, but having trouble adapting the code.
# add day column from datetime index
df['1D'] = df.index.day
# reset index
daily = df.reset_index()
# count per ID per day. Result is per ID data of non-zero
a = daily.groupby(['1D', 'id']).size()
# filter by right join
filtered = a.merge(df, on = id", how = 'right')
I cannot figure out how to adapt this for the following 6hr periods each day: 00:01-06:00, 06:01-12:00, 12:01-18:00, 18:01-24:00.
Groupby ID and then integer divide hour by 6 and get unique counts. In your case it should be greater than or equal to 4 because there are 4 - 6 hour bins in 24 hours and each day has 4 unique bins i.e.
Bins = 4
00:01-06:00
06:01-12:00
12:01-18:00
18:01-24:00
Code
mask = df.groupby('id')['date'].transform(lambda x: (x.dt.hour // 6).nunique() >= 4)
df = df[mask]
I propose to use pivot_table with resample which allows to change to arbitrary frequencies. Please see comments for further explanations.
# build test data. I need a dummy column to use pivot_table later. Any column with numerical values will suffice
data = [[datetime(2020, 1, 1, 1), 1, 1],
[datetime(2020, 1, 1, 6), 1, 1],
[datetime(2020, 1, 1, 12), 1, 1],
[datetime(2020, 1, 1, 18), 1, 1],
[datetime(2020, 1, 1, 1), 2, 1],
]
df = pd.DataFrame.from_records(data=data, columns=['date', 'id', 'dummy'])
df = df.set_index('date')
# We need a helper dataframe df_tmp.
# Transform id entries to columns. resample with 6h = 360 minutes = 360T.
# Take mean() because it will produce nan values
# WARNING: It will only work if you have at least one id with observations for every 6h.
df_tmp = pd.pivot_table(df, columns='id', index=df.index).resample('360T').mean()
# Drop MultiColumnHierarchy and drop all columns with NaN values
df_tmp.columns = df_tmp.columns.get_level_values(1)
df_tmp.dropna(axis=1, inplace=True)
# Filter values in original dataframe where
mask_id = df.id.isin(df_tmp.columns.to_list())
df = df[mask_id]
I kept your requirements on timestamps but I believe you want to use the commented lines in my solution.
import pandas as pd
period = pd.to_datetime(['2020-01-01 00:01:00', '2020-01-01 06:00:00'])
# period = pd.to_datetime(['2020-01-01 00:00:00', '2020-01-01 06:00:00'])
shift = pd.to_timedelta(['6H', '6H'])
id_with_data = set(df['ID'])
for k in range(4): # for a day (00:01 --> 24:00)
period_mask = (period[0] <= df.index) & (df.index <= period[1])
# period_mask = (period[0] <= df.index) & (df.index < period[1])
present_ids = set(df.loc[period_mask, 'ID'])
id_with_data = id_with_data.intersection(present_ids)
period += shift
df = df.loc[df['ID'].isin(list(id_with_data))]

How do I shorten this piece of code for the mean values of each row in Excel?

Currently this piece of code works, but it seems to repeat itself and so I don't think it's efficient?
First I write out the header names.
Then I save the file and open it as df. Is this necessary?
The df['Average'] part calculates the mean of each row. Without this part Python doesn't plot the "Average" column.
However, without the summary_ave_data part the "Average" values are not input into Excel.
sheet1.write(0, 0, "Number")
sheet1.write(0, 1, "Value 1")
sheet1.write(0, 2, "Value 2")
sheet1.write(0, 3, "Average")
book.save('C://Users//user/Desktop/excel_trial1.xls')
df = pd.read_excel("C://Users//user/Desktop/excel_trial1.xls", header=0,
delim_whitespace=True)
df['Average'] = df.mean(axis=1)
df
summary_ave_data = df.copy()
summary_ave_data['Average'] = summary_ave_data.mean(numeric_only=True,
axis=1)
print(summary_ave_data)
summary_ave_data.to_excel('C://Users//user/Desktop/excel_trial1.xls')
I will assume you already have the data in excel file. You can rename the columns
once you read the excel by using pandas.
df = pd.read_excel("C://Users//user/Desktop/excel_trial1.xls")
df.columns = ['col_name_1', 'col_name_2', 'col_name_3']
If any missing values replace them by 0
df.fillna(0)
or you can do
df.replace(to_replace='Nan', value=0)
Then you can save the mean of data in each row
df['mean'] = df.mean(axis=1)
You can run the following test:
df_test = pd.DataFrame({'a': [1, 1], 'b': [3, 4]})
I can rename the columns, calculate the mean and add it as a new column by doing this
df_test.columns = ['A', 'B']
df_test['mean'] = df_test.mean(axis=1)
df_test
Final Output
A B mean
0 1 3 2.0
1 1 4 2.5
#you can export the data to excel as following
writer =pd.ExcelWriter('name_df.xlsx',engine='xlsxwriter')
df_test.to_excel(writer, sheet_name = 'Sheet1' , na_rep = 'Nan')
writer.save()
#see the attached excel screenshot

Resources