Apply a function to multiple rows of a pandas DataFrame - python-3.x

I'm trying to apply a function to different readings of each measure. Is it possible to do it without transforming the dataframe?
import random
import pandas as pd
df = pd.DataFrame({
'index': sorted(['A', 'B']*3),
'measure': [i for i in range(0,3)]*2,
'reading': [random.random() for i in range(0,6)]
})
index measure reading
0 A 0 0.260492
1 A 1 0.805028
2 A 2 0.548699
3 B 0 0.014042
4 B 1 0.719705
5 B 2 0.398824
How can I apply a function like basic difference to different readings for each index?
Here I assumed function applied to reading 0 and 1. It should be part of the call as I need to calculate it for different values of measure.
Desired output looks like this:
index applied
0 A 0.5445359999999999
1 B 0.705663

Try this
import random
import pandas as pd
import numpy as np
df = pd.DataFrame({
'index': sorted(['A', 'B']*3),
'measure': [i for i in range(0,3)]*2,
'reading': [random.random() for i in range(0,6)]
})
print(df)
# index measure reading
# 0 A 0 0.869707
# 1 A 1 0.120680
# 2 A 2 0.772035
# 3 B 0 0.565548
# 4 B 1 0.577074
# 5 B 2 0.290668
start = 0
stop = 1
# I decided to specify start and stop value separately, the absolute difference is
# calculated via np.sum(). If the difference between start and stop is always 1, you
# can omit the np.sum() call.
df = df.groupby('index').agg(applied=('reading', lambda x: np.sum(np.diff(x)
[start:stop])))
print(df)
# applied
# index
# A -0.749027
# B 0.011526

Related

Append column to data frame with text based on another column value

In this dataframe, how to go about appending a column named "class_name", with a text string, that is based on another column.
x
y
z
not used
Label
-3.8481877
-0.47685334
0.63422906
1.0396314
1
-2.320888
0.65347993
1.1519914
0.12997247
1
1.5827686
1.4119303
-1.7410104
-4.6962333
2
-0.1337152
0.13315737
-1.6648949
-1.4205348
2
-0.4028037
1.332986
1.3618442
0.3292255
1
-0.015517877
1.346349
1.4083523
0.87017965
0
-0.2669228
0.5478992
-0.06730786
-1.5959451
0
-0.03318152
0.3263167
-2.116833
-5.4616213
1
There are the values the new column will take based on the values in the 'Label' column:
0 == 'avocados'
1 == 'apples'
2 == ' grapes
This is my code so far:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pandas as pd
df = pd.read_csv('embed1_2.csv')
df.loc[df.y_train == 103, 'class_name'] = 'avocados'
df.loc[df.y_train == 103, 'class_name'] = 'apples'
df.loc[df.y_train == 103, 'class_name'] = 'grapes'
How to get the appended column to show up with the converted text?
Thanks for your help!
create a dictionary and then use map in creating a new columns
dict = {
0 : 'avocados',
1 : 'apples',
2 : 'grapes'
}
df['val']=df['Label'].map(dict)
df
x y z not used Label val
0 -3.848188 -0.476853 0.634229 1.039631 1 apples
1 -2.320888 0.653480 1.151991 0.129972 1 apples
2 1.582769 1.411930 -1.741010 -4.696233 2 grapes
3 -0.133715 0.133157 -1.664895 -1.420535 2 grapes
4 -0.402804 1.332986 1.361844 0.329226 1 apples
5 -0.015518 1.346349 1.408352 0.870180 0 avocados
6 -0.266923 0.547899 -0.067308 -1.595945 0 avocados
7 -0.033182 0.326317 -2.116833 -5.461621 1 apples

Horizontal grouped Barplot with Bokeh using dataframe

I have a Dataframe and I want to group by Type, and then Flag and plot a graph for count of ID and another graph grouped by Type , Flag and sum of Total column in Bokeh.
')
p.hbar(df,
plot_width=800,
plot_height=800,
label='Type',
values='ID',
bar_width=0.4,
group = ' Type', 'Flag'
legend='top_right')
[![Expected Graph ][2]][2]
If it's not possible with Bokeh what other package can I use to get a good looking graph( Vibrant colours with white background)
You can do this with the holoviews library, which uses bokeh as a backend.
import pandas as pd
import holoviews as hv
from holoviews import opts
hv.extension("bokeh")
df = pd.DataFrame({
"type": list("ABABCCAD"),
"flag": list("YYNNNYNY"),
"id": list("DEFGHIJK"),
"total": [40, 100, 20, 60, 77, 300, 60, 50]
})
# Duplicate the dataframe
df = pd.concat([df] * 2)
print(df)
type flag id total
0 A Y 1 40
1 B Y 2 100
2 A N 3 20
3 B N 4 60
4 C N 5 77
5 C Y 6 300
6 A N 7 60
7 D Y 8 50
Now that we have our data, lets work on plotting it:
def mainplot_hook(plot, element):
plot.state.text(
y="xoffsets",
x="total",
text="total",
source=plot.handles["source"],
text_align="left",
y_offset=9,
x_offset=5
)
def sideplot_hook(plot, element):
plot.state.text(
y="xoffsets",
x="count",
text="count",
source=plot.handles["source"],
text_align="left",
y_offset=9,
x_offset=5
)
# Create single bar plot for sum of the total column
total_sum = df.groupby(["type", "flag"])["total"].sum().reset_index()
total_sum_bars = hv.Bars(total_sum, kdims=["type", "flag"], vdims="total")
# Create our multi-dimensional bar plot
all_ids = sorted(df["id"].unique())
counts = df.groupby(["type", "flag"])["id"].value_counts().rename("count").reset_index()
id_counts_hmap = hv.Bars(counts, kdims=["type", "flag", "id"], vdims="count").groupby("type")
main_plot = (total_sum_bars
.opts(hooks=[mainplot_hook],
title="Total Sum",
invert_axes=True)
)
side_plots = (
id_counts_hmap
.redim.values(id=all_ids, flag=["Y", "N"])
.redim.range(count=(0, 3))
.opts(
opts.NdLayout(title="Counts of ID"),
opts.Bars(color="#1F77B4", height=250, width=250, invert_axes=True, hooks=[sideplot_hook]))
.layout("type")
.cols(2)
)
final_plot = main_plot + side_plots
# Save combined output as html
hv.save(final_plot, "my_plot.html")
# Save just the main_plot as html
hv.save(main_plot, "main_plot.html")
As you can see, the code to make plots in holoviews can be a little tricky but it's definitely a tool I would recommend you pick up. Especially if you deal with high dimensional data regularly, it makes plotting it a breeze once you get the syntax down.

Pandas grouping and resampling for a bar plot:

I have a dataframe that records concentrations for several different locations in different years, with a high temporal frequency (<1 hour). I am trying to make a bar/multibar plot showing mean concentrations, at different locations in different years
To calculate mean concentration, I have to apply quality control filters to daily and monthly data.
My approach is to first apply filters and resample per year and then do the grouping by location and year.
Also, out of all the locations (in the column titled locations) I have to choose only a few rows. So, I am slicing the original dataframe and creating a new dataframe with selected rows.
I am not able to achieve this using the following code:
date=df['date']
location = df['location']
df.date = pd.to_datetime(df.date)
year=df.date.dt.year
df=df.set_index(date)
df['Year'] = df['date'].map(lambda x: x.year )
#Location name selection/correction in each city:
#Changing all stations:
df['location'] = df['location'].map(lambda x: "M" if x == "mm" else x)
#New dataframe:
df_new = df[(df['location'].isin(['K', 'L', 'M']))]
#Data filtering:
df_new = df_new[df_new['value'] >= 0]
df_new.drop(df_new[df_new['value'] > 400].index, inplace = True)
df_new.drop(df_new[df_new['value'] <2].index, inplace = True)
diurnal = df_new[df_new['value']].resample('12h')
diurnal_mean = diurnal.mean()[diurnal.count() >= 9]
daily_mean=diurnal_mean.resample('d').mean()
df_month=daily_mean.resample('m').mean()
df_yearly=df_month[df_month['value']].resample('y')
#For plotting:
df_grouped = df_new.groupby(['location', 'Year']).agg({'value':'sum'}).reset_index()
sns.barplot(x='location',y='value',hue='Year',data= df_grouped)
This is one of the many errors that cropped up:
"None of [Float64Index([22.73, 64.81, 8.67, 19.98, 33.12, 37.81, 39.87, 42.29, 37.81,\n 36.51,\n ...\n 11.0, 40.0, 23.0, 80.0, 50.0, 60.0, 40.0, 80.0, 80.0,\n 17.0],\n dtype='float64', length=63846)] are in the [columns]"
ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
This is a sample dataframe, showing what I need to plot; value column should ideally represent resampled values, after performing the quality control operations and resampling.
Unnamed: 0 location value \
date location value
2017-10-21 08:45:00+05:30 8335 M 339.3
2017-08-18 17:45:00+05:30 8344 M 45.1
2017-11-08 13:15:00+05:30 8347 L 594.4
2017-10-21 13:15:00+05:30 8659 N 189.9
2017-08-18 15:45:00+05:30 8662 N 46.5
This is how the a part of the actual data should look like, after selecting the chosen locations. I am a new user so cannot attach a screenshot of the graph I require. This query is an extension of the query I had posted earlier , with the additional requirement of plotting resampled data instead of simple value counts. Iteration over years to plot different group values as bar plot in pandas
Any help will be much appreciated.
Fundamentally, your errors come with this unclear indexing where you are passing continuous, float values of one column for rowwise selection of index which currently is a datetime type.
df_new[df_new['value']] # INDEXING DATETIME USING FLOAT VALUES
...
df_month[df_month['value']] # COLUMN value DOES NOT EXIST
Possibly, you meant to select the column value (out of the others) during resampling.
diurnal = df_new['value'].resample('12h')
diurnal.mean()[diurnal.count() >= 9]
daily_mean = diurnal_mean.resample('d').mean()
df_month = daily_mean.resample('m').mean() # REMOVE value BEING UNDERLYING SERIES
df_yearly = df_month.resample('y')
However, no where above do you retain location for plotting. Hence, instead of resample, use groupby(pd.Grouper(...))
# AGGREGATE TO KEEP LOCATION AND 12h
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
)
# FILTER
diurnal_sub = diurnal[diurnal["count"] >= 9]
# MULTIPLE DATE TIME LEVEL MEANS
daily_mean = diurnal_sub.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal_sub.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = diurnal_sub.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
print(df_yearly)
To demonstrate with random, reproducible data:
Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(242020)
random_df = pd.DataFrame({'date': (np.random.choice(pd.date_range('2017-01-01', '2019-12-31'), 5000) +
pd.to_timedelta(np.random.randint(60*60, 60*60*24, 5000), unit='s')),
'location': np.random.choice(list("KLM"), 5000),
'value': np.random.uniform(10, 1000, 5000)
})
Aggregation
loc_list = list("KLM")
# NEW DATA FRAME WITH DATA FILTERING
df = (random_df.set_index(random_df['date'])
.assign(Year = lambda x: x['date'].dt.year,
location = lambda x: x['location'].where(x["location"] != "mm", "M"))
.query('(location == #loc_list) and (value >= 2 and value <= 400)')
)
# 12h AGGREGATION
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
.query("count >= 2")
)
# d, m, y AGGREGATION
daily_mean = diurnal.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = (diurnal.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
.reset_index()
.assign(Year = lambda x: x["date"].dt.year)
)
print(df_yearly)
# location date mean Year
# 0 K 2017-12-31 188.984592 2017
# 1 K 2018-12-31 199.521702 2018
# 2 K 2019-12-31 216.497268 2019
# 3 L 2017-12-31 214.347873 2017
# 4 L 2018-12-31 199.232711 2018
# 5 L 2019-12-31 177.689221 2019
# 6 M 2017-12-31 222.412711 2017
# 7 M 2018-12-31 241.597977 2018
# 8 M 2019-12-31 215.554228 2019
Plotting
sns.set()
fig, axs = plt.subplots(figsize=(12,5))
sns.barplot(x='location', y='mean', hue='Year', data= df_yearly, ax=axs)
plt.title("Location Value Yearly Aggregation", weight="bold", size=16)
plt.show()
plt.clf()
plt.close()

Use if statement within .str.find()

I would like to know If I have an if statement that looks something like this:
if int(i) > 10:
return 0
else:
return -1
where i is equivalent to a row entry in df["price"] (df is a pandas dataframe) defined as follows:
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
how can I use df["price"].str.find(...) together with the above if statement to filter the data by the true condition?
I would like output that looks like the following:
0 -1
1 0
2 -1
I have been struggling with how to implement it, please assist.
Generally its easiest to first convert to optimal dtypes. That way all operations will be quicker - of course, it depends on your application whether this matters. But if things are numbers, let them be numbers (explicit > implicit).
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
df['price'] = df.price.astype(int) # or float
df['number'] = df.number.astype(int)
You can then add your criteria as a colum (or just use the output). Apply or map are not so quick, so its better to use the np.where suggested by others or any other comparison that will use numpy under the hood. For example:
df['criteria'] = -1 * (df.price <= 10).astype(int) # quicker to not use map or apply
df.criteria
You could use gt + map:
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
result = df.price.astype(int).gt(10).map({False: -1, True: 0})
print(result)
Output
0 -1
1 0
2 -1
Name: price, dtype: int64
Or if you prefer, you could use np.where, as mentioned by #coldspeed in the comments.
import numpy as np
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
result = np.where(df.price.astype(int) > 10, 0, -1)
print(result)
Output
[-1 0 -1]
You can use np.where:
df['price'] =df['price'].astype(int)
df['output'] = np.where(df['price']>10, 0, -1)
df
price Number output
0 10 07367 -1
1 20 08356 0
2 9 07745 -1
The syntax is: np.where(condition, valueIfTrue, valueIfFalse)
simply you can use lambda functions
df.price.apply(lambda x : 0 if int(x)>10 else -1)

Python Pandas: bootstrap confidence limits by row rather than entire dataframe

What I am trying to do is to get bootstrap confidence limits by row regardless of the number of rows and make a new dataframe from the output.I currently can do this for the entire dataframe, but not by row. The data I have in my actual program looks similar to what I have below:
0 1 2
0 1 2 3
1 4 1 4
2 1 2 3
3 4 1 4
I want the new dataframe to look something like this with the lower and upper confidence limits:
0 1
0 1 2
1 1 5.5
2 1 4.5
3 1 4.2
The current generated output looks like this:
0 1
0 2.0 2.75
The python 3 code below generates a mock dataframe and generates the bootstrap confidence limits for the entire dataframe. The result is a new dataframe with just 2 values, a upper and a lower confidence limit rather than 4 sets of 2(one for each row).
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a)
b = pd.DataFrame(b)
b = b.T
print(b)
Thank you for any help.
scikits.bootstrap operates by assuming that data samples are arranged by row, not by column. If you want the opposite behavior, just use the transpose, and a statfunction that doesn't combine columns.
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a.T, statfunction=lambda x: np.average(x, axis=0))
print(b.T)
Below is the answer I ended up figuring out to create bootstrap ci by row.
import pandas as pd
import numpy as np
import numpy.random as npr
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
x= zz.dtypes
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
def bootstrap(data, num_samples, statistic, alpha):
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
cc = list(a.index.values) # informs generator of the number of rows
def bootbyrow(cc):
for xx in range(1):
xx = list(a.index.values)
for xx in range(len(cc)):
k = a.apply(lambda y: y[xx])
k = k.values
for xx in range(1):
kk = list(bootstrap(k,10000,np.mean,0.05))
yield list(kk)
abc = pd.DataFrame(list(bootbyrow(cc))) #bootstrap ci by row
# the next 4 just show that its working correctly
a0 = bootstrap((a.loc[0,].values),10000,np.mean,0.05)
a1 = bootstrap((a.loc[1,].values),10000,np.mean,0.05)
a2 = bootstrap((a.loc[2,].values),10000,np.mean,0.05)
a3 = bootstrap((a.loc[3,].values),10000,np.mean,0.05)
print(abc)
print(a0)
print(a1)
print(a2)
print(a3)

Resources