Python Pandas: bootstrap confidence limits by row rather than entire dataframe - python-3.x

What I am trying to do is to get bootstrap confidence limits by row regardless of the number of rows and make a new dataframe from the output.I currently can do this for the entire dataframe, but not by row. The data I have in my actual program looks similar to what I have below:
0 1 2
0 1 2 3
1 4 1 4
2 1 2 3
3 4 1 4
I want the new dataframe to look something like this with the lower and upper confidence limits:
0 1
0 1 2
1 1 5.5
2 1 4.5
3 1 4.2
The current generated output looks like this:
0 1
0 2.0 2.75
The python 3 code below generates a mock dataframe and generates the bootstrap confidence limits for the entire dataframe. The result is a new dataframe with just 2 values, a upper and a lower confidence limit rather than 4 sets of 2(one for each row).
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a)
b = pd.DataFrame(b)
b = b.T
print(b)
Thank you for any help.

scikits.bootstrap operates by assuming that data samples are arranged by row, not by column. If you want the opposite behavior, just use the transpose, and a statfunction that doesn't combine columns.
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a.T, statfunction=lambda x: np.average(x, axis=0))
print(b.T)

Below is the answer I ended up figuring out to create bootstrap ci by row.
import pandas as pd
import numpy as np
import numpy.random as npr
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
x= zz.dtypes
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
def bootstrap(data, num_samples, statistic, alpha):
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
cc = list(a.index.values) # informs generator of the number of rows
def bootbyrow(cc):
for xx in range(1):
xx = list(a.index.values)
for xx in range(len(cc)):
k = a.apply(lambda y: y[xx])
k = k.values
for xx in range(1):
kk = list(bootstrap(k,10000,np.mean,0.05))
yield list(kk)
abc = pd.DataFrame(list(bootbyrow(cc))) #bootstrap ci by row
# the next 4 just show that its working correctly
a0 = bootstrap((a.loc[0,].values),10000,np.mean,0.05)
a1 = bootstrap((a.loc[1,].values),10000,np.mean,0.05)
a2 = bootstrap((a.loc[2,].values),10000,np.mean,0.05)
a3 = bootstrap((a.loc[3,].values),10000,np.mean,0.05)
print(abc)
print(a0)
print(a1)
print(a2)
print(a3)

Related

Append column to data frame with text based on another column value

In this dataframe, how to go about appending a column named "class_name", with a text string, that is based on another column.
x
y
z
not used
Label
-3.8481877
-0.47685334
0.63422906
1.0396314
1
-2.320888
0.65347993
1.1519914
0.12997247
1
1.5827686
1.4119303
-1.7410104
-4.6962333
2
-0.1337152
0.13315737
-1.6648949
-1.4205348
2
-0.4028037
1.332986
1.3618442
0.3292255
1
-0.015517877
1.346349
1.4083523
0.87017965
0
-0.2669228
0.5478992
-0.06730786
-1.5959451
0
-0.03318152
0.3263167
-2.116833
-5.4616213
1
There are the values the new column will take based on the values in the 'Label' column:
0 == 'avocados'
1 == 'apples'
2 == ' grapes
This is my code so far:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pandas as pd
df = pd.read_csv('embed1_2.csv')
df.loc[df.y_train == 103, 'class_name'] = 'avocados'
df.loc[df.y_train == 103, 'class_name'] = 'apples'
df.loc[df.y_train == 103, 'class_name'] = 'grapes'
How to get the appended column to show up with the converted text?
Thanks for your help!
create a dictionary and then use map in creating a new columns
dict = {
0 : 'avocados',
1 : 'apples',
2 : 'grapes'
}
df['val']=df['Label'].map(dict)
df
x y z not used Label val
0 -3.848188 -0.476853 0.634229 1.039631 1 apples
1 -2.320888 0.653480 1.151991 0.129972 1 apples
2 1.582769 1.411930 -1.741010 -4.696233 2 grapes
3 -0.133715 0.133157 -1.664895 -1.420535 2 grapes
4 -0.402804 1.332986 1.361844 0.329226 1 apples
5 -0.015518 1.346349 1.408352 0.870180 0 avocados
6 -0.266923 0.547899 -0.067308 -1.595945 0 avocados
7 -0.033182 0.326317 -2.116833 -5.461621 1 apples

Python Pandas sum() the difference between two values

In my python code, using pandas i have to resample a datetimedata series and calculate diffs between a column values (the sum of diffs between values), i write this piece of code:
import pandas as pd
import datetime
from .models import Results, VarsResults
start_date = datetime.date(2021, 6, 21)
end_date = datetime.date(2021, 6, 24)
def calc_q(start_d, end_d):
start_d = start_date
end_d = end_date
var_results = VarsResults.objects.filter(
id_res__read_date__range=(start_d, end_d)
).select_related(
"id_res"
).values(
"id_res__read_date",
"id_res__unit_id",
"id_res__device_id",
"id_res__proj_code",
"var_val",
)
df = pd.DataFrame(list(var_results))
df['id_res__read_date'] = pd.to_datetime(df['id_res__read_date'])
df = df.set_index('id_res__read_date')
df_15 = df.resample('15min').sum()
return df_15
but i get the sum of the values itself.
example
... | 5
... | 3
... | 1
i get 9
i would the sum of the difference between values not the sum of the values:
in this case 4 (5-3 = 2 + 3-1 = 2, 2+2)
Is there a method in pandas using resample for manage this kind of clcultion?
So many thanks in advance
Manuel
The sum of all the differences is equal to the difference between the first element and the last one: if you work it out, all the other elements cancel out. In your data for example the 3 cancels out:
(5-3) + (3-1)
= 5 - 3 + 3 - 1 # - 3 and + 3 cancel out
= 5 - 1
I don't know how Pandas works, but you can simply do the equivalent of first_value - last_value.

Apply a function to multiple rows of a pandas DataFrame

I'm trying to apply a function to different readings of each measure. Is it possible to do it without transforming the dataframe?
import random
import pandas as pd
df = pd.DataFrame({
'index': sorted(['A', 'B']*3),
'measure': [i for i in range(0,3)]*2,
'reading': [random.random() for i in range(0,6)]
})
index measure reading
0 A 0 0.260492
1 A 1 0.805028
2 A 2 0.548699
3 B 0 0.014042
4 B 1 0.719705
5 B 2 0.398824
How can I apply a function like basic difference to different readings for each index?
Here I assumed function applied to reading 0 and 1. It should be part of the call as I need to calculate it for different values of measure.
Desired output looks like this:
index applied
0 A 0.5445359999999999
1 B 0.705663
Try this
import random
import pandas as pd
import numpy as np
df = pd.DataFrame({
'index': sorted(['A', 'B']*3),
'measure': [i for i in range(0,3)]*2,
'reading': [random.random() for i in range(0,6)]
})
print(df)
# index measure reading
# 0 A 0 0.869707
# 1 A 1 0.120680
# 2 A 2 0.772035
# 3 B 0 0.565548
# 4 B 1 0.577074
# 5 B 2 0.290668
start = 0
stop = 1
# I decided to specify start and stop value separately, the absolute difference is
# calculated via np.sum(). If the difference between start and stop is always 1, you
# can omit the np.sum() call.
df = df.groupby('index').agg(applied=('reading', lambda x: np.sum(np.diff(x)
[start:stop])))
print(df)
# applied
# index
# A -0.749027
# B 0.011526

Apply function to pandas series given varying arguments

Initial question
I want to calculate the Levenshtein distance between multiple strings, one in a series, the other in a list. I tried my hands on map, zip, etc., but I only got the desired result using a for loop and apply. Is there a way to improve style and especially speed?
Here is what I tried and it does what it is supposed to do, but lacks of speed given a large series.
import stringdist
strings = ['Hello', 'my', 'Friend', 'I', 'am']
s = pd.Series(data=strings, index=strings)
c = ['me', 'mine', 'Friend']
df = pd.DataFrame()
for w in c:
df[w] = s.apply(lambda x: stringdist.levenshtein(x, w))
## Result: ##
me mine Friend
Hello 4 5 6
my 1 3 6
Friend 5 4 0
I 2 4 6
am 2 4 6
Solution
Thanks to #Dames and #molybdenum42, I can provide the solution I used, directly beneath the question. For more insights, please check their great answers below.
import stringdist
from itertools import product
strings = ['Hello', 'my', 'Friend', 'I', 'am']
s = pd.Series(data=strings, index=strings)
c = ['me', 'mine', 'Friend']
word_combinations = np.array(list(product(s.values, c)))
vectorized_levenshtein = np.vectorize(stringdist.levenshtein)
result = vectorized_levenshtein(word_combinations[:, 0],
word_combinations[:, 1])
result = result.reshape((len(s), len(c)))
df = pd.DataFrame(result, columns=c, index=s)
This results in the desired data frame.
Setup:
import stringdist
import pandas as pd
import numpy as np
import itertools
s = pd.Series(data=['Hello', 'my', 'Friend'],
index=['Hello', 'my', 'Friend'])
c = ['me', 'mine', 'Friend']
Options
option: an easy one-liner
df = pd.DataFrame([s.apply(lambda x: stringdist.levenshtein(x, w)) for w in c])
option: np.fromfunction (thanks to #baccandr)
#np.vectorize
def lavdist(a, b):
return stringdist.levenshtein(c[a], s[b])
df = pd.DataFrame(np.fromfunction(lavdist, (len(c), len(s)), dtype = int),
columns=c, index=s)
option: see #molybdenum42
word_combinations = np.array(list(itertools.product(s.values, c)))
vectorized_levenshtein = np.vectorize(stringdist.levenshtein)
result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
df = pd.DataFrame([word_combinations[:,1], word_combinations[:,1], result])
df = df.set_index([0,1])[2].unstack()
(the best) option: modified option 3
word_combinations = np.array(list(itertools.product(s.values, c)))
vectorized_levenshtein = np.vectorize(distance)
result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
result = result.reshape((len(s), len(c)))
df = pd.DataFrame(result, columns=c, index=s)
Performance testing:
import timeit
from Levenshtein import distance
import pandas as pd
import numpy as np
import itertools
s = pd.Series(data=['Hello', 'my', 'Friend'],
index=['Hello', 'my', 'Friend'])
c = ['me', 'mine', 'Friend']
test_code0 = """
df = pd.DataFrame()
for w in c:
df[w] = s.apply(lambda x: distance(x, w))
"""
test_code1 = """
df = pd.DataFrame({w:s.apply(lambda x: distance(x, w)) for w in c})
"""
test_code2 = """
#np.vectorize
def lavdist(a, b):
return distance(c[a], s[b])
df = pd.DataFrame(np.fromfunction(lavdist, (len(c), len(s)), dtype = int),
columns=c, index=s)
"""
test_code3 = """
word_combinations = np.array(list(itertools.product(s.values, c)))
vectorized_levenshtein = np.vectorize(distance)
result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
df = pd.DataFrame([word_combinations[:,1], word_combinations[:,1], result])
df = df.set_index([0,1])[2] #.unstack() produces error
"""
test_code4 = """
word_combinations = np.array(list(itertools.product(s.values, c)))
vectorized_levenshtein = np.vectorize(distance)
result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
result = result.reshape((len(s), len(c)))
df = pd.DataFrame(result, columns=c, index=s)
"""
test_setup = "from __main__ import distance, s, c, pd, np, itertools"
print("test0", timeit.timeit(test_code0, number = 1000, setup = test_setup))
print("test1", timeit.timeit(test_code1, number = 1000, setup = test_setup))
print("test2", timeit.timeit(test_code2, number = 1000, setup = test_setup))
print("test3", timeit.timeit(test_code3, number = 1000, setup = test_setup))
print("test4", timeit.timeit(test_code4, number = 1000, setup = test_setup))
Results
# results
# test0 1.3671939949999796
# test1 0.5982696900009614
# test2 0.3246431229999871
# test3 2.0100400850005826
# test4 0.23796007100099814
Using itertools, you can at least get all the required combinations. Using a vectorized version of stringcount.levenshtein (made using numpy.vectorize()) you can then get your desired result without looping at all, though I haven't tested the performance of the vectorized levenshtein function.
The code could look something like this:
import stringdist
import numpy as np
import pandas as pd
import itertools
s = pd.Series(["Hello", "my","Friend"])
c = ['me', 'mine', 'Friend']
word_combinations = np.array(list(itertools.product(s.values, c)))
vectorized_levenshtein = np.vectorize(stringdist.levenshtein)
result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
At this point you have the results in a numpy array, each corresponding to one of all the possible combinations of your two intial arrays. If you want to get it into the shape you have in your example, there's some pandas trickery to be done:
df = pd.DataFrame([word_combinations[:,0], word_combinations[:,1], result]).T
### initially looks like: ###
# 0 1 2
# 0 Hello me 4
# 1 Hello mine 5
# 2 Hello Friend 6
# 3 my me 1
# 4 my mine 3
# 5 my Friend 6
# 6 Friend me 5
# 7 Friend mine 4
# 8 Friend Friend 0
df = df.set_index([0,1])[2].unstack()
### Now looks like: ###
# Friend Hello my
# Friend 0 6 6
# me 5 4 1
# mine 4 5 3
Again, I haven't tested the performance of this method, so I recommend checking that out - it should be faster than iteration though.
EDIT:
User #Dames has a better suggestion for making the result all pretty-like:
result = result.reshape(len(c), len(s))
df = pd.DataFrame(result, columns=c, index=s)

How to have a chart multiple columns continuously by iterating through a data-frame with matplotlib

BACKGROUND INFORMATION:
I have dataframe of x many stocks with y price sets (closing & 3 day SMA), (currently this is 5 and 2 respectively (one is closing price, the other is a 3 day Simple Moving Average SMA).
The current output is [2781 rows x 10 columns] with a ranging data set start_date = '2006-01-01' till end_date = '2016-12-31'. The output is as follows as a dataframe print(df):
CURRENT OUTPUT:
ANZ Price ANZ 3 day SMA CBA Price CBA 3 day SMA MQG Price MQG 3 day SMA NAB Price NAB 3 day SMA WBC Price WBC 3 day SMA
Date
2006-01-02 23.910000 NaN 42.569401 NaN 66.558502 NaN 30.792999 NaN 22.566401 NaN
2006-01-03 24.040001 NaN 42.619099 NaN 66.086403 NaN 30.935699 NaN 22.705400 NaN
2006-01-04 24.180000 24.043334 42.738400 42.642300 66.587997 66.410967 31.078400 30.935699 22.784901 22.685567
2006-01-05 24.219999 24.146667 42.708599 42.688699 66.558502 66.410967 30.964300 30.992800 22.794800 22.761700
... ... ... ... ... ... ... ... ... ... ...
2016-12-27 87.346667 30.670000 30.706666 32.869999 32.729999 87.346667 30.670000 30.706666 32.869999 32.729999
2016-12-28 87.456667 31.000000 30.773333 32.980000 32.829999 87.456667 31.000000 30.773333 32.980000 32.829999
2016-12-29 87.520002 30.670000 30.780000 32.599998 32.816666 87.520002 30.670000 30.780000 32.599998 32.816666
MY WORKING CODE:
#!/usr/bin/python3
from pandas_datareader import data
import pandas as pd
import itertools as it
import os
import numpy as np
import fix_yahoo_finance as yf
import matplotlib.pyplot as plt
yf.pdr_override()
stock_list = sorted(["ANZ.AX", "WBC.AX", "MQG.AX", "CBA.AX", "NAB.AX"])
number_of_decimal_places = 8
moving_average_period = 3
def get_moving_average(df, stock_name):
df2 = df.rolling(window=moving_average_period).mean()
df2.rename(columns={stock_name: stock_name.replace("Price", str(moving_average_period) + " day SMA")}, inplace=True)
df = pd.concat([df, df2], axis=1, join_axes=[df.index])
return df
# Function to get the closing price of the individual stocks
# from the stock_list list
def get_closing_price(stock_name, specific_close):
symbol = stock_name
start_date = '2006-01-01'
end_date = '2016-12-31'
df = data.get_data_yahoo(symbol, start_date, end_date)
sym = symbol + " "
print(sym * 10)
df = df.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1)
df = df.rename(columns={'Close': specific_close})
# https://stackoverflow.com/questions/16729483/converting-strings-to-floats-in-a-dataframe
# df[specific_close] = df[specific_close].astype('float64')
# print(type(df[specific_close]))
return df
# Creates a big DataFrame with all the stock's Closing
# Price returns the DataFrame
def get_all_close_prices(directory):
count = 0
for stock_name in stock_list:
specific_close = stock_name.replace(".AX", "") + " Price"
if not count:
prev_df = get_closing_price(stock_name, specific_close)
prev_df = get_moving_average(prev_df, specific_close)
else:
new_df = get_closing_price(stock_name, specific_close)
new_df = get_moving_average(new_df, specific_close)
# https://stackoverflow.com/questions/11637384/pandas-join-merge-concat-two-dataframes
prev_df = prev_df.join(new_df)
count += 1
# prev_df.to_csv(directory)
df = pd.DataFrame(prev_df, columns=list(prev_df))
df = df.apply(pd.to_numeric)
convert_df_to_csv(df, directory)
return df
def convert_df_to_csv(df, directory):
df.to_csv(directory)
def main():
# FINDS THE CURRENT DIRECTORY AND CREATES THE CSV TO DUMP THE DF
csv_in_current_directory = os.getcwd() + "/stock_output.csv"
csv_in_current_directory_dow_distribution = os.getcwd() + "/dow_distribution.csv"
# FUNCTION THAT GETS ALL THE CLOSING PRICES OF THE STOCKS
# AND RETURNS IT AS ONE COMPLETE DATAFRAME
df = get_all_close_prices(csv_in_current_directory)
print(df)
# Main line of code
if __name__ == "__main__":
main()
QUESTION:
From this df I want to create x many lines graphs (one graph per stock) with y many lines (price, and SMAs). How can I do this with matplotlib? Could this be done with a for loop and save the individuals plots as the loop gets iterated? If so how?
First import import matplotlib.pyplot as plt.
Then it depends whether you want x many individual plots or one plot with x many subplots:
Individual plots
df.plot(y=[0,1])
df.plot(y=[2,3])
df.plot(y=[4,5])
df.plot(y=[6,7])
df.plot(y=[8,9])
plt.show()
You can also save the individual plots in a loop:
for i in range(0,9,2):
df.plot(y=[i,i+1])
plt.savefig('{}.png'.format(i))
Subplots
fig, axes = plt.subplots(nrows=2, ncols=3)
df.plot(ax=axes[0,0],y=[0,1])
df.plot(ax=axes[0,1],y=[2,3])
df.plot(ax=axes[0,2],y=[4,5])
df.plot(ax=axes[1,0],y=[6,7])
df.plot(ax=axes[1,1],y=[8,9])
plt.show()
See https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html for options to customize your plot(s).
The best approach is to make a function that is dependent on the size of your lists x and y. Thereby the function should be as follows:
def generate_SMA_graphs(df):
columnNames = list(df.head(0))
print("CN:\t", columnNames)
print(len(columnNames))
count = 0
for stock in stock_list:
stock_iter = count * (len(moving_average_period_list) + 1)
sma_iter = stock_iter + 1
for moving_average_period in moving_average_period_list:
fig = plt.figure()
df.plot(y=[columnNames[stock_iter], columnNames[sma_iter]])
plt.xlabel('Time')
plt.ylabel('Price ($)')
graph_title = columnNames[stock_iter] + " vs. " + columnNames[sma_iter]
plt.title(graph_title)
plt.grid(True)
plt.savefig(graph_title.replace(" ", "") + ".png")
print("\t\t\t\tCompleted: ", graph_title)
plt.close(fig)
sma_iter += 1
count += 1
With the code above, irrespective how ever long either list is (for x or y, stock list or SMA list) the above function will generate a graph comparing the original price with every SMA for that given stock.

Resources