Perform operations on a dataframe with map and reduce - python-3.x

I have a dataframe with the items sold by different stores every day:
date date_block_num shop_id item_id item_price item_cnt_day
0 02.01.2013 0 59 22154 999.00 1.0
1 03.01.2013 0 25 2552 899.00 1.0
2 05.01.2013 0 25 2552 899.00 -1.0
3 06.01.2013 0 25 2554 1709.05 1.0
4 15.01.2013 0 25 2555 1099.00 1.0
I would like to get the results for each designated store with shop_id with filter, map and reduce.
So I tried:
each_shop = filter(lambda n: n==transactions.shop_id, transactions)
results = map(lambda n: transactions.item_price*transactions.item_cnt_day, each_shop)
for result in results:
print(result)
But got:
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py:798: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = getattr(x, name)(y)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-21-4e86d47b9f0d> in <module>()
3 each_shop = filter(lambda n: n==transactions.shop_id, transactions)
4 results = map(lambda n: transactions.item_price*transactions.item_cnt_day, each_shop)
----> 5 for result in results:
6 print(result)
7
<ipython-input-21-4e86d47b9f0d> in <lambda>(n)
1 # YOUR CODE GOES HERE
2 # map
----> 3 each_shop = filter(lambda n: n==transactions.shop_id, transactions)
4 results = map(lambda n: transactions.item_price*transactions.item_cnt_day, each_shop)
5 for result in results:
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(self, other, axis)
859
860 with np.errstate(all='ignore'):
--> 861 res = na_op(values, other)
862 if is_scalar(res):
863 raise TypeError('Could not compare %s type with Series' %
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
798 result = getattr(x, name)(y)
799 if result is NotImplemented:
--> 800 raise TypeError("invalid type comparison")
801 except AttributeError:
802 result = op(x, y)
TypeError: invalid type comparison

Related

how to solve this shap.waterfall_plot error?

I'm trying to do a waterfall plot form the SHAP library to represent an instance of the predictions of a model like that:
ex = shap.Explanation(shap_values[0],
explainer.expected_value,
X.iloc[0],
columns)
ex
ex returns that:
.values =
array([-2.27243590e-01, 5.41666667e-02, 3.33333333e-03, 2.21153846e-02,
1.92307692e-04, -7.17948718e-02])
.base_values =
0.21923076923076923
.data =
BMI 18.716444
ROM-PADF-KE_D 33
Asym-ROM-PHIR(≥8)_discr 1
Asym_SLCMJLanding-pVGRF(10percent)_discr 1
Asym_TJ_Valgus_FPPA(10percent)_discr 1
DVJ_Valgus_KneeMedialDisplacement_D_discr 0
Name: 0, dtype: object
but when I try to plot the waterfall plot I receive that error
shap.waterfall_plot(ex)
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/tmp/ipykernel_4785/3628025354.py in <module>
----> 1 shap.waterfall_plot(ex)
/usr/local/lib/python3.8/dist-packages/shap/plots/_waterfall.py in waterfall(shap_values, max_display, show)
120 yticklabels[rng[i]] = feature_names[order[i]]
121 else:
--> 122 yticklabels[rng[i]] = format_value(features[order[i]], "%0.03f") + " = " + feature_names[order[i]]
123
124 # add a last grouped feature to represent the impact of all the features we didn't show
/usr/local/lib/python3.8/dist-packages/shap/utils/_general.py in format_value(s, format_str)
232 s = format_str % s
233 s = re.sub(r'\.?0+$', '', s)
--> 234 if s[0] == "-":
235 s = u"\u2212" + s[1:]
236 return s
IndexError: string index out of range**strong text**
Edit for minimal reproducible error:
the explainer is a kernel explainer:
explainer_2 = shap.KernelExplainer(sci_Model_2.predict, X)
shap_values_2 = explainer.shap_values(X)
X and y are lists from dataFrames charged like that:
y = data_modelo_1_2_csv_encoded['Soft-Tissue_injury_≥4days']
y_list = label_encoder.fit_transform(y)
X = data_modelo_1_2_csv_encoded.drop('Soft-Tissue_injury_≥4days',axis=1)
X_list = X.to_numpy()
and the model is a little weka model wrapper for python, to use python libraries with weka models like SHAP, done like that:
class weka_classifier(BaseEstimator, ClassifierMixin):
def __init__(self, classifier = None, dataset = None):
if classifier is not None:
self.classifier = classifier
if dataset is not None:
self.dataset = dataset
self.dataset.class_is_last()
if index is not None:
self.index = index
def fit(self, X, y):
return self.fit2()
def fit2(self):
return self.classifier.build_classifier(self.dataset)
def predict_instance(self,x):
x.append(0.0)
inst = Instance.create_instance(x,classname='weka.core.DenseInstance', weight=1.0)
inst.dataset = self.dataset
return self.classifier.classify_instance(inst)
def predict_proba_instance(self,x):
x.append(0.0)
inst = Instance.create_instance(x,classname='weka.core.DenseInstance', weight=1.0)
inst.dataset = self.dataset
return self.classifier.distribution_for_instance(inst)
def predict_proba(self,X):
prediction = []
for i in range(X.shape[0]):
instance = []
for j in range(X.shape[1]):
instance.append(X[i][j])
instance.append(0.0)
instance = Instance.create_instance(instance,classname='weka.core.DenseInstance', weight=1.0)
instance.dataset=self.dataset
prediction.append(self.classifier.distribution_for_instance(instance))
return np.asarray(prediction)
def predict(self,X):
prediction = []
for i in range(X.shape[0]):
instance = []
for j in range(X.shape[1]):
instance.append(X[i][j])
instance.append(0.0)
instance = Instance.create_instance(instance,classname='weka.core.DenseInstance', weight=1.0)
instance.dataset=self.dataset
prediction.append(self.classifier.classify_instance(instance))
return np.asarray(prediction)
def set_data(self,dataset):
self.dataset = dataset
self.dataset.class_is_last()
the database is an arff charged to a csv and uploaded like a dataframe with this variables:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 BMI 260 non-null float64
1 ROM-PADF-KE_D 260 non-null int64
2 Asym-ROM-PHIR(≥8)_discr 260 non-null int64
3 Asym_SLCMJLanding-pVGRF(10percent)_discr 260 non-null int64
4 Asym_TJ_Valgus_FPPA(10percent)_discr 260 non-null int64
5 DVJ_Valgus_KneeMedialDisplacement_D_discr 260 non-null int64
6 Soft-Tissue_injury_≥4days 260 non-null category
dtypes: category(1), float64(1), int64(5)
likely your issue is that 0 in your .data field is a string instead if a number.
I can reproduce the same error with format_value('0', "%0.03f").
Looking at current format_value we can see that it removes all trailing zeros from a string and in particular format_value('100', "%0.03f") gives 1.
This is a bug and that the regex should be replaced (for example with this: https://stackoverflow.com/a/26299205/4178189)
Note that when you supply a number (e.g. 100 or 0) the number is first replaced with a string (100.000 or 0.000) so the function does not show its bug when called with a number (int or float).
Also the development version of shap (not yet released), would not suffer from this issue since when called with a non number value the function waterfall_plot would not call format_value, see: https://github.com/slundberg/shap/blob/8926cd0122d0a1b3cca0768f2c386de706090668/shap/plots/_waterfall.py#L127
note: this question is also a github issue, see https://github.com/slundberg/shap/issues/2581#issuecomment-1155134604

Why Exception Handling doesn't print text?

My question is why Python doesn't execute the print statement in the Exception Handling code below. I am trying to calculate the log of volumes for a bunch of stocks. Each stock has 1259 volume values. But Python generates a RunTimeWarning "divide by zero encountered in log". So I try to use Exception Handling to locate where the log input is zero, but Python doesn't execute the print statement under except. The print statement is supposed to print the name of the stock and the index in the array where the volume is zero. Why?
Here is the code:
for i, stock in enumerate(df.columns):
volumes = df[stock].to_numpy()
for r in range(len(volumes)): # len(volumes) = 1259
try:
v = np.log(volumes[r])
except:
print(stock, r)
Here is the Error that follows after the RunTimeWarning.
LinAlgError Traceback (most recent call last)
<ipython-input-6-6aa283671e2c> in <module>
13 closes = df_close[stock].to_numpy()
14 volumes = df_vol[stock].to_numpy()
---> 15 indicator_values_all_stocks[i] = indicator.price_volume_fit(volumes, closes, histLength)
16
17 indicator_values_all_stocks_no_NaN = indicator_values_all_stocks[:, ~np.isnan(indicator_values_all_stocks).any(axis=0)]
~\Desktop\Python Projects Organized\Finance\Indicator Statistics\B.57. Price Volume Fit\indicator.py in price_volume_fit(volumes, closes, histLength)
1259 x = log_volumes[i - histLength:i]
1260 y = log_prices[i - histLength:i]
-> 1261 model = np.polyfit(x, y, 1, full = True)
1262 slope[i] = model[0][0]
1263
<__array_function__ internals> in polyfit(*args, **kwargs)
c:\users\donald seger\miniconda3\envs\tensorflow\lib\site-packages\numpy\lib\polynomial.py in polyfit(x, y, deg, rcond, full, w, cov)
629 scale = NX.sqrt((lhs*lhs).sum(axis=0))
630 lhs /= scale
--> 631 c, resids, rank, s = lstsq(lhs, rhs, rcond)
632 c = (c.T/scale).T # broadcast scale coefficients
633
<__array_function__ internals> in lstsq(*args, **kwargs)
c:\users\donald seger\miniconda3\envs\tensorflow\lib\site-packages\numpy\linalg\linalg.py in lstsq(a, b, rcond)
2257 # lapack can't handle n_rhs = 0 - so allocate the array one larger in that axis
2258 b = zeros(b.shape[:-2] + (m, n_rhs + 1), dtype=b.dtype)
-> 2259 x, resids, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)
2260 if m == 0:
2261 x[...] = 0
c:\users\donald seger\miniconda3\envs\tensorflow\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_lstsq(err, flag)
107
108 def _raise_linalgerror_lstsq(err, flag):
--> 109 raise LinAlgError("SVD did not converge in Linear Least Squares")
110
111 def get_linalg_error_extobj(callback):
LinAlgError: SVD did not converge in Linear Least Squares

Why do I get a view error when enumerating a Dataframe

Why do I get a "view" error:
ndf = pd.DataFrame()
ndf['Signals'] = [1,1,1,1,1,0,0,0,0,0]
signals_diff = ndf.Signals.diff()
ndf['Revals'] = [101,102,105,104,105,106,107,108,109,109]
ndf['Entry'] = 0
for i,element in enumerate(signals_diff):
if (i==0):
ndf.iloc[i]['Entry'] = ndf.iloc[i]['Revals']
elif (element == 0):
ndf.iloc[i]['Entry'] = ndf.iloc[i - 1]['Entry']
else:
ndf.iloc[i]['Entry'] = ndf.iloc[i]['Revals']
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
ndf.iloc[i]['Entry'] = ndf.iloc[i]['Revals']
instead of iloc use loc:
ndf = pd.DataFrame()
ndf['Signals'] = [1,1,1,1,1,0,0,0,0,0]
signals_diff = ndf.Signals.diff()
ndf['Revals'] = [101,102,105,104,105,106,107,108,109,109]
ndf['Entry'] = 0
for i,element in enumerate(signals_diff):
if (i==0):
ndf.loc[i,'Entry'] = ndf.loc[i,'Revals']
elif (element == 0):
ndf.loc[i,'Entry'] = ndf.loc[i - 1,'Entry']
else:
ndf.loc[i,'Entry'] = ndf.loc[i,'Revals']
This will solve the problem but when assigning, the index should be same. So because of the index thing you might not be able to get the expected result.
Do not chain indexes like ndf.iloc[i]['Entry'] when trying to assign something. See why does that not work.
That said, your code can be rewrite as:
ndf['Entry'] = ndf['Revals'].where(signals_diff != 0).ffill()
Output:
Signals Revals Entry
0 1 101 101.0
1 1 102 101.0
2 1 105 101.0
3 1 104 101.0
4 1 105 101.0
5 0 106 106.0
6 0 107 106.0
7 0 108 106.0
8 0 109 106.0
9 0 109 106.0
Let us keep using the index position slice with get_indexer
for i,element in enumerate(signals_diff):
if (i==0):
ndf.iloc[i,ndf.columns.get_indexer(['Entry'])] = ndf.iloc[i,ndf.columns.get_indexer(['Revals'])]
elif (element == 0):
ndf.iloc[i,ndf.columns.get_indexer(['Entry'])] = ndf.iloc[i - 1,ndf.columns.get_indexer(['Entry'])]
else:
ndf.iloc[i,ndf.columns.get_indexer(['Entry'])] = ndf.iloc[i,ndf.columns.get_indexer(['Revals'])]

Cannot cast ufunc subtract output from dtype('float64') to dtype('int64') with casting rule 'same_kind' despite forced convertion

I have a data Series ts:
0 2599.0
1 2599.0
2 3998.0
3 3998.0
4 1299.0
5 1499.0
6 1499.0
7 2997.5
8 749.5
Name: 0, dtype: float64
and I would like to predict the next period using ARIMA:
import statsmodels.tsa.api as smt
array = []
for i, row in test.iterrows():
print("row['shop_id']: ", row['shop_id'], " row['item_id']: ", row['item_id'])
ts = pd.DataFrame(sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']],[row['item_id']]], :]['item_price'].values*sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']],[row['item_id']]], :]['item_cnt_day'].values).T.iloc[0]
rng = range(5)
for i in rng:
for j in rng:
try:
tmp_mdl = smt.ARMA(ts, order = (i, j)).fit(method='mle', trand='nc')
tmp_aic = tmp_mdl.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_order = (i, j)
best_mdl = tmp_mdl
except:
continue
if best_mdl.predict()<0:
y_pred = 0
else:
y_pred = best_mdl.predict()
d = {'id':row['ID'], 'item_cnt_month': y_pred}
array.append(d)
df = pd.DataFrame(array)
df
But I get:
---------------------------------------------------------------------------
UFuncTypeError Traceback (most recent call last)
<ipython-input-104-85dfa2fa67c1> in <module>()
22 except:
23 continue
---> 24 if best_mdl.predict()<0:
25 y_pred = 0
26 else:
3 frames
/usr/local/lib/python3.6/dist-packages/statsmodels/tsa/arima_model.py in geterrors(self, params)
686 k = self.k_exog + self.k_trend
687 if k > 0:
--> 688 y -= dot(self.exog, params[:k])
689
690 k_ar = self.k_ar
UFuncTypeError: Cannot cast ufunc 'subtract' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'
So I used best_mdl.predict().astype('float32') but it didn't changed anything.

Why am I getting this error - KeyError: ('datetime64[ns]', 'left')?

When I run the code below this error is displayed:
KeyError: ('datetime64[ns]', 'left')
Anyone have a answer ? Please find the code below:
def transform_in_intervals(df,freq=to_offset('5t')):
df = df.copy()
time = df.index
interval = pd.interval_range(start=time[0],periods=(time[-1]-time[0])/pd.Timedelta(freq)+1,freq=freq,closed="left")
df.reset_index(False,inplace=True)
i, c = 0, 0
while i < len(df):
if df.loc[i,"time"] in interval[c]:
df.loc[i,"interval"] = interval[c]
i += 1
else:
c += 1
df_res = pd.DataFrame([])
for iv, df_left in df.groupby("interval"):
df_res = df_res.append(df_left.drop("time",axis=1).mean(),ignore_index=True)
df_res.set_index(interval,inplace=True)
return df_res
start= time.time()
trans_df = transform_in_intervals(merged_df)
end = time.time()
print("time:{:.3f} s".format(end-start))
KeyError Traceback (most recent call last)
<ipython-input-31-bfcffbdad696> in <module>()
1 start= time.time()
----> 2 trans_df = transform_in_intervals(merged_df)
3 end = time.time()
4 print("time:{:.3f} s".format(end-start))
<ipython-input-30-2d97dbfdb4d1> in transform_in_intervals(df, freq)
14 for iv, df_left in df.groupby("interval"):
15 df_res = df_res.append(df_left.drop("time",axis=1).mean(),ignore_index=True)
---> 16 df_res.set_index(interval,inplace=True)
17 return df_res
~\AppData\Local\Continuum\anaconda2\envs\py35\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
3921
3922 # clear up memory usage
-> 3923 index._cleanup()
3924
3925 frame.index = index
~\AppData\Local\Continuum\anaconda2\envs\py35\lib\site-packages\pandas\core\indexes\base.py in _cleanup(self)
1912
1913 def _cleanup(self):
-> 1914 self._engine.clear_mapping()
1915
1916 #cache_readonly
pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~\AppData\Local\Continuum\anaconda2\envs\py35\lib\site-packages\pandas\core\indexes\interval.py in _engine(self)
366 #cache_readonly
367 def _engine(self):
--> 368 return IntervalTree(self.left, self.right, closed=self.closed)
369
370 #property
pandas\_libs\intervaltree.pxi in pandas._libs.interval.IntervalTree.__init__()
KeyError: ('datetime64[ns]', 'left')

Resources