How to subset a xarray.Dataset according to lat/lon values taken from a SRTM DEM extents - python-3.x

I have a year wise (1980-2020) precipitation data set in netCDF format. I am importing them in xarray to have 40 years of merged precipitation values:
import netCDF4
import numpy
import xarray as xr
import pandas as pd
prcp=xr.open_mfdataset('/home/hrsa/Sayantan/HAR_V2/prcp/HARv2_d10km_d_2d_prcp_*.nc',combine = 'nested', concat_dim="time")
which renders:
time: 14976west_east: 381south_north: 252
1980-01-01 ... 2020-12-31
-1.675e+06 -1.665e+06 ... 2.125e+06
-7.45e+05 -7.35e+05 ... 1.765e+06
(south_north, west_east)
dask.array<chunksize=(252, 381), meta=np.ndarray>
(south_north, west_east)
dask.array<chunksize=(252, 381), meta=np.ndarray>
Data variables:
(time, south_north, west_east)
dask.array<chunksize=(366, 252, 381), meta=np.ndarray>
Attributes: (33)
This a large dataset, hence I am required to subset it according to an SRTM image whose extents (in EPSG:4326) is defined as
# Extents of the SRTM DEM covering Panchi_B and the SASE AWS/Base Camp
min_lon = 77.0
min_lat = 32.0
max_lon = 78.0
max_lat = 33.0
In order to subset according to above coordinates I have tried the following:
prcp = prcp.sel(lat = slice(min_lat,max_lat), lon = slice(min_lon,max_lon))
the Error output:
KeyError Traceback (most recent call last)
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/, in group_indexers_by_index(data_obj, indexers, method, tolerance)
72 try:
---> 73 index = xindexes[key]
74 coord = data_obj.coords[key]
KeyError: 'lat'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
Input In [25], in <cell line: 1>()
----> 1 prcp = prcp.sel(lat = slice(min_lat,max_lat), lon = slice(min_lon,max_lon))
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/, in Dataset.sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
2440 """Returns a new dataset with each array indexed by tick labels
2441 along the specified dimension(s).
2498 DataArray.sel
2499 """
2500 indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel")
-> 2501 pos_indexers, new_indexes = remap_label_indexers(
2502 self, indexers=indexers, method=method, tolerance=tolerance
2503 )
2504 # TODO: benbovy - flexible indexes: also use variables returned by Index.query
2505 # (temporary dirty fix).
2506 new_indexes = {k: v[0] for k, v in new_indexes.items()}
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/, in remap_label_indexers(obj, indexers, method, tolerance, **indexers_kwargs)
414 indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "remap_label_indexers")
416 v_indexers = {
417 k: if isinstance(v, DataArray) else v
418 for k, v in indexers.items()
419 }
--> 421 pos_indexers, new_indexes = indexing.remap_label_indexers(
422 obj, v_indexers, method=method, tolerance=tolerance
423 )
424 # attach indexer's coordinate to pos_indexers
425 for k, v in indexers.items():
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/, in remap_label_indexers(data_obj, indexers, method, tolerance)
107 pos_indexers = {}
108 new_indexes = {}
--> 110 indexes, grouped_indexers = group_indexers_by_index(
111 data_obj, indexers, method, tolerance
112 )
114 forward_pos_indexers = grouped_indexers.pop(None, None)
115 if forward_pos_indexers is not None:
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/, in group_indexers_by_index(data_obj, indexers, method, tolerance)
82 except KeyError:
83 if key in data_obj.coords:
---> 84 raise KeyError(f"no index found for coordinate {key}")
85 elif key not in data_obj.dims:
86 raise KeyError(f"{key} is not a valid dimension or coordinate")
KeyError: 'no index found for coordinate lat'
How can I resolve this issue? Any help will be appreciated, Thank you.
############# Edit (for #Robert Wilson) ##################
In order to find out the ranges, I did the following:
lon = prcp.lon.to_dataframe()
lat =


Unsupported operand types with df.copy() method

I'm uploading my dataset, and I'm copying my dataset, but an error is appearing.
import numpy as np
import pandas as pd
import mathplotlib.pyplot as plt
#we evaluate the price of a house for those cases where the information is missing, for each variable
def analyse_na_value(df, var):
df - df.copy()
# we indicate as a variable as 1 where the observation is missing
# we indicate as 0 where the observation has a real value
df[var] = np.where(df[var].isnull(), 1 , 0)
# we calculate the mean saleprice where the information is missing or present
for var in vars_with_na:
analyse_na_value(house_data, var)
error,when I comment this code line, I don't get an error
df - df.copy()
TypeError Traceback (most recent call last)
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/ in na_arithmetic_op(left, right, op, is_cmp)
142 try:
--> 143 result = expressions.evaluate(op, left, right)
144 except TypeError:
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/ in evaluate(op, a, b, use_numexpr)
232 if use_numexpr:
--> 233 return _evaluate(op, op_str, a, b) # type: ignore
234 return _evaluate_standard(op, op_str, a, b)
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/ in _evaluate_numexpr(op, op_str, a, b)
118 if result is None:
--> 119 result = _evaluate_standard(op, op_str, a, b)
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/ in _evaluate_standard(op, op_str, a, b)
67 with np.errstate(all="ignore"):
---> 68 return op(a, b)
TypeError: unsupported operand type(s) for -: 'str' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-31-25d58bc46c86> in <module>
16 for var in vars_with_na:
---> 17 analyse_na_value(house_data, var)
<ipython-input-31-25d58bc46c86> in analyse_na_value(df, var)
1 #we evaluate the price of a house for those cases where the information is missing, for each variable
2 def analyse_na_value(df, var):
----> 3 df - df.copy()
5 # we indicate as a variable as 1 where the observation is missing
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/ in f(self, other, axis, level, fill_value)
649 if isinstance(other, ABCDataFrame):
650 # Another DataFrame
--> 651 new_data = self._combine_frame(other, na_op, fill_value)
653 elif isinstance(other, ABCSeries):
~/anaconda3/lib/python3.8/site-packages/pandas/core/ in _combine_frame(self, other, func, fill_value)
5864 return func(left, right)
-> 5866 new_data = ops.dispatch_to_series(self, other, _arith_op)
5867 return new_data
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/ in dispatch_to_series(left, right, func, axis)
273 # _frame_arith_method_with_reindex
--> 275 bm = left._mgr.operate_blockwise(right._mgr, array_op)
276 return type(left)(bm)
~/anaconda3/lib/python3.8/site-packages/pandas/core/internals/ in operate_blockwise(self, other, array_op)
362 Apply array_op blockwise with another (aligned) BlockManager.
363 """
--> 364 return operate_blockwise(self, other, array_op)
366 def apply(self: T, f, align_keys=None, **kwargs) -> T:
~/anaconda3/lib/python3.8/site-packages/pandas/core/internals/ in operate_blockwise(left, right, array_op)
36 lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
---> 38 res_values = array_op(lvals, rvals)
39 if left_ea and not right_ea and hasattr(res_values, "reshape"):
40 res_values = res_values.reshape(1, -1)
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/ in arithmetic_op(left, right, op)
188 else:
189 with np.errstate(all="ignore"):
--> 190 res_values = na_arithmetic_op(lvalues, rvalues, op)
192 return res_values
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/ in na_arithmetic_op(left, right, op, is_cmp)
148 # will handle complex numbers incorrectly, see GH#32047
149 raise
--> 150 result = masked_arith_op(left, right, op)
152 if is_cmp and (is_scalar(result) or result is NotImplemented):
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/ in masked_arith_op(x, y, op)
90 if mask.any():
91 with np.errstate(all="ignore"):
---> 92 result[mask] = op(xrav[mask], yrav[mask])
94 else:
TypeError: unsupported operand type(s) for -: 'str' and 'str'
As far to what I know the copy() function works with python3,
but in pandas,
and python3 does it work I don't know.
How can I get rid of this error without commenting that code line?
I think you are supposed to do df = df.copy(). I would recommend changing the variable though. Here is an official Pandas documentation on this function. What you are doing is subtracting the data frame from itself...

Specifying the columns using strings is only supported for pandas DataFrames

I want to One-hot-encoding several columns and used several solutions include simple one-hot-encoding, ColumnTransformer, make_column_transformer, Pipeline, and get_dummies but anytime I have got different errors.
x = dataset.iloc[:, :11].values
y = dataset.iloc[:, 11].values
""" data encoding """
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# oe = OrdinalEncoder()
# x = oe.fit_transform(x)
non_cat = ["Make", "Model", "Vehicle", "Transmission", "Fuel"]
onehot_cat = ColumnTransformer([
("categorical", OrdinalEncoder(), non_cat),
("onehot_categorical", OneHotEncoder(), non_cat)],
remainder= "passthrough")
x = onehot_cat.fit_transform(x)
[['ACURA' 'ILX' 'COMPACT' ... 6.7 8.5 33]
['ACURA' 'ILX' 'COMPACT' ... 7.7 9.6 29]
['ACURA' 'ILX HYBRID' 'COMPACT' ... 5.8 5.9 48]
['VOLVO' 'XC60 T6 AWD' 'SUV - SMALL' ... 8.6 10.3 27]
['VOLVO' 'XC90 T5 AWD' 'SUV - STANDARD' ... 8.3 9.9 29]
['VOLVO' 'XC90 T6 AWD' 'SUV - STANDARD' ... 8.7 10.7 26]]
AttributeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\utils\ in _get_column_indices(X, key)
424 try:
--> 425 all_columns = X.columns
426 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-4-4008371c305f> in <module>
24 ("onehot_categorical", OneHotEncoder(), non_cat)],
25 remainder= "passthrough")
---> 26 x = onehot_cat.fit_transform(x)
28 print('OneHotEncode = ', x.shape)
~\Anaconda3\lib\site-packages\sklearn\compose\ in fit_transform(self, X, y)
527 self._validate_transformers()
528 self._validate_column_callables(X)
--> 529 self._validate_remainder(X)
531 result = self._fit_transform(X, y, _fit_transform_one)
~\Anaconda3\lib\site-packages\sklearn\compose\ in _validate_remainder(self, X)
325 cols = []
326 for columns in self._columns:
--> 327 cols.extend(_get_column_indices(X, columns))
329 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~\Anaconda3\lib\site-packages\sklearn\utils\ in _get_column_indices(X, key)
425 all_columns = X.columns
426 except AttributeError:
--> 427 raise ValueError("Specifying the columns using strings is only "
428 "supported for pandas DataFrames")
429 if isinstance(key, str):
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
I got a similar error trying to make prediction using a model. It was expecting a dataframe but I was sending a numpy object instead. So I changed it from:
prediction = monitor_model.predict(s_df.to_numpy())
prediction = monitor_model.predict(s_df)

Dask dataframe valueError

I have several parquet files (dataframes), which I load as one dask dataframe graph and sample.
Afterwards I perform some computations based on the original data in the dataframe and append the new columns to my dask dataframe.
Finally, I want to compute the mean() and std() for all columns in and do get a ValueError that I am not sure where it comes from or what I'm doing wrong.
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from os.path import join
import dask
import dask.dataframe as dd
import dask.array as da
# read in the data
data_pq = dd.read_parquet(join(path_to_data,'filter_width_*_DNN_train.parquet'),chunksize='4GB')
print('Convert to single precission and sample')
data_pq = data_pq.astype(np.float32).sample(frac=0.1)
# ## compute the additional quantites (tensors)
# compute tensors R, S mag(U) etc.
mag_U = da.sqrt(data_pq['U_bar'].values**2 + data_pq['V_bar'].values**2 +data_pq['W_bar'].values**2)
mag_grad_c = da.sqrt(data_pq['grad_c_x_LES'].values**2 + data_pq['grad_c_y_LES'].values**2 +data_pq['grad_c_z_LES'].values**2)
sum_U = data_pq['U_bar'].values + data_pq['V_bar']+data_pq['W_bar'].values
sum_c = da.absolute(data_pq['grad_c_x_LES'].values) + da.absolute(data_pq['grad_c_y_LES'].values) +da.absolute(data_pq['grad_c_z_LES'].values)
grad_U = da.sqrt(data_pq['grad_U_x_LES'].values**2 + data_pq['grad_U_y_LES'].values**2 +data_pq['grad_U_z_LES'].values**2)
grad_V = da.sqrt(data_pq['grad_V_x_LES'].values**2 + data_pq['grad_V_y_LES'].values**2 +data_pq['grad_V_z_LES'].values**2)
grad_W = da.sqrt(data_pq['grad_W_x_LES'].values**2 + data_pq['grad_W_y_LES'].values**2 +data_pq['grad_W_z_LES'].values**2)
mag_grad_U = da.sqrt(grad_U**2 + grad_V**2 +grad_W**2)
sum_grad_U = da.absolute(grad_U) + da.absolute(grad_V) +da.absolute(grad_W)
print('Computing gradient_tensor')
gradient_tensor = da.array([
print('Computing S and R')
# symetric strain
Strain = 0.5*(gradient_tensor + da.transpose(gradient_tensor,(1,0,2)))
#anti symetric strain
Anti = 0.5*(gradient_tensor - da.transpose(gradient_tensor,(1,0,2)))
print('Computing lambdas')
lambda_1 = da.trace(Strain**2)
lambda_2 = da.trace(Anti**2)
lambda_3 = da.trace(Strain**3)
lambda_4 = da.trace(Anti**2 * Strain)
lambda_5 = da.trace(Anti**2 * Strain**2)
# Add to the dask dataframe
data_pq['mag_grad_c'] = mag_grad_c
data_pq['mag_U'] = mag_U
data_pq['sum_c'] = sum_c
data_pq['sum_U'] = sum_U
data_pq['sum_grad_U'] = sum_grad_U
data_pq['mag_grad_U'] = mag_grad_U
data_pq = data_pq.repartition(npartitions=lambda_1.npartitions)
data_pq['lambda_1'] = lambda_1
data_pq['lambda_2'] = lambda_2
data_pq['lambda_3'] = lambda_3
data_pq['lambda_4'] = lambda_4
data_pq['lambda_5'] = lambda_5
print('Done with feature computation')
# reindex and compute mean and std
data_pq = data_pq.reset_index().drop('index',axis=1)
# compute the mean and std
data_mean, data_std = dask.compute(data_pq.mean(),data_pq.std())
Not sure where it comes from. It says the indexes do not match.
This is the error message I get:
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) ~/Python/Data_driven_models/DASK_processing/ in <module>
119 data_pq = data_pq.reset_index().drop('index',axis=1)
--> 121 data_mean, data_std = dask.compute(data_pq.mean(),data_pq.std())
~/.local/lib/python3.6/site-packages/dask/ in compute(*args,
450 postcomputes.append(x.__dask_postcompute__())
--> 452 results = schedule(dsk, keys, **kwargs)
453 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
~/.local/lib/python3.6/site-packages/dask/ in get(dsk, result, cache, num_workers, pool, **kwargs)
82 get_id=_thread_get_id,
83 pack_exception=pack_exception,
---> 84 **kwargs
85 )
~/.local/lib/python3.6/site-packages/dask/ in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
484 _execute_task(task, data) # Re-execute locally
485 else:
--> 486 raise_exception(exc, tb)
487 res, worker_id = loads(res_info)
488 state["cache"][key] = res
~/.local/lib/python3.6/site-packages/dask/ in reraise(exc, tb)
314 if exc.__traceback__ is not tb:
315 raise exc.with_traceback(tb)
--> 316 raise exc
~/.local/lib/python3.6/site-packages/dask/ in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
~/.local/lib/python3.6/site-packages/dask/ in
_execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
~/.local/lib/python3.6/site-packages/pandas/core/ in
__init__(self, data, index, dtype, name, copy, fastpath)
312 if len(index) != len(data):
313 raise ValueError(
--> 314 f"Length of passed values is {len(data)}, "
315 f"index implies {len(index)}."
316 )
ValueError: Length of passed values is 3728270, index implies 2135992.

slice in xarray gives error 'float' object cannot be interpreted as an integer

I am trying to slice data by longtitude using xarray.
The data is in a netcdf file I created from measurements I made.
The xarray.Dataset has the following attributes:
(lat: 1321, lon: 1321)
Data variables:
(lon) float64 '8.413 8.411 8.409 ... 4.904 4.905'
(lat) float64 '47.4 47.4 47.41 ... 52.37 52.37'
(data) float64 ... #dimension: 1321
my code is:
import xarray as xr
obs = xr.open_dataset('')
obs=obs['data'].sel(lon=slice(4.905, 8.413))
The error I get is TypeError: 'float' object cannot be interpreted as an integer
I could not find out whether it is an error in my code, or an error in xarray. I would expect such an error using isel instead of sel. Could not find any solution on here or over at the xarray documentation.
Full error message:
TypeError Traceback (most recent call last)
<ipython-input-434-5b37e4c5d0c6> in <module>
----> 1 obs=obs['data'].sel(lon=slice(4.905, 8.413))
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
1059 method=method,
1060 tolerance=tolerance,
-> 1061 **indexers_kwargs,
1062 )
1063 return self._from_temp_dataset(ds)
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
2066 self, indexers=indexers, method=method, tolerance=tolerance
2067 )
-> 2068 result = self.isel(indexers=pos_indexers, drop=drop)
2069 return result._overwrite_indexes(new_indexes)
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in isel(self, indexers, drop, **indexers_kwargs)
1933 var_indexers = {k: v for k, v in indexers.items() if k in var_value.dims}
1934 if var_indexers:
-> 1935 var_value = var_value.isel(var_indexers)
1936 if drop and var_value.ndim == 0 and var_name in coord_names:
1937 coord_names.remove(var_name)
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in isel(self, indexers, **indexers_kwargs)
1059 key = tuple(indexers.get(dim, slice(None)) for dim in self.dims)
-> 1060 return self[key]
1062 def squeeze(self, dim=None):
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in __getitem__(self, key)
701 array `x.values` directly.
702 """
--> 703 dims, indexer, new_order = self._broadcast_indexes(key)
704 data = as_indexable(self._data)[indexer]
705 if new_order:
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in _broadcast_indexes(self, key)
541 if all(isinstance(k, BASIC_INDEXING_TYPES) for k in key):
--> 542 return self._broadcast_indexes_basic(key)
544 self._validate_indexers(key)
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in _broadcast_indexes_basic(self, key)
568 dim for k, dim in zip(key, self.dims) if not isinstance(k, integer_types)
569 )
--> 570 return dims, BasicIndexer(key), None
572 def _validate_indexers(self, key):
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in __init__(self, key)
369 k = int(k)
370 elif isinstance(k, slice):
--> 371 k = as_integer_slice(k)
372 else:
373 raise TypeError(
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in as_integer_slice(value)
345 def as_integer_slice(value):
--> 346 start = as_integer_or_none(value.start)
347 stop = as_integer_or_none(value.stop)
348 step = as_integer_or_none(value.step)
~/opt/anaconda3/lib/python3.7/site-packages/xarray/core/ in as_integer_or_none(value)
341 def as_integer_or_none(value):
--> 342 return None if value is None else operator.index(value)
I want to select the entire data, because eventually I want to subtract the entire array from a bigger data base with a wider grid. This bigger data base is a NETCDF file as well. And for that one, I managed to slice the longitude with the exact same code I am trying on this smaller data set where I get the error. The only difference is, that the bigger NETCDF uses a float32 format. I don't suspect this could cause the error.
Any help is appreciated. Thank you.
I think I found the problem.
When I created the netcdf file for the observation, I made a mistake in the createDimension part, when I named the lon and lat data. Because of this, lat and lon showed up under 'Data variables' in the netcdf file, where they should show up under 'Coordinates'
wrong was something like:
#Specifying dimensions#
f.createDimension('longitude', len(lon_list))
f.createDimension('latitude', len(lat_list))
#Building variables
longitude = f.createVariable('lon', float, ('lon',), zlib=True)
latitude = f.createVariable('lat', float, ('lat',), zlib=True)
data = f.createVariable('data', float, ('lat','lon'), zlib=True)
correct was:
#Specifying dimensions#
f.createDimension('lon', len(lon_list))
f.createDimension('lat', len(lat_list))
#Building variables
longitude = f.createVariable('lon', float, ('lon',), zlib=True)
latitude = f.createVariable('lat', float, ('lat',), zlib=True)
data = f.createVariable('data', float, ('lat','lon'), zlib=True)
This is a little late but I just ran into a similar issue where I got a similar undecipherable error when trying to slice by a variable.
I think the problem is that if you are trying to slice by a variable that isn't a coordinate you get an error that isn't very informative.
data = data.assign_coords({"lat","lon":data.lon})
would have fixed this without rewriting the netcdf file.

Create linear model to check correlation tokenize error

I have data like the sample below, which has 4 continuous columns [x0 to x3] and a binary column y. y has two values 1.0 and 0.0. I’m trying to check for correlation between the binary column y and one of the continuous columns x0, using the CatConCor function below, but I’m getting the error message below. The function creates a linear regression model and calcs the p value for the residuals with and without the categorical variable. If anyone can please point out the issue or how to fix it, it would be very much appreciated.
x_r x0 x1 x2 x3 y
0 0 0.466726 0.030126 0.998330 0.892770 0.0
1 1 0.173168 0.525810 -0.079341 -0.112151 0.0
2 2 -0.854467 0.770712 0.929614 -0.224779 0.0
3 3 -0.370574 0.568183 -0.928269 0.843253 0.0
4 4 -0.659431 -0.948491 -0.091534 0.706157 0.0
import numpy as np
import pandas as pd
from time import time
import scipy.stats as stats
from IPython.display import display # Allows the use of display() for DataFrames
# Pretty display for notebooks
%matplotlib inline
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import as cm
# correlation between categorical variable and continuous variable
def CatConCor(df,catVar,conVar):
import statsmodels.api as sm
from statsmodels.formula.api import ols
# subsetting data for one categorical column and one continuous column
mod = ols(conVar+'~'+catVar,
aov_table = sm.stats.anova_lm(mod, typ=2)
if aov_table['PR(>F)'][0] < 0.05:
print('Correlated p='+str(aov_table['PR(>F)'][0]))
print('Uncorrelated p='+str(aov_table['PR(>F)'][0]))
# checking for correlation between categorical and continuous variables
AssertionError Traceback (most recent call last)
<ipython-input-6-80f83b8c8e14> in <module>()
1 # checking for correlation between categorical and continuous variables
----> 3 CatConCor(df=train_df,catVar='y',conVar='x0')
<ipython-input-2-35404ba1d697> in CatConCor(df, catVar, conVar)
104 mod = ols(conVar+'~'+catVar,
--> 105 data=data2).fit()
107 aov_table = sm.stats.anova_lm(mod, typ=2)
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/base/ in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
154 tmp = handle_formula_data(data, None, formula, depth=eval_env,
--> 155 missing=missing)
156 ((endog, exog), missing_idx, design_info) = tmp
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/formula/ in handle_formula_data(Y, X, formula, depth, missing)
63 if data_util._is_using_pandas(Y, None):
64 result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65 NA_action=na_action)
66 else:
67 result = dmatrices(formula, Y, depth, return_type='dataframe',
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in dmatrices(formula_like, data, eval_env, NA_action, return_type)
308 eval_env = EvalEnvironment.capture(eval_env, reference=1)
309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310 NA_action, return_type)
311 if lhs.shape[1] == 0:
312 raise PatsyError("model is missing required outcome variables")
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
163 return iter([data])
164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165 NA_action)
166 if design_infos is not None:
167 return build_design_matrices(design_infos, data,
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
60 "ascii-only, or else upgrade to Python 3.")
61 if isinstance(formula_like, str):
---> 62 formula_like = ModelDesc.from_formula(formula_like)
63 # fallthrough
64 if isinstance(formula_like, ModelDesc):
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in from_formula(cls, tree_or_string)
162 tree = tree_or_string
163 else:
--> 164 tree = parse_formula(tree_or_string)
165 value = Evaluator().eval(tree, require_evalexpr=False)
166 assert isinstance(value, cls)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in parse_formula(code, extra_operators)
146 tree = infix_parse(_tokenize_formula(code, operator_strings),
147 operators,
--> 148 _atomic_token_types)
149 if not isinstance(tree, ParseNode) or tree.type != "~":
150 tree = ParseNode("~", None, [tree], tree.origin)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in infix_parse(tokens, operators, atomic_types, trace)
209 want_noun = True
--> 210 for token in token_source:
211 if c.trace:
212 print("Reading next token (want_noun=%r)" % (want_noun,))
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in _tokenize_formula(code, operator_strings)
92 else:
93 it.push_back((pytype, token_string, origin))
---> 94 yield _read_python_expr(it, end_tokens)
96 def test__tokenize_formula():
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in _read_python_expr(it, end_tokens)
42 origins = []
43 bracket_level = 0
---> 44 for pytype, token_string, origin in it:
45 assert bracket_level >= 0
46 if bracket_level == 0 and token_string in end_tokens:
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in next(self)
330 else:
331 # May raise StopIteration
--> 332 return six.advance_iterator(self._it)
333 __next__ = next
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/ in python_tokenize(code)
33 break
34 origin = Origin(code, start, end)
---> 35 assert pytype not in (tokenize.NL, tokenize.NEWLINE)
36 if pytype == tokenize.ERRORTOKEN:
37 raise PatsyError("error tokenizing input "
Upgrading patsy to 0.5.1 fixed the issue. I found the tip here:
