package cvxpy new syntax error summing entries - python-3.x

I’m new to the cvxpy package. I’m trying to use it to work through an example from the following blog:
https://towardsdatascience.com/integer-programming-in-python-1cbdfa240df2
Where we’re trying to optimize the combination of marketing channels sent to a customer.
There’s been some recent changes to the cvxpy package and I’m getting the error below when I try to run the sum_entries step, (which has in the latest version been changed to cvxpy.sum)
I think the problem is coming from the dimensions of “selection” and “TRANSFORMER” being incompatible, but I’m not familiar enough with the cvxpy package to know. Any tips are greatly appreciated.
Code:
test_probs.shape
(200, 8)
Code:
# selection = cvxpy.Bool(*test_probs.shape) # syntax changed in latest version
selection = cvxpy.Variable(*test_probs.shape, boolean=True)
# constraints
# Constant matrix that counts how many of each
# material we sent to each customer
TRANSFORMER = np.array([[1,0,0],
[0,1,0],
[0,0,1],
[1,1,0],
[1,0,1],
[0,1,1],
[1,1,1],
[0,0,0]])
# can't send customer more promotion than there is supply
# note: sum_entries changed to sum in latest cvxpy version
supply_constraint = cvxpy.sum(selection * TRANSFORMER, axis=0) <= supply
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-47-f2ebf41a00af> in <module>()
18 # note: sum_entries changed to sum in latest cvxpy version
19
---> 20 supply_constraint = cvxpy.sum(selection * TRANSFORMER, axis=0) <= supply
21
22
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in cast_op(self, other)
47 """
48 other = self.cast_to_const(other)
---> 49 return binary_op(self, other)
50 return cast_op
51
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in __mul__(self, other)
385 return cvxtypes.multiply_expr()(self, other)
386 elif self.is_constant() or other.is_constant():
--> 387 return cvxtypes.mul_expr()(self, other)
388 else:
389 warnings.warn("Forming a nonconvex expression.")
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/affine/binary_operators.py in __init__(self, lh_exp, rh_exp)
41
42 def __init__(self, lh_exp, rh_exp):
---> 43 super(BinaryOperator, self).__init__(lh_exp, rh_exp)
44
45 def name(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/atom.py in __init__(self, *args)
42 self.args = [Atom.cast_to_const(arg) for arg in args]
43 self.validate_arguments()
---> 44 self._shape = self.shape_from_args()
45 if len(self._shape) > 2:
46 raise ValueError("Atoms must be at most 2D.")
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/affine/binary_operators.py in shape_from_args(self)
107 """Returns the (row, col) shape of the expression.
108 """
--> 109 return u.shape.mul_shapes(self.args[0].shape, self.args[1].shape)
110
111 def is_atom_convex(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/utilities/shape.py in mul_shapes(lh_shape, rh_shape)
140 lh_old = lh_shape
141 rh_old = rh_shape
--> 142 lh_shape, rh_shape, shape = mul_shapes_promote(lh_shape, rh_shape)
143 if lh_shape != lh_old:
144 shape = shape[1:]
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/utilities/shape.py in mul_shapes_promote(lh_shape, rh_shape)
107 if lh_mat_shape[1] != rh_mat_shape[0]:
108 raise ValueError("Incompatible dimensions %s %s" % (
--> 109 lh_shape, rh_shape))
110 if lh_shape[:-2] != rh_shape[:-2]:
111 raise ValueError("Incompatible dimensions %s %s" % (
ValueError: Incompatible dimensions (1, 200) (8, 3)
Update:
I tried changing the selection shape as suggested in the comment below.
code:
selection = cvxpy.Variable(test_probs.shape, boolean=True)
and now I get the new error when I run the supply_constraint part of the code below.
code:
# constraints
# Constant matrix that counts how many of each
# material we sent to each customer
TRANSFORMER = np.array([[1,0,0],
[0,1,0],
[0,0,1],
[1,1,0],
[1,0,1],
[0,1,1],
[1,1,1],
[0,0,0]])
# can't send customer more promotion than there is supply
# note: sum_entries changed to sum in latest cvxpy version
supply_constraint = cvxpy.sum(selection * TRANSFORMER, axis=0) <= supply
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-6eb7a55ea896> in <module>()
18 # note: sum_entries changed to sum in latest cvxpy version
19
---> 20 supply_constraint = cvxpy.sum(selection * TRANSFORMER, axis=0) <= supply
21
22
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in cast_op(self, other)
47 """
48 other = self.cast_to_const(other)
---> 49 return binary_op(self, other)
50 return cast_op
51
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in __le__(self, other)
482 """NonPos : Creates an inequality constraint.
483 """
--> 484 return NonPos(self - other)
485
486 def __lt__(self, other):
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in cast_op(self, other)
47 """
48 other = self.cast_to_const(other)
---> 49 return binary_op(self, other)
50 return cast_op
51
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in __sub__(self, other)
370 """Expression : The difference of two expressions.
371 """
--> 372 return self + -other
373
374 #_cast_other
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in cast_op(self, other)
47 """
48 other = self.cast_to_const(other)
---> 49 return binary_op(self, other)
50 return cast_op
51
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in __add__(self, other)
358 """Expression : Sum two expressions.
359 """
--> 360 return cvxtypes.add_expr()([self, other])
361
362 #_cast_other
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/affine/add_expr.py in __init__(self, arg_groups)
34 # For efficiency group args as sums.
35 self._arg_groups = arg_groups
---> 36 super(AddExpression, self).__init__(*arg_groups)
37 self.args = []
38 for group in arg_groups:
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/atom.py in __init__(self, *args)
42 self.args = [Atom.cast_to_const(arg) for arg in args]
43 self.validate_arguments()
---> 44 self._shape = self.shape_from_args()
45 if len(self._shape) > 2:
46 raise ValueError("Atoms must be at most 2D.")
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/affine/add_expr.py in shape_from_args(self)
42 """Returns the (row, col) shape of the expression.
43 """
---> 44 return u.shape.sum_shapes([arg.shape for arg in self.args])
45
46 def expand_args(self, expr):
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/utilities/shape.py in sum_shapes(shapes)
50 raise ValueError(
51 "Cannot broadcast dimensions " +
---> 52 len(shapes)*" %s" % tuple(shapes))
53
54 longer = shape if len(shape) >= len(t) else t
ValueError: Cannot broadcast dimensions (3,) (1, 3)

Your issue is happening when you create the selection variable. You are unpacking the shape tuple into multiple arguments. The first argument to Variable should be a shape. So the correct construction is:
selection = cvxpy.Variable(test_probs.shape, boolean=True)
You can verify this is correct by inspecting the shape attribute:
selection.shape
Which should now give:
(200, 8)

Related

Unsupported operand types with df.copy() method

I'm uploading my dataset, and I'm copying my dataset, but an error is appearing.
import numpy as np
import pandas as pd
import mathplotlib.pyplot as plt
house_data=pd.read_csv("/home/houseprice.csv")
#we evaluate the price of a house for those cases where the information is missing, for each variable
def analyse_na_value(df, var):
df - df.copy()
# we indicate as a variable as 1 where the observation is missing
# we indicate as 0 where the observation has a real value
df[var] = np.where(df[var].isnull(), 1 , 0)
#print(df[var].isnull())
# we calculate the mean saleprice where the information is missing or present
df.groupby(var)['SalePrice'].median().plot.bar()
plt.title(var)
plt.show()
for var in vars_with_na:
analyse_na_value(house_data, var)
error,when I comment this code line, I don't get an error
df - df.copy()
TypeError Traceback (most recent call last)
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, is_cmp)
142 try:
--> 143 result = expressions.evaluate(op, left, right)
144 except TypeError:
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py in evaluate(op, a, b, use_numexpr)
232 if use_numexpr:
--> 233 return _evaluate(op, op_str, a, b) # type: ignore
234 return _evaluate_standard(op, op_str, a, b)
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b)
118 if result is None:
--> 119 result = _evaluate_standard(op, op_str, a, b)
120
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b)
67 with np.errstate(all="ignore"):
---> 68 return op(a, b)
69
TypeError: unsupported operand type(s) for -: 'str' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-31-25d58bc46c86> in <module>
15
16 for var in vars_with_na:
---> 17 analyse_na_value(house_data, var)
<ipython-input-31-25d58bc46c86> in analyse_na_value(df, var)
1 #we evaluate the price of a house for those cases where the information is missing, for each variable
2 def analyse_na_value(df, var):
----> 3 df - df.copy()
4
5 # we indicate as a variable as 1 where the observation is missing
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value)
649 if isinstance(other, ABCDataFrame):
650 # Another DataFrame
--> 651 new_data = self._combine_frame(other, na_op, fill_value)
652
653 elif isinstance(other, ABCSeries):
~/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in _combine_frame(self, other, func, fill_value)
5864 return func(left, right)
5865
-> 5866 new_data = ops.dispatch_to_series(self, other, _arith_op)
5867 return new_data
5868
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/__init__.py in dispatch_to_series(left, right, func, axis)
273 # _frame_arith_method_with_reindex
274
--> 275 bm = left._mgr.operate_blockwise(right._mgr, array_op)
276 return type(left)(bm)
277
~/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in operate_blockwise(self, other, array_op)
362 Apply array_op blockwise with another (aligned) BlockManager.
363 """
--> 364 return operate_blockwise(self, other, array_op)
365
366 def apply(self: T, f, align_keys=None, **kwargs) -> T:
~/anaconda3/lib/python3.8/site-packages/pandas/core/internals/ops.py in operate_blockwise(left, right, array_op)
36 lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
37
---> 38 res_values = array_op(lvals, rvals)
39 if left_ea and not right_ea and hasattr(res_values, "reshape"):
40 res_values = res_values.reshape(1, -1)
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py in arithmetic_op(left, right, op)
188 else:
189 with np.errstate(all="ignore"):
--> 190 res_values = na_arithmetic_op(lvalues, rvalues, op)
191
192 return res_values
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, is_cmp)
148 # will handle complex numbers incorrectly, see GH#32047
149 raise
--> 150 result = masked_arith_op(left, right, op)
151
152 if is_cmp and (is_scalar(result) or result is NotImplemented):
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py in masked_arith_op(x, y, op)
90 if mask.any():
91 with np.errstate(all="ignore"):
---> 92 result[mask] = op(xrav[mask], yrav[mask])
93
94 else:
TypeError: unsupported operand type(s) for -: 'str' and 'str'
1
As far to what I know the copy() function works with python3,
but in pandas,
and python3 does it work I don't know.
How can I get rid of this error without commenting that code line?
I think you are supposed to do df = df.copy(). I would recommend changing the variable though. Here is an official Pandas documentation on this function. What you are doing is subtracting the data frame from itself...

eli5 explain_weights_xgboost KeyError: 'bias'

I am new to xgboost, I trained a model, that works pretty well. Now I am trying to use eli5 to see the weights and I get: KeyError: 'bias'
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
in
3 clf6 = model6.named_steps['clf']
4 vec6 = model6.named_steps['transformer']
----> 5 explain_weights_xgboost(clf6, vec=vec6)
~/dev/envs/env3.7/lib/python3.7/site-packages/eli5/xgboost.py in explain_weights_xgboost(xgb, vec, top, target_names, targets, feature_names, feature_re, feature_filter, importance_type)
80 description=DESCRIPTION_XGBOOST,
81 is_regression=is_regression,
---> 82 num_features=coef.shape[-1],
83 )
84
~/dev/envs/env3.7/lib/python3.7/site-packages/eli5/_feature_importances.py in get_feature_importance_explanation(estimator, vec, coef, feature_names, feature_filter, feature_re, top, description, is_regression, estimator_feature_names, num_features, coef_std)
35 feature_filter=feature_filter,
36 feature_re=feature_re,
---> 37 num_features=num_features,
38 )
39 feature_importances = get_feature_importances_filtered(
~/dev/envs/env3.7/lib/python3.7/site-packages/eli5/sklearn/utils.py in get_feature_names_filtered(clf, vec, bias_name, feature_names, num_features, feature_filter, feature_re, estimator_feature_names)
124 feature_names=feature_names,
125 num_features=num_features,
--> 126 estimator_feature_names=estimator_feature_names,
127 )
128 return feature_names.handle_filter(feature_filter, feature_re)
~/dev/envs/env3.7/lib/python3.7/site-packages/eli5/sklearn/utils.py in get_feature_names(clf, vec, bias_name, feature_names, num_features, estimator_feature_names)
77 features are named x0, x1, x2, etc.
78 """
---> 79 if not has_intercept(clf):
80 bias_name = None
81
~/dev/envs/env3.7/lib/python3.7/site-packages/eli5/sklearn/utils.py in has_intercept(estimator)
60 if hasattr(estimator, 'fit_intercept'):
61 return estimator.fit_intercept
---> 62 if hasattr(estimator, 'intercept_'):
63 if estimator.intercept_ is None:
64 return False
~/dev/envs/env3.7/lib/python3.7/site-packages/xgboost/sklearn.py in intercept_(self)
743 .format(self.booster))
744 b = self.get_booster()
--> 745 return np.array(json.loads(b.get_dump(dump_format='json')[0])['bias'])
746
747
KeyError: 'bias'
Thank you!
I had the same issue and fixed it by specifying explicitly the argument booster when creating the estimator:
clf = XGBClassifier(booster='gbtree')

AttributeError: 'MpoImageFile' object has no attribute 'shape'

images, labels = next(iter(self.loader))
grid = torchvision.utils.make_grid(images)
images, labels = next(iter(self.loader))
triggers the error.
I have a custom dataset class where I load each image (RGB) from an url :
image = Image.open(urllib.request.urlopen(URL))
and I apply some albumentations transforms.
The code works when I read an image for which I have a path using cv2.
However, it doesn't work when I read an image from the url.
Note that I verified that the urls aren't broken.
Here's the traceback:
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
344 def __next__(self):
345 index = self._next_index() # may raise StopIteration
--> 346 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
347 if self._pin_memory:
348 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/content/transform_dataset.py in __getitem__(self, idx)
49 labels = torch.from_numpy(item[2:].values.astype("float32"))
50 #print("self.root,item,self.image_transform,self.transform,self.size", self.root,item,self.image_transform,self.transform,self.size)
---> 51 image = load_image(self.root,item.ID,item.URL,self.image_transform)
52 return image, labels
53
/content/transform_dataset.py in load_image(root, ID, URL, image_transform)
81 print(image.shape)
82 image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
---> 83 image = image_transform(image=image)["image"]
84 return image
/usr/local/lib/python3.6/dist-packages/albumentations/core/composition.py in __call__(self, **data)
169 convert_keypoints_to_albumentations, data)
170
--> 171 data = t(**data)
172
173 if dual_start_end is not None and idx == dual_start_end[1]:
/usr/local/lib/python3.6/dist-packages/albumentations/core/transforms_interface.py in __call__(self, **kwargs)
26 if (random.random() < self.p) or self.always_apply:
27 params = self.get_params()
---> 28 params = self.update_params(params, **kwargs)
29 if self.targets_as_params:
30 targets_as_params = {k: kwargs[k] for k in self.targets_as_params}
/usr/local/lib/python3.6/dist-packages/albumentations/core/transforms_interface.py in update_params(self, params, **kwargs)
66 if hasattr(self, 'interpolation'):
67 params['interpolation'] = self.interpolation
---> 68 params.update({'cols': kwargs['image'].shape[1], 'rows': kwargs['image'].shape[0]})
69 return params
70
AttributeError: 'MpoImageFile' object has no attribute 'shape'
In order to work with albumentations, you must pass a numpy array to the transforms not a PIL image. So:
image = Image.open(urllib.request.urlopen(URL))
image = np.array(image)

Graph coloring with CVXPY set up constraints

I am trying to code the constraints for the graph coloring problem using CVXPY. I am pretty new to mixed-integer programming (MIP), and I am having some difficulty specifying the constraints. I have input data like the “edges” list below. Where each tuple in the list is an edge that starts on one node and ends on the other. For example the first edge starts on node “0” and ends on node “1”.
The constraints I would like to specify are:
each node can have only one color.
vertices sharing one edge should have different colors.
each node must have at least one color.
The objective function would be the number of colors used.
For a selection variable I want to specify the color each node takes.
I’m using the colors array to multiply with the selection variable and then sum for each node, to select the color for the node and specify that each node can only have one color.
I’m getting the error below. Am I constructing the selection variable correctly?
Also this seems like a lot of work, does anyone see a simpler way to do this? Or does anyone know of an example of solving the graph coloring problem with CVXPY?
One more thing, if I transpose the colors array the error goes away.
Input data:
edges
[(0, 1), (1, 2), (1, 3)]
code:
# selection variable
# 4 is the number of nodes
selection = cvxpy.Variable(4, boolean=True)
# array for colors
colors = np.array([[1,0,0,0],
[0,1,0,0],
[0,0,1,0],
[0,0,0,1],
[1,1,0,0],
[1,1,1,0],
[1,1,1,1],
[0,1,1,1],
[0,0,1,1],
[1,0,1,0],
[0,1,0,1],
[0,1,1,0],
[1,1,0,0],
[0,0,0,0]])
# each node can only have one color constraint
one_color = cvxpy.sum(selection * colors, axis=0) == 1
error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-b22226c89a2f> in <module>()
1 # each node can only have one color constraint
2
----> 3 one_color = cvxpy.sum(selection * colors, axis=0) == 1
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in cast_op(self, other)
47 """
48 other = self.cast_to_const(other)
---> 49 return binary_op(self, other)
50 return cast_op
51
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/expressions/expression.py in __mul__(self, other)
385 return cvxtypes.multiply_expr()(self, other)
386 elif self.is_constant() or other.is_constant():
--> 387 return cvxtypes.mul_expr()(self, other)
388 else:
389 warnings.warn("Forming a nonconvex expression.")
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/affine/binary_operators.py in __init__(self, lh_exp, rh_exp)
41
42 def __init__(self, lh_exp, rh_exp):
---> 43 super(BinaryOperator, self).__init__(lh_exp, rh_exp)
44
45 def name(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/atom.py in __init__(self, *args)
42 self.args = [Atom.cast_to_const(arg) for arg in args]
43 self.validate_arguments()
---> 44 self._shape = self.shape_from_args()
45 if len(self._shape) > 2:
46 raise ValueError("Atoms must be at most 2D.")
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/atoms/affine/binary_operators.py in shape_from_args(self)
107 """Returns the (row, col) shape of the expression.
108 """
--> 109 return u.shape.mul_shapes(self.args[0].shape, self.args[1].shape)
110
111 def is_atom_convex(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/utilities/shape.py in mul_shapes(lh_shape, rh_shape)
140 lh_old = lh_shape
141 rh_old = rh_shape
--> 142 lh_shape, rh_shape, shape = mul_shapes_promote(lh_shape, rh_shape)
143 if lh_shape != lh_old:
144 shape = shape[1:]
~/anaconda2/envs/py36/lib/python3.6/site-packages/cvxpy/utilities/shape.py in mul_shapes_promote(lh_shape, rh_shape)
107 if lh_mat_shape[1] != rh_mat_shape[0]:
108 raise ValueError("Incompatible dimensions %s %s" % (
--> 109 lh_shape, rh_shape))
110 if lh_shape[:-2] != rh_shape[:-2]:
111 raise ValueError("Incompatible dimensions %s %s" % (
ValueError: Incompatible dimensions (1, 4) (14, 4)

Create linear model to check correlation tokenize error

I have data like the sample below, which has 4 continuous columns [x0 to x3] and a binary column y. y has two values 1.0 and 0.0. I’m trying to check for correlation between the binary column y and one of the continuous columns x0, using the CatConCor function below, but I’m getting the error message below. The function creates a linear regression model and calcs the p value for the residuals with and without the categorical variable. If anyone can please point out the issue or how to fix it, it would be very much appreciated.
Data:
x_r x0 x1 x2 x3 y
0 0 0.466726 0.030126 0.998330 0.892770 0.0
1 1 0.173168 0.525810 -0.079341 -0.112151 0.0
2 2 -0.854467 0.770712 0.929614 -0.224779 0.0
3 3 -0.370574 0.568183 -0.928269 0.843253 0.0
4 4 -0.659431 -0.948491 -0.091534 0.706157 0.0
Code:
import numpy as np
import pandas as pd
from time import time
import scipy.stats as stats
from IPython.display import display # Allows the use of display() for DataFrames
# Pretty display for notebooks
%matplotlib inline
###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# correlation between categorical variable and continuous variable
def CatConCor(df,catVar,conVar):
import statsmodels.api as sm
from statsmodels.formula.api import ols
# subsetting data for one categorical column and one continuous column
data2=df.copy()[[catVar,conVar]]
data2[catVar]=data2[catVar].astype('category')
mod = ols(conVar+'~'+catVar,
data=data2).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
if aov_table['PR(>F)'][0] < 0.05:
print('Correlated p='+str(aov_table['PR(>F)'][0]))
else:
print('Uncorrelated p='+str(aov_table['PR(>F)'][0]))
# checking for correlation between categorical and continuous variables
CatConCor(df=train_df,catVar='y',conVar='x0')
Error:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-6-80f83b8c8e14> in <module>()
1 # checking for correlation between categorical and continuous variables
2
----> 3 CatConCor(df=train_df,catVar='y',conVar='x0')
<ipython-input-2-35404ba1d697> in CatConCor(df, catVar, conVar)
103
104 mod = ols(conVar+'~'+catVar,
--> 105 data=data2).fit()
106
107 aov_table = sm.stats.anova_lm(mod, typ=2)
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
153
154 tmp = handle_formula_data(data, None, formula, depth=eval_env,
--> 155 missing=missing)
156 ((endog, exog), missing_idx, design_info) = tmp
157
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/formula/formulatools.py in handle_formula_data(Y, X, formula, depth, missing)
63 if data_util._is_using_pandas(Y, None):
64 result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65 NA_action=na_action)
66 else:
67 result = dmatrices(formula, Y, depth, return_type='dataframe',
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in dmatrices(formula_like, data, eval_env, NA_action, return_type)
308 eval_env = EvalEnvironment.capture(eval_env, reference=1)
309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310 NA_action, return_type)
311 if lhs.shape[1] == 0:
312 raise PatsyError("model is missing required outcome variables")
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
163 return iter([data])
164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165 NA_action)
166 if design_infos is not None:
167 return build_design_matrices(design_infos, data,
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
60 "ascii-only, or else upgrade to Python 3.")
61 if isinstance(formula_like, str):
---> 62 formula_like = ModelDesc.from_formula(formula_like)
63 # fallthrough
64 if isinstance(formula_like, ModelDesc):
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/desc.py in from_formula(cls, tree_or_string)
162 tree = tree_or_string
163 else:
--> 164 tree = parse_formula(tree_or_string)
165 value = Evaluator().eval(tree, require_evalexpr=False)
166 assert isinstance(value, cls)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in parse_formula(code, extra_operators)
146 tree = infix_parse(_tokenize_formula(code, operator_strings),
147 operators,
--> 148 _atomic_token_types)
149 if not isinstance(tree, ParseNode) or tree.type != "~":
150 tree = ParseNode("~", None, [tree], tree.origin)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/infix_parser.py in infix_parse(tokens, operators, atomic_types, trace)
208
209 want_noun = True
--> 210 for token in token_source:
211 if c.trace:
212 print("Reading next token (want_noun=%r)" % (want_noun,))
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _tokenize_formula(code, operator_strings)
92 else:
93 it.push_back((pytype, token_string, origin))
---> 94 yield _read_python_expr(it, end_tokens)
95
96 def test__tokenize_formula():
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _read_python_expr(it, end_tokens)
42 origins = []
43 bracket_level = 0
---> 44 for pytype, token_string, origin in it:
45 assert bracket_level >= 0
46 if bracket_level == 0 and token_string in end_tokens:
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/util.py in next(self)
330 else:
331 # May raise StopIteration
--> 332 return six.advance_iterator(self._it)
333 __next__ = next
334
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/tokens.py in python_tokenize(code)
33 break
34 origin = Origin(code, start, end)
---> 35 assert pytype not in (tokenize.NL, tokenize.NEWLINE)
36 if pytype == tokenize.ERRORTOKEN:
37 raise PatsyError("error tokenizing input "
AssertionError:
Upgrading patsy to 0.5.1 fixed the issue. I found the tip here:
https://github.com/statsmodels/statsmodels/issues/5343

Resources