Adding band description to rioxarray to_raster() - python-3.x

I've seen that one can add band descriptions to a geotiff image using rasterio [1]. How would I do the same thing when saving an array to a raster with rioxarray?
I tried adding the names as coords, but when I save an re-open the raster, the bands are named [1, 2, 3, 4] instead of ['R', 'G', 'B', 'NIR'].
import numpy as np
import xarray as xa
import rioxarray as rioxa
bands = ['R', 'G', 'B', 'NIR']
im_arr = np.random.randint(0, 255, size=(4, 400, 400))
im_save = xa.DataArray(im_arr, dims=('band', 'y', 'x'),
coords={'x': np.arange(0, 400), 'y': np.arange(0, 400),
'band': bands})
path = 'test.tiff'
im_save.rio.to_raster(path)
im_load = rioxa.open_rasterio(path)
print(im_load)
<xarray.DataArray (band: 4, y: 400, x: 400)> [640000 values with dtype=int32]
Coordinates:
band (band) int32 1 2 3 4
y (y) float64 0.0 1.0 2.0 3.0 4.0 ... 396.0 397.0 398.0 399.0
x (x) float64 0.0 1.0 2.0 3.0 4.0 ... 396.0 397.0 398.0 399.0
spatial_ref int32 0 Attributes:
scale_factor: 1.0
add_offset: 0.0
grid_mapping: spatial_ref

You should consider switching from a 3d DataArray to a Dataset with 4 variables, each representing a separate band.
If you name the variables correctly, it should get written to the tiff:
import numpy as np
import xarray as xa
import rioxarray as rioxa
bands = ['R', 'G', 'B', 'NIR']
xa_dataset = xa.Dataset()
for band in bands:
xa_dataset[band] = xa.DataArray(np.random.randint(0, 255, (400, 400), dtype="uint8"), dims=('y', 'x'),
coords={'x': np.arange(0, 400), 'y': np.arange(0, 400)})
# see the structure
print(xa_dataset)
# <xarray.Dataset>
# Dimensions: (x: 400, y: 400)
# Coordinates:
# * x (x) int64 0 1 2 3 4 5 6 7 8 ... 391 392 393 394 395 396 397 398 399
# * y (y) int64 0 1 2 3 4 5 6 7 8 ... 391 392 393 394 395 396 397 398 399
# Data variables:
# R (y, x) uint8 18 41 126 79 64 215 105 ... 29 137 243 23 150 23 224
# G (y, x) uint8 1 18 90 195 45 8 150 68 ... 96 194 22 58 118 210 198
# B (y, x) uint8 125 90 165 226 153 253 212 ... 162 217 221 162 18 17
# NIR (y, x) uint8 161 195 149 168 40 182 146 ... 18 114 38 119 23 110 26
# write to disk
xa_dataset.rio.to_raster("test.tiff")
# load
im_load = rioxa.open_rasterio('test.tiff')
print(im_load)
# <xarray.DataArray (band: 4, y: 400, x: 400)>
# [640000 values with dtype=uint8]
# Coordinates:
# * band (band) int64 1 2 3 4
# * y (y) float64 0.0 1.0 2.0 3.0 4.0 ... 396.0 397.0 398.0 399.0
# * x (x) float64 0.0 1.0 2.0 3.0 4.0 ... 396.0 397.0 398.0 399.0
# spatial_ref int64 0
# Attributes:
# scale_factor: 1.0
# add_offset: 0.0
# long_name: ('R', 'G', 'B', 'NIR')
# grid_mapping: spatial_ref
You can see the band names are now included in the attributes as long_name.
Running gdalinfo, you can see the band description has been set:
Driver: GTiff/GeoTIFF
Files: test.tiff
Size is 400, 400
Origin = (-0.500000000000000,-0.500000000000000)
Pixel Size = (1.000000000000000,1.000000000000000)
Image Structure Metadata:
INTERLEAVE=PIXEL
Corner Coordinates:
Upper Left ( -0.5000000, -0.5000000)
Lower Left ( -0.500, 399.500)
Upper Right ( 399.500, -0.500)
Lower Right ( 399.500, 399.500)
Center ( 199.500, 199.500)
Band 1 Block=400x5 Type=Byte, ColorInterp=Red
Description = R
Mask Flags: PER_DATASET ALPHA
Band 2 Block=400x5 Type=Byte, ColorInterp=Green
Description = G
Mask Flags: PER_DATASET ALPHA
Band 3 Block=400x5 Type=Byte, ColorInterp=Blue
Description = B
Mask Flags: PER_DATASET ALPHA
Band 4 Block=400x5 Type=Byte, ColorInterp=Alpha
Description = NIR

Related

TypeError: float() argument must be a string or a number, not 'Timestamp' to REGRESSION LINEAR

This is my code and it has an error that I don't know how to fix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from time import time
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
Data
proof_df = pd.read_excel("WORK_NUM_3.xlsx")
Show information (only 10 of 1550)
proof_df.head(10)
results
ORDEN DATA N1 N2 N3 N4 N5
0 1 1994-03-13 25 45 60 76 79
1 2 1994-03-17 13 30 58 63 64
2 3 1994-03-20 5 15 32 33 48
3 4 1994-03-24 27 57 60 61 77
4 5 1994-03-27 19 44 53 54 71
5 6 1994-04-03 4 45 54 65 67
6 7 1994-04-07 9 21 37 42 68
7 8 1994-04-10 5 16 26 28 62
8 9 1994-04-14 4 15 44 64 73
9 10 1994-04-17 20 32 49 54 62
declare variables
y = proof_df.iloc[:, 2:len(quina_df.columns)]
X = proof_df[['ORDEN','DATA']]
regression algorithm
regresor = SVR(kernel='linear')
hora_inicio = time()
train of algorithm
regresor.fit(X_train.values, y_train.values.ravel())
print('train finish in {time() - hora_inicio} segundos')
result:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [37], in <cell line: 2>()
1 # Entrenamiento del algoritmo
----> 2 regresor.fit(X_train.values, y_train.values.ravel())
3 print('Entrenamiento finalizado en {time() - hora_inicio} segundos')
File ~\anaconda3\lib\site-packages\sklearn\svm\_base.py:190, in BaseLibSVM.fit(self, X, y, sample_weight)
188 check_consistent_length(X, y)
189 else:
--> 190 X, y = self._validate_data(
191 X,
192 y,
193 dtype=np.float64,
194 order="C",
195 accept_sparse="csr",
196 accept_large_sparse=False,
197 )
199 y = self._validate_targets(y)
201 sample_weight = np.asarray(
202 [] if sample_weight is None else sample_weight, dtype=np.float64
203 )
File ~\anaconda3\lib\site-packages\sklearn\base.py:581, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
584 if not no_val_X and check_params.get("ensure_2d", True):
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:964, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
961 if y is None:
962 raise ValueError("y cannot be None")
--> 964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,
967 accept_large_sparse=accept_large_sparse,
968 dtype=dtype,
969 order=order,
970 copy=copy,
971 force_all_finite=force_all_finite,
972 ensure_2d=ensure_2d,
973 allow_nd=allow_nd,
974 ensure_min_samples=ensure_min_samples,
975 ensure_min_features=ensure_min_features,
976 estimator=estimator,
977 )
979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
981 check_consistent_length(X, y)
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:746, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
749 "Complex data not supported\n{}\n".format(array)
750 ) from complex_warning
TypeError: float() argument must be a string or a number, not 'Timestamp'
I I am trying to work with the dates as the variable ( X ) that the model should receive since the variable ( y ) are the other results
I really appreciate any help to help understand what is going on

CRS error while clipping rioxarray to shapefile

I'm trying to clip a rioxarray dataset to a shapefile, but get the following error:
> data_clipped = data.rio.clip(shape.geometry.apply(mapping))
MissingCRS: CRS not found. Please set the CRS with 'set_crs()' or 'write_crs()'. Data variable: precip
This error seems straightforward, but I can't figure out which CRS needs to be set. Both the dataset and the shapefile have CRS values that rio can find:
> print(data.rio.crs)
EPSG:4326
> print(shape.crs)
epsg:4326
The dataarray within the dataset, called 'precip', does not have a CRS, but it also doesn't seem to respond to the set_crs() command:
> print(data.precip.rio.crs)
None
> data.precip.rio.set_crs(data.rio.crs)
> print(data.precip.rio.crs)
None
What am I missing here?
For reference, rioxarray set_crs() documentation - this shows set_crs() working on data arrays, unlike my experience with data.precip
My data, in case I have something unusual:
> print(data)
<xarray.Dataset>
Dimensions: (x: 541, y: 411)
Coordinates:
* y (y) float64 75.0 74.9 74.8 74.7 74.6 ... 34.3 34.2 34.1 34.0
* x (x) float64 -12.0 -11.9 -11.8 -11.7 ... 41.7 41.8 41.9 42.0
time object 2020-01-01 00:00:00
spatial_ref int64 0
Data variables:
precip (y, x) float64 nan nan nan ... 1.388e-17 1.388e-17 1.388e-17
Attributes:
Conventions: CF-1.6
history: 2021-01-05 01:36:52 GMT by grib_to_netcdf-2.16.0: /opt/ecmw...
> print(shape)
ID name orgn_name geometry
0 Albania Shqipëria MULTIPOLYGON (((19.50115 40.96230, 19.50563 40...
1 Andorra Andorra POLYGON ((1.43992 42.60649, 1.45041 42.60596, ...
2 Austria Österreich POLYGON ((16.00000 48.77775, 16.00000 48.78252...
This issue is resolved if the set_crs() is used in the same command as the clip operation:
data_clipped = data.precip.rio.set_crs('WGS84').rio.clip(shape.geometry.apply(mapping))

How do I change the colour for a range of values in the code based on the condition

So I have a dataset that looks as such
ACCx ACCy ACCz ECG RESP LABEL BINARY
0 0.9554 -0.2220 -0.5580 0.021423 -1.148987 0.0 0
1 0.9258 -0.2216 -0.5538 0.020325 -1.124573 0.0 0
2 0.9082 -0.2196 -0.5392 0.016525 -1.152039 0.0 0
3 0.8974 -0.2102 -0.5122 0.016708 -1.158142 0.0 0
4 0.8882 -0.2036 -0.4824 0.011673 -1.161194 0.0 0
... ... ... ... ... ... ... ...
695 0.9134 -0.1400 0.1074 0.003479 2.299500 7.0 0
696 0.9092 -0.1394 0.0994 0.000778 2.305603 7.0 0
697 0.9084 -0.1414 0.0934 -0.001694 2.297974 7.0 0
698 0.9116 -0.1416 0.0958 -0.003799 2.354431 7.0 0
699 0.9156 -0.1396 0.1022 -0.006546 2.355957 7.0 0
Now the values of Binary is 1 if LABEL is 2, as shown below
ACCx ACCy ACCz ECG RESP LABEL BINARY
200 0.8776 -0.1030 -0.2968 -0.011673 -1.222229 2.0 1
201 0.8758 -0.1018 -0.2952 -0.001556 -1.202393 2.0 1
202 0.8760 -0.1030 -0.2918 0.022385 -1.222229 2.0 1
203 0.8786 -0.1038 -0.2950 0.049622 -1.228333 2.0 1
204 0.8798 -0.1050 -0.2930 0.084457 -1.210022 2.0 1
... ... ... ... ... ... ... ...
295 0.8756 -0.1052 -0.2694 -0.106430 -0.883484 2.0 1
296 0.8760 -0.1036 -0.2680 -0.108719 -0.880432 2.0 1
297 0.8760 -0.1056 -0.2638 -0.106750 -0.888062 2.0 1
298 0.8768 -0.1064 -0.2560 -0.099792 -0.889587 2.0 1
299 0.8792 -0.1064 -0.2510 -0.094894 -0.865173 2.0 1
I need to plot a scatter plot against the RESP values but the colour must be different for the values where binary is 1
I used the following code to plot the scatter plot
def plot_coloured(dataframe):
"""
Function 2: plot_coloured(dataframe)
Parameters: dataframe: Stress data DataFrame
Output: Plot
"""
plt.figure(figsize=(12, 6))
plt.scatter(x=[i for i in range(0, 700)],
y=dataframe["RESP"])
And got the following image
The image for scatterplot between resp and indices
I would like to know how I can change the colour of the points on the plot where the value of binary is 1
I have heard about the c argument in plt,scatter() but I do not know if it helps here
Use a Boolean mask to create separate dataframes based upon the desired condition, and then plot both dataframes with different colors
import pandas as pd
import matplotlib.pyplot as plt
data = {'ACCx': [0.9554, 0.9258, 0.9082, 0.8974, 0.8882, 0.9134, 0.9092, 0.9084, 0.9116, 0.9156, 0.8776, 0.8758, 0.876, 0.8786, 0.8798, 0.8756, 0.876, 0.876, 0.8768, 0.8792],
'ACCy': [-0.222, -0.2216, -0.2196, -0.2102, -0.2036, -0.14, -0.1394, -0.1414, -0.1416, -0.1396, -0.103, -0.1018, -0.103, -0.1038, -0.105, -0.1052, -0.1036, -0.1056, -0.1064, -0.1064],
'ACCz': [-0.558, -0.5538, -0.5392, -0.5122, -0.4824, 0.1074, 0.0994, 0.0934, 0.0958, 0.1022, -0.2968, -0.2952, -0.2918, -0.295, -0.293, -0.2694, -0.268, -0.2638, -0.256, -0.251],
'ECG': [0.021422999999999998, 0.020325, 0.016525, 0.016708, 0.011673000000000001, 0.003479, 0.000778, -0.0016940000000000002, -0.0037990000000000003, -0.006546, -0.011673000000000001, -0.001556, 0.022385, 0.049622, 0.084457, -0.10643, -0.10871900000000001, -0.10675, -0.09979199999999999, -0.094894],
'RESP': [-1.148987, -1.124573, -1.152039, -1.158142, -1.161194, 2.2995, 2.305603, 2.297974, 2.354431, 2.355957, -1.222229, -1.202393, -1.222229, -1.228333, -1.210022, -0.883484, -0.880432, -0.8880620000000001, -0.8895870000000001, -0.865173],
'LABEL': [0.0, 0.0, 0.0, 0.0, 0.0, 7.0, 7.0, 7.0, 7.0, 7.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
'BINARY': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
df = pd.DataFrame(data)
# create separate dataframes with desired condition
mask = (df.BINARY == 1)
resp_1 = df[mask]
resp_others = df[~mask]
# plot the two dataframes
plt.figure(figsize=(12, 6))
plt.scatter(x=resp_1.index, y=resp_1.RESP, color='g', label='BINARY=1')
plt.scatter(x=resp_others.index, y=resp_others.RESP, label='BINARY!=1')
plt.legend()

hvplot histogram: DataError: None of the available storage backends were able to support the supplied data format

import pandas as pd
import numpy as np
import random
import copy
import feather
import plotly.graph_objects as go
import plotly.express as px
import panel as pn
import holoviews as hv
import geoviews as gv
import geoviews.feature as gf
import cartopy
import cartopy.feature as cf
from geoviews import opts
from cartopy import crs as ccrs
import hvplot.pandas # noqa
import colorcet as cc
from colorcet.plotting import swatch
hv.extension("bokeh","plotly")
I have a dataframe called test:
Out[5]:
age age_band car_ins_renew_month people_type
0 NaN NaN NaN sign_up_only
1 61.0 55-64 7.0 active_interest
2 NaN NaN NaN sign_up_only
3 55.0 55-64 8.0 previous_customer
4 NaN NaN NaN sign_up_only
... ... ... ... ...
107627 42.0 35-44 6.0 previous_customer
107628 73.0 65+ 7.0 previous_customer
107629 NaN NaN NaN sign_up_only
107630 NaN NaN NaN sign_up_only
107631 NaN NaN NaN sign_up_only
[107632 rows x 4 columns]
In [6]: test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107632 entries, 0 to 107631
Data columns (total 4 columns):
age 73289 non-null float32
age_band 73289 non-null category
car_ins_renew_month 64290 non-null float32
people_type 107632 non-null category
dtypes: category(2), float32(2)
memory usage: 1.0 MB
For the entire test dataframe, I can successfully produce histograms using hvplot:
age (with hover data for age_band):
In [7]: test.hvplot.hist(
...: y="age",by=["age_band"],
...: bins=[18,25,35,45,55,65,74],
...: xticks=[(21.5,"18-24"),(30,"25-34"),(40,"35-44"),(50,"45-54"),(60,"55-64"),(69.5,"65-74")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
car_ins_renew_month:
test.hvplot.hist(
...: y="car_ins_renew_month",
...: bins=[1,2,3,4,5,6,7,8,9,10,11,12,13],
...: xticks=[(1.5,"JAN"),(2.5,"FEB"),(3.5,"MAR"),(4.5,"APR"),(5.5,"MAY"),(6.5,"JUN"),(7.5,"JUL"),(8.5,"AUG"),(9.5,"SEP"),(10.5,"OCT"),(11.5,"NOV"),(12.5,"DEC")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
However, for the subset of test where people_type is equal to previous_customer:
In [11]: test_prev_cust = test.loc[test["people_type"]=="previous_customer"]
while I can successfully produce a histogram for the car_ins_renew_month attribute:
In [13]: test_prev_cust.hvplot.hist(
...: y="car_ins_renew_month",
...: bins=[1,2,3,4,5,6,7,8,9,10,11,12,13],
...: xticks=[(1.5,"JAN"),(2.5,"FEB"),(3.5,"MAR"),(4.5,"APR"),(5.5,"MAY"),(6.5,"JUN"),(7.5,"JUL"),(8.5,"AUG"),(9.5,"SEP"),(10.5,"OCT"),(11.5,"NOV"),(12.5,"DEC")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
when I try to produce a histogram for the age attribute then I get the following error:
In [14]: test_prev_cust = hvplot.hist(
...: y="age",by=["age_band"],
...: bins=[18,25,35,45,55,65,74],
...: xticks=[(21.5,"18-24"),(30,"25-34"),(40,"35-44"),(50,"45-54"),(60,"55-64"),(69.5,"65-74")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-100-b2108cee586d> in <module>
7 color="teal",legend=False,
8 line_width=4,line_color="w",
----> 9 width=650,height=280
10 )
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/plotting/core.py in hist(self, y, by, **kwds)
399 The HoloViews representation of the plot.
400 """
--> 401 return self(kind='hist', x=None, y=y, by=by, **kwds)
402
403 def kde(self, y=None, by=None, **kwds):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/plotting/core.py in __call__(self, x, y, kind, **kwds)
70 return pn.panel(plot, **panel_dict)
71
---> 72 return self._get_converter(x, y, kind, **kwds)(kind, x, y)
73
74 def _get_converter(self, x=None, y=None, kind=None, **kwds):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/converter.py in __call__(self, kind, x, y)
942 obj = DynamicMap(cbcallable, streams=[self.stream])
943 else:
--> 944 obj = method(x, y)
945
946 if self.crs and self.project:
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/converter.py in hist(self, x, y, data)
1383 if self.by:
1384 hist = hists = histogram(
-> 1385 ds.groupby(self.by), dimension=y, **hist_opts
1386 )
1387 hist = hists.last
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/param/parameterized.py in __new__(class_, *args, **params)
2810 inst = class_.instance()
2811 inst.param._set_name(class_.__name__)
-> 2812 return inst.__call__(*args,**params)
2813
2814 def __call__(self,*args,**kw):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in __call__(self, element, **kwargs)
162 elif 'streams' not in kwargs:
163 kwargs['streams'] = self.p.streams
--> 164 return element.apply(self, **kwargs)
165
166
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/accessors.py in __call__(self, function, streams, link_inputs, dynamic, **kwargs)
113 for k, v in self._obj.data.items():
114 new_val = v.apply(function, dynamic=dynamic, streams=streams,
--> 115 link_inputs=link_inputs, **kwargs)
116 if new_val is not None:
117 mapped.append((k, new_val))
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/accessors.py in __call__(self, function, streams, link_inputs, dynamic, **kwargs)
108 if hasattr(function, 'dynamic'):
109 inner_kwargs['dynamic'] = False
--> 110 return function(self._obj, **inner_kwargs)
111 elif self._obj._deep_indexable:
112 mapped = []
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in __call__(self, element, **kwargs)
159 for k, el in element.items()])
160 elif isinstance(element, ViewableElement):
--> 161 return self._apply(element)
162 elif 'streams' not in kwargs:
163 kwargs['streams'] = self.p.streams
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in _apply(self, element, key)
119 for hook in self._preprocess_hooks:
120 kwargs.update(hook(self, element))
--> 121 ret = self._process(element, key)
122 for hook in self._postprocess_hooks:
123 ret = hook(self, ret, **kwargs)
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/operation/element.py in _process(self, element, key)
657 hist *= edges[1]-edges[0]
658 return Histogram((edges, hist), kdims=[element.get_dimension(selected_dim)],
--> 659 label=element.label, **params)
660
661
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/element/chart.py in __init__(self, data, edges, **params)
196 elif isinstance(data, tuple) and len(data) == 2 and len(data[0])+1 == len(data[1]):
197 data = data[::-1]
--> 198 super(Histogram, self).__init__(data, **params)
199
200
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/data/__init__.py in __init__(self, data, kdims, vdims, **kwargs)
209 validate_vdims = kwargs.pop('_validate_vdims', True)
210 initialized = Interface.initialize(type(self), data, kdims, vdims,
--> 211 datatype=kwargs.get('datatype'))
212 (data, self.interface, dims, extra_kws) = initialized
213 super(Dataset, self).__init__(data, **dict(kwargs, **dict(dims, **extra_kws)))
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/data/interface.py in initialize(cls, eltype, data, kdims, vdims, datatype)
252 % (intfc.__name__, e))
253 error = ' '.join([error, priority_error])
--> 254 raise DataError(error)
255
256 return data, interface, dims, extra_kws
DataError: None of the available storage backends were able to support the supplied data format.
I know that I can successfully produce histograms using hvplot for subsets of my test dataframe for both the car_ins_renew_month and age attributes as I was able to do this for people_type is equal to active_interest. I just can't for people_type is equal to previous_customer.
One thing that I did notice for my test_prev_cust dataframe was that there are no people in 2 of the categories for age_band:
In [18]: test_prev_cust["age_band"].value_counts()
Out[18]:
45-54 13457
55-64 10369
35-44 8760
65+ 7801
25-34 0
18-24 0
Name: age_band, dtype: int64
Could this be the cause of my issue? If so then is there a way to work around this and still include age_band as a hover data on my plot?
Thanks
Software versions:
bokeh 1.4.0 py37_0
cartopy 0.17.0 py37haea56ea_1
colorcet 2.0.2 py_0 pyviz
feather-format 0.4.0 py_1003 conda-forge
geoviews 1.6.5 py_0 pyviz
holoviews 1.12.6 py_0 pyviz
hvplot 0.5.2 py_0 pyviz
jupyter 1.0.0 py37_7
matplotlib 3.1.1 py37h54f8f79_0
notebook 6.0.2 py37_0
numpy 1.17.3 py37h4174a10_0
pandas 0.25.3 py37h0a44026_0
panel 0.7.0 py_0 pyviz
plotly 4.3.0 py_0 plotly
plotly_express 0.4.1 py_0 plotly
python 3.7.5 h359304d_0
seaborn 0.9.0 pyh91ea838_1
I'm on os x Catalina, using latest version of Firefox and I am working in a Jupyter notebook.
The problem is caused by your variable age_band being categorical, having 0 counts for some of the categories and using it with the keyword by=['age_band].
You could try converting age_band to a string, but in this case creating a barplot is nicer I think:
age_band_group = df.groupby(['age_band']
).agg(count=('age', np.size)
).fillna(0)
age_band_group.hvplot.bar(color='teal')

Create linear model to check correlation tokenize error

I have data like the sample below, which has 4 continuous columns [x0 to x3] and a binary column y. y has two values 1.0 and 0.0. I’m trying to check for correlation between the binary column y and one of the continuous columns x0, using the CatConCor function below, but I’m getting the error message below. The function creates a linear regression model and calcs the p value for the residuals with and without the categorical variable. If anyone can please point out the issue or how to fix it, it would be very much appreciated.
Data:
x_r x0 x1 x2 x3 y
0 0 0.466726 0.030126 0.998330 0.892770 0.0
1 1 0.173168 0.525810 -0.079341 -0.112151 0.0
2 2 -0.854467 0.770712 0.929614 -0.224779 0.0
3 3 -0.370574 0.568183 -0.928269 0.843253 0.0
4 4 -0.659431 -0.948491 -0.091534 0.706157 0.0
Code:
import numpy as np
import pandas as pd
from time import time
import scipy.stats as stats
from IPython.display import display # Allows the use of display() for DataFrames
# Pretty display for notebooks
%matplotlib inline
###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# correlation between categorical variable and continuous variable
def CatConCor(df,catVar,conVar):
import statsmodels.api as sm
from statsmodels.formula.api import ols
# subsetting data for one categorical column and one continuous column
data2=df.copy()[[catVar,conVar]]
data2[catVar]=data2[catVar].astype('category')
mod = ols(conVar+'~'+catVar,
data=data2).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
if aov_table['PR(>F)'][0] < 0.05:
print('Correlated p='+str(aov_table['PR(>F)'][0]))
else:
print('Uncorrelated p='+str(aov_table['PR(>F)'][0]))
# checking for correlation between categorical and continuous variables
CatConCor(df=train_df,catVar='y',conVar='x0')
Error:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-6-80f83b8c8e14> in <module>()
1 # checking for correlation between categorical and continuous variables
2
----> 3 CatConCor(df=train_df,catVar='y',conVar='x0')
<ipython-input-2-35404ba1d697> in CatConCor(df, catVar, conVar)
103
104 mod = ols(conVar+'~'+catVar,
--> 105 data=data2).fit()
106
107 aov_table = sm.stats.anova_lm(mod, typ=2)
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
153
154 tmp = handle_formula_data(data, None, formula, depth=eval_env,
--> 155 missing=missing)
156 ((endog, exog), missing_idx, design_info) = tmp
157
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/formula/formulatools.py in handle_formula_data(Y, X, formula, depth, missing)
63 if data_util._is_using_pandas(Y, None):
64 result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65 NA_action=na_action)
66 else:
67 result = dmatrices(formula, Y, depth, return_type='dataframe',
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in dmatrices(formula_like, data, eval_env, NA_action, return_type)
308 eval_env = EvalEnvironment.capture(eval_env, reference=1)
309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310 NA_action, return_type)
311 if lhs.shape[1] == 0:
312 raise PatsyError("model is missing required outcome variables")
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
163 return iter([data])
164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165 NA_action)
166 if design_infos is not None:
167 return build_design_matrices(design_infos, data,
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
60 "ascii-only, or else upgrade to Python 3.")
61 if isinstance(formula_like, str):
---> 62 formula_like = ModelDesc.from_formula(formula_like)
63 # fallthrough
64 if isinstance(formula_like, ModelDesc):
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/desc.py in from_formula(cls, tree_or_string)
162 tree = tree_or_string
163 else:
--> 164 tree = parse_formula(tree_or_string)
165 value = Evaluator().eval(tree, require_evalexpr=False)
166 assert isinstance(value, cls)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in parse_formula(code, extra_operators)
146 tree = infix_parse(_tokenize_formula(code, operator_strings),
147 operators,
--> 148 _atomic_token_types)
149 if not isinstance(tree, ParseNode) or tree.type != "~":
150 tree = ParseNode("~", None, [tree], tree.origin)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/infix_parser.py in infix_parse(tokens, operators, atomic_types, trace)
208
209 want_noun = True
--> 210 for token in token_source:
211 if c.trace:
212 print("Reading next token (want_noun=%r)" % (want_noun,))
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _tokenize_formula(code, operator_strings)
92 else:
93 it.push_back((pytype, token_string, origin))
---> 94 yield _read_python_expr(it, end_tokens)
95
96 def test__tokenize_formula():
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _read_python_expr(it, end_tokens)
42 origins = []
43 bracket_level = 0
---> 44 for pytype, token_string, origin in it:
45 assert bracket_level >= 0
46 if bracket_level == 0 and token_string in end_tokens:
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/util.py in next(self)
330 else:
331 # May raise StopIteration
--> 332 return six.advance_iterator(self._it)
333 __next__ = next
334
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/tokens.py in python_tokenize(code)
33 break
34 origin = Origin(code, start, end)
---> 35 assert pytype not in (tokenize.NL, tokenize.NEWLINE)
36 if pytype == tokenize.ERRORTOKEN:
37 raise PatsyError("error tokenizing input "
AssertionError:
Upgrading patsy to 0.5.1 fixed the issue. I found the tip here:
https://github.com/statsmodels/statsmodels/issues/5343

Resources