Error: Format: "svg" not recognized. Use one of: - python-3.x

Hallo I try to create a decisiontree with my csv datasheet. I installed in anaconda and python the graphviz package with the following command:
conda install graphviz
pip install graphviz
to get my tree visible. Here is my code that I have wrote in Jupyther Notebook:
import pandas as pd
import graphviz
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
file = 'automotive_data.csv'
COLS = np.arange(0,22,1).tolist()#gibt später bei usecols eine andere möglichkeit die spalten anzusprechen
data = pd.read_csv(file, header=0, sep = ",", index_col=0, usecols=COLS)
x = data.iloc[:,1:]
x = x.to_numpy()
y = data[['Ausfall']]
y
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.3, random_state=1)
model = DecisionTreeClassifier (
criterion='entropy',
splitter='best',
min_samples_split= 0.3,
max_features=10,
max_depth=None
)
#Danach mit fit erstellt
model.fit(xTrain, yTrain)
dot=export_graphviz(model, out_file=None,filled=True,
feature_names=data.columns[1:24],
class_names=['ja','nein']);
# Erzeuge Graphviz-Graphen aus dot-Quellcode
graph = graphviz.Source(dot)
graph#Here I get an error
In the last row I get the error:
Format: "svg" not recognized. Use one of:
CalledProcessError Traceback (most recent call last)
~\anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\anaconda3\lib\site-packages\graphviz\files.py in _repr_svg_(self)
111
112 def _repr_svg_(self):
--> 113 return self.pipe(format='svg').decode(self._encoding)
114
115 def pipe(self, format=None, renderer=None, formatter=None, quiet=False):
~\anaconda3\lib\site-packages\graphviz\files.py in pipe(self, format, renderer, formatter, quiet)
136 out = backend.pipe(self._engine, format, data,
137 renderer=renderer, formatter=formatter,
--> 138 quiet=quiet)
139
140 return out
~\anaconda3\lib\site-packages\graphviz\backend.py in pipe(engine, format, data, renderer, formatter, quiet)
242 """
243 cmd, _ = command(engine, format, None, renderer, formatter)
--> 244 out, _ = run(cmd, input=data, capture_output=True, check=True, quiet=quiet)
245 return out
246
~\anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, encoding, quiet, **kwargs)
182 if check and proc.returncode:
183 raise CalledProcessError(proc.returncode, cmd,
--> 184 output=out, stderr=err)
185
186 return out, err
CalledProcessError: Command '['dot', '-Tsvg']' returned non-zero exit status 1. [stderr: b'Format: "svg" not recognized. Use one of:\r\n']
I also tried to use PNG as my format but it didn't work too. I have no idea how to solve this problem.

So apparently the issue is you have to configure the graphviz plugins first.
Open a terminal in administrator mode and run dot -c. (This assumes that the graphviz binaries are in your path)

I had the same problem, I solved it by installing version 2.38 instead of 2.44
https://www2.graphviz.org/Packages/stable/windows/10/msbuild/Release/Win32/

Related

sklearn v1.2.1: ValueError: custom transformer class

I have installed in my system scikit-learn 1.2.1 and I would like to create custom transformers classes to use with a ColumnTransformer instance. But the problem is when I set the output as pandas dataframe I get the following error message:
But first the code.
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
class TestClass(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X, y)
def transform(self, X, y=None):
return X
ct = ColumnTransformer(transformers=[('simple_imputer', SimpleImputer(strategy='most_frequent'), ['paymentmethod']),
('test_class', TestClass(), ['dependents', 'seniorcitizen', 'partner'])])
ct.set_output(transform='pandas') # I really need 'result' as pandas dataframe
result = ct.fit_transform(X_train, y_train)
Error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[22], line 3
1 ct = ColumnTransformer(transformers=[('simple_imputer', SimpleImputer(strategy='most_frequent'), ['paymentmethod']),
2 ('test_class', TestClass(), ['dependents', 'seniorcitizen', 'partner'])])
----> 3 ct.set_output(transform='pandas')
4 result = ct.fit_transform(X_train, y_train)
File ~/.local/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py:287, in ColumnTransformer.set_output(self, transform)
279 transformers = (
280 trans
281 for _, trans, _ in chain(
(...)
284 if trans not in {"passthrough", "drop"}
285 )
286 for trans in transformers:
--> 287 _safe_set_output(trans, transform=transform)
289 return self
File ~/.local/lib/python3.10/site-packages/sklearn/utils/_set_output.py:275, in _safe_set_output(estimator, transform)
272 return
274 if not hasattr(estimator, "set_output"):
--> 275 raise ValueError(
276 f"Unable to configure output for {estimator} because `set_output` "
277 "is not available."
278 )
279 return estimator.set_output(transform=transform)
ValueError: Unable to configure output for TestClass() because `set_output` is not available.
How can I fix this?

Specifying the columns using strings is only supported for pandas DataFrames

I want to One-hot-encoding several columns and used several solutions include simple one-hot-encoding, ColumnTransformer, make_column_transformer, Pipeline, and get_dummies but anytime I have got different errors.
x = dataset.iloc[:, :11].values
y = dataset.iloc[:, 11].values
""" data encoding """
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# oe = OrdinalEncoder()
# x = oe.fit_transform(x)
non_cat = ["Make", "Model", "Vehicle", "Transmission", "Fuel"]
onehot_cat = ColumnTransformer([
("categorical", OrdinalEncoder(), non_cat),
("onehot_categorical", OneHotEncoder(), non_cat)],
remainder= "passthrough")
x = onehot_cat.fit_transform(x)
error:
[['ACURA' 'ILX' 'COMPACT' ... 6.7 8.5 33]
['ACURA' 'ILX' 'COMPACT' ... 7.7 9.6 29]
['ACURA' 'ILX HYBRID' 'COMPACT' ... 5.8 5.9 48]
...
['VOLVO' 'XC60 T6 AWD' 'SUV - SMALL' ... 8.6 10.3 27]
['VOLVO' 'XC90 T5 AWD' 'SUV - STANDARD' ... 8.3 9.9 29]
['VOLVO' 'XC90 T6 AWD' 'SUV - STANDARD' ... 8.7 10.7 26]]
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
424 try:
--> 425 all_columns = X.columns
426 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-4-4008371c305f> in <module>
24 ("onehot_categorical", OneHotEncoder(), non_cat)],
25 remainder= "passthrough")
---> 26 x = onehot_cat.fit_transform(x)
27
28 print('OneHotEncode = ', x.shape)
~\Anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
527 self._validate_transformers()
528 self._validate_column_callables(X)
--> 529 self._validate_remainder(X)
530
531 result = self._fit_transform(X, y, _fit_transform_one)
~\Anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_remainder(self, X)
325 cols = []
326 for columns in self._columns:
--> 327 cols.extend(_get_column_indices(X, columns))
328
329 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
425 all_columns = X.columns
426 except AttributeError:
--> 427 raise ValueError("Specifying the columns using strings is only "
428 "supported for pandas DataFrames")
429 if isinstance(key, str):
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
I got a similar error trying to make prediction using a model. It was expecting a dataframe but I was sending a numpy object instead. So I changed it from:
prediction = monitor_model.predict(s_df.to_numpy())
to:
prediction = monitor_model.predict(s_df)

How to fix this ValueError?

I am trying to run a python code, mostly based on NLTK book, for ngram POS Tagging a Gujarati language text from my GujaratiTextCorpus. I encountered a ValueError.
I am working with Python 3.7.3 in Windows 10. I use jupyter notebook through anaconda. I am a beginner in using python. I studied the answers available on stackoverflow. com to fix my ValueError, but could not solve it.
import nltk
f = open('C:\\Users\\BHOGAYATA\\Documents\\GujaratiPosTagging\\cts260.txt', encoding = 'utf8')
raw = f.read()
train2_sents = nltk.sent_tokenize(raw)
text2 = nltk.Text(train2_sents)
train2_sents
import nltk
f = open('C:\\Users\\BHOGAYATA\\Documents\\GujaratiPosTagging\\txt42_sents.txt', encoding = 'utf8')
raw = f.read()
bs_sents = nltk.sent_tokenize(raw)
text3 = nltk.Text(bs_sents)
bs_sents
unigram_tagger = nltk.UnigramTagger(train2_sents)
unigram_tagger.tag(bs_sents)
I expected that the words of the two Gujarati sentences would be POS Tagged. I found the following error messages:
ValueError
Traceback (most recent call last)
<ipython-input-3-5fae0b92393e> in <module>
11 text3 = nltk.Text(bs_sents)
12 bs_sents
---> 13 unigram_tagger = nltk.UnigramTagger(train2_sents)
14 unigram_tagger.tag(bs_sents)
15
~\Anaconda3\lib\site-packages\nltk\tag\sequential.py in __init__(self, train, model, backoff, cutoff, verbose)
344
345 def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
--> 346 NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
347
348 def encode_json_obj(self):
~\Anaconda3\lib\site-packages\nltk\tag\sequential.py in __init__(self, n, train, model, backoff, cutoff, verbose)
293
294 if train:
--> 295 self._train(train, cutoff, verbose)
296
297 def encode_json_obj(self):
~\Anaconda3\lib\site-packages\nltk\tag\sequential.py in _train(self, tagged_corpus, cutoff, verbose)
181 fd = ConditionalFreqDist()
182 for sentence in tagged_corpus:
--> 183 tokens, tags = zip(*sentence)
184 for index, (token, tag) in enumerate(sentence):
185 # Record the event.
ValueError: not enough values to unpack (expected 2, got 1)
It says the variable you are passing have one output but you are expecting two..
Ex:
for a, b in [("a", "b")]:
print("a:", a, "b:", b)
This will work
for a, b in [("a")]:
print("a:", a, "b:", b)
This will not work
Edit:
Look at your UnigramTagger
For first argument it takes a list of tagged sentences of type
list(list(tuple(str, str)))
You are giving train2_sents of type
list(tuple(str,str)
Where your
list(tuple(str,str) is same as train2_sents

error: unpack requires a buffer of 16 bytes

The following code gives me an error:
"error: unpack requires a buffer of 16 bytes"
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import cartopy
import cartopy.crs as ccrs
from cartopy.io.shapereader import Reader
# Reading the data from shapefiles .shp
basin = Reader('C:\\...\\BasinCOL2014.shp')
fig = plt.figure (figsize = (10,5))
ax = fig.add_subplot (1, 1, 1, projection = ccrs.PlateCarree(central_longitude=0, globe=None))
ax.set_extent ([-66.0, -80.0, -5.0, 13.0])
ax.gridlines (draw_labels = True)
# Aditional elements to display in map
ax.coastlines (resolution = '10m')
ax.add_feature (cartopy.feature.RIVERS, linewidth=4)
ax.add_geometries (basin.geometries(), crs = ccrs.Geodetic(), edgecolor = 't', facecolor = 'none')
I expect the output will be a map with all the three elements Coastlines, Rivers and Basin. I only get two of them (see image below).
Partial results I´m getting from the code above
Currently installed by Anaconda Navigator 1.9.7
- Jupyter notebook 5.7.8
- Cartopy 0.17.0
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-6-628330e4110c> in <module>
12 ax.coastlines (resolution = '10m')
13 ax.add_feature (cartopy.feature.RIVERS, linewidth=4)
---> 14 ax.add_geometries (basin.geometries(), crs = ccrs.Geodetic(), edgecolor = 't', facecolor = 'none')
C:\ProgramData\Anaconda3\lib\site-packages\cartopy\mpl\geoaxes.py in add_geometries(self, geoms, crs, **kwargs)
586 """
587 styler = kwargs.pop('styler', None)
--> 588 feature = cartopy.feature.ShapelyFeature(geoms, crs, **kwargs)
589 return self.add_feature(feature, styler=styler)
590
C:\ProgramData\Anaconda3\lib\site-packages\cartopy\feature\__init__.py in __init__(self, geometries, crs, **kwargs)
229 """
230 super(ShapelyFeature, self).__init__(crs, **kwargs)
--> 231 self._geoms = tuple(geometries)
232
233 def geometries(self):
C:\ProgramData\Anaconda3\lib\site-packages\cartopy\io\shapereader.py in geometries(self)
234 geometry_factory = self._geometry_factory
235 for i in range(self._reader.numRecords):
--> 236 shape = self._reader.shape(i)
237 yield _make_geometry(geometry_factory, shape)
238
C:\ProgramData\Anaconda3\lib\site-packages\shapefile.py in shape(self, i)
811 return k
812 shp.seek(offset)
--> 813 return self.__shape()
814
815 def shapes(self):
C:\ProgramData\Anaconda3\lib\site-packages\shapefile.py in __shape(self)
749 # Read m extremes and values
750 if shapeType in (13,15,18,23,25,28,31):
--> 751 (mmin, mmax) = unpack("<2d", f.read(16))
752 # Measure values less than -10e38 are nodata values according to the spec
753 record.m = []
error: unpack requires a buffer of 16 bytes

Create linear model to check correlation tokenize error

I have data like the sample below, which has 4 continuous columns [x0 to x3] and a binary column y. y has two values 1.0 and 0.0. I’m trying to check for correlation between the binary column y and one of the continuous columns x0, using the CatConCor function below, but I’m getting the error message below. The function creates a linear regression model and calcs the p value for the residuals with and without the categorical variable. If anyone can please point out the issue or how to fix it, it would be very much appreciated.
Data:
x_r x0 x1 x2 x3 y
0 0 0.466726 0.030126 0.998330 0.892770 0.0
1 1 0.173168 0.525810 -0.079341 -0.112151 0.0
2 2 -0.854467 0.770712 0.929614 -0.224779 0.0
3 3 -0.370574 0.568183 -0.928269 0.843253 0.0
4 4 -0.659431 -0.948491 -0.091534 0.706157 0.0
Code:
import numpy as np
import pandas as pd
from time import time
import scipy.stats as stats
from IPython.display import display # Allows the use of display() for DataFrames
# Pretty display for notebooks
%matplotlib inline
###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# correlation between categorical variable and continuous variable
def CatConCor(df,catVar,conVar):
import statsmodels.api as sm
from statsmodels.formula.api import ols
# subsetting data for one categorical column and one continuous column
data2=df.copy()[[catVar,conVar]]
data2[catVar]=data2[catVar].astype('category')
mod = ols(conVar+'~'+catVar,
data=data2).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
if aov_table['PR(>F)'][0] < 0.05:
print('Correlated p='+str(aov_table['PR(>F)'][0]))
else:
print('Uncorrelated p='+str(aov_table['PR(>F)'][0]))
# checking for correlation between categorical and continuous variables
CatConCor(df=train_df,catVar='y',conVar='x0')
Error:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-6-80f83b8c8e14> in <module>()
1 # checking for correlation between categorical and continuous variables
2
----> 3 CatConCor(df=train_df,catVar='y',conVar='x0')
<ipython-input-2-35404ba1d697> in CatConCor(df, catVar, conVar)
103
104 mod = ols(conVar+'~'+catVar,
--> 105 data=data2).fit()
106
107 aov_table = sm.stats.anova_lm(mod, typ=2)
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
153
154 tmp = handle_formula_data(data, None, formula, depth=eval_env,
--> 155 missing=missing)
156 ((endog, exog), missing_idx, design_info) = tmp
157
~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/formula/formulatools.py in handle_formula_data(Y, X, formula, depth, missing)
63 if data_util._is_using_pandas(Y, None):
64 result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65 NA_action=na_action)
66 else:
67 result = dmatrices(formula, Y, depth, return_type='dataframe',
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in dmatrices(formula_like, data, eval_env, NA_action, return_type)
308 eval_env = EvalEnvironment.capture(eval_env, reference=1)
309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310 NA_action, return_type)
311 if lhs.shape[1] == 0:
312 raise PatsyError("model is missing required outcome variables")
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
163 return iter([data])
164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165 NA_action)
166 if design_infos is not None:
167 return build_design_matrices(design_infos, data,
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
60 "ascii-only, or else upgrade to Python 3.")
61 if isinstance(formula_like, str):
---> 62 formula_like = ModelDesc.from_formula(formula_like)
63 # fallthrough
64 if isinstance(formula_like, ModelDesc):
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/desc.py in from_formula(cls, tree_or_string)
162 tree = tree_or_string
163 else:
--> 164 tree = parse_formula(tree_or_string)
165 value = Evaluator().eval(tree, require_evalexpr=False)
166 assert isinstance(value, cls)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in parse_formula(code, extra_operators)
146 tree = infix_parse(_tokenize_formula(code, operator_strings),
147 operators,
--> 148 _atomic_token_types)
149 if not isinstance(tree, ParseNode) or tree.type != "~":
150 tree = ParseNode("~", None, [tree], tree.origin)
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/infix_parser.py in infix_parse(tokens, operators, atomic_types, trace)
208
209 want_noun = True
--> 210 for token in token_source:
211 if c.trace:
212 print("Reading next token (want_noun=%r)" % (want_noun,))
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _tokenize_formula(code, operator_strings)
92 else:
93 it.push_back((pytype, token_string, origin))
---> 94 yield _read_python_expr(it, end_tokens)
95
96 def test__tokenize_formula():
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _read_python_expr(it, end_tokens)
42 origins = []
43 bracket_level = 0
---> 44 for pytype, token_string, origin in it:
45 assert bracket_level >= 0
46 if bracket_level == 0 and token_string in end_tokens:
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/util.py in next(self)
330 else:
331 # May raise StopIteration
--> 332 return six.advance_iterator(self._it)
333 __next__ = next
334
~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/tokens.py in python_tokenize(code)
33 break
34 origin = Origin(code, start, end)
---> 35 assert pytype not in (tokenize.NL, tokenize.NEWLINE)
36 if pytype == tokenize.ERRORTOKEN:
37 raise PatsyError("error tokenizing input "
AssertionError:
Upgrading patsy to 0.5.1 fixed the issue. I found the tip here:
https://github.com/statsmodels/statsmodels/issues/5343

Resources