AttributeError: lower not found in sklearn - python-3.x

I'm trying to build a news classifier using sklearn, I manage to generate the models, but when I try to train it I get these messages:
AttributeError Traceback (most recent call last) Cell In [131], line 3
1 if isinstance(new_text, str):
2 new_text_tfidf = tfidf_vectorizer.transform([new_text])
----> 3 predicted_category = dt.predict(new_text_tfidf)[0]
4 else:
5 predicted_category = "Invalid input, please provide a string"
File ~\AppData\Roaming\Python\Python39\site-packages\sklearn\pipeline.py:457, in Pipeline.predict(self, X, **predict_params)
455 Xt = X
456 for _, name, transform in self._iter(with_final=False):
--> 457 Xt = transform.transform(Xt)
458 return self.steps[-1][1].predict(Xt, **predict_params)
File ~\AppData\Roaming\Python\Python39\site-packages\sklearn\feature_extraction\text.py:2103, in TfidfVectorizer.transform(self, raw_documents) 2086 """Transform documents to document-term matrix. 2087 2088 Uses the vocabulary and document frequencies (df) learned by fit (or (...) 2099 Tf-idf-weighted document-term matrix. 2100 """ 2101 check_is_fitted(self, msg="The TF-IDF vectorizer is not fitted")
-> 2103 X = super().transform(raw_documents) 2104 return self._tfidf.transform(X, copy=False)
File ~\AppData\Roaming\Python\Python39\site-packages\sklearn\feature_extraction\text.py:1387, in CountVectorizer.transform(self, raw_documents) 1384 self._check_vocabulary() 1386 # use the same matrix-building strategy as fit_transform
-> 1387 _, X = self._count_vocab(raw_documents, fixed_vocab=True) 1388 if self.binary: 1389 X.data.fill(1)
File ~\AppData\Roaming\Python\Python39\site-packages\sklearn\feature_extraction\text.py:1209, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab) 1207 for doc in raw_documents: 1208 feature_counter = {}
-> 1209 for feature in analyze(doc): 1210 try: 1211 feature_idx = vocabulary[feature]
File ~\AppData\Roaming\Python\Python39\site-packages\sklearn\feature_extraction\text.py:111, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
109 else:
110 if preprocessor is not None:
--> 111 doc = preprocessor(doc)
112 if tokenizer is not None:
113 doc = tokenizer(doc)
File ~\AppData\Roaming\Python\Python39\site-packages\sklearn\feature_extraction\text.py:69, in _preprocess(doc, accent_function, lower)
50 """Chain together an optional series of text preprocessing steps to
51 apply to a document.
52 (...)
66 preprocessed string
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
File ~\AppData\Roaming\Python\Python39\site-packages\scipy\sparse\_base.py:771, in spmatrix.__getattr__(self, attr)
769 return self.getnnz()
770 else:
--> 771 raise AttributeError(attr + " not found")
AttributeError: lower not found
Below are some piece of codes from my notebook:
preprocess_text(s) method:
def preprocess_text(s):
"""A text processing pipeline for cleaning up text using the hero package."""
s= s.replace("<br/>", "")
s = s.replace("’", "")
s = s.replace("‘", "")
s = hero.fillna(s)
s = hero.lowercase(s)
s = hero.remove_digits(s)
s = hero.remove_punctuation(s)
s = hero.remove_diacritics(s)
s = hero.remove_whitespace(s)
s = s.replace("Ë","E").replace("ë","e").replace("Ç","C").replace("ç","c")
return s
text = dataset['Text']
category = dataset['Category']
print(category)
X_train, X_test, Y_train, Y_test = train_test_split(text,category, test_size = 0.3, random_state = 42,shuffle=True, stratify=category)
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=False, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1,2))
# Transform the training data: tfidf_train
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
# Transform the test data: tfidf_test
tfidf_test = tfidf_vectorizer.transform(X_test)`
Train model with Random Forest algorithm:
#Random Forest Classifier
`rfc = Pipeline([('tfidf', TfidfVectorizer()),
('rfc', RandomForestClassifier(n_estimators=100)),
])
rfc.fit(X_train, Y_train)
test_predict = rfc.predict(X_test)
train_accuracy = round(rfc.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)
print("RandomForestClassifier Train Accuracy Score : {}% ".format(train_accuracy ))
print("RandomForestClassifier Test Accuracy Score : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))
import pickle
with open('model/random_fin.pkl', 'wb') as file:
pickle.dump(rfc, file)
with open('model/tfidf_vectorizer.pkl', 'wb') as file:
pickle.dump(rfc.named_steps['tfidf'], file)
new_text = "Berisha ka akuzuar Ramen ne lidhje me aferen e inceneratoreve"
new_text_tfidf = tfidf_vectorizer.transform([new_text])
predicted_category = dt.predict(new_text_tfidf)[0]
redicted_category = "Invalid input, please provide a string"
print(predicted_category)
I'm trying to resolve this issue, but until now no success...

Related

PyTorch DataLoader: Only one element tensors can be converted Python scalars

For Python 3.10 and torch version: 1.12.1, I am using MNIST dataset scaled in the range [0, 1] with one-hot encoded vectors for the target as:
batch_size = 256
# Define transformations for MNIST dataset-
# MNIST dataset statistics-
# mean = np.array([0.1307])
# std_dev = np.array([0.3081])
transforms_apply = transforms.Compose(
[
transforms.ToTensor(),
# transforms.Normalize(mean = mean, std = std_dev)
]
)
# Load MNIST dataset-
train_dataset = torchvision.datasets.MNIST(
root = 'data', train = True,
transform = transforms_apply, download = True
)
test_dataset = torchvision.datasets.MNIST(
root = 'data', train = False,
transform = transforms_apply
)
# Sanity check-
print(f"training dataset length/shape: {list(train_dataset.data.size())}")
# training dataset length/shape: [60000, 28, 28]
print(f"mean = {train_dataset.data.float().mean() / 255:.4f} &"
f" std dev = {train_dataset.data.float().std() / 255:.4f}"
)
# mean = 0.1307 & std dev = 0.3081
# Convert the targets to one-hot encoded vectors-
train_dataset.targets = F.one_hot(train_dataset.targets, num_classes = 10)
test_dataset.targets = F.one_hot(test_dataset.targets, num_classes = 10)
# Sanity checks-
print(f"Train dataset: min = {train_dataset.data.min()} & max = {train_dataset.data.max()};"
f" Test dataset: min = {test_dataset.data.min()} & max = {test_dataset.data.max()}"
)
# Train dataset: min = 0 & max = 255; Test dataset: min = 0 & max = 255
train_dataset.data.shape, train_dataset.targets.shape
# (torch.Size([60000, 28, 28]), torch.Size([60000, 10]))
test_dataset.data.shape, test_dataset.targets.shape
# (torch.Size([10000, 28, 28]), torch.Size([10000, 10]))
# Create training and testing dataloaders-
train_loader = torch.utils.data.DataLoader(
dataset = train_dataset, batch_size = batch_size,
shuffle = True
)
test_loader = torch.utils.data.DataLoader(
dataset = test_dataset, batch_size = batch_size,
shuffle = False
)
print(f"Sizes of train_dataset: {len(train_dataset)} and test_dataet: {len(test_dataset)}")
print(f"Sizes of train_loader: {len(train_loader)} and test_loader: {len(test_loader)}")
# Sizes of train_dataset: 60000 and test_dataet: 10000
# Sizes of train_loader: 235 and test_loader: 40
print(f"len(train_loader) = {len(train_loader)} & len(test_loader) = {len(test_loader)}")
# len(train_loader) = 235 & len(test_loader) = 40
# Sanity check-
len(train_dataset) / batch_size, len(test_dataset) / batch_size
# (234.375, 39.0625)
# Get some random batch of training images & labels-
x, y = next(iter(train_loader))
print(f"images.shape: {x.shape}, labels.shape: {y.shape}")
This generates the error:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) Input In [25], in <cell line: 2>()
1 # Get some random batch of training images & labels-
----> 2 x, y = next(iter(train_loader))
3 print(f"images.shape: {x.shape}, labels.shape: {y.shape}")
File
~\anaconda3\envs\torch-gpu\lib\site-packages\torch\utils\data\dataloader.py:681,
in _BaseDataLoaderIter.next(self)
678 if self._sampler_iter is None:
679 # TODO(https://github.com/pytorch/pytorch/issues/76750)
680 self._reset() # type: ignore[call-arg]
--> 681 data = self._next_data()
682 self._num_yielded += 1
683 if self._dataset_kind == _DatasetKind.Iterable and
684 self._IterableDataset_len_called is not None and
685 self._num_yielded > self._IterableDataset_len_called:
File
~\anaconda3\envs\torch-gpu\lib\site-packages\torch\utils\data\dataloader.py:721,
in _SingleProcessDataLoaderIter._next_data(self)
719 def _next_data(self):
720 index = self._next_index() # may raise StopIteration
--> 721 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
722 if self._pin_memory:
723 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File
~\anaconda3\envs\torch-gpu\lib\site-packages\torch\utils\data_utils\fetch.py:49,
in _MapDatasetFetcher.fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
File
~\anaconda3\envs\torch-gpu\lib\site-packages\torch\utils\data_utils\fetch.py:49,
in (.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
File
~\anaconda3\envs\torch-gpu\lib\site-packages\torchvision\datasets\mnist.py:138,
in MNIST.getitem(self, index)
130 def getitem(self, index: int) -> Tuple[Any, Any]:
131 """
132 Args:
133 index (int): Index (...)
136 tuple: (image, target) where target is index of the target class.
137 """
--> 138 img, target = self.data[index], int(self.targets[index])
140 # doing this so that it is consistent with all other datasets
141 # to return a PIL Image
142 img = Image.fromarray(img.numpy(), mode="L")
ValueError: only one element tensors can be converted to Python
scalars
I know that this is due to the one-hot encoding since when not using it, this error is absent. How to solve it?
You can add it to your transforms using the Lambda transform
transforms_apply = transforms.Compose(
[
transforms.ToTensor(),
# transforms.Normalize(mean = mean, std = std_dev)
transforms.Lambda(lambda t: F.one_hot(t.long(), num_classes=10))
]
)

xgb.train(): TypeError: float() argument must be a string or a number, not 'DMatrix'

When I look at the documentation, the argument is supposed to be a 'DMatrix' (xgboost version 1.5.0).
https://xgboost.readthedocs.io/en/latest/python/python_api.html#:~:text=Customized%20objective%20function.-,Learning%20API,num_boost_round%20(int)%20%E2%80%93%20Number%20of%20boosting%20iterations,-.
Indicates pretty much the same thing for the version I'm using (goto subheading '1.2.2 Python' in document link below):
https://xgboost.readthedocs.io/_/downloads/en/release_1.3.0/pdf/
I don't understand why it is asking for a float argument when it is supposed to be a DMatrix.
I've looked at all the Stack posts that have the string 'TypeError: float() argument must be a string or a number, not...', but none of them include 'DMatrix' and I have not been able to find a solution that I could adapt this particular issue.
The the following is the bit of code that elicits this error (go to 'clf - xgb.train(...)'):
def grid_search(timeout_seconds, cv_splits, num_boost_round):
# Read input data
X, y = preprocessing()
y.replace({1:0,2:1,3:2,4:3,5:4,6:5,7:6,8:7,9:8,10:9,11:10,12:11,13:12,14:13,
15:14,16:15,17:16,18:17,19:18,20:19,21:20,22:21}, inplace = True)
# Create dataframe to collect the results
tests_columns = ["test_nr", "cv_mean", "cv_min", "cv_max", "cv_median", "params"]
test_id = 0
tests = pd.DataFrame(columns=tests_columns)
# Cross validation number of splits
kf = KFold(n_splits=cv_splits)
# Execute until timeout occurs
with timeout(timeout_seconds, exception=RuntimeError):
# Get the grid
grid_iter, keys, length = get_grid_iterable()
try:
# For every element of the grid
for df_grid in grid_iter:
# Prepare a list to collect the scores
score = []
params = dict(zip(keys, df_grid))
# The objective function
params["objective"] = "multi:softprob"
params['num_class'] = 22
print('X.reason_action_converted: ', X.reason_action_converted)
# For each fold, train XGBoost and spit out the results
for train_index, test_index in kf.split(X.values):
# Get X train and X test
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
**# Get y train and y test**
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Convert into DMatrix
d_train = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
d_valid = xgb.DMatrix(X_test, label=y_test, missing=np.NaN)
d_test = xgb.DMatrix(X_test, missing=np.NaN)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
# Create the classifier using the current grid params. Apply early stopping of 50 rounds
'''clf = xgb.train(params, d_train, boosting_rounds, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)'''
**clf = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)**
y_hat = clf.predict(d_test)
# Append Scores on the fold kept out
score.append(r2_score(y_test, y_hat))
# Store the result into a dataframe
score_df = pd.DataFrame(columns=tests_columns, data=[
[test_id, np.mean(score), np.min(score), np.max(score), np.median(score),
json.dumps(dict(zip(keys, [str(g) for g in df_grid])))]])
test_id += 1
tests = pd.concat([tests, score_df])
except RuntimeError:
# When timeout occurs an exception is raised and the main cycle is broken
pass
# Spit out the results
tests.to_csv("grid-search.csv", index=False)
print(tests)
**if __name__ == "__main__":
grid_search(timeout_seconds=3600, cv_splits=4, num_boost_round=500)**
The error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<command-3902447645915365> in <module>
106
107 if __name__ == "__main__":
--> 108 grid_search(timeout_seconds=3600,
cv_splits=4, num_boost_round=500)
<command-3902447645915365> in grid_search(timeout_seconds, cv_splits, num_boost_round)
84 # Create the classifier using the current grid params. Apply early stopping of 50 rounds
85 '''clf = xgb.train(params,
d_train, boosting_rounds, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)'''
---> 86 clf = xgb.train(params,
d_train, num_boost_round, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)
87 y_hat = clf.predict(d_test)
88
/databricks/python/lib/python3.8/site-
packages/xgboost/training.py in train(params, dtrain,
num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval,
xgb_model, callbacks)
204 Booster : a trained booster model
205 """
--> 206 bst = _train_internal(params, dtrain,
207
num_boost_round=num_boost_round,
208 evals=evals,
/databricks/python/lib/python3.8/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
107 nboost += 1
108 # check evaluation result.
--> 109 if callbacks.after_iteration(bst, i,
dtrain, evals):
110 break
111 # do checkpoint after evaluation, in
case evaluation also updates
/databricks/python/lib/python3.8/site-
packages/xgboost/callback.py in after_iteration(self,
model, epoch, dtrain, evals)
421 for _, name in evals:
422 assert name.find('-') == -1,
'Dataset name should not contain `-`'
--> 423 score = model.eval_set(evals,
epoch, self.metric)
424 score = score.split()[1:] # into
datasets
425 # split up `test-error:0.1234`
/databricks/python/lib/python3.8/site-
packages/xgboost/core.py in eval_set(self, evals,
iteration, feval)
1350 if feval is not None:
1351 for dmat, evname in evals:
-> 1352 feval_ret =
feval(self.predict(dmat, training=False,
1353
output_margin=True), dmat)
1354 if isinstance(feval_ret, list):
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/metrics/_classification.py in
log_loss(y_true, y_pred, eps, normalize, sample_weight,
labels)
2184 The logarithm used is the natural logarithm
(base-e).
2185 """
-> 2186 y_pred = check_array(y_pred,
ensure_2d=False)
2187 check_consistent_length(y_pred, y_true,
sample_weight)
2188
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in
check_array(array, accept_sparse, accept_large_sparse,
dtype, order, copy, force_all_finite, ensure_2d,
allow_nd, ensure_min_samples, ensure_min_features,
estimator)
636 # make sure we actually converted to
numeric:
637 if dtype_numeric and array.dtype.kind
== "O":
--> 638 array = array.astype(np.float64)
639 if not allow_nd and array.ndim >= 3:
640 raise ValueError("Found array with
dim %d. %s expected <= 2."
TypeError: float() argument must be a string or a number, not 'DMatrix'
I'm using Databricks, Python 3.8.8, and xgboost 1.3.1.
I am trying to adapt code from the following tutorial: Effortless Hyperparameters Tuning with Apache Spark.

When Inference,How do I set parameter of target with EfficientDet-Pytorch?

Im studying objective detection code with efficientdet-pytorch at the first time.
I used pretrained Weight model as reference:
https://www.kaggle.com/shonenkov/inference-efficientdet
https://github.com/toandaominh1997/EfficientDet.Pytorch
https://github.com/toandaominh1997/EfficientDet.Pytorch
I tried to check efficientdet model's output...
from effdet import get_efficientdet_config, EfficientDet, DetBenchTrain
from effdet.efficientdet import HeadNet
#load sample efficientdet code
config = get_efficientdet_config('tf_efficientdet_d0')
config.image_size = [512,512]
config.norm_kwargs=dict(eps=.001, momentum=.01)
net = EfficientDet(config, pretrained_backbone=False)
checkpoint = torch.load('efficientdet_d0-d92fd44f.pth')
net.load_state_dict(checkpoint)
#net.reset_head(num_classes=1)
#net.class_net = HeadNet(config, num_outputs=config.num_classes)
net=DetBenchTrain(net, config)
print("Loaded pretrained weights")
#>>>Loaded pretrained weights"
#img.shape:[3,512,512]
net.eval()
with torch.no_grad():
detected=net(img.unsqueeze(0))
#error of target is below.
TypeError: forward() missing 1 required positional argument: 'target'
And refering to https://www.kaggle.com/shonenkov/inference-efficientdet
I tried below code.
def make_predictions(images, score_threshold=0.22):
predictions = []
with torch.no_grad():
det = net(images, torch.tensor([1]*images.shape[0]).float())
print(det.shape)
for i in range(images.shape[0]):
boxes = det[i].detach().cpu().numpy()[:,:4]
scores = det[i].detach().cpu().numpy()[:,4]
indexes = np.where(scores > score_threshold)[0]
boxes = boxes[indexes]
boxes[:, 2] = boxes[:, 2] + boxes[:, 0]
boxes[:, 3] = boxes[:, 3] + boxes[:, 1]
predictions.append({
'boxes': boxes[indexes],
'scores': scores[indexes],
})
return [predictions]
output=make_predictions(img.unsqueeze(0))
#error is below...
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-412-e81ace805280> in <module>
----> 1 output=make_predictions(img.unsqueeze(0))
<ipython-input-407-a6aab7dca874> in make_predictions(images, score_threshold)
3 predictions = []
4 with torch.no_grad():
----> 5 det = net(images, torch.tensor([1]*images.shape[0]).float())
6 print(det.shape)
7 for i in range(images.shape[0]):
/opt/anaconda3/envs/yohenv/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/anaconda3/envs/yohenv/lib/python3.7/site-packages/effdet/bench.py in forward(self, x, target)
117 else:
118 cls_targets, box_targets, num_positives = self.anchor_labeler.batch_label_anchors(
--> 119 target['bbox'], target['cls'])
120
121 loss, class_loss, box_loss = self.loss_fn(class_out, box_out, cls_targets, box_targets, num_positives)
IndexError: too many indices for tensor of dimension 1
When inference with TestData ,How do I set parameter of target?
Sorry for the inconvenience,Can you give me advice?

How to use SHAP with a linear SVC model from sklearn using Pipeline?

I am doing text classification using a linear SVC model from sklearn. Now I want to visualize which words/tokens have the highest impact on the classification decision by using SHAP (https://github.com/slundberg/shap).
Right now this does not work because I am getting an error that seems to originate from the vectorizer step in the pipeline I have defined - whats wrong here?
Is my general approach on how to use SHAP in this case correct?
x_Train, x_Test, y_Train, y_Test = train_test_split(df_all['PDFText'], df_all['class'], test_size = 0.2, random_state = 1234)
pipeline = Pipeline([
(
'tfidv',
TfidfVectorizer(
ngram_range=(1,3),
analyzer='word',
strip_accents = ascii,
use_idf = True,
sublinear_tf=True,
max_features=6000,
min_df=2,
max_df=1.0
)
),
(
'lin_svc',
svm.SVC(
C=1.0,
probability=True,
kernel='linear'
)
)
])
pipeline.fit(x_Train, y_Train)
shap.initjs()
explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
shap_values = explainer.shap_values(x_Test, nsamples=100)
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_Test.iloc[0,:])
This is the error message I get:
Provided model function fails when applied to the provided data set.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-4bca63616b3b> in <module>
3
4 # use Kernel SHAP to explain test set predictions
----> 5 explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
6 shap_values = explainer.shap_values(x_Test, nsamples=100)
7
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\explainers\kernel.py in __init__(self, model, data, link, **kwargs)
95 self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
96 self.data = convert_to_data(data, keep_index=self.keep_index)
---> 97 model_null = match_model_to_data(self.model, self.data)
98
99 # enforce our current input type limitations
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\common.py in match_model_to_data(model, data)
80 out_val = model.f(data.convert_to_df())
81 else:
---> 82 out_val = model.f(data.data)
83 except:
84 print("Provided model function fails when applied to the provided data set.")
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py in predict_proba(self, X)
379 for name, transform in self.steps[:-1]:
380 if transform is not None:
--> 381 Xt = transform.transform(Xt)
382 return self.steps[-1][-1].predict_proba(Xt)
383
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents, copy)
1631 check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1632
-> 1633 X = super(TfidfVectorizer, self).transform(raw_documents)
1634 return self._tfidf.transform(X, copy=False)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
1084
1085 # use the same matrix-building strategy as fit_transform
-> 1086 _, X = self._count_vocab(raw_documents, fixed_vocab=True)
1087 if self.binary:
1088 X.data.fill(1)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
940 for doc in raw_documents:
941 feature_counter = {}
--> 942 for feature in analyze(doc):
943 try:
944 feature_idx = vocabulary[feature]
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
326 tokenize)
327 return lambda doc: self._word_ngrams(
--> 328 tokenize(preprocess(self.decode(doc))), stop_words)
329
330 else:
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
KernelExplainer expects to receive a classification model as the first argument. Please check the use of Pipeline with Shap following the link.
In your case, you can use the Pipeline as follows:
x_Train = pipeline.named_steps['tfidv'].fit_transform(x_Train)
explainer = shap.KernelExplainer(pipeline.named_steps['lin_svc'].predict_proba, x_Train)

Bag of words and length of notes for Classification (different types of features) using FeatureUnion()

I am trying to do a binary classification task, mainly to classify notes in two classes. I have already implemented a bag of words- classification appraoch. I begin with a pandas DataFrame with the a 'text' column, which lists all the notes and a 'label' column with either 0 or 1. After splitting the data into train/test, the X_train data is a pandas series object with only the text in the rows. I constructed the pipeline below to classify the data based on a bag-of-words approach. (ofcourse with the imports needed to run the code)
pipeline = Pipeline([
('vect', CountVectorizer(preprocessor=preprocess, tokenizer=tokenizeText)),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression())
])
parameters = {
'vect__ngram_range': [(1,1), (1,2), (1,3)], # unigrams or bigrams # PAS AAN
'clf__C': [0.001, 1, 10]
}
if __name__ == "__main__":
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train) #PAS AAN
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Now i want to also add the lenght of the notes (the text) as a feature. I looked into this post: How to add another feature (length of text) to current bag of words classification? Scikit-learn and followed the suggested links, but am nog quite able to configure the right code with the use of the FeatureUnion() method of scikitlearn. The code i have which results in a error is below (i think it has something to do with the data format of X_train that i submit?)
First i constructed a 'Transformer':
from sklearn.base import BaseEstimator, TransformerMixin
class LengthExtractor(BaseEstimator, TransformerMixin):
"""Takes in dataframe, extracts road name column, outputs average word length"""
def __init__(self):
pass
def average_word_length(self, name):
"""Helper code to compute average word length of a name"""
lenght_list = []
for i in range(len(name)):
lenght_list.append(len(name[i].split()))
return lenght_list
def transform(self, df, y=None):
"""The workhorse of this feature extractor"""
return df.apply(self.average_word_length)
def fit(self, df, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
Then I tried to run the following pipline:
from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
('features', FeatureUnion([
('ngram_tf_idf', Pipeline([
('vect', CountVectorizer(preprocessor=preprocess, tokenizer=tokenizeText)),
('tfidf', TfidfTransformer())
])),
('len', LengthExtractor()),
])),
('clf', LogisticRegression())
])
pipeline.fit(X_train, X_test)
y = pipeline.predict(y_train)
print(classification_report(y, y_test))
Probably i am doing something wrong when i create the new class as i am new to OOP. Some help would be really appreciated.
*** Edit: Data Example (note that i created this, the original data in in the dutch language)
X_train would look like:
11 patient has a visitor this morning everything went well
0 no problems occurd during the cisit of the patient
13 patient will exit his room this afternoon and will leave
12 patient is very agressive in his voice towards other patient
24 today miss hit a nurse when she was passing by
22 sir was in a conflict with another patient on the same ward
Name: Tekst, dtype: object
y_test would look like:
11 0
0 0
13 0
12 1
24 1
22 1
Name: Label, dtype: int64
The same data structure, format holds for the X_test and y_test. Please note that the 'tekst' data in X_train and X_test, is a string from which i already remove punctuation and lowercase.
The additional functions i created to preprocess (to get rid of numbers etc) and tokenization is the following:
def preprocess(text):
newText = ''
for c in text:
if ((c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == ' '):
newText = newText + c
elif (c == '.' or c == '/' or ord(c) == 92):
newText = newText + ' '
return newText
from nltk import tokenize
def tokenizeText(text):
"Tokenize the sentence"
sentences = tokenize.sent_tokenize(text, language='dutch')
word = tokenize.word_tokenize(text, language='dutch')
return word
EDIT-16-5-17: ERROR CODE:
ValueError Traceback (most recent call last)
<ipython-input-70-1e3e10829138> in <module>()
12 ])
13
---> 14 pipeline.fit(X_train, X_test)
15 y = pipeline.predict(y_train)
16 print(classification_report(y, y_test))
/Users/jj/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/jj/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/jj/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
740 self._update_transformer_list(transformers)
741 if any(sparse.issparse(f) for f in Xs):
--> 742 Xs = sparse.hstack(Xs).tocsr()
743 else:
744 Xs = np.hstack(Xs)
/Users/jj/anaconda3/lib/python3.6/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
462
463 """
--> 464 return bmat([blocks], format=format, dtype=dtype)
465
466
/Users/jj/anaconda3/lib/python3.6/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
579 elif brow_lengths[i] != A.shape[0]:
580 raise ValueError('blocks[%d,:] has incompatible '
--> 581 'row dimensions' % i)
582
583 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions

Resources