Related
When I look at the documentation, the argument is supposed to be a 'DMatrix' (xgboost version 1.5.0).
https://xgboost.readthedocs.io/en/latest/python/python_api.html#:~:text=Customized%20objective%20function.-,Learning%20API,num_boost_round%20(int)%20%E2%80%93%20Number%20of%20boosting%20iterations,-.
Indicates pretty much the same thing for the version I'm using (goto subheading '1.2.2 Python' in document link below):
https://xgboost.readthedocs.io/_/downloads/en/release_1.3.0/pdf/
I don't understand why it is asking for a float argument when it is supposed to be a DMatrix.
I've looked at all the Stack posts that have the string 'TypeError: float() argument must be a string or a number, not...', but none of them include 'DMatrix' and I have not been able to find a solution that I could adapt this particular issue.
The the following is the bit of code that elicits this error (go to 'clf - xgb.train(...)'):
def grid_search(timeout_seconds, cv_splits, num_boost_round):
# Read input data
X, y = preprocessing()
y.replace({1:0,2:1,3:2,4:3,5:4,6:5,7:6,8:7,9:8,10:9,11:10,12:11,13:12,14:13,
15:14,16:15,17:16,18:17,19:18,20:19,21:20,22:21}, inplace = True)
# Create dataframe to collect the results
tests_columns = ["test_nr", "cv_mean", "cv_min", "cv_max", "cv_median", "params"]
test_id = 0
tests = pd.DataFrame(columns=tests_columns)
# Cross validation number of splits
kf = KFold(n_splits=cv_splits)
# Execute until timeout occurs
with timeout(timeout_seconds, exception=RuntimeError):
# Get the grid
grid_iter, keys, length = get_grid_iterable()
try:
# For every element of the grid
for df_grid in grid_iter:
# Prepare a list to collect the scores
score = []
params = dict(zip(keys, df_grid))
# The objective function
params["objective"] = "multi:softprob"
params['num_class'] = 22
print('X.reason_action_converted: ', X.reason_action_converted)
# For each fold, train XGBoost and spit out the results
for train_index, test_index in kf.split(X.values):
# Get X train and X test
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
**# Get y train and y test**
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Convert into DMatrix
d_train = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
d_valid = xgb.DMatrix(X_test, label=y_test, missing=np.NaN)
d_test = xgb.DMatrix(X_test, missing=np.NaN)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
# Create the classifier using the current grid params. Apply early stopping of 50 rounds
'''clf = xgb.train(params, d_train, boosting_rounds, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)'''
**clf = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)**
y_hat = clf.predict(d_test)
# Append Scores on the fold kept out
score.append(r2_score(y_test, y_hat))
# Store the result into a dataframe
score_df = pd.DataFrame(columns=tests_columns, data=[
[test_id, np.mean(score), np.min(score), np.max(score), np.median(score),
json.dumps(dict(zip(keys, [str(g) for g in df_grid])))]])
test_id += 1
tests = pd.concat([tests, score_df])
except RuntimeError:
# When timeout occurs an exception is raised and the main cycle is broken
pass
# Spit out the results
tests.to_csv("grid-search.csv", index=False)
print(tests)
**if __name__ == "__main__":
grid_search(timeout_seconds=3600, cv_splits=4, num_boost_round=500)**
The error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<command-3902447645915365> in <module>
106
107 if __name__ == "__main__":
--> 108 grid_search(timeout_seconds=3600,
cv_splits=4, num_boost_round=500)
<command-3902447645915365> in grid_search(timeout_seconds, cv_splits, num_boost_round)
84 # Create the classifier using the current grid params. Apply early stopping of 50 rounds
85 '''clf = xgb.train(params,
d_train, boosting_rounds, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)'''
---> 86 clf = xgb.train(params,
d_train, num_boost_round, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)
87 y_hat = clf.predict(d_test)
88
/databricks/python/lib/python3.8/site-
packages/xgboost/training.py in train(params, dtrain,
num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval,
xgb_model, callbacks)
204 Booster : a trained booster model
205 """
--> 206 bst = _train_internal(params, dtrain,
207
num_boost_round=num_boost_round,
208 evals=evals,
/databricks/python/lib/python3.8/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
107 nboost += 1
108 # check evaluation result.
--> 109 if callbacks.after_iteration(bst, i,
dtrain, evals):
110 break
111 # do checkpoint after evaluation, in
case evaluation also updates
/databricks/python/lib/python3.8/site-
packages/xgboost/callback.py in after_iteration(self,
model, epoch, dtrain, evals)
421 for _, name in evals:
422 assert name.find('-') == -1,
'Dataset name should not contain `-`'
--> 423 score = model.eval_set(evals,
epoch, self.metric)
424 score = score.split()[1:] # into
datasets
425 # split up `test-error:0.1234`
/databricks/python/lib/python3.8/site-
packages/xgboost/core.py in eval_set(self, evals,
iteration, feval)
1350 if feval is not None:
1351 for dmat, evname in evals:
-> 1352 feval_ret =
feval(self.predict(dmat, training=False,
1353
output_margin=True), dmat)
1354 if isinstance(feval_ret, list):
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/metrics/_classification.py in
log_loss(y_true, y_pred, eps, normalize, sample_weight,
labels)
2184 The logarithm used is the natural logarithm
(base-e).
2185 """
-> 2186 y_pred = check_array(y_pred,
ensure_2d=False)
2187 check_consistent_length(y_pred, y_true,
sample_weight)
2188
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in
check_array(array, accept_sparse, accept_large_sparse,
dtype, order, copy, force_all_finite, ensure_2d,
allow_nd, ensure_min_samples, ensure_min_features,
estimator)
636 # make sure we actually converted to
numeric:
637 if dtype_numeric and array.dtype.kind
== "O":
--> 638 array = array.astype(np.float64)
639 if not allow_nd and array.ndim >= 3:
640 raise ValueError("Found array with
dim %d. %s expected <= 2."
TypeError: float() argument must be a string or a number, not 'DMatrix'
I'm using Databricks, Python 3.8.8, and xgboost 1.3.1.
I am trying to adapt code from the following tutorial: Effortless Hyperparameters Tuning with Apache Spark.
This is my first time working on text classification. I am working on binary text classification with CamemBert using fast-bert library which is mostly inspired from fastai.
When I run the code below
from fast_bert.data_cls import BertDataBunch
from fast_bert.learner_cls import BertLearner
databunch = BertDataBunch(DATA_PATH,LABEL_PATH,
tokenizer='camembert-base',
train_file='train.csv',
val_file='val.csv',
label_file='labels.csv',
text_col='text',
label_col='label',
batch_size_per_gpu=8,
max_seq_length=512,
multi_gpu=multi_gpu,
multi_label=False,
model_type='camembert-base')
learner = BertLearner.from_pretrained_model(
databunch,
pretrained_path='camembert-base', #'/content/drive/My Drive/model/model_out'
metrics=metrics,
device=device_cuda,
logger=logger,
output_dir=OUTPUT_DIR,
finetuned_wgts_path=None, #WGTS_PATH
warmup_steps=300,
multi_gpu=multi_gpu,
is_fp16=True,
multi_label=False,
logging_steps=50)
learner.fit(epochs=10,
lr=9e-5,
validate=True,
schedule_type="warmup_cosine",
optimizer_type="adamw")
Everything works fine until training.
I get this error message when I try to train my model:
RuntimeError Traceback (most recent call last)
<ipython-input-13-9b5c6ad7c8f0> in <module>()
3 validate=True,
4 schedule_type="warmup_cosine",
----> 5 optimizer_type="adamw")
2 frames
/usr/local/lib/python3.6/dist-packages/fast_bert/learner_cls.py in fit(self, epochs, lr, validate, return_results, schedule_type, optimizer_type)
421 # Evaluate the model against validation set after every epoch
422 if validate:
--> 423 results = self.validate()
424 for key, value in results.items():
425 self.logger.info(
/usr/local/lib/python3.6/dist-packages/fast_bert/learner_cls.py in validate(self, quiet, loss_only)
515 for metric in self.metrics:
516 validation_scores[metric["name"]] = metric["function"](
--> 517 all_logits, all_labels
518 )
519 results.update(validation_scores)
/usr/local/lib/python3.6/dist-packages/fast_bert/metrics.py in fbeta(y_pred, y_true, thresh, beta, eps, sigmoid)
56 y_pred = (y_pred > thresh).float()
57 y_true = y_true.float()
---> 58 TP = (y_pred * y_true).sum(dim=1)
59 prec = TP / (y_pred.sum(dim=1) + eps)
60 rec = TP / (y_true.sum(dim=1) + eps)
RuntimeError: The size of tensor a (2) must match the size of tensor b (39) at non-singleton dimension 1
How can I fix this ?
Thanks
fbeta doesn't work for binary classification. Using only accuracy solved this.
I am trying to perform hyperparameter tuning for Spatio-Temporal K-Means clustering by using it in a pipeline with a Decision Tree classifier. The idea is to use K-Means clustering algorithm to generate cluster-distance space matrix and clustered labels which will be then passed to Decision Tree classifier. For hyperparameter tuning, just use parameters for K-Means algorithm.
I am using Python 3.8 and sklearn 0.22.
The data I am interested is having 3 columns/attributes: 'time', 'x' and 'y' (x and y are spatial coordinates).
The code is:
class ST_KMeans(BaseEstimator, TransformerMixin):
# class ST_KMeans():
"""
Note that K-means clustering algorithm is designed for Euclidean distances.
It may stop converging with other distances, when the mean is no longer a
best estimation for the cluster 'center'.
The 'mean' minimizes squared differences (or, squared Euclidean distance).
If you want a different distance function, you need to replace the mean with
an appropriate center estimation.
Parameters:
k: number of clusters
eps1 : float, default=0.5
The spatial density threshold (maximum spatial distance) between
two points to be considered related.
eps2 : float, default=10
The temporal threshold (maximum temporal distance) between two
points to be considered related.
metric : string default='euclidean'
The used distance metric - more options are
‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’,
‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’,
‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘rogerstanimoto’, ‘sqeuclidean’,
‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘yule’.
n_jobs : int or None, default=-1
The number of processes to start; -1 means use all processors (BE AWARE)
Attributes:
labels : array, shape = [n_samples]
Cluster labels for the data - noise is defined as -1
"""
def __init__(self, k, eps1 = 0.5, eps2 = 10, metric = 'euclidean', n_jobs = 1):
self.k = k
self.eps1 = eps1
self.eps2 = eps2
# self.min_samples = min_samples
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X, Y = None):
"""
Apply the ST K-Means algorithm
X : 2D numpy array. The first attribute of the array should be time attribute
as float. The following positions in the array are treated as spatial
coordinates.
The structure should look like this [[time_step1, x, y], [time_step2, x, y]..]
For example 2D dataset:
array([[0,0.45,0.43],
[0,0.54,0.34],...])
Returns:
self
"""
# check if input is correct
X = check_array(X)
# type(X)
# numpy.ndarray
# Check arguments for DBSCAN algo-
if not self.eps1 > 0.0 or not self.eps2 > 0.0:
raise ValueError('eps1, eps2, minPts must be positive')
# Get dimensions of 'X'-
# n - number of rows
# m - number of attributes/columns-
n, m = X.shape
# Compute sqaured form Euclidean Distance Matrix for 'time' and spatial attributes-
time_dist = squareform(pdist(X[:, 0].reshape(n, 1), metric = self.metric))
euc_dist = squareform(pdist(X[:, 1:], metric = self.metric))
'''
Filter the euclidean distance matrix using time distance matrix. The code snippet gets all the
indices of the 'time_dist' matrix in which the time distance is smaller than 'eps2'.
Afterward, for the same indices in the euclidean distance matrix the 'eps1' is doubled which results
in the fact that the indices are not considered during clustering - as they are bigger than 'eps1'.
'''
# filter 'euc_dist' matrix using 'time_dist' matrix-
dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1)
# Initialize K-Means clustering model-
self.kmeans_clust_model = KMeans(
n_clusters = self.k, init = 'k-means++',
n_init = 10, max_iter = 300,
precompute_distances = 'auto', algorithm = 'auto')
# Train model-
self.kmeans_clust_model.fit(dist)
self.labels = self.kmeans_clust_model.labels_
self.X_transformed = self.kmeans_clust_model.fit_transform(X)
return self
def transform(self, X):
if not isinstance(X, np.ndarray):
# Convert to numpy array-
X = X.values
# Get dimensions of 'X'-
# n - number of rows
# m - number of attributes/columns-
n, m = X.shape
# Compute sqaured form Euclidean Distance Matrix for 'time' and spatial attributes-
time_dist = squareform(pdist(X[:, 0].reshape(n, 1), metric = self.metric))
euc_dist = squareform(pdist(X[:, 1:], metric = self.metric))
# filter 'euc_dist' matrix using 'time_dist' matrix-
dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1)
# return self.kmeans_clust_model.transform(X)
return self.kmeans_clust_model.transform(dist)
# Initialize ST-K-Means object-
st_kmeans_algo = ST_KMeans(
k = 5, eps1=0.6,
eps2=9, metric='euclidean',
n_jobs=1
)
Y = np.zeros(shape = (501,))
# Train on a chunk of dataset-
st_kmeans_algo.fit(data.loc[:500, ['time', 'x', 'y']], Y)
# Get clustered data points labels-
kmeans_labels = st_kmeans_algo.labels
kmeans_labels.shape
# (501,)
# Get labels for points clustered using trained model-
# kmeans_transformed = st_kmeans_algo.X_transformed
kmeans_transformed = st_kmeans_algo.transform(data.loc[:500, ['time', 'x', 'y']])
kmeans_transformed.shape
# (501, 5)
dtc = DecisionTreeClassifier()
dtc.fit(kmeans_transformed, kmeans_labels)
y_pred = dtc.predict(kmeans_transformed)
# Get model performance metrics-
accuracy = accuracy_score(kmeans_labels, y_pred)
precision = precision_score(kmeans_labels, y_pred, average='macro')
recall = recall_score(kmeans_labels, y_pred, average='macro')
print("\nDT model metrics are:")
print("accuracy = {0:.4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
accuracy, precision, recall
))
# DT model metrics are:
# accuracy = 1.0000, precision = 1.0000 & recall = 1.0000
# Hyper-parameter Tuning:
# Define steps of pipeline-
pipeline_steps = [
('st_kmeans_algo' ,ST_KMeans(k = 5, eps1=0.6, eps2=9, metric='euclidean', n_jobs=1)),
('dtc', DecisionTreeClassifier())
]
# Instantiate a pipeline-
pipeline = Pipeline(pipeline_steps)
kmeans_transformed.shape, kmeans_labels.shape
# ((501, 5), (501,))
# Train pipeline-
pipeline.fit(kmeans_transformed, kmeans_labels)
# Specify parameters to be hyper-parameter tuned-
params = [
{
'st_kmeans_algo__k': [3, 5, 7]
}
]
# Initialize GridSearchCV object-
grid_cv = GridSearchCV(estimator=pipeline, param_grid=params, cv = 2)
# Train GridSearch on computed data from above-
grid_cv.fit(kmeans_transformed, kmeans_labels)
The 'grid_cv.fit()' call gives the following error:
ValueError Traceback (most recent call
last) in
5
6 # Train GridSearch on computed data from above-
----> 7 grid_cv.fit(kmeans_transformed, kmeans_labels)
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
in _run_search(self, evaluate_candidates) 1149 def
_run_search(self, evaluate_candidates): 1150 """Search all candidates in param_grid"""
-> 1151 evaluate_candidates(ParameterGrid(self.param_grid)) 1152 1153
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
in evaluate_candidates(candidate_params)
680 n_splits, n_candidates, n_candidates * n_splits))
681
--> 682 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
683 X, y,
684 train=train, test=test,
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
call(self, iterable) 1002 # remaining jobs. 1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator): 1005 self._iterating = self._original_iterator is not None 1006
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
_dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
init(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
call(self)
253 # change the default number of processes to -1
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 255 return [func(*args, **kwargs)
256 for func, args, kwargs in self.items]
257
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
(.0)
253 # change the default number of processes to -1
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 255 return [func(*args, **kwargs)
256 for func, args, kwargs in self.items]
257
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
in _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score, return_parameters,
return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
call(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
_passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~/.local/lib/python3.8/site-packages/sklearn/utils/metaestimators.py
in (*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~/.local/lib/python3.8/site-packages/sklearn/pipeline.py in
score(self, X, y, sample_weight)
617 if sample_weight is not None:
618 score_params['sample_weight'] = sample_weight
--> 619 return self.steps[-1][-1].score(Xt, y, **score_params)
620
621 #property
~/.local/lib/python3.8/site-packages/sklearn/base.py in score(self, X,
y, sample_weight)
367 """
368 from .metrics import accuracy_score
--> 369 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
370
371
~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
in accuracy_score(y_true, y_pred, normalize, sample_weight)
183
184 # Compute accuracy for each possible representation
--> 185 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
186 check_consistent_length(y_true, y_pred, sample_weight)
187 if y_type.startswith('multilabel'):
~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
in _check_targets(y_true, y_pred)
78 y_pred : array or indicator matrix
79 """
---> 80 check_consistent_length(y_true, y_pred)
81 type_true = type_of_target(y_true)
82 type_pred = type_of_target(y_pred)
~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in
check_consistent_length(*arrays)
209 uniques = np.unique(lengths)
210 if len(uniques) > 1:
--> 211 raise ValueError("Found input variables with inconsistent numbers of"
212 " samples: %r" % [int(l) for l in lengths])
213
ValueError: Found input variables with inconsistent numbers of
samples: [251, 250]
The different dimensions/shapes are:
kmeans_transformed.shape, kmeans_labels.shape, data.loc[:500, ['time', 'x', 'y']].shape
# ((501, 5), (501,), (501, 3))
I don't get it how the error arrives at the "samples: [251, 25]" ?
What's going wrong?
Thanks!
250 and 251 are respectively the shapes of your train and validation in GridSearchCV
look at your custom estimator...
def transform(self, X):
return self.X_transformed
the original transform method doesn't apply any sort of operation it simply returns the train data. we need an estimator that is able to transform the new data (in sour case the validation inside gridsearch) in a flexible way. change the transform method in this way
def transform(self, X):
return self.kmeans_clust_model.transform(X)
I'm training polynomial regressions over a series of dimensions, and attempting to use predict() for a list of inputs.
inputs = np.linspace(0,10,100).reshape(-1,1)
for i, deg in enumerate([1, 3, 6, 9]):
poly = PolynomialFeatures(degree=deg)
X_poly = poly.fit_transform(X_train.reshape(-1,1))
linreg = LinearRegression().fit(X_poly, y_train)
print(linreg.predict(inputs))
When I call predict(), I get the following traceback:
ValueError Traceback (most recent call last)
<ipython-input-5-4100ae3f3ba3> in <module>()
13 return
14
---> 15 answer_one()
<ipython-input-5-4100ae3f3ba3> in answer_one()
9 X_poly = PolynomialFeatures(degree=deg).fit_transform(X_train.reshape(-1,1))
10 linreg = LinearRegression().fit(X_poly, y_train)
---> 11 print(linreg.predict(inputs))
12 # print(linreg.score(X_poly, y_train))
13 return
/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/base.py in predict(self, X)
266 Returns predicted values.
267 """
--> 268 return self._decision_function(X)
269
270 _preprocess_data = staticmethod(_preprocess_data)
/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/base.py in _decision_function(self, X)
251 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
252 return safe_sparse_dot(X, self.coef_.T,
--> 253 dense_output=True) + self.intercept_
254
255 def predict(self, X):
/opt/conda/lib/python3.6/site-packages/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
187 return ret
188 else:
--> 189 return fast_dot(a, b)
190
191
ValueError: shapes (100,1) and (2,) not aligned: 1 (dim 1) != 2 (dim 0)
The (100,1) shape is clearly for the inputs array, but I'm not sure what object's shape is (2,).
When you train a classifier with poly:
X_poly = poly.fit_transform(X_train.reshape(-1,1))
you need to make sure that the prediction is also using poly values:
print(linreg.predict(inputs))
in this case inputs have to be also polys:
inputs = poly.transform(inputs)
print(linreg.predict(inputs))
I am trying to re-execute a GitHub project on my computer for recommendation using embedding, the goal is to first embed the user and item present in the movieLens dataset, and then use the inner product to predict a rating, when I finished the integration of all components, I got an error in the training.
Code:
from lightfm.datasets import fetch_movielens
movielens = fetch_movielens()
ratings_train, ratings_test = movielens['train'], movielens['test']
def _binarize(dataset):
dataset = dataset.copy()
dataset.data = (dataset.data >= 0.0).astype(np.float32)
dataset = dataset.tocsr()
dataset.eliminate_zeros()
return dataset.tocoo()
train, test = _binarize(movielens['train']), _binarize(movielens['test'])
class ScaledEmbedding(nn.Embedding):
""" Change the scale from normal to [0,1/embedding_dim] """
def reset_parameters(self):
self.weight.data.normal_(0, 1.0 / self.embedding_dim)
if self.padding_idx is not None:
self.weight.data[self.padding_idx].fill_(0)
class ZeroEmbedding(nn.Embedding):
def reset_parameters(self):
self.weight.data.zero_()
if self.padding_idx is not None:
self.weight.data[self.padding_idx].fill_(0)
class BilinearNet(nn.Module):
def __init__(self, num_users, num_items, embedding_dim, sparse=False):
super().__init__()
self.embedding_dim = embedding_dim
self.user_embeddings = ScaledEmbedding(num_users, embedding_dim,
sparse=sparse)
self.item_embeddings = ScaledEmbedding(num_items, embedding_dim,
sparse=sparse)
self.user_biases = ZeroEmbedding(num_users, 1, sparse=sparse)
self.item_biases = ZeroEmbedding(num_items, 1, sparse=sparse)
def forward(self, user_ids, item_ids):
user_embedding = self.user_embeddings(user_ids)
item_embedding = self.item_embeddings(item_ids)
user_embedding = user_embedding.view(-1, self.embedding_dim)
item_embedding = item_embedding.view(-1, self.embedding_dim)
user_bias = self.user_biases(user_ids).view(-1, 1)
item_bias = self.item_biases(item_ids).view(-1, 1)
dot = (user_embedding * item_embedding).sum(1)
return dot + user_bias + item_bias
def pointwise_loss(net,users, items, ratings, num_items):
negatives = Variable(
torch.from_numpy(np.random.randint(0,
num_items,
len(users))).cuda()
)
positives_loss = (1.0 - torch.sigmoid(net(users, items)))
negatives_loss = torch.sigmoid(net(users, negatives))
return torch.cat([positives_loss, negatives_loss]).mean()
embedding_dim = 128
minibatch_size = 1024
n_iter = 10
l2=0.0
sparse = True
num_users, num_items = train.shape
net = BilinearNet(num_users,
num_items,
embedding_dim,
sparse=sparse).cuda()
optimizer = optim.Adagrad(net.parameters(),
weight_decay=l2)
for epoch_num in range(n_iter):
users, items, ratings = shuffle(train)
user_ids_tensor = torch.from_numpy(users).cuda()
item_ids_tensor = torch.from_numpy(items).cuda()
ratings_tensor = torch.from_numpy(ratings).cuda()
epoch_loss = 0.0
for (batch_user,
batch_item,
batch_ratings) in zip(_minibatch(user_ids_tensor,
minibatch_size),
_minibatch(item_ids_tensor,
minibatch_size),
_minibatch(ratings_tensor,
minibatch_size)):
user_var = Variable(batch_user)
item_var = Variable(batch_item)
ratings_var = Variable(batch_ratings)
optimizer.zero_grad()
loss = pointwise_loss(net,user_var, item_var, ratings_var, num_items)
epoch_loss += loss.data[0]
loss.backward()
optimizer.step()
print('Epoch {}: loss {}'.format(epoch_num, epoch_loss))
Error:
RuntimeError Traceback (most recent call last) <ipython-input-87-dcd04440363f> in <module>()
22 ratings_var = Variable(batch_ratings)
23 optimizer.zero_grad()
---> 24 loss = pointwise_loss(net,user_var, item_var, ratings_var, num_items)
25 epoch_loss += loss.data[0]
26 loss.backward()
<ipython-input-86-679e10f637a5> in pointwise_loss(net, users, items, ratings, num_items)
8
9 positives_loss = (1.0 - torch.sigmoid(net(users, items)))
---> 10 negatives_loss = torch.sigmoid(net(users, negatives))
11
12 return torch.cat([positives_loss, negatives_loss]).mean()
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in
__call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
<ipython-input-58-3946abf81d81> in forward(self, user_ids, item_ids)
16
17 user_embedding = self.user_embeddings(user_ids)
---> 18 item_embedding = self.item_embeddings(item_ids)
19
20 user_embedding = user_embedding.view(-1, self.embedding_dim)
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in
__call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py in forward(self, input)
115 return F.embedding(
116 input, self.weight, self.padding_idx, self.max_norm,
--> 117 self.norm_type, self.scale_grad_by_freq, self.sparse)
118
119 def extra_repr(self):
~\Anaconda3\lib\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) 1504 # remove once script supports set_grad_enabled 1505
_no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1506 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) 1507 1508
RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got CUDAType instead (while checking arguments for embedding)
can anyone help me please ?
I would suggest you to check the input type
I had the same issue which solved by converting the input type from int32 to int64.(running on win10)
ex:
x = torch.tensor(train).to(torch.int64)
For Error like:
Runtime Error: Expected tensor for argument #1 'indices' to have scalar type Long; but got CUDAType instead (while checking arguments for embedding)
b_input_ids = torch.tensor(b_input_ids).to(device).long()
Above conversion works flawlessly across Oses
For me, the error was caused by type_as.
Changing .type_as(z) to .to(self.device) solved the issue.