Training a Random Forest on Tensorflow - python-3.x

I am trying to train a tensorflow based random forest regression on numerical and continuos data.
When I try to fit my estimator it begins with the message below:
INFO:tensorflow:Constructing forest with params =
INFO:tensorflow:{'num_trees': 10, 'max_nodes': 1000, 'bagging_fraction': 1.0, 'feature_bagging_fraction': 1.0, 'num_splits_to_consider': 10, 'max_fertile_nodes': 0, 'split_after_samples': 250, 'valid_leaf_threshold': 1, 'dominate_method': 'bootstrap', 'dominate_fraction': 0.99, 'model_name': 'all_dense', 'split_finish_name': 'basic', 'split_pruning_name': 'none', 'collate_examples': False, 'checkpoint_stats': False, 'use_running_stats_method': False, 'initialize_average_splits': False, 'inference_tree_paths': False, 'param_file': None, 'split_name': 'less_or_equal', 'early_finish_check_every_samples': 0, 'prune_every_samples': 0, 'feature_columns': [_NumericColumn(key='Average_Score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='lat', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='lng', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)], 'num_classes': 1, 'num_features': 2, 'regression': True, 'bagged_num_features': 2, 'bagged_features': None, 'num_outputs': 1, 'num_output_columns': 2, 'base_random_seed': 0, 'leaf_model_type': 2, 'stats_model_type': 2, 'finish_type': 0, 'pruning_type': 0, 'split_type': 0}
Then the process breaks down and I get a value error below:
ValueError: Shape must be at least rank 2 but is rank 1 for 'concat' (op: 'ConcatV2') with input shapes: [?], [?], [?], [] and with computed input tensors: input[3] = <1>.
This is the code I am using:
import tensorflow as tf
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
import pandas as pd
from tensorflow.contrib.tensor_forest.client import random_forest
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
def getFeatures():
Average_Score = tf.feature_column.numeric_column('Average_Score')
lat = tf.feature_column.numeric_column('lat')
lng = tf.feature_column.numeric_column('lng')
return [Average_Score,lat ,lng]
# Import hotel data
Hotel_Reviews=pd.read_csv("./DataMining/Hotel_Reviews.csv")
Hotel_Reviews_Filtered=Hotel_Reviews[(Hotel_Reviews.lat.notnull() |
Hotel_Reviews.lng.notnull())]
Hotel_Reviews_Filtered_Target = Hotel_Reviews_Filtered[["Reviewer_Score"]]
Hotel_Reviews_Filtered_Features = Hotel_Reviews_Filtered[["Average_Score","lat","lng"]]
#Preprocess the data
x=Hotel_Reviews_Filtered_Features.to_dict('list')
for key in x:
x[key] = np.array(x[key])
y=Hotel_Reviews_Filtered_Target.values
#specify params
params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
feature_colums= getFeatures(),
num_classes=1,
num_features=2,
regression=True,
num_trees=10,
max_nodes=1000)
#build the graph
graph_builder_class = tensor_forest.RandomForestGraphs
est=random_forest.TensorForestEstimator(
params, graph_builder_class=graph_builder_class)
#define input function
train_input_fn = numpy_io.numpy_input_fn(
x=x,
y=y,
batch_size=1000,
num_epochs=1,
shuffle=True)
est.fit(input_fn=train_input_fn, steps=500)
The variables x is a list of numpy array of shape (512470,):
{'Average_Score': array([ 7.7, 7.7, 7.7, ..., 8.1, 8.1, 8.1]),
'lat': array([ 52.3605759, 52.3605759, 52.3605759, ..., 48.2037451,
48.2037451, 48.2037451]),
'lng': array([ 4.9159683, 4.9159683, 4.9159683, ..., 16.3356767,
16.3356767, 16.3356767])}
The variable y is numpy array of shape (512470,1):
array([[ 2.9],
[ 7.5],
[ 7.1],
...,
[ 2.5],
[ 8.8],
[ 8.3]])

Force each array in x to be 2 dim using ndmin=2. Then the shapes should match and concat should be able to operate.

Related

Azure ML Endpoint error 'GradientBoostingRegressor' object has no attribute 'n_features_'

While running the endpoint testing in Azure ML, I am experiencing one error related to the reading of input data.
Steps followed :
Running Gradient boost model
2.Train and test the data and save it in the model. pkl file
Registering the model on azure ML and deploying the configuration with the code
Reading score.py for the init() and run()
Train.py code
%%writefile $script_folder/train.py
import argparse
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import joblib
import pickle
from azureml.core import Workspace, Dataset, Experiment
from azureml.core import Run
import re
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
import math
import pickle
#ws = Workspace.from_config()
#az_dataset = Dataset.get_by_name(ws, 'pricing')
# let user feed in 2 parameters, the location of the data files (from datastore), and the regularization rate of the logistic regression model
#parser = argparse.ArgumentParser()
#parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
#parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
#args = parser.parse_args()
train_data = pd.read_csv("C:\\Users\\abhay\\Downloads\\Projects_DataScience\\Ensemble_Machine_Learning\\dataset\\train_update.csv")
column_datatypes = train_data.dtypes
categorical_columns = list(column_datatypes[column_datatypes=="object"].index.values)
continuous_columns = list(column_datatypes[column_datatypes=="float64"].index.values)
continuous_columns.remove('loss')
total_rows = train_data.shape[0]
columns_with_blanks_cat = np.random.randint(1,116,2)
columns_with_blanks_cont = np.random.randint(117,130,3)
columns_with_blank = np.append(columns_with_blanks_cat,columns_with_blanks_cont)
#for every column insert 5 blanks at random locations
for col in columns_with_blank:
rows_with_blanks = np.random.randint(1,total_rows,5)
train_data.iloc[rows_with_blanks,col] = np.nan
class Data_preprocessing:
def __init__(self,train_data):
self.train_data = train_data
def missing_value_continuous(self,column_names_with_specific_type,imputation_type="mean"): # null value imputation with mean value
if imputation_type=="mean": # mean imputation
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mean_imputer.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type]=mean_imputer.transform(self.train_data[column_names_with_specific_type])
if imputation_type=="median": # median imputation
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
median_imputer.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type]=median_imputer.transform(self.train_data[column_names_with_specific_type])
return self.train_data
def missing_value_categorical(self,column_names_with_specific_type,imputation_type="most_frequent"): # check for missing categorical column values
most_frequent = SimpleImputer(strategy="most_frequent")
most_frequent.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type] = most_frequent.transform(train_data[column_names_with_specific_type])
return self.train_data
def outlier_treatment(self,Q1,Q3,IQR,columns_with_outlier,action): # outlier treatmenr
if action=="median":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
meadian_outlier = np.median(self.train_data[column_name])
self.train_data.loc[self.train_data[((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))].index,column_name]=meadian_outlier
if action=="mean":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
mean_outlier = np.mean(self.train_data[column_name])
self.train_data.loc[self.train_data[((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))].index,column_name]=mean_outlier
if action=="remove":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
self.train_data = self.train_data[~((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))]
return self.train_data
column_names = np.array(train_data.columns)
Data_preprocessing_obj = Data_preprocessing(train_data)
train_data = Data_preprocessing_obj.missing_value_continuous(continuous_columns,"median")
train_data = Data_preprocessing_obj.missing_value_categorical(categorical_columns)
columns_with_outlier = ['cont7','cont9','cont10']
Q1 = train_data[continuous_columns].quantile(0.25)
Q3 = train_data[continuous_columns].quantile(0.75)
IQR = (Q3-Q1)
train_data = Data_preprocessing_obj.outlier_treatment(Q1,Q3,IQR,columns_with_outlier,"median")
def feature_selection_numerical_variables(train_data,qthreshold,corr_threshold,exclude_numerical_cols_list):
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(train_data.select_dtypes(include=num_colums).columns)
numerical_columns = [column for column in numerical_columns if column not in exclude_numerical_cols_list]
#remove variables with constant variance
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(train_data[numerical_columns])
constant_columns = [column for column in train_data[numerical_columns].columns
if column not in train_data[numerical_columns].columns[constant_filter.get_support()]]
if len(constant_columns)>0:
train_data.drop(labels=constant_columns, axis=1, inplace=True)
#remove deleted columns from dataframe
numerical_columns = [column for column in numerical_columns if column not in constant_columns]
#remove variables with qconstant variance
#Remove quasi-constant variables
qconstant_filter = VarianceThreshold(threshold=qthreshold)
qconstant_filter.fit(train_data[numerical_columns])
qconstant_columns = [column for column in train_data[numerical_columns].columns
if column not in train_data[numerical_columns].columns[constant_filter.get_support()]]
if len(qconstant_columns)>0:
train_data.drop(labels=qconstant_columns, axis=1, inplace=True)
#remove deleted columns from dataframe
numerical_columns = [column for column in numerical_columns if column not in qconstant_columns]
#remove correlated variables
correlated_features = set()
correlation_matrix = train_data[numerical_columns].corr()
ax = sns.heatmap(
correlation_matrix,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right');
#print(correlation_matrix)
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > corr_threshold:
colname = correlation_matrix.columns[i]
colcompared = correlation_matrix.columns[j]
#check if the column compared against is not in the columns excluded list
if colcompared not in correlated_features:
correlated_features.add(colname)
train_data.drop(labels=correlated_features, axis=1, inplace=True)
return train_data,constant_columns,qconstant_columns,correlated_features
train_data,constant_columns,qconstant_columns,correlated_features =feature_selection_numerical_variables(train_data,0.01,0.75,['loss','id'],)
for cf1 in categorical_columns:
le = LabelEncoder()
le.fit(train_data[cf1].unique())
filename = cf1+".sav"
pickle.dump(le, open(filename, 'wb'))
train_data[cf1] = le.transform(train_data[cf1])
#snippet to calculate the unique values with a categorical columns
df = pd.DataFrame(columns=["Column_Name","Count"])
for cat in categorical_columns:
unique_value_count = len(train_data[cat].unique())
df = df.append({'Column_Name': cat, "Count":int(unique_value_count)}, ignore_index=True)
columns_unique_value = np.array(df.Count.value_counts().index)
#snippet to identify the dependent/correlated categorical variables and drop them
columns_to_drop_cat = set()
correlated_columns = dict()
for unique_value_count in columns_unique_value:
if unique_value_count>1:
categorical_columns = df.loc[df.Count==unique_value_count,'Column_Name']
categorical_columns = categorical_columns.reset_index(drop=True)
columns_length=len(categorical_columns)
for col in range(columns_length-1):
column_to_compare = categorical_columns[col]
columns_compare_against = categorical_columns[(col+1):columns_length]
chi_scores = chi2(train_data[columns_compare_against],train_data[column_to_compare])
if column_to_compare not in columns_to_drop_cat:
columns_to_be_dropped = [i for i in range(len(columns_compare_against)) if chi_scores[1][i]<=0.05]
columns_to_drop_array = np.array(columns_compare_against)[columns_to_be_dropped]
correlated_columns[column_to_compare]=columns_to_drop_array
columns_to_drop_cat.update(columns_to_drop_array)
train_data = train_data.drop(columns_to_drop_cat,axis=1)
correlated_features = list(correlated_features)
columns_to_drop_cat = list(columns_to_drop_cat)
columns_to_drop_cat.extend(correlated_features)
columns_to_drop = columns_to_drop_cat.copy()
#output the columns_to_drop file to a csv
columns_to_drop_df=pd.DataFrame(columns_to_drop,columns=['colnames'])
#columns_to_drop_df.to_csv("/model/columns_to_drop.csv",index=False)
train_data['loss'] = np.log(train_data['loss'])
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
#convert the int64 columns categorical
Column_datatypes= train_data.dtypes
Integer_columns = list(Column_datatypes.where(lambda x: x =="int64").dropna().index.values)
train_data[Integer_columns] = train_data[Integer_columns].astype('category',copy=False)
X,y = train_data.drop(['id','loss'],axis=1),train_data['loss']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # perform train test split
ref_cols=X_train.columns
from sklearn.ensemble import GradientBoostingRegressor #GBM algorithm
gbm_base = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=1.0)
trained_model=gbm_base.fit(X_train,y_train)
# Predict the outcome using Test data - Score Model
Y_test_predict_tuned = gbm_base.predict(X_test)
# Get the probability score - Scored Probabilities
#Y_prob = gbm_base.predict_proba(X_test)[:, 1]
# Get Confusion matrix and the accuracy/score - Evaluate
score =np.sqrt(mean_squared_error(y_test, Y_test_predict_tuned))
#print('Export the model to model.pkl')
#f = open('fwrk2.pkl', 'wb')
#pickle.dump(trained_model, f)
#f.close()
#print('Import the model from model.pkl')
#f2 = open('fwrk2.pkl', 'rb')
#clf2 = pickle.load(f2)
#X_new = [[154, 54, 35]]
#print('New Sample:', X_new)
#print('Predicted class:', clf2.predict(X_new))
#os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
#joblib.dump(value=trained_model, filename='outputs/fwrk2.pkl')
Reading the score.py
%%writefile score.py
import json
import numpy as np
import os
import pickle
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingRegressor
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from azureml.core.model import Model
def init():
global model
#model = joblib.load('recommender.pkl')
model_path = Model.get_model_path('fwrk2')
model = joblib.load(model_path)
input_sample = pd.DataFrame(data=[{"cat1":0, "cat4": 0, "cat14": 0, "cat15": 0, "cat18": 0, "cat19": 0, "cat20": 0, "cat21": 0
, "cat22": 0, "cat35": 0, "cat42":0, "cat47": 0, "cat48": 0, "cat55": 0
, "cat56": 0, "cat58": 0, "cat59": 0, "cat60": 0, "cat61": 0, "cat62": 0
, "cat63": 0, "cat64": 0, "cat68": 0, "cat70": 0, "cat76": 0, "cat77":0
, "cat78": 0, "cat82": 0, "cat85": 0, "cat86": 0, "cat89": 0, "cat91": 0
, "cat92": 0, "cat93": 0, "cat94":0, "cat96": 0, "cat97": 0, "cat99": 0
, "cat100": 0, "cat101": 0, "cat103": 0, "cat105": 0, "cat107": 0, "cat109":0
, "cat110": 0, "cat111": 0, "cat112": 0, "cat113": 0, "cat116": 0, "cont1": 0
, "cont2": 0, "cont3": 0, "cont4": 0, "cont5": 0
, "cont6": 0, "cont7": 0, "cont8": 0, "cont14": 0}])
output_sample = np.array([0]) # This is a integer type sample. Use the data type that reflects the expected result
#input_schema('data', PandasParameterType(input_sample))
#output_schema(NumpyParameterType(output_sample))
def run(data):
try:
result = model.predict(data)
# you can return any datatype as long as it is JSON-serializable
return result.tolist()
except Exception as e:
error = str(e)
return error
The endpoint publish is succeeded, and I can see the test feature on the azure portal to enter values, post entering the values.
[{"cat1":0, "cat4": 0, "cat14": 0, "cat15": 0, "cat18": 0, "cat19": 0, "cat20": 0, "cat21": 0
, "cat22": 0, "cat35": 0, "cat42":0, "cat47": 0, "cat48": 0, "cat55": 0
, "cat56": 0, "cat58": 0, "cat59": 0, "cat60": 0, "cat61": 0, "cat62": 0
, "cat63": 0, "cat64": 0, "cat68": 0, "cat70": 0, "cat76": 0, "cat77":0
, "cat78": 0, "cat82": 0, "cat85": 0, "cat86": 0, "cat89": 0, "cat91": 0
, "cat92": 0, "cat93": 0, "cat94":0, "cat96": 0, "cat97": 0, "cat99": 0
, "cat100": 0, "cat101": 0, "cat103": 0, "cat105": 0, "cat107": 0, "cat109":0
, "cat110": 0, "cat111": 0, "cat112": 0, "cat113": 0, "cat116": 0, "cont1": 0
, "cont2": 0, "cont3": 0, "cont4": 0, "cont5": 0
, "cont6": 0, "cont7": 0, "cont8": 0, "cont14": 0}])
Error: "'GradientBoostingRegressor' object has no attribute 'n_features"
Please can someone guide what could be the problem in executing the above input sample? Is it related to the version of the package, and if yes, then how to update it and solve it?
The GradientBoostingRegressor will take the value of dictionary and replace the values from tuple to dictionary. Replace the gradient descent code with the below code block.
Below is the current existing block
gbm_base = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=1.0)
Replace the above code block with the below code block.
gbm_base = {
“max_depth”:2,
“n_estimators”:3,
“”learning_rate:1.0,
}
gbm = GradientBoostingRegressor(**gbm_base)
use gbm variable further to use the features of GadientBoostingRegressor
Edit 1: Alternative Procedure
I have tried to reproduce the issue with my sample and no issue was recorded with GradientBoostingRegressor. Checkout the code block mentioned.
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
X, y = make_regression(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)
Output: GradientBoostingRegressor(random_state=0)
reg.predict(X_test[1:2])
output: array([-61...])
reg.score(X_test, y_test)
Output: 0.4...

pytorch lstm output not match hidden state

Here is the code:
import torch
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence
rnn = torch.nn.LSTM(2, 3, 1, bidirectional=True)
x1= torch.randn(1,2)
x2=torch.randn(2,2)
x=pack_sequence([x1,x2], enforce_sorted=False)
y, (hn, cn) = rnn(x)
y,lens=pad_packed_sequence(y)
print the hidden state result:
print(torch.cat([hn[-2], hn[-1]], dim=1))
get result:
tensor([[-0.0265, -0.1149, -0.0466, 0.1080, 0.0901, 0.0865],
[ 0.0736, -0.2503, -0.0634, 0.0801, 0.1401, -0.0032]],
grad_fn=<CatBackward>)
print the output:
for i,j in enumerate(lens):
print(y[i][j-1])
get result:
tensor([-0.0265, -0.1149, -0.0466, 0.1080, 0.0901, 0.0865],
grad_fn=<SelectBackward>)
tensor([ 0.0736, -0.2503, -0.0634, 0.0932, 0.0962, -0.0915],
grad_fn=<SelectBackward>)
the second tensor is not the same with the hidden state!
Why?

Using DNNLinearCombinedEstimator in tensorflow for multilabel classification

I have a multilabel dataset that I would like to use a wide-n-deep neural network to classify the samples.
This is a very small example just to test:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.enable_eager_execution()
training_df: pd.DataFrame = pd.DataFrame(
data={
'feature1': np.random.rand(10),
'feature2': np.random.rand(10),
'feature3': np.random.rand(10),
'feature4': np.random.randint(0, 3, 10),
'feature5': np.random.randint(0, 3, 10),
'feature6': np.random.randint(0, 3, 10),
'target1': np.random.randint(0, 2, 10),
'target2': np.random.randint(0, 2, 10),
'target3': np.random.randint(0, 2, 10)
}
)
features = ['feature1', 'feature2', 'feature3','feature4', 'feature5', 'feature6']
targets = ['target1', 'target2', 'target3']
Categorical_Cols = ['feature4', 'feature5', 'feature6']
Numerical_Cols = ['feature1', 'feature2', 'feature3']
wide_columns = [tf.feature_column.categorical_column_with_vocabulary_list(key=x, vocabulary_list=[0, 1, -1])
for x in Categorical_Cols]
deep_columns = [tf.feature_column.numeric_column(x) for x in Numerical_Cols]
def wrap_dataset(df, features, labels):
dataset = (
tf.data.Dataset.from_tensor_slices(
(
tf.cast(df[features].values, tf.float32),
tf.cast(df[labels].values, tf.int32),
)
)
)
return(dataset)
input_fn_train = wrap_dataset(training_df, features, targets)
m = tf.contrib.estimator.DNNLinearCombinedEstimator(
head=tf.contrib.estimator.multi_label_head(n_classes=2),
# wide settings
linear_feature_columns=wide_columns,
# linear_optimizer=tf.train.FtrlOptimizer(...),
# deep settings
dnn_feature_columns=deep_columns,
# dnn_optimizer=tf.train.ProximalAdagradOptimizer(...),
dnn_hidden_units=[10, 30, 10])
m.train(input_fn=input_fn_train)
In this example, we have 6 features including:
3 numerical features: feature1, feature2, and feature3
3 categorical features: feature4, feature5, and feature6
where each sample has three labels and each label has a binary value: 0 or 1.
The error is about the input function and I cannot figure out how to define the input function in a correct way.
Any help to correct the code is appreciated.
UPDATE: The error is:
TypeError: <TensorSliceDataset shapes: ((6,), (3,)), types: (tf.float32, tf.int32)> is not a callable object
Since it says it is not a callable object, you can simply add lambda and it should work
input_fn_train = lambda: wrap_dataset(training_df, features, targets)
Also I think you need to sort out how you pass your data to the Estimator. It might take dictionaries since you are using feature columns. Right now you are passing tensors and not dictionary of Tensors. Check out this useful post.
Finally, I figured out how to make the code working. I post it here to help people who would like to do multi-label classification using built-in function DNNLinearCombinedEstimator from tensorflow package, version 1.13.
import numpy as np
import pandas as pd
import tensorflow as tf
# from tensorflow import contrib
tf.enable_eager_execution()
training_df: pd.DataFrame = pd.DataFrame(
data={
'feature1': np.random.rand(10),
'feature2': np.random.rand(10),
'feature3': np.random.rand(10),
'feature4': np.random.randint(0, 3, 10),
'feature5': np.random.randint(0, 3, 10),
'feature6': np.random.randint(0, 3, 10),
'target1': np.random.randint(0, 2, 10),
'target2': np.random.randint(0, 2, 10),
'target3': np.random.randint(0, 2, 10)
}
)
features = ['feature1', 'feature2', 'feature3','feature4', 'feature5', 'feature6']
targets = ['target1', 'target2', 'target3']
Categorical_Cols = ['feature4', 'feature5', 'feature6']
Numerical_Cols = ['feature1', 'feature2', 'feature3']
wide_columns = [tf.feature_column.categorical_column_with_vocabulary_list(key=x, vocabulary_list=[0, 1, -1])
for x in Categorical_Cols]
deep_columns = [tf.feature_column.numeric_column(x) for x in Numerical_Cols]
def input_fn(df):
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values)
for k in Numerical_Cols}
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1])
for k in Categorical_Cols}
# Merges the two dictionaries into one.
feature_cols = continuous_cols.copy()
feature_cols.update(categorical_cols)
labels =tf.convert_to_tensor(training_df.as_matrix(training_df[targets].columns.tolist()), dtype=tf.int32)
return feature_cols, labels
def train_input_fn():
return input_fn(training_df)
def eval_input_fn():
return input_fn(training_df)
m = tf.contrib.learn.DNNLinearCombinedEstimator(
head=tf.contrib.learn.multi_label_head(n_classes=3),
# wide settings
linear_feature_columns=wide_columns,
# linear_optimizer=tf.train.FtrlOptimizer(...),
# deep settings
dnn_feature_columns=deep_columns,
# dnn_optimizer=tf.train.ProximalAdagradOptimizer(...),
dnn_hidden_units=[10, 10])
m.train(input_fn=train_input_fn, steps=20)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
print("#########################################################")
for key in sorted(results):
print("%s: %s" % (key, results[key]))

ValueError: setting an array element with a sequence error while cross validation

I am trying to make text sentiment but I got always this error.
My training data consists of two columns.
List of occurrence (X): This is the list of 0,1s based on occurrence of the words in the text document. There are 2115 values in each array. Looks like this: [0 0 1 ..., 0 0 0]. There is no missing values, each array has 2115 values.
Label of the data (label): This is also list of 0 and 1s based on sentiment. Looks like this: 1. There is just one value in each row for label.
My training sample has 1440 observations.Here is my Data picture
Code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
title = "Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = GaussianNB()
plot_learning_curve(estimator, title, data.X, data.label, ylim=(0.3, 1.01), cv=cv, n_jobs=4)
When I run the code I got this error:
/anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array=231 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 0, ...
Name: arrr, Length: 129, dtype: object, accept_sparse=False, dtype=<class 'numpy.float64'>, order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=False, estimator=None)
397
398 if sp.issparse(array):
399 array = _ensure_sparse_format(array, accept_sparse, dtype, copy,
400 force_all_finite)
401 else:
--> 402 array = np.array(array, dtype=dtype, order=order, copy=copy)
array = 231 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 0, ...
Name: arrr, Length: 129, dtype: object
dtype = <class 'numpy.float64'>
order = None
copy = False
403
404 if ensure_2d:
405 if array.ndim == 1:
406 raise ValueError(
ValueError: setting an array element with a sequence.
What should I do for solving this problem?
Thanks
I solved the problem. The problem occurs with the dimensions. I also changed the label into array. From pandas DataFrame to Numpy Array there exists "lists" in each array. So I changed as follows:
featurelists=data.X.values.tolist()
X=np.array(featurelists)
y=data.label.as_matrix()
Now, it works.
Thanks all.

How to get all parameters of estimator in PySpark

I have a RandomForestRegressor, GBTRegressor and I'd like to get all parameters of them. The only way I found it could be done with several get methods like:
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
est = RandomForestRegressor()
est.getMaxDepth()
est.getSeed()
But RandomForestRegressor and GBTRegressor have different parameters so it's not a good idea to hardcore all that methods.
A workaround could be something like this:
get_methods = [method for method in dir(est) if method.startswith('get')]
params_est = {}
for method in get_methods:
try:
key = method[3:]
params_est[key] = getattr(est, method)()
except TypeError:
pass
Then output will be like this:
params_est
{'CacheNodeIds': False,
'CheckpointInterval': 10,
'FeatureSubsetStrategy': 'auto',
'FeaturesCol': 'features',
'Impurity': 'variance',
'LabelCol': 'label',
'MaxBins': 32,
'MaxDepth': 5,
'MaxMemoryInMB': 256,
'MinInfoGain': 0.0,
'MinInstancesPerNode': 1,
'NumTrees': 20,
'PredictionCol': 'prediction',
'Seed': None,
'SubsamplingRate': 1.0}
But I think there should be a better way to do that.
extractParamMap can be used to get all params from every estimator, for example:
>>> est = RandomForestRegressor()
>>> {param[0].name: param[1] for param in est.extractParamMap().items()}
{'numTrees': 20, 'cacheNodeIds': False, 'impurity': 'variance', 'predictionCol': 'prediction', 'labelCol': 'label', 'featuresCol': 'features', 'minInstancesPerNode': 1, 'seed': -5851613654371098793, 'maxDepth': 5, 'featureSubsetStrategy': 'auto', 'minInfoGain': 0.0, 'checkpointInterval': 10, 'subsamplingRate': 1.0, 'maxMemoryInMB': 256, 'maxBins': 32}
>>> est = GBTRegressor()
>>> {param[0].name: param[1] for param in est.extractParamMap().items()}
{'cacheNodeIds': False, 'impurity': 'variance', 'predictionCol': 'prediction', 'labelCol': 'label', 'featuresCol': 'features', 'stepSize': 0.1, 'minInstancesPerNode': 1, 'seed': -6363326153609583521, 'maxDepth': 5, 'maxIter': 20, 'minInfoGain': 0.0, 'checkpointInterval': 10, 'subsamplingRate': 1.0, 'maxMemoryInMB': 256, 'lossType': 'squared', 'maxBins': 32}
As described in How to print best model params in pyspark pipeline , you can get any model parameter that is available in the original JVM object of any model using the following structure
<yourModel>.stages[<yourModelStage>]._java_obj.<getYourParameter>()
All get-parameters are available here
https://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/classification/RandomForestClassificationModel.html
For example, if you want to get MaxDepth of your RandomForest after cross-validation (getMaxDepth is not available in PySpark) you use
cvModel.bestModel.stages[-1]._java_obj.getMaxDepth()

Resources