Sklearn DecisionTreeClassifier.tree_.value output floats - python-3.x

I am using sklearn DecisionTreeClassifier to predict between two classes.
clf = DecisionTreeClassifier(class_weight='balanced', random_state=SEED)
params = {'criterion':['gini','entropy'],
'max_leaf_nodes':[100,1000]
}
grid = GridSearchCV(estimator=clf,param_grid=params, cv=SKF,
scoring=scorer,
n_jobs=-1, verbose=5)
trans_df = pipe.fit_transform(df.drop(["out"], axis=1))
grid.fit(trans_df, df['out'].fillna(0))
I need to output the tree for analysis.
No problem until there, I am going through all nodes and get the rules following more or less this answer.
def tree_to_flat(tree, feature_names):
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
positions = []
def recurse(node, depth, position=OrderedDict()):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
lname = name
ldict = {key:value for (key,value) in position.items()}
ldict[lname] = '<=' + str(threshold)
rname = name
rdict = {key:value for (key,value) in position.items()}
rdict[rname] = '>' + str(threshold)
recurse(tree_.children_left[node], depth + 1, ldict)
recurse(tree_.children_right[node], depth + 1, rdict)
else:
position['value'] = tree_.value[node]
positions.append(position)
return position
recurse(0, 1)
return positions
If I look at the different values, they are all non integer, like [[296.727705967, 104.03070761]]. The 104.03 is close to the number of instances in the node in total (104).
My understanding was that tree_.value[node] gives the number of instances in the two classes. How can I end up with non integer numbers?
Thanks in advance

Related

K-means clustering model fitting in android studio with Chaquopy SDK is not working

I want to generate random K values based on the image. My code is working in python script but while integrating with Chaquopy. When I am debugging my code, the code is not working and not throwing errors also.
def Adaptive_K_Value(image):
print(image)
data = image / 255.0 # use 0...1 scale
print(data)
data = data.reshape(image.shape[0] * image.shape[1], 3)
print(data.shape)
data = np.array(data)
print(data)
max_Dunn = float("-inf")
min_DB = float("inf")
max_Silhoutte = float("-inf")
max_Calinski = float("-inf")
ideal_k = k = 2
while True:
print("k = ", k + 1)
kmeans = KMeans(n_clusters=k)
print("kmeans ",kmeans)
print(data.shape)
kmeans.fit(data)
print("fitted model ",kmeans)
kmeans_labels = kmeans.labels_
print(kmeans.labels_)
print(kmeans.cluster_centers_)
distances = pairwise_distances(data)
Dunn = dunn(distances, kmeans_labels)
print("Dunn Index kmeans =", Dunn)
DB = davies_bouldin_score(data, kmeans_labels)
print("DB Index kmeans =", DB)
Silhoutte = silhouette_score(data, kmeans_labels)
print('Silhouette Score kmeans=', Silhoutte)
Calinski = metrics.calinski_harabasz_score(data, kmeans_labels)
print('Calinski-Harabasz Index kmeans =', Calinski)
count = 0
if max_Dunn < Dunn:
count += 1
if min_DB > DB:
count += 1
if max_Silhoutte < Silhoutte:
count += 1
if max_Calinski < Calinski:
count += 1
if count < 2:
break
else:
ideal_k = k
k += 1
max_Dunn = Dunn
min_DB = DB
max_Silhoutte = Silhoutte
max_Calinski = Calinski
print("\nOptimum K value =", ideal_k + 1)
return ideal_k
I am trying to implement Adaptive_K_Value() method to get random K values from the image.
This line kmeans.fit(data) is not working. While loop is terminating directly. It just printing KMeans(n_clusters=3) means this line -> kmeans = KMeans(n_clusters=k) is only working.
Please help me
Logs of Android Studio
Here the k-means model is just initialized but the fitting part is not working. When I run the python script on the command line it returns the K value.

create set of randomized column names in pandas dataframe

I am trying to create a set of columns (within panda dataframe) where the column names are randomized. This is because I want to generate filter data from a larger data-set in a randomized fashion.
How can I generate an N (= 4) * 3 set of column names as per below?
car_speed state_8 state_17 state_19 state_16 wd_8 wd_17 wd_19 wd_16 wu_8 wu_17 wu_19 wu_16
My potential code below, but doesn't really work. I need the blocks'state_' first, then 'wd_', and then 'wd_'. My code below generates 'state_', 'wd_', 'wu_' individually in consecutive order. I have problems further on, when it is in that order, of filling in the data from the larger data-set
def iteration1(data, classes = 50, sigNum = 4):
dataNN = pd.DataFrame(index = [0])
dataNN['car_speed'] = np.zeros(1)
while len(dataNN.columns) < sigNum + 1:
state = np.int(np.random.uniform(0, 50))
dataNN['state_'+str(state)] = np.zeros(1) # this is the state value set-up
dataNN['wd_' + str(state)] = np.zeros(1) # this is the weight direction
dataNN['wu_' + str(state)] = np.zeros(1) # this is the weight magnitude
count = 0 # initialize count row as zero
while count < classes :
dataNN.loc[count] = np.zeros(len(dataNN.columns))
for state in dataNN.columns[1:10]:
dataNN[state].loc[count] = data[state].loc[count]
count = count + 1
if count > classes : break
return dataNN
Assuming the problem you have is lack of grouping of "state_*", "wd_*", and "wu_*" I suggest that you first select sigNum / 3 random ints and then use them to label the columns. Like the following:
states = [np.int(np.random.uniform(0, 50)) for _ in range (sigNum/3)]
i = 0
while len(dataNN.columns) <= sigNum:
state = states[i]
i += 1
dataNN['state_'+str(state)] = np.zeros(1) # this is the state value set-up
dataNN['wd_' + str(state)] = np.zeros(1) # this is the weight direction
dataNN['wu_' + str(state)] = np.zeros(1) # this is the weight magnitude
import random
import pandas as pd
def iteration1(data, classes = 5, subNum = 15):
dataNN = pd.DataFrame(index = [0])
dataNN['car_speed'] = np.zeros(1)
states = random.sample(range(50), sub_sig)
for i in range(0, sub_sig, 1):
dataNN['state_'+str(states[i])] = np.zeros(1) # this is the state value set-up
for i in range(0, subNum, 1):
dataNN['wd_' + str(states[i])] = np.zeros(1) # this is the weight direction
for i in range(0, subNum, 1):
dataNN['wu_' + str(states[i])] = np.zeros(1) # this is the weight magnitude
return dataNN

Keras sign for if statement

I have a function that I define as follows
def NewLoss(y_true,y_pred):
p=0
for i in range(3074):
if (y_pred[i+1]-y_pred[i])<0:
p+=(y_true[i]-y_pred[i])**2
elif (y_pred[i+1]-y_pred[i])>0:
p+=(y_true[i]-y_pred[i])**2+(y_true[i]-y_pred[i])*(y_pred[i+1]-y_pred[i])**2
else:
p+=(y_true[i]-y_pred[i])**2+0.5*(y_true[i]-y_pred[i])*(y_pred[i+1]-y_pred[i])**2
return p
My y_true and y_pred are vectors. When I try to run a code that calls this function, I get the following error:
"Using a tf.Tensor as a Python bool is not allowed".
I would like to know how to check the sign of (y_true[i]-y_pred[i]) and avoid this error, I am actually using keras.
Thank you very much for your help.
def NewLoss(y_true, y_pred):
true = y_true[:3074]
pred = y_pred[:3074]
predShifted = y_pred[1:3075]
diff = true - pred
diffShifted = predShifted - pred
pLeftPart = K.square(diff)
pRightPart = diff * K.square(diffShifted)
greater = K.cast(K.greater(diffShifted,0),K.floatx())
equal = 0.5 * K.cast(K.equal(diffShifted, 0), K.floatx())
mask = greater + equal
return K.sum(pLeftPart + (mask*pRightPart))
Remarks:
1 - The first axis is the samples axis, perhaps you're trying to do this with the timesteps axis? If so, use:
true = y_true[:,:3074]
pred = y_pred[:,:3074]
predShifted = y_pred[:,1:3075]
2 - Having differences exactly equal to zero is so rare that maybe you don't need the last part of the if statement.
3 - If the max length of your tensors is 3075, you can simplify the selections:
true = y_true[:-1]
pred = y_pred[:-1]
predShifted = y_pred[1:]

Tokenizer.word_index did not contain "START" or "END", rather contained "start" and "end"

I was trying to make an Image Captioning model in a similar fashion as in here
I used ResNet50 instead off VGG16 and also had to use progressive loading via model.fit_generator() method.
I used ResNet50 from here and when I imported it by setting include_top = False, It gave me features of photo in shape of {'key': [[[[value1, value2, .... value 2048]]]]}, where "key" is the image id.
Here's my code of captionGenerator function:-
def createCaptions(tokenizer, photoData, MaxLength, model):
for key, feature in photoData.items():
inSeq = "START"
for i in range(MaxLength):
sequence = tokenizer.texts_to_sequences([inSeq])[0]
sequence = pad_sequences([sequence], maxlen = MaxLength)
ID = model.predict([np.array(feature[0][0][0]), sequence])
ID = np.argmax(ID)
ID = word_for_id(ID)
if ID is None:
break
inSeq += " " + ID
if ID == "END":
break
print(inSeq)
The function word_for_id is :-
def word_for_id(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
I had generated photoData via:-
features = {}
for images in os.listdir(args["image"]):
filename = args["image"] + '/' + images
image = load_img(filename, target_size = inputShape)
image = img_to_array(image)
image = np.expand_dims(image, axis = 0)
image = preprocess(image)
pred = resnet.predict(image)
image_id = images.split('.')[0]
features[image_id] = pred
print('>{}'.format(images))
features is my photoData dictionary.
The problem is, in training data photos descriptions which I generate through:-
def train_test_data(filename):
DataFile = open(filename, 'r')
Data = DataFile.read()
DataFile.close()
ImageID = []
textDataFile = pickle.load(open('descriptions.pkl', 'rb'))
for line in Data.split('\n'):
if len(line) < 1:
continue
ImageID.append(line.split('.')[0])
Data = {}
for key in textDataFile:
if key in ImageID:
Data[key] = textDataFile[key]
for ID in Data:
for i in range(len(Data[ID])):
l = Data[ID][i]
l = "START " + " ".join(l) + " END"
Data[ID][i] = l
return Data
Here, I added "START" and "END" at the begginning and end of each sentences of description respectively. But in tokenizer.word_index, "START" and "END" are not found as keys. That is:-
k = pickle.load(open('word_index.pkl', 'rb'))
print("START" in k)
This gives result as False.
Please explain to me why this is happening.
If I do:-
k = pickle.load(open('word_index.pkl', 'rb'))
print("start" in k)
The answer comes out True.
That is because by default the Tokenizer lowers the words when fitting based on the lower=True parameter. You can either use the lower case or pass lower=False when creating the tokenizer, documentation.

select with bokeh not really working

I am using bokeh 0.12.2. I have a select with words. When i choose a word it should circle the dot data. It seems to work then stop. I am trying with 2 words, word1 and word2. lastidx is full of index.xc and yx are the location of the circle here is the code. This is working with one but not really if i change the value in the select:
for j in range(0,2):
for i in range(0,len(lastidx[j])):
xc.append(tsne_kmeans[lastidx[j][i], 0])
yc.append(tsne_kmeans[lastidx[j][i], 1])
source = ColumnDataSource(data=dict(x=xc, y=yc, s=mstwrd))
def callback(source=source):
dat = source.get('data')
x, y, s = dat['x'], dat['y'], dat['s']
val = cb_obj.get('value')
if val == 'word1':
for i in range(0,75):
x[i] = x[i]
y[i] = y[i]
elif val == 'word2':
for i in range(76,173):
x[i-76] = x[i]
y[i-76] = y[i]
source.trigger('change')
slct = Select(title="Word:", value="word1", options=mstwrd , callback=CustomJS.from_py_func(callback))
# create the circle around the data where the word exist
r = plot_kmeans.circle('x','y', source=source)
glyph = r.glyph
glyph.size = 15
glyph.fill_alpha = 0.0
glyph.line_color = "black"
glyph.line_dash = [4, 2]
glyph.line_width = 1
x and y are loaded with all the data here and I just pick the data for the word I select. It seems to work and then it does not.
Is it possible to do that as a stand alone chart?
Thank you
I figured it out: code here is just to see if this was working. This will be improved of course. And may be this is what was written here at the end:
https://github.com/bokeh/bokeh/issues/2618
for i in range(0,len(lastidx[0])):
xc.append(tsne_kmeans[lastidx[0][i], 0])
yc.append(tsne_kmeans[lastidx[0][i], 1])
addto = len(lastidx[1])-len(lastidx[0])
# here i max out the data which has the least
# so when you go from one option to the other it
# removes all the previous data circle
for i in range(0,addto):
xc.append(-16) # just send them somewhere
yc.append(16)
for i in range(0, len(lastidx[1])):
xf.append(tsne_kmeans[lastidx[1][i], 0])
yf.append(tsne_kmeans[lastidx[1][i], 1])
x = xc
y = yc
source = ColumnDataSource(data=dict(x=x, y=y,xc=xc,yc=yc,xf=xf,yf=yf))
val = "word1"
def callback(source=source):
dat = source.get('data')
x, y,xc,yc,xf,yf = dat['x'], dat['y'], dat['xc'], dat['yc'], dat['xf'], dat['yf']
# if slct.options['value'] == 'growth':
val = cb_obj.get('value')
if val == 'word1':
for i in range(0,len(xc)):
x[i] = xc[i]
y[i] = yc[i]
elif val == 'word2':
for i in range(0,len(xf)):
x[i] = xf[i]
y[i] = yf[i]
source.trigger('change')
slct = Select(title="Most Used Word:", value=val, options=mstwrd , callback=CustomJS.from_py_func(callback))
# create the circle around the data where the word exist
r = plot_kmeans.circle('x','y', source=source)
I will check if i can pass a matrix. Don't forget to have the same size of data if not you will have multiple options circled in the same time.
Thank you

Resources