I am using Pandas for a sentiment analysis problem, I want to change my categories to ['Agree', 'Disagree', 'Neutral'] to a scale from 1 to 5 - python-3.x

recsys_file = pd.read_csv('full_chat_questions.csv')
for elem in recsys_file["item3G_new"]:
if recsys_file["elem"] == 'Strongly disagree':
recsys_file['elem'].replace(to_replace = elem, value = 1)
if recsys_file["elem"] == 'Disagree':
recsys_file['elem'].replace(to_replace = elem, value = 2)
if recsys_file["elem"] == 'Neutral':
recsys_file['elem'].replace(to_replace = elem, value = 3)
if recsys_file["elem"] == 'Agree':
recsys_file['elem'].replace(to_replace = elem, value = 4)
if recsys_file["elem"] == 'Strongly agree':
recsys_file['elem'].replace(to_replace = elem, value = 5)
trying to change the categories to values from 1 to 5 but I am getting errors

Try this, it uses np.select() to make your code more concise and gives a default value if there is nothing matching (which might be part of the problem)
import pandas as pd
import numpy as np
recsys_file = pd.read_csv('full_chat_questions.csv')
condition_list = [recsys_file["elem"] == 'Strongly disagree', recsys_file["elem"] == 'Disagree', recsys_file["elem"] == 'Neutral', recsys_file["elem"] == 'Agree', recsys_file["elem"] == 'Strongly agree']
choice_list = [1, 2, 3, 4, 5]
recsys_file["elem"] = np.select(condition_list, choice_list, 0)

Related

How to alternate color of the graphs between blue and white?

I have a list D containing 50 sub-lists. The number of elements in these sub-lists are decreasing. I visualize the list D by
for i, array in enumerate(D):
plt.scatter([i]*len(array), array)
I have 50 functions taking values from St_Sp, and Y is a list containing 50 elements, each of them is the output of each function. I visualize these functions
fig, ax = plt.subplots()
for i in range(len(Y)):
ax.plot(St_Sp, Y[i])
I found that too many colors are not easy to eyes. I would like to ask how to alternate color of the graphs between blue and white? I mean the color of the functions and dots in D are white > blue > white > blue ...
Could you please elaborate on how to do so?
##### Import packages
import numpy as np
import scipy.linalg as la
import time
import matplotlib
import matplotlib.pyplot as plt
##### Initial conditions
N = 100
lamda = 7
mu = 2
a = np.exp(-0.05)
r = - np.log(a).copy()
St_Sp = np.arange(- N, N + 1)
Card = St_Sp.shape[0]
##### Define infintesimal generator
def LL(x, y):
if x == N or x == - N: re = 0
elif x - y == - 1: re = lamda
elif x - y == 1: re = mu
elif x - y == 0: re = - (mu + lamda)
else: re = 0
return re
def L(x):
return - LL(x, x)
##### Define function Phi
def Phi(x):
return max(x, 0)
Phi = np.vectorize(Phi)
##### Define vector b
b = np.array(Phi(St_Sp))
##### Define function Psi
def Psi(x):
return L(x) / (L(x) + r)
Psi = np.vectorize(Psi)
##### Generate a Boolean vector whose all elements are False
d = np.array([0] * Card).astype(bool)
##### Define matrix A
A = np.zeros((Card, Card))
for i in range(Card):
for j in range(Card):
if (i != j) & (L(St_Sp[i]) != 0):
A[i, j] = LL(St_Sp[i], St_Sp[j]) / L(St_Sp[i])
elif (i != j) & (L(St_Sp[i]) == 0):
A[i, j] = 0
elif (i == j) & (Psi(St_Sp[i]) != 0):
A[i, j] = - 1 / Psi(St_Sp[i])
else: A[i, j] = 1
##### Row names of A
rows = np.arange(0, Card)
##### Define matrix B
B = np.zeros((Card, Card))
for i in range(Card):
for j in range(Card):
if i != j:
B[i, j] = LL(St_Sp[i], St_Sp[j])
else: B[i, j] = LL(St_Sp[i], St_Sp[j]) - r
start = time.time()
##### Generate I_0
I = [np.array([1] * Card).astype(bool), d.copy()]
Z = np.array(b.copy())
Z = Z.astype(float)
D = [St_Sp]
index0 = np.matmul(B, Z) <= 0
index1 = ~ index0
Y = [b.copy()]
##### Iterations
for i in range(1, Card):
I = [I[0] & index0, I[1] | index1]
Z = np.array(b.copy())
Z = Z.astype(float)
A1 = A[np.ix_(rows[I[1]], rows[I[1]])]
A2 = A[np.ix_(rows[I[1]], rows[I[0]])]
Z[I[1]] = la.solve(A1, - np.matmul(A2, Z[I[0]]))
Y = np.concatenate((Y, [Z]))
D.append(St_Sp[I[0]])
index = np.matmul(B[I[0]], Z) <= 0
index0, index1 = d.copy(), d.copy()
index0[I[0]], index1[I[0]] = index, ~ index
if (I[0] == index0).all() == True: break
for i, array in enumerate(D):
plt.scatter([i]*len(array), array)
fig, ax = plt.subplots()
for i in range(len(Y)):
ax.plot(St_Sp, Y[i])
The easiest approach is to set a custom color cycler. Instead of cycling between the 10 typical colors, the default colors for the plots will cycle through the given colors.
from cycler import cycler
custom_cycler = cycler(color=['white', 'blue'])
plt.gca().set_prop_cycle(custom_cycler)
for i, array in enumerate(D[:-1]):
plt.scatter([i] * len(array), array)
plt.scatter([len(D) - 1] * len(D[-1]), D[-1], color='crimson')
fig, ax = plt.subplots()
ax.set_prop_cycle(custom_cycler)
for i in range(len(Y) - 1):
ax.plot(St_Sp, Y[i])
ax.plot(St_Sp, Y[len(Y) - 1], color='crimson')
plt.show()

Why doesn't assigning new values to a numpy array work?

I have the following code. The beggining is quite long, but only serves to generate data. The problem happens with a few lines at the end.
##### Import packages
import numpy as np
import scipy.linalg as la
##### Initial conditions
N = 5
lamda = 7
mu = 2
a = 0.5
r = - np.log(a).copy()
St_Sp = np.arange(- N, N + 1)
Card = St_Sp.shape[0]
##### Define infintesimal generator
def LL(x, y):
if x == N or x == - N: re = 0
elif x - y == - 1: re = lamda
elif x - y == 1: re = mu
elif x - y == 0: re = - (mu + lamda)
else: re = 0
return re
def L(x):
return - LL(x, x)
##### Define function Phi
def Phi(x): return max(x, 0)
Phi = np.vectorize(Phi)
##### Define vector b
b = Phi(St_Sp).copy()
##### Define function Psi
def Psi(x): return L(x) / (L(x) + r)
Psi = np.vectorize(Psi)
##### Generate a Boolean vector whose all elements are False
d = np.array([0] * Card).astype(bool)
##### Define matrix A
A = np.zeros((Card, Card))
for i in range(Card):
for j in range(Card):
if (i != j) & (L(St_Sp[i]) != 0):
A[i, j] = LL(St_Sp[i], St_Sp[j]) / L(St_Sp[i])
elif (i != j) & (L(St_Sp[i]) == 0):
A[i, j] = 0
elif (i == j) & (Psi(St_Sp[i]) != 0):
A[i, j] = - 1 / Psi(St_Sp[i])
else: A[i, j] = 1
##### Row names of A
rows = np.arange(0, Card)
##### Define matrix B
B = np.zeros((Card, Card))
for i in range(Card):
for j in range(Card):
if i != j:
B[i, j] = LL(St_Sp[i], St_Sp[j])
else: B[i, j] = LL(St_Sp[i], St_Sp[j]) - r
##### Generate I_0
I = [np.array([1] * Card).astype(bool), d.copy()]
Z = b.copy()
index0 = np.matmul(B, Z) <= 0
index1 = ~ index0
##### Generate I_1
I = [index0, index1]
Z = b.copy()
if np.sum(I[1]) > 0:
order = np.concatenate((rows[I[1]], rows[~ I[1]]))
A1 = A[np.ix_(rows[I[1]], order)]
A2 = la.lu(A1)[2]
p = np.atleast_2d(A1).shape[0]
B1 = A2[:, range(p)]
B2 = - np.matmul(A2[:, p:], Z[I[0]])
print('Before being assigned new values, Z is \n', Z)
print('\n The index I[1] of elements of Z to be change \n', I[1])
M = la.solve_triangular(B1, B2, lower = False)
print('\n The values to be assigned to Z[I[1]] is \n', M)
Z[I[1]] = M
print('\n After being assigned new values, Z is \n', Z)
with result
Before being assigned new values, Z is
[0 0 0 0 0 0 1 2 3 4 5]
The index I[1] of elements of Z to be change
[False False False False False True True True True True False]
The values to be assigned to Z[I[1]] is
[2.08686055 2.88974949 3.40529229 3.88978577 4.41338306]
After being assigned new values, Z is
[0 0 0 0 0 2 2 3 3 4 5]
It's very weird to me that the command Z[I[1]] = M does not assign new values from M to the postion of Z indexed by I[1]. Could you please elaborate on why this problem arises and how to resolve it?
The datatype of your array Z is int, to the values are typecasted by python automatically, resulting in the interger values of int([2.08686055 2.88974949 3.40529229 3.88978577 4.41338306]) = [2 2 3 3 4 5].
If you want to change that behavour, you just need to add a line to change the type of your original array:
Z = Z.astype(float)

why is my ROC curve getting plotted in reverse

I have one csv file and tried to plot an ROC curve with out using any predefined libraries for the curve plotting. I have used numpy and pandas for my code. Can any one please tell me where am I going wrong?ROC Curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('5_b.csv')
#adding a new column to the data
df['Y_pred'] = np.where(df['proba']<=0.5, 0, 1)
#printing the top 5 values in data
#df.head(5)
#sorting dataframe
df=df.sort_values(by =['proba'])
df.head(5)
#confusion matrix
TP_Main = len(df[(df['y'] == 1) & (df['Y_pred'] == 1)])
FP_Main = len(df[(df['y'] == 0) & (df['Y_pred'] == 1)])
FN_Main = len(df[(df['y'] == 1) & (df['Y_pred'] == 0)])
TN_Main = len(df[(df['y'] == 0) & (df['Y_pred'] == 0)])
print("TN_Main : {0},FN_Main : {1}".format(TN_Main,FN_Main))
print("FP_Main : {0},TP_Main : {1}".format(FP_Main,TP_Main))
#F1score
precision = TP_Main/(TP_Main+FP_Main)
recall = TP_Main/(TP_Main+FN_Main)
F1score = ((precision*recall)/(precision+recall))*2
print("precision : {0},recall : {1}".format(precision,recall))
print("F1score : ",F1score)
#df.sort_values(by =['proba'], inplace = True, ascending = False)
tprList = []
fprList = []
for i in range(len(df)):
df['Y_pred'] =np.where(df['proba']<=df.iloc[i][1],0,1)
TP = len(df[(df['y'] == 1) & (df['Y_pred'] == 1)])
FP = len(df[(df['y'] == 0) & (df['Y_pred'] == 1)])
FN = len(df[(df['y'] == 1) & (df['Y_pred'] == 0)])
TN = len(df[(df['y'] == 0) & (df['Y_pred'] == 0)])
TPR = TP/(FN+TP)
FPR = TN/(FP+TN)
tprList.append(TPR)
fprList.append(FPR)
tpr_array = np.array(tprList)
fpr_array = np.array(fprList)
#Accuracy score
AccScore = (TN_Main+TP_Main)/len(df)
print("Accuracy Score =", AccScore)
AUCScore = np.trapz(tpr_array,fpr_array)
print("AUC Score :",AUCScore)
plt.plot(tpr_array,fpr_array)

How to get different predation values every year for my simulation?

I am trying to run this model of seed predation and population dynamics but I am new to coding and I am only getting one predation value that gets repeated over different generations. How can I get different predation values for different year?
Also, Is there an issue with the normalizing method used?
import numpy as np
import matplotlib.pyplot as plt
def is_odd(year):
return ((year % 2) == 1)
def reproduction(p_iter, year, dead):
if is_odd(year):
predation = dead
seedsProd = p_iter*s_oddd
seedsPred = K*predation*200*(seedsProd/np.sum(seedsProd))
return (seedsProd - seedsPred) + np.array([0,0,p_iter[2]])
else:
predation = dead
seedsProd = p_iter*s_even
seedsPred = K*predation*200*(seedsProd/np.sum(seedsProd))
return (seedsProd - seedsPred) +np.array([0,p_iter[1],0])
def normalize(p_iter):
if is_odd(year):
x = np.copy(p_iter)
x[2] = 0
x = (K-p_iter[2]) * x / sum(x)
x[2] = p_iter[2]
return x
else:
x = np.copy(p_iter)
x[1] = 0
x = (K-p_iter[1]) * x / sum(x)
x[1] = p_iter[1]
return x
Predation is defined here:
def predation():
return (np.array(np.round(np.random.uniform(0.4,0.6),2)))
#max_years
Y = 100
#carrying capacity
K = 1000
#initial populaton
p_1, p_2, p_3 = 998., 1., 1.
#seed released per plant
s_1, s_2, s_3 = 200, 260, 260
p_init = np.array([p_1, p_2, p_3],dtype=float)
s_oddd = np.array([s_1, s_2, 0.0])
s_even = np.array([s_1, 0.0, s_3])
n = len(p_init)
m = np.append(p_init,s_oddd)
p_iter = p_init
dead = 0
norm = 0
for year in range(1,Y+1):
dead = predation()
seeds = reproduction(p_iter, year, dead)
p_iter = np.maximum(seeds,np.zeros(p_iter.shape))
p_iter = normalize(p_iter)
m = np.vstack((m, [*p_iter]+[*seeds] ))

Calculating entropy in ID3 log2(0) in formula

import numpy as np
udacity_set = np.array(
[[1,1,1,0],
[1,0,1,0],
[0,1,0,1],
[1,0,0,1]])
label = udacity_set[:,udacity_set.shape[1]-1]
fx = label.size
positive = label[label == 1].shape[0]
positive_probability = positive/fx
negative = label[label == 0].shape[0]
negative_probability = negative/fx
entropy = -negative_probability*np.log2(negative_probability) - positive_probability*np.log2(positive_probability)
atribute = 0
V = 1
attribute_set = udacity_set[np.where(udacity_set[:,atribute] == 1)] #selecting positive instance of occurance in attribute 14
instances = attribute_set.shape[0]
negative_labels = attribute_set[np.where(attribute_set[:,attribute_set.shape[1]-1]== 0)].shape[0]
positive_labels = attribute_set[np.where(attribute_set[:,attribute_set.shape[1]-1]== 1)].shape[0]
p0 = negative_labels/instances
p1 = positive_labels/instances
entropy2 = - p0*np.log2(p0) - p1*np.log2(p1)
attribute_set2 = udacity_set[np.where(udacity_set[:,atribute] == 0)] #selecting positive instance of occurance in attribute 14
instances2 = attribute_set2.shape[0]
negative_labels2 = attribute_set[np.where(attribute_set2[:,attribute_set2.shape[1]-1]== 0)].shape[0]
positive_labels2 = attribute_set[np.where(attribute_set2[:,attribute_set2.shape[1]-1]== 1)].shape[0]
p02 = negative_labels2/instances2
p12 = positive_labels2/instances2
entropy22 = - p02*np.log2(p02) - p12*np.log2(p12)
Problem is when attribute is pure and entropy is meant to be 0. But when i put this into a formula i get NaN. I know how to code workaround, but why is this formula rigged?

Resources