cannot convert string into float - python-3.x

Sales Discount Profit Product ID
0 0.050090 0.000000 0.262335 FUR-ADV-10000002
1 0.110793 0.000000 0.260662 FUR-ADV-10000108
2 0.309561 0.864121 0.241432 FUR-ADV-10000183
3 0.039217 0.591474 0.260687 FUR-ADV-10000188
4 0.070205 0.000000 0.263628 FUR-ADV-10000190
5 0.697873 0.000000 0.281162 FUR-ADV-10000571
6 0.064918 0.000000 0.261285 FUR-ADV-10000600
7 0.091950 0.000000 0.262946 FUR-ADV-10000847
8 0.056013 0.318384 0.257952 FUR-ADV-10001283
9 0.304472 0.318384 0.265739 FUR-ADV-10001440
10 0.046234 0.318384 0.261058 FUR-ADV-10001659
Am using K elbow method to find the right number of cluster
Using the elbow method to find the optimal number of clusters
import matplotlib.pyplot as plt
def kelbow(final_df,k):
from sklearn.cluster import KMeans
x = []
for i in range(1,k):
kmeans = KMeans(n_clusters = i)
kmeans.fit(final_df)
x.append(kmeans.inertia_)
plt.plot(range(1,k), 30)
plt.title('The elbow method')
plt.xlabel('The number of clusters')
plt.ylabel('WCSS')
plt.show()
return x
Returning the function,
kelbow(final_df,30),
But the code is throwing the error as,
ValueError: could not convert string to float: 'TEC-STA-10004927'
How can i find the clusters?

Make dummy variables.
final_df = pd.get_dummies(final_df, columns=['ProductID'], dtype=('int64'))
final_df = final_df.drop(['ProductID'], axis=1)

This should work for you:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def kelbow(df, k):
x = []
final_df = pd.get_dummies(df, columns=df.select_dtypes(['object']).columns)
for i in range(1,k):
kmeans = KMeans(n_clusters = i)
kmeans.fit(final_df)
x.append(kmeans.inertia_)
plt.plot(range(1,k), 30)
plt.title('The elbow method')
plt.xlabel('The number of clusters')
plt.ylabel('WCSS')
plt.show()
return x

Related

How can I create a forest plot?

I would like to combine different risk ratios into one forest plot. I would expect the output to be similar to metamiss in STATA or metafor in R. How can I do this in Python?
By using the zEPID package I create a forest plot of different risk ratios.
import matplotlib.image as mpimg
import numpy as np
import matplotlib.pyplot as plt
import zepid
from zepid.graphics import EffectMeasurePlot
labs = ["ACA(Isq=41.37% Tausq=0.146 pvalue=0.039 )",
"ICA0(Isq=25.75% Tausq=0.092 pvalue=0.16 )",
"ICA1(Isq=60.34% Tausq=0.121 pvalue=0.00 )",
"ICAb(Isq=25.94% Tausq=0.083 pvalue=0.16 )",
"ICAw(Isq=74.22% Tausq=0.465 pvalue=0.00 )"]
measure = [2.09,2.24,1.79,2.71,1.97]
lower = [1.49,1.63,1.33,2.00,1.25]
upper = [2.92,3.07,2.42,3.66,3.11]
p = EffectMeasurePlot(label=labs, effect_measure=measure, lcl=lower, ucl=upper)
p.labels(effectmeasure='RR')
p.colors(pointshape="D")
ax=p.plot(figsize=(7,3), t_adjuster=0.09, max_value=4, min_value=0.35 )
plt.title("Random Effect Model(Risk Ratio)",loc="right",x=1, y=1.045)
plt.suptitle("Missing Data Imputation Method",x=-0.1,y=0.98)
ax.set_xlabel("Favours Control Favours Haloperidol ", fontsize=10)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(False)
plt.savefig("Missing Data Imputation Method",bbox_inches='tight')
The statsmodels library has an API for doing simple meta-analysis and plotting forest plots. It supports DerSimonian-Laird (chi2) and Paule-Mandel (iterated). See the statsmodel docs for more use cases, options and examples.
An example from their docs:
import numpy as np
from statsmodels.stats.meta_analysis import combine_effects
# dummy data
mean_effect = np.array([61.00,61.40,62.21,62.30,62.34,62.60,62.70,62.84,65.90])
var_effect = np.array([0.2025,1.2100,0.0900,0.2025,0.3844,0.5625,0.0676,0.0225,1.8225])
idx = ['lab1','lab2','lab3','lab4','lab5','lab6','lab7','lab8','lab9']
# meta-analysis and forest plot
results = combine_effects(mean_effect, var_effect, method_re="chi2", use_t=True, row_names=idx)
print(results.summary_frame())
fig = results.plot_forest()
Output:
eff sd_eff ci_low ci_upp w_fe w_re
lab1 61.000000 0.450000 60.118016 61.881984 0.057436 0.123113
lab2 61.400000 1.100000 59.244040 63.555960 0.009612 0.040314
lab3 62.210000 0.300000 61.622011 62.797989 0.129230 0.159749
lab4 62.300000 0.450000 61.418016 63.181984 0.057436 0.123113
lab5 62.340000 0.620000 61.124822 63.555178 0.030257 0.089810
lab6 62.600000 0.750000 61.130027 64.069973 0.020677 0.071005
lab7 62.700000 0.260000 62.190409 63.209591 0.172052 0.169810
lab8 62.840000 0.150000 62.546005 63.133995 0.516920 0.194471
lab9 65.900000 1.350000 63.254049 68.545951 0.006382 0.028615
fixed effect 62.583397 0.107846 62.334704 62.832090 1.000000 NaN
random effect 62.390139 0.245750 61.823439 62.956838 NaN 1.000000
fixed effect wls 62.583397 0.189889 62.145512 63.021282 1.000000 NaN
random effect wls 62.390139 0.294776 61.710384 63.069893 NaN 1.000000
I’d also recommend having a read through the docs for the individual methods such as combine_effects() which contains additional notes and references regarding the implementation.
Since I haven't found a customizable package to create a forest plot, I developed myforestplot for that purpose.
The following is one example of a forest plot using titanic dataset.
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
import myforestplot as mfp
data = (pd.read_csv("titanic.csv")
[["survived", "pclass", "sex", "age", "embark_town"]]
.dropna()
)
ser = data["age"]
data["age"] = (ser
.mask(ser >= 40, "40 or more")
.mask(ser < 40, "20_39")
.mask(ser <20, "0_19")
)
res = smf.logit("survived ~ sex + age + embark_town", data=data).fit()
order = ["age", "sex", "embark_town"]
cont_cols = []
item_order = {"embark_town": ['Southampton', 'Cherbourg', 'Queenstown'],
"age": ["0_19", "20_39", "40 or more"]
}
df = mfp.statsmodels_pretty_result_dataframe(data, res,
order=order,
cont_cols=cont_cols,
item_order=item_order,
fml=".3f",
)
df is a dataframe for creating a forest plot.
plt.rcParams["font.size"] = 8
fp = mfp.SimpleForestPlot(ratio=(8,3), dpi=150, figsize=(5,3), df=df,
vertical_align=True)
fp.errorbar(errorbar_kwds=None, log_scale=True)
xticklabels = [0.1, 0.5, 1.0, 2.0]
fp.ax2.set_xlim(np.log([0.1, 1.5]))
fp.ax2.set_xticks(np.log(xticklabels))
fp.ax2.set_xticklabels(xticklabels)
fp.ax2.set_xlabel("OR (log scale)")
fp.ax2.axvline(x=0, ymin=0, ymax=1.0, color="black", alpha=0.5)
fp.ax1.set_xlim([0.35, 1])
fp.embed_cate_strings("category", 0.3, header="Category",
text_kwds=dict(fontweight="bold"),
header_kwds=dict(fontweight="bold")
)
fp.embed_strings("item", 0.36, header="", replace={"age":""})
fp.embed_strings("nobs", 0.60, header="N")
fp.embed_strings("risk_pretty", 0.72, header="OR (95% CI)")
fp.horizontal_variable_separators()
fp.draw_outer_marker(log_scale=True, scale=0.008)
plt.show()
and we obtain the figure.
A forest plot image

Learning neural network and saving the result

I think this is a simple question, but not for me( There is a table in df:
Date X1 X2 Y1
07.02.2019 5 1 1
08.02.2019 6 2 1
09.02.2019 1 3 0
10.02.2019 4 4 1
11.02.2019 1 1 0
12.02.2019 4 2 1
13.02.2019 5 5 1
14.02.2019 6 5 1
15.02.2019 1 1 0
16.02.2019 4 5 1
17.02.2019 1 2 0
18.02.2019 1 1
19.02.2019 2 1
20.02.2019 3 2
21.02.2019 4 14
I need to build a neural network for Y1 from the parameters X1 and X2 and then apply it to the lines with a date greater than 17.02.2019, And save the network prediction result in a separate df2
import pandas as pd
import numpy as np
import re
from sklearn.neural_network import MLPClassifier
df = pd.read_csv("ob.csv", encoding = 'cp1251', sep = ';')
df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')
startdate = pd.to_datetime('2019-02-17')
X = ['X1', 'X2'] ????
y = ['Y1'] ????
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(x, y)
clf.predict(???????) ????? df2 = ????
Where ???? - I do not know how to set the conditions correctly
import pandas as pd
import numpy as np
import re
from sklearn.neural_network import MLPClassifier
df = pd.read_csv("ob.csv", encoding = 'cp1251', sep = ';')
df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')
startdate = pd.to_datetime('2019-02-17')
train = df[df['Date'] <= '2019-02-17']
test = df[df['Date'] > '2019-02-17']
X_train = train[['X1', 'X2']]
y_train = train[['Y1']]
X_test = test[['X1', 'X2']]
y_test = test[['Y1']]
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
df2 = pd.DataFrame(clf.predict(X_test))
df2.to_csv('prediction.csv')

Unable to plot scatter plot because of TypeError

I have a dataset, in which i will be using only a single column to apply kmeans clustering. However while plotting the graph, i am getting "numpy.ndarray". I tried converting to float, but still facing the same issue
Dataframe:
Brim
1234.5
345
675.7
120
110
Code:
from sklearn.cluster import KMeans
import numpy as np
km = KMeans(n_clusters=4, init='k-means++',n_init=10)
km.fit(df1)
x = km.fit_predict(df1)
x
array([0, 0, 0, ..., 3, 3, 3])
np.shape(x)
(1097,)
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
plt.scatter(df1[x ==3,0], df1[x == 3,1], s=100, c='cyan')
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-62-5f0966ccc828> in <module>()
1 import matplotlib.pyplot as plt
2 get_ipython().run_line_magic('matplotlib', 'inline')
----> 3 plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
4 plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
5 plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
->2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site- packages\pandas\core\generic.py in _get_item_cache(self, item)
1838 """Return the cached item, item represents a label indexer."""
1839 cache = self._item_cache
-> 1840 res = cache.get(item)
1841 if res is None:
1842 values = self._data.get(item)
TypeError: unhashable type: 'numpy.ndarray'
If I understood your code correctly, you're trying to slice your DataFrame for plotting, based on the values of x.
For that, you should be using df1.loc[x==1,0] instead of df1[x==1,0] (and so on for all other slices).
In my case, I was trying to pick random 2 features and run KMeans classifier on it.
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1)
kmeans_classifier = KMeans(n_clusters=3) # select random features
y_kmeans = kmeans_classifier.fit_predict(sample)
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 75, c ='red', label = 'Zero')
Last line was throwing the TypeError. I resolved this by converting the sample DataFrame to Numpy representation with values.
Modified code:
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1).values

Can not plot a 3d bar use matplotlib

I want to plot a 3d bar use matplotlib.
I have a dataframe like this
In[1]: mf
Out[1]: 1 2 4
0
6N 45.238806 104.102564 16.804965
12S 25.597015 95.128205 13.156028
18S 29.689055 76.730769 17.078014
7S 0.000000 156.602564 20.106383
12S 25.597015 95.128205 13.156028
25S 0.000000 151.217949 16.929078
2S 4.962687 49.358974 32.517730
14N 0.000000 0.000000 33.386525
24S 10.447761 71.346154 25.343972
I want to plot a 3d bar in the dataframe corresponding position.
My code like this:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
xpos = [1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9]
ypos = [3,2,1,3,2,1,3,2,1,3,2,1,3,2,1,3,2,1,3,2,1,3,2,1,3,2,1]
zpos = np.zeros(27)
dx = np.ones(27)
dy = np.ones(27)
# to reshape my dataframe to a np vector
nf = mf.values
dz = np.reshape(nf,(1,27))
ax1.bar3d(xpos, ypos, zpos, dx,dy,dz, color="#00ceaa")
but I get this error:
15 dz = np.reshape(nf,(1,27))
16 dz
---> 17 ax1.bar3d(xpos, ypos, zpos, dx,dy,dz, color="#00ceaa")
bar3d(self, x, y, z, dx, dy, dz, color, zsort, shade, *args, **kwargs)
2526
2527 if shade:
-> 2528 normals = self._generate_normals(polys)
2529 sfacecolors = self._shade_colors(facecolors, normals)
in _generate_normals(self, polygons)
1771 v1 = np.array(verts[0]) - np.array(verts[1])
1772 v2 = np.array(verts[2]) - np.array(verts[0])
-> 1773 normals.append(np.cross(v1, v2))
1774 return normals
in cross(a, b, axisa, axisb, axisc, axis)
1716 "(dimension must be 2 or 3)")
1717 if a.shape[-1] not in (2, 3) or b.shape[-1] not in (2, 3):
-> 1718 raise ValueError(msg)
1719
1720 # Create the output array
ValueError: incompatible dimensions for cross product
(dimension must be 2 or 3)
Where is my code wrong I did not have thinks, thanks a lot.
You need to reshape your df.values like this:
dz = np.reshape(nf,(27))
such that all arrays have the same shape (i.e. (27,), check dx.shape, dy.shape,z.shape,...).
Also note that (while not required) it's good practice to declare both your xpos and ypos lists as np.array like:
xpos = np.array([1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9])

How to pass user defined function inside TfidfVectorizer.fit_transform()

I have function for text preprocessing which is simply removing stopwords as:
def text_preprocessing():
df['text'] = df['text'].apply(word_tokenize)
df['text']=df['text'].apply(lambda x: [item for item in x if item not in stopwords])
new_array=[]
for keywords in df['text']: #converts list of words into string
P=" ".join(str(x) for x in keywords)
new_array.append(P)
df['text'] = new_array
return df['text']
I want to pass text_preprocessing() into another function tf_idf() which gives feature matrix what I essentially did as:-
def tf_idf():
tfidf = TfidfVectorizer()
feature_array = tfidf.fit_transform(text_preprocessing)
keywords_data=pd.DataFrame(feature_array.toarray(), columns=tfidf.get_feature_names())
return keywords_data
I got an error as TypeError: 'function' object is not iterable
Rather than building additional functions for stop-word removal you can simply pass a custom list of stop-words to TfidfVectorizer. As you can see in the example below "test" is successfully excluded from the Tfidf vocabulary.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# Setting up
numbers = np.random.randint(1, 5, 3)
text = ['This is a test.', 'Is this working?', "Let's see."]
df = pd.DataFrame({'text': text, 'numbers': numbers})
# Define custom stop words and instantiate TfidfVectorizer with them
my_stopwords = ['test'] # the list can be longer
tfidf = TfidfVectorizer(stop_words=my_stopwords)
text_tfidf = tfidf.fit_transform(df['text'])
# Optional - concatenating tfidf with df
df_tfidf = pd.DataFrame(text_tfidf.toarray(), columns=tfidf.get_feature_names())
df = pd.concat([df, df_tfidf], axis=1)
# Initial df
df
Out[133]:
numbers text
0 2 This is a test.
1 4 Is this working?
2 3 Let's see.
tfidf.vocabulary_
Out[134]: {'this': 3, 'is': 0, 'working': 4, 'let': 1, 'see': 2}
# Final df
df
Out[136]:
numbers text is let see this working
0 2 This is a test. 0.707107 0.000000 0.000000 0.707107 0.000000
1 4 Is this working? 0.517856 0.000000 0.000000 0.517856 0.680919
2 3 Let's see. 0.000000 0.707107 0.707107 0.000000 0.000000

Resources