How to plot the distribution of each in feature in cancer dataset - python-3.x

I want to get the distribution of each features in cancer dataset using ggplot but its giving me error.
#pip install plotnine
from plotnine import ggplot
from plotnine import *
from sklearn.datasets import load_breast_cancer
for i in cancer.feature_names:
ggplot(cancer.data)+aes(x=i)+geom_bar(size=10)
This is the error message i got
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

I would recommand to use seaborn for that. Here is an example of plotting the distribution of each in feature in cancer dataset by target:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
# loading data
cancer = load_breast_cancer()
data = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
columns= np.append(cancer['feature_names'], ['target']))
df = data.melt(['target'], var_name='cols', value_name='vals')
g = sns.FacetGrid(df, col='cols', hue="target", palette="Set1", col_wrap=4)
g = (g.map(sns.distplot, "vals", hist=True, ))

from plotnine import ggplot
from plotnine import *
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
import pandas as pd
import matplotlib.pyplot as plt
data=pd.DataFrame(cancer.data,columns=cancer.feature_names)
for i in data.columns:
print(ggplot(data)+aes(x=i)+geom_density(size=1))
print(ggplot(data)+aes(x=i)+geom_bar(size=10))

Related

Can't run KMeans algorithm because Pandas DataFrame is loading decimal point numbers as strings

I drop all columns but the two I am interested in. When I try to convert my dataframe to a 2d numpy array from the two columns it turns into an object type that contains strings. I believe this is because the Data_Values has values such as "23.6." Is there anyway I can get rid of the decimal point and trailing numbers in this data as they are all different values.
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin
data = pd.read_csv('Alzheimer_s_Disease_and_Healthy_Aging_Data.csv', engine='python', header=None)
data.columns = ['RowId', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Response',
'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'Data_Value', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'StratificationCategory1',
'Stratification1', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation',
'ClassID', 'TopicID', 'QuestionID', 'ResponseID', 'LocationID', 'StratificationCategoryID1', 'StratificationID1',
'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3', 'Report']
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 10)
data1 = data.iloc[1:]
df = data1[data1["Data_Value_Type"].str.contains("Mean") == False]
df = data1[data1["Data_Value"].str.contains("NaN") == False]
df.dropna()
df = df.drop(columns=['RowId', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Response',
'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'StratificationCategory1',
'Stratification1', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation',
'ClassID', 'TopicID', 'QuestionID', 'ResponseID', 'StratificationCategoryID1', 'StratificationID1',
'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3', 'Report'])
x = df.to_numpy()
print(x.dtype)

ImportError: cannnot import name 'Imputer' from 'sklearn.preprocessing'

Trying to import Imputer from sklearn.preprocessing,
import pandas as pd
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values
#PART WHERE ERROR OCCURS:-
from sklearn.preprocessing import Imputer
Shows "ImportError: cannot import name 'Imputer' from 'sklearn.preprocessing' (/home/codeknight13/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_init_.py)"
from sklearn.preprocessing import Imputer was deprecated with scikit-learn v0.20.4 and removed as of v0.22.2. See the sklean changelog.
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
pip install scikit-learn==0.20.4 or conda install scikit-learn=0.20.4 are not a good options because scikit-learn==0.20.4 is more than 3 years out of date.

AttributeError: module 'matplotlib' has no attribute 'scatter'

I'm trying to make cluster of latitude and longitude.
the code gave an error in plt.scatter(data['Lng'],data['Lat']) line
the error is:
AttributeError: module 'matplotlib' has no attribute 'scatter'
code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
data = pd.read_csv("pk.csv")
data.head()
lat_long = data.drop(['country', 'iso2','admin', 'capital','population',
'population_proper'] , axis = 1)
lat_long.head()
plt.scatter(data['Lng'],data['Lat']) # error here
It should be:
import matplotlib.pyplot as plt
Or it can be:
from matplotlib import pyplot as plt
Also you can read PEP 328 for more information and clearity.

How to scale a data using Python 3

I am trying to scale my data using Python 3
But I keep getting this error: I am out of ideas as to what could be the issue? Please can you assist me guys? I would deeply appreciate your help!
import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series, DataFrame
from pandas.plotting import scatter_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from pylab import rcParams
import seaborn as sb
import scipy
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
mtcars = pd.read_csv('mtcars.csv')
mtcars.columns = ['Car
names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']
mpg = mtcars['mpg']
#Scale your data
mpg_matrix = mpg.reshape(-1,1)
scaled = preprocessing.MinMaxScaler()
scaled_mpg = scaled.fit_transform(mpg_matrix)
plt.plot(scaled_mpg)
plt.show()
mpg_matrix = mpg.numpy.reshape(-1,1)
tr__
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 5067, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'numpy'
pandas.core.series.Series doesn't have reshape.
Perhaps:
mpg_matrix = mpg.values.reshape(-1,1)
i.e. get the underlying numpy array and reshape that.

I was visualizing a data set using seaborn in python3 but its giving me an error. unsupported operand type(s) for /: 'str' and 'int'

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
poll_df=pd.read_csv('http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv')
#poll_df is the data which i have read from a csv file.
sns.factorplot('Affiliation',data=poll_df)
I have difficulty understanding the question. Column Affiliation has a str value not numeric.
if you want to count total number of each str category and have a bar plot try:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
poll_df=pd.read_csv('http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv')
#poll_df is the data which i have read from a csv file.
sns.countplot('Affiliation',data=poll_df)
alternatively upload the image of what kind of plot you would like to have as a result

Resources