I drop all columns but the two I am interested in. When I try to convert my dataframe to a 2d numpy array from the two columns it turns into an object type that contains strings. I believe this is because the Data_Values has values such as "23.6." Is there anyway I can get rid of the decimal point and trailing numbers in this data as they are all different values.
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin
data = pd.read_csv('Alzheimer_s_Disease_and_Healthy_Aging_Data.csv', engine='python', header=None)
data.columns = ['RowId', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Response',
'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'Data_Value', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'StratificationCategory1',
'Stratification1', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation',
'ClassID', 'TopicID', 'QuestionID', 'ResponseID', 'LocationID', 'StratificationCategoryID1', 'StratificationID1',
'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3', 'Report']
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 10)
data1 = data.iloc[1:]
df = data1[data1["Data_Value_Type"].str.contains("Mean") == False]
df = data1[data1["Data_Value"].str.contains("NaN") == False]
df.dropna()
df = df.drop(columns=['RowId', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Response',
'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'StratificationCategory1',
'Stratification1', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation',
'ClassID', 'TopicID', 'QuestionID', 'ResponseID', 'StratificationCategoryID1', 'StratificationID1',
'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3', 'Report'])
x = df.to_numpy()
print(x.dtype)
I'm trying to make cluster of latitude and longitude.
the code gave an error in plt.scatter(data['Lng'],data['Lat']) line
the error is:
AttributeError: module 'matplotlib' has no attribute 'scatter'
code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
data = pd.read_csv("pk.csv")
data.head()
lat_long = data.drop(['country', 'iso2','admin', 'capital','population',
'population_proper'] , axis = 1)
lat_long.head()
plt.scatter(data['Lng'],data['Lat']) # error here
It should be:
import matplotlib.pyplot as plt
Or it can be:
from matplotlib import pyplot as plt
Also you can read PEP 328 for more information and clearity.
I am trying to scale my data using Python 3
But I keep getting this error: I am out of ideas as to what could be the issue? Please can you assist me guys? I would deeply appreciate your help!
import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series, DataFrame
from pandas.plotting import scatter_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from pylab import rcParams
import seaborn as sb
import scipy
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
mtcars = pd.read_csv('mtcars.csv')
mtcars.columns = ['Car
names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']
mpg = mtcars['mpg']
#Scale your data
mpg_matrix = mpg.reshape(-1,1)
scaled = preprocessing.MinMaxScaler()
scaled_mpg = scaled.fit_transform(mpg_matrix)
plt.plot(scaled_mpg)
plt.show()
mpg_matrix = mpg.numpy.reshape(-1,1)
tr__
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 5067, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'numpy'
pandas.core.series.Series doesn't have reshape.
Perhaps:
mpg_matrix = mpg.values.reshape(-1,1)
i.e. get the underlying numpy array and reshape that.
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
poll_df=pd.read_csv('http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv')
#poll_df is the data which i have read from a csv file.
sns.factorplot('Affiliation',data=poll_df)
I have difficulty understanding the question. Column Affiliation has a str value not numeric.
if you want to count total number of each str category and have a bar plot try:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
poll_df=pd.read_csv('http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv')
#poll_df is the data which i have read from a csv file.
sns.countplot('Affiliation',data=poll_df)
alternatively upload the image of what kind of plot you would like to have as a result
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import pandas as pd
filepath='E:/PROJECT ON DATA SCIENCE/boxplot/fee.csv';
X=pd.read_csv(filepath_or_buffer=filepath,index_col=0)
X.boxplot(by='stype', column='fee')
X.boxplot(by='pincode', column='fee')
If you want to boxplot X grouping by both stype and pincode you can use
X.boxplot(column='fee', by=['stype', 'pincode'])
Complete code would be
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import pandas as pd
filepath='E:/PROJECT ON DATA SCIENCE/boxplot/fee.csv';
X=pd.read_csv(filepath_or_buffer=filepath,index_col=0)
X.boxplot(column='fee', by=['stype', 'pincode'])