How to scale a data using Python 3 - python-3.x

I am trying to scale my data using Python 3
But I keep getting this error: I am out of ideas as to what could be the issue? Please can you assist me guys? I would deeply appreciate your help!
import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series, DataFrame
from pandas.plotting import scatter_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from pylab import rcParams
import seaborn as sb
import scipy
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
mtcars = pd.read_csv('mtcars.csv')
mtcars.columns = ['Car
names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']
mpg = mtcars['mpg']
#Scale your data
mpg_matrix = mpg.reshape(-1,1)
scaled = preprocessing.MinMaxScaler()
scaled_mpg = scaled.fit_transform(mpg_matrix)
plt.plot(scaled_mpg)
plt.show()
mpg_matrix = mpg.numpy.reshape(-1,1)
tr__
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 5067, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'numpy'

pandas.core.series.Series doesn't have reshape.
Perhaps:
mpg_matrix = mpg.values.reshape(-1,1)
i.e. get the underlying numpy array and reshape that.

Related

Python: xarray and h5py incompatibility

The code below sends an error "RuntimeError: NetCDF: HDF error". If I remove the import h5py, I get no error. Are there any suggestions on why this might be happening and how I can fix it? My ultimate aim is to load a hdf5 and write out to netCDF.
import numpy as np
import pandas as pd
import h5py
import xarray as xr
ds = xr.Dataset(
{"foo": (("x", "y"), np.random.rand(4, 5))},
coords={
"x": [10, 20, 30, 40],
"y": pd.date_range("2000-01-01", periods=5),
"z": ("x", list("abcd")),
},
)
ds.to_netcdf("saved_on_disk.nc")
import numpy as np
import pandas as pd
import xarray as xr
Works.
import numpy as np
import pandas as pd
import h5py
import xarray as xr
Doesn't Work
import numpy as np
import pandas as pd
import xarray as xr
import h5py
Doesn't Work
import numpy as np
import pandas as pd
from netCDF4 import Dataset
import xarray as xr
import h5py
Works!
The key was also loading the netCDF4 package.

AttributeError: module 'matplotlib' has no attribute 'scatter'

I'm trying to make cluster of latitude and longitude.
the code gave an error in plt.scatter(data['Lng'],data['Lat']) line
the error is:
AttributeError: module 'matplotlib' has no attribute 'scatter'
code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
data = pd.read_csv("pk.csv")
data.head()
lat_long = data.drop(['country', 'iso2','admin', 'capital','population',
'population_proper'] , axis = 1)
lat_long.head()
plt.scatter(data['Lng'],data['Lat']) # error here
It should be:
import matplotlib.pyplot as plt
Or it can be:
from matplotlib import pyplot as plt
Also you can read PEP 328 for more information and clearity.

How to plot the distribution of each in feature in cancer dataset

I want to get the distribution of each features in cancer dataset using ggplot but its giving me error.
#pip install plotnine
from plotnine import ggplot
from plotnine import *
from sklearn.datasets import load_breast_cancer
for i in cancer.feature_names:
ggplot(cancer.data)+aes(x=i)+geom_bar(size=10)
This is the error message i got
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I would recommand to use seaborn for that. Here is an example of plotting the distribution of each in feature in cancer dataset by target:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
# loading data
cancer = load_breast_cancer()
data = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
columns= np.append(cancer['feature_names'], ['target']))
df = data.melt(['target'], var_name='cols', value_name='vals')
g = sns.FacetGrid(df, col='cols', hue="target", palette="Set1", col_wrap=4)
g = (g.map(sns.distplot, "vals", hist=True, ))
from plotnine import ggplot
from plotnine import *
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
import pandas as pd
import matplotlib.pyplot as plt
data=pd.DataFrame(cancer.data,columns=cancer.feature_names)
for i in data.columns:
print(ggplot(data)+aes(x=i)+geom_density(size=1))
print(ggplot(data)+aes(x=i)+geom_bar(size=10))

I was visualizing a data set using seaborn in python3 but its giving me an error. unsupported operand type(s) for /: 'str' and 'int'

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
poll_df=pd.read_csv('http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv')
#poll_df is the data which i have read from a csv file.
sns.factorplot('Affiliation',data=poll_df)
I have difficulty understanding the question. Column Affiliation has a str value not numeric.
if you want to count total number of each str category and have a bar plot try:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
poll_df=pd.read_csv('http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv')
#poll_df is the data which i have read from a csv file.
sns.countplot('Affiliation',data=poll_df)
alternatively upload the image of what kind of plot you would like to have as a result

Plotting multiple boxplots group by two columns

import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import pandas as pd
filepath='E:/PROJECT ON DATA SCIENCE/boxplot/fee.csv';
X=pd.read_csv(filepath_or_buffer=filepath,index_col=0)
X.boxplot(by='stype', column='fee')
X.boxplot(by='pincode', column='fee')
If you want to boxplot X grouping by both stype and pincode you can use
X.boxplot(column='fee', by=['stype', 'pincode'])
Complete code would be
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import pandas as pd
filepath='E:/PROJECT ON DATA SCIENCE/boxplot/fee.csv';
X=pd.read_csv(filepath_or_buffer=filepath,index_col=0)
X.boxplot(column='fee', by=['stype', 'pincode'])

Resources