I'm having a dataset. Where I was practicing feature engineering by converting categorical objects to numbers, with the following lines of code:
import pandas as pd
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
df.head()
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
label_encoder = preprocessing.LabelEncoder()
for col in colsObj:
df[col] = label_encoder.fit_transform(df[col])
df.head()
for col in colsObj:
df[col] = label_encoder.inverse_transform(df[col])
df.head()
But here the inverse_tranform() wasn't returning the original dataset. Please help me!
You need one encoder per column - you cannot encode all columns with the same encoder:
import pandas as pd
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv', index_col='Id')
print(df.shape)
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
print(df.head())
encoder = {}
for col in colsObj:
encoder[col] = preprocessing.LabelEncoder()
df[col] = encoder[col].fit_transform(df[col])
print(df.head())
for col in colsObj:
df[col] = encoder[col].inverse_transform(df[col])
print(df.head())
You can also check out this answer for further details.
Related
I've got my data cleaned and prepped. I've done a split test and am now trying to do a linear regression. The issue is, when I first tried it, it say that I needed to create an array and reshape the data. I have done this, but now it's giving me an error " _reshape_dispatcher() missing 1 required positional argument: 'newshape'". All of the methods I've looked up to declare a newshape have not worked.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
df = pd.read_csv('googleplaystore.csv') # 1
df = df.dropna() # 3
df['Size'] = df['Size'].str.extract(r'(\d+\.?\d)', expand=False).astype(float) * df['Size'].str[-1].replace({'M': 1024, 'k': 1}) # 4
df = df.dropna() # remove nan from "Varies with device"
df['Price'] = df['Price'].str.strip('$').astype(float) # 5
df['Installs'] = df['Installs'].str.strip('+')
df['Installs'] = df['Installs'].str.replace(',',"").astype(int)
df['Reviews'] = df['Reviews'].astype(float)
df['Size'] = df['Size'].astype(float)
df = df.loc[df['Rating'].between(1, 5)] # 6
df = df.loc[df['Type'] != 'Free'] # 7
df.drop(df[df['Price'] >= 200].index, inplace = True)
df.drop(df[df['Reviews'] >2000000].index, inplace = True)
df.drop(df[df['Installs'] >10000].index, inplace = True)
inp1 = df.copy()
df_reviewslog=np.log10(df['Reviews'])
df_installslog=np.log10(df['Installs'])
del df['App']
del df['Last Updated']
del df['Current Ver']
del df['Android Ver']
pd.get_dummies(df, columns=['Category', 'Genres', 'Content Rating'], drop_first=True)
inp2 = df.copy()
df_train = X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.7, random_state=0)
df_test = X_train,X_Test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.3, random_state=0)
df_train = np.array(df_train)
df_test = np.array(df_test)
df_train = np.reshape(df_train.shape)
df_test = np.reshape(df_test.shape)
lr = LinearRegression()
lr.fit(X_train,y_train)
print(lr.score(X_Test,y_test))
I have loaded a dataset and tried to find the correlation coefficient with respect to target variable.
Below are the codes:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#Loading the dataset
x = load_boston()
df = pd.DataFrame(x.data, columns = x.feature_names)
df["MEDV"] = x.target
X = df.drop("MEDV",1) #Feature Matrix
y = df["MEDV"] #Target Variable
df.head()
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
#Correlation with output variable
cor_target = abs(cor["MEDV"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.4]
print(relevant_features)
How do I drop the features that have correlation coefficient < 0.4?
Try this:
#Selecting least correlated features
irelevant_features = cor_target[cor_target<0.4]
# list of irelevant_features
cols = list([i for i in irelevant_features.index])
#Dropping irelevant_features
df = df.drop(cols, axis=1)
relevant_features = cor_target[cor_target < 0.4]
print(relevant_features)
X = df.drop(['MEDV','CRIM', 'ZN', 'CHAS','AGE', 'DIS','RAD', 'B'], 1)
use: for i in irelevant_features(As written above)
I want the output of this code in int64 format but the output of this code is in float. how can change it? pls suggest
import pandas as pd
import numpy as np
df = pd.read_csv('https://query.data.world/s/HqjNNadqEnwSq1qnoV_JqyRJkc7o6O')
df = df[df.isnull().sum(axis=1) < 5]
print(round(100*(df.isnull().sum()/len(df.index))),2)
Something like this should do the trick...
import pandas as pd
import numpy as np
df = pd.read_csv('https://query.data.world/s/HqjNNadqEnwSq1qnoV_JqyRJkc7o6O')
df = df[df.isnull().sum(axis=1) < 5]
x = round(100*(df.isnull().sum()/len(df.index)))
y = x.astype(np.int64)
print(y)
The key bit being x.astype(np.int64) to convert the format.
I get a keyerror for "Displacement" when I try to plot Force against Displacement with pandas for these group of dataframes. Please help.
The link to the excel sheet being used:
https://www.dropbox.com/s/f8lnp973ojv3ish/neurospheress.xlsx?dl=0
I tried clearing any space in the column titles but it doesn't work
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('neurospheress.xlsx', sep='\s*,\s*', sheet_name = 'LS')
df1 = data.iloc[:80,:2]
df2 = data.iloc[:80,2:4]
df3 = data.iloc[:80,4:]
dfs = [df1,df2,df3]
for i,df in enumerate(dfs):
plt.plot(df['Displacement'], df['Force'], linestyle='--', alpha= 0.8, label='df{}'.format(i))
plt.legend(loc='best')
plt.show()
The below solution works, it basically adds two things to your solution
a) Skip the first row from excel
b) Rename the column names for df2 and df3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('neurospheress.xlsx', sep='\s*,\s*', sheet_name = 'LS',skiprows=1)
df1 = data.iloc[:80,:2]
df2 = data.iloc[:80,2:4]
df3 = data.iloc[:80,4:]
dfs = [df1,df2,df3]
df2.rename(columns={'Force.1':'Force','Displacement.1':'Displacement'},inplace=True)
df3.rename(columns={'Force.2':'Force','Displacement.2':'Displacement'},inplace=True)
print(data.columns)
print(df1.columns)
print(df2.columns)
for i,df in enumerate(dfs):
plt.plot(df['Displacement'], df['Force'], linestyle='--', alpha= 0.8, label='df{}'.format(i))
plt.legend(loc='best')
plt.show()
I have just started using Quandl and Pandas when I came across this code.
import quandl
import pandas as pd
api_key=open('quandlapi.txt','r').read()
df = quandl.get("FMAC/HPI_TX", authtoken=api_key)
fiddy_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
main_df = pd.DataFrame()
for abbv in fiddy_states[0][0][1:]:
query="FMAC/HPI_"+str(abbv)
df = quandl.get(query, authtoken=api_key)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df)
But when I run it I get the following error :
ValueError: columns overlap but no suffix specified: Index(['Value'], dtype='object')
Can anyone tell me what wrong I am doing here.?