import python file into jupyter notebook - python-3.x

I have a python file bucket.py. I'm trying to import it in to a jupyter notebook using the code below. I'm then trying to use one of the functions in it "exp1" to explore a dataframe. I'm getting the error below. Can someone please tell me how to import a file from a directory so I can use the functions in it, in my jupyter notebook?
code:
import importlib.util
spec = importlib.util.spec_from_file_location("module.name", '/Users/stuff/bucket/bucket.py')
foo = importlib.util.module_from_spec(spec)
foo.exp1(df)
error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-e1cc80f06e24> in <module>
----> 1 foo.exp1(harborsideoakland_df)
AttributeError: module 'module.name' has no attribute 'exp1'
bucket.py file:
# import libraries
import numpy as np
import pandas as pd
from time import time
import scipy.stats as stats
from IPython.display import display # Allows the use of display() for DataFrames
# # Pretty display for notebooks
# %matplotlib inline
###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
### HELPER FUNCTIONS:
# Initial Exploration
def exp1(df):
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
# shape of data
print('rows and columns: {}'.format(df.shape))
# head data
# display(df.head())
print('')
# data types and columns in data
print('data types and columns in data:')
print('')
#display(df.info())
print(df.info())
print('')
# unique values in each column
print('unique values in each column:')
#display(df.nunique())
print(df.nunique())
print('')
# percentage duplicates
print('percentage duplicates : {}'.format(1-(float(df.drop_duplicates().shape[0]))/df.shape[0]))
print('')
## Percentage of column with missing values
print('Percentage of column with missing values:')
print('')
missingdf=df.apply(lambda x: float(sum(x.isnull()))/len(x))
#display(missingdf.head(n=missingdf.shape[0]))
print(missingdf.head(n=missingdf.shape[0]))
print('')
print('Data snapshot:')
print('')
print(df[:5])

this worked:
import sys
sys.path.append(r'/Users/stuff/bucket/bucket')
import bucket as Lb

Related

Getting value error in train.test while eliminating features from dataset using RFE.what is the solution?

valueError image part
Here is the code for eliminating features where I am getting value errors. I want to use recursive feature elimination without specifying any features . I tried to use the RFE(Recursion feature elemination) model to automatically eliminate weak features with each iteration which I have unable to do.HERE is the link of the dataset. https://drive.google.com/file/d/1neYnunu6a_Mdn3NfRZsF8wE4gwMCpjAY/view?usp=sharing .I will be grateful if you suggest me how to do it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.metrics import classification_report
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
df.keys()
x=pd.DataFrame(df)
x.head()
X = df.drop(["Sub_Cat"],axis=1).values
y = df["Sub_Cat"].values
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
X_train.shape,X_test.shape
sel=SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1))
sel.fit(X_train,y_train)
sel.get_support()
[I am getting value error in this part][1]
Then i tried to do this also getting `X = df.drop(["Dst_IP","Timestamp","Flow_ID","Src_IP","Sub_Cat"],axis=1).values
y = df["Sub_Cat"].values
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
X = df.drop(["Dst_IP","Timestamp","Flow_ID","Src_IP","Sub_Cat"],axis=1).values
y = df["Sub_Cat"].values
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
sel=SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1))
sel.fit(X_train,y_train)
sel.get_support()
I am still getting error:
ValueError Traceback (most recent call last)
in ()
1 sel=SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1))
----> 2 sel.fit(X_train,y_train)
3 sel.get_support()
3 frames
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Anomaly'

`FileNotFoundError: No such file: '/content/Weeds_Detectiontrain/1.png'` How to write the correct directory

I'm new to python! I am trying to upgrade myself. However, I have a dataset namely tree_dataset which contains 3 folders i.e. test, train, and validation. Each folder contains 9 different folders (classes). Now, I want to show a sample of all first images (data) from the 9 different classes.
My code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")
import os
import imageio
import skimage
import skimage.io
import skimage.transform
# Showing sample of all first images (data) from the 09 different classes
f, ax = plt.subplots(nrows=1,ncols=9, figsize=(20, 10))
i=0
for d in directory:
file='/content/tree_dataset'+d+'/1.png'
im=imageio.imread(file)
ax[i].imshow(im,resample=True)
ax[i].set_title(d, fontsize=8)
i+=1
Error: FileNotFoundError: No such file: '/content/Weeds_Detectiontrain/1.png'

When I applied RandomForest in Python, ValueError: Found input variables with inconsistent numbers of samples: [2883, 1236]

File "D:\Users\Watson Rockstar\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 205, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError:
Found input variables with inconsistent numbers of samples: [2883, 1236]
This dataset totally has 4119 data, and the Xtrain volum= (2883,18), Xtest volum = (1236,18)
I have tried to use LabelEncoder and OneHotEncoder to sovle the problems, but it is not helpful:
# Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
#import the necessary modelling algos.
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#preprocessing
from sklearn.preprocessing import LabelEncoder
telebanking = pd.read_csv('bank-additional.csv')
telebank = telebanking.drop(['duration','default'],axis =1)
def transform(feature):
le = LabelEncoder()
telebank[feature] = le.fit_transform(telebank[feature])
print(le.classes_)
cat_telebank=telebank.select_dtypes(include='object')
cat_telebank.columns
for col in cat_telebank.columns:
transform(col)
scaler=StandardScaler()
scaled_telebank=scaler.fit_transform(telebank.drop('y',axis=1))
X=scaled_telebank
Y=telebank['y'].as_matrix()
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
def compare(model):
clf = model
clf.fit(Xtrain,Ytrain)
pred = clf.predict(Xtrain)
acc.append(accuracy_score(pred,Ytest))
prec.append(precision_score(pred,Ytest))
rec.append(recall_score(pred,Ytest))
auroc.append(roc_auc_score(pred,Ytest))
acc=[]
prec=[]
rec=[]
auroc=[]
models=[RandomForestClassifier(),DecisionTreeClassifier()]
model_names=['RandomForestClassifier','DecisionTreeClassifier']
for model in range(len(models)):
compare(models[model])
d={'Modelling Algo':model_names,'Accuracy':acc,'Precision':prec,'Recall':rec,'Area Under ROC Curve':auroc}
met_telebank=pd.DataFrame(d)
met_telebank
It is the first warning's detail.
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
should be
Xtrain,Ytrain,Xtest,Ytest = train_test_split(X,Y,test_size=0.3)
This is causing the error, because it wants to use Xtest as the Ytrain values.

How to plot the distribution of each in feature in cancer dataset

I want to get the distribution of each features in cancer dataset using ggplot but its giving me error.
#pip install plotnine
from plotnine import ggplot
from plotnine import *
from sklearn.datasets import load_breast_cancer
for i in cancer.feature_names:
ggplot(cancer.data)+aes(x=i)+geom_bar(size=10)
This is the error message i got
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I would recommand to use seaborn for that. Here is an example of plotting the distribution of each in feature in cancer dataset by target:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
# loading data
cancer = load_breast_cancer()
data = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
columns= np.append(cancer['feature_names'], ['target']))
df = data.melt(['target'], var_name='cols', value_name='vals')
g = sns.FacetGrid(df, col='cols', hue="target", palette="Set1", col_wrap=4)
g = (g.map(sns.distplot, "vals", hist=True, ))
from plotnine import ggplot
from plotnine import *
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
import pandas as pd
import matplotlib.pyplot as plt
data=pd.DataFrame(cancer.data,columns=cancer.feature_names)
for i in data.columns:
print(ggplot(data)+aes(x=i)+geom_density(size=1))
print(ggplot(data)+aes(x=i)+geom_bar(size=10))

How to scale a data using Python 3

I am trying to scale my data using Python 3
But I keep getting this error: I am out of ideas as to what could be the issue? Please can you assist me guys? I would deeply appreciate your help!
import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series, DataFrame
from pandas.plotting import scatter_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from pylab import rcParams
import seaborn as sb
import scipy
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
mtcars = pd.read_csv('mtcars.csv')
mtcars.columns = ['Car
names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']
mpg = mtcars['mpg']
#Scale your data
mpg_matrix = mpg.reshape(-1,1)
scaled = preprocessing.MinMaxScaler()
scaled_mpg = scaled.fit_transform(mpg_matrix)
plt.plot(scaled_mpg)
plt.show()
mpg_matrix = mpg.numpy.reshape(-1,1)
tr__
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 5067, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'numpy'
pandas.core.series.Series doesn't have reshape.
Perhaps:
mpg_matrix = mpg.values.reshape(-1,1)
i.e. get the underlying numpy array and reshape that.

Resources