Jupyter Notebook:Pyhon3 - python-3.x

I have edited my question by uploading the whole code, so if you could check this out #Nathon_Marotte Sir.
I am trying to run this code and it gives me an error:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
observations = 1000
xs = np.random.uniform(low=-10, high=10, size=(observations,1))
zs = np.random.uniform(-10,10,(observations,1))
inputs = np.column_stack((xs,zs))
print(inputs.shape)
noise= np.random.uniform(-1, 1, (observations, 1))
targets = 2*xs - 3*zs + 5 + noise
print(targets.shape)
#observations=1000
targets = targets.reshape(observations,)
fig=plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.plot(xs, zs, targets)
ax.set_xlabel('xs')
ax.set_ylabel('zs')
ax.set_zlabel('Targets')
ax.view_init(azim=250)
plt.show()
targets=targets.reshape(observations,)
Error:
ValueError Traceback (most recent call last)
<ipython-input-44-28d2a78b4ad5> in <module>
3 ax = fig.add_subplot(111,projection='3d')
4
----> 5 ax.plot(xs, zs, targets)
6
7 ax.set_xlabel('xs')
F:\Softwares\Anaconda\Installed\lib\site-packages\mpl_toolkits\mplot3d\axes3d.py in plot(self, xs, ys, zdir, *args, **kwargs)
1467
1468 # Match length
-> 1469 zs = np.broadcast_to(zs, np.shape(xs))
1470
1471 lines = super().plot(xs, ys, *args, **kwargs)
<__array_function__ internals> in broadcast_to(*args, **kwargs)
F:\Softwares\Anaconda\Installed\lib\site-packages\numpy\lib\stride_tricks.py in broadcast_to(array, shape, subok)
178 [1, 2, 3]])
179 """
--> 180 return _broadcast_to(array, shape, subok=subok, readonly=True)
181
182
F:\Softwares\Anaconda\Installed\lib\site-packages\numpy\lib\stride_tricks.py in _broadcast_to(array, shape, subok, readonly)
121 'negative')
122 extras = []
--> 123 it = np.nditer(
124 (array,), flags=['multi_index', 'refs_ok', 'zerosize_ok'] + extras,
125 op_flags=['readonly'], itershape=shape, order='C')
ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (1000,) and requested shape (1000,1)
As I am a newbie and not have sufficient knowledge to fix this. So if you could help me out to fix this bug out? That would be great.
Thanking you in advance.

I am done with my question and it worked when I used Google Colab resources.
Thank you all and especially #NathanMarotte

Related

sklearn v1.2.1: ValueError: custom transformer class

I have installed in my system scikit-learn 1.2.1 and I would like to create custom transformers classes to use with a ColumnTransformer instance. But the problem is when I set the output as pandas dataframe I get the following error message:
But first the code.
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
class TestClass(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X, y)
def transform(self, X, y=None):
return X
ct = ColumnTransformer(transformers=[('simple_imputer', SimpleImputer(strategy='most_frequent'), ['paymentmethod']),
('test_class', TestClass(), ['dependents', 'seniorcitizen', 'partner'])])
ct.set_output(transform='pandas') # I really need 'result' as pandas dataframe
result = ct.fit_transform(X_train, y_train)
Error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[22], line 3
1 ct = ColumnTransformer(transformers=[('simple_imputer', SimpleImputer(strategy='most_frequent'), ['paymentmethod']),
2 ('test_class', TestClass(), ['dependents', 'seniorcitizen', 'partner'])])
----> 3 ct.set_output(transform='pandas')
4 result = ct.fit_transform(X_train, y_train)
File ~/.local/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py:287, in ColumnTransformer.set_output(self, transform)
279 transformers = (
280 trans
281 for _, trans, _ in chain(
(...)
284 if trans not in {"passthrough", "drop"}
285 )
286 for trans in transformers:
--> 287 _safe_set_output(trans, transform=transform)
289 return self
File ~/.local/lib/python3.10/site-packages/sklearn/utils/_set_output.py:275, in _safe_set_output(estimator, transform)
272 return
274 if not hasattr(estimator, "set_output"):
--> 275 raise ValueError(
276 f"Unable to configure output for {estimator} because `set_output` "
277 "is not available."
278 )
279 return estimator.set_output(transform=transform)
ValueError: Unable to configure output for TestClass() because `set_output` is not available.
How can I fix this?

Specifying the columns using strings is only supported for pandas DataFrames

I want to One-hot-encoding several columns and used several solutions include simple one-hot-encoding, ColumnTransformer, make_column_transformer, Pipeline, and get_dummies but anytime I have got different errors.
x = dataset.iloc[:, :11].values
y = dataset.iloc[:, 11].values
""" data encoding """
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# oe = OrdinalEncoder()
# x = oe.fit_transform(x)
non_cat = ["Make", "Model", "Vehicle", "Transmission", "Fuel"]
onehot_cat = ColumnTransformer([
("categorical", OrdinalEncoder(), non_cat),
("onehot_categorical", OneHotEncoder(), non_cat)],
remainder= "passthrough")
x = onehot_cat.fit_transform(x)
error:
[['ACURA' 'ILX' 'COMPACT' ... 6.7 8.5 33]
['ACURA' 'ILX' 'COMPACT' ... 7.7 9.6 29]
['ACURA' 'ILX HYBRID' 'COMPACT' ... 5.8 5.9 48]
...
['VOLVO' 'XC60 T6 AWD' 'SUV - SMALL' ... 8.6 10.3 27]
['VOLVO' 'XC90 T5 AWD' 'SUV - STANDARD' ... 8.3 9.9 29]
['VOLVO' 'XC90 T6 AWD' 'SUV - STANDARD' ... 8.7 10.7 26]]
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
424 try:
--> 425 all_columns = X.columns
426 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-4-4008371c305f> in <module>
24 ("onehot_categorical", OneHotEncoder(), non_cat)],
25 remainder= "passthrough")
---> 26 x = onehot_cat.fit_transform(x)
27
28 print('OneHotEncode = ', x.shape)
~\Anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
527 self._validate_transformers()
528 self._validate_column_callables(X)
--> 529 self._validate_remainder(X)
530
531 result = self._fit_transform(X, y, _fit_transform_one)
~\Anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_remainder(self, X)
325 cols = []
326 for columns in self._columns:
--> 327 cols.extend(_get_column_indices(X, columns))
328
329 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
425 all_columns = X.columns
426 except AttributeError:
--> 427 raise ValueError("Specifying the columns using strings is only "
428 "supported for pandas DataFrames")
429 if isinstance(key, str):
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
I got a similar error trying to make prediction using a model. It was expecting a dataframe but I was sending a numpy object instead. So I changed it from:
prediction = monitor_model.predict(s_df.to_numpy())
to:
prediction = monitor_model.predict(s_df)

How do I correctly write the syntax for performing and plotting a for loop operation?

I am trying to create a for loop which uses a defined function (B_lambda) and takes in values of wavelength and temperature to produce values of intensity. i.e. I want the loop to take the function B_lambda and to run through every value within my listed wavelength range for each temperature in the temperature list. Then I want to plot the results. I am not very good with the syntax and have tried many ways but nothing is producing what I need and I am mostly getting errors. I have no idea how to use a for loop to plot and all online sources that I have checked out have not helped me with using a defined function in a for loop. I will put my latest code that seems to have the least errors down below with the error message:
import matplotlib.pylab as plt
import numpy as np
from astropy import units as u
import scipy.constants
%matplotlib inline
#Importing constants to use.
h = scipy.constants.h
c = scipy.constants.c
k = scipy.constants.k
wavelengths= np.arange(1000,30000)*1.e-10
temperature=[3000,4000,5000,6000]
for lam in wavelengths:
for T in temperature:
B_lambda = ((2*h*c**2)/(lam**5))*((1)/(np.exp((h*c)/(lam*k*T))-1))
plt.figure()
plt.plot(wavelengths,B_lambda)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-6-73b866241c49> in <module>
17 B_lambda = ((2*h*c**2)/(lam**5))*((1)/(np.exp((h*c)/(lam*k*T))-1))
18 plt.figure()
---> 19 plt.plot(wavelengths,B_lambda)
20
21
/usr/local/lib/python3.6/dist-packages/matplotlib/pyplot.py in plot(scalex, scaley, data, *args, **kwargs)
2787 return gca().plot(
2788 *args, scalex=scalex, scaley=scaley, **({"data": data} if data
-> 2789 is not None else {}), **kwargs)
2790
2791
/usr/local/lib/python3.6/dist-packages/matplotlib/axes/_axes.py in plot(self, scalex, scaley, data, *args, **kwargs)
1663 """
1664 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D._alias_map)
-> 1665 lines = [*self._get_lines(*args, data=data, **kwargs)]
1666 for line in lines:
1667 self.add_line(line)
/usr/local/lib/python3.6/dist-packages/matplotlib/axes/_base.py in __call__(self, *args, **kwargs)
223 this += args[0],
224 args = args[1:]
--> 225 yield from self._plot_args(this, kwargs)
226
227 def get_next_color(self):
/usr/local/lib/python3.6/dist-packages/matplotlib/axes/_base.py in _plot_args(self, tup, kwargs)
389 x, y = index_of(tup[-1])
390
--> 391 x, y = self._xy_from_xy(x, y)
392
393 if self.command == 'plot':
/usr/local/lib/python3.6/dist-packages/matplotlib/axes/_base.py in _xy_from_xy(self, x, y)
268 if x.shape[0] != y.shape[0]:
269 raise ValueError("x and y must have same first dimension, but "
--> 270 "have shapes {} and {}".format(x.shape, y.shape))
271 if x.ndim > 2 or y.ndim > 2:
272 raise ValueError("x and y can be no greater than 2-D, but have "
ValueError: x and y must have same first dimension, but have shapes (29000,) and (1,)```
First thing to note (and this is minor) is that astropy is not required to run your code. So, you can simplify the import statements.
import matplotlib.pylab as plt
import numpy as np
import scipy.constants
%matplotlib inline
#Importing constants to use.
h = scipy.constants.h
c = scipy.constants.c
k = scipy.constants.k
wavelengths= np.arange(1000,30000,100)*1.e-10 # here, I chose steps of 100, because plotting 29000 datapoints takes a while
temperature=[3000,4000,5000,6000]
Secondly, to tidy up the loop a bit, you can write a helper function, that youn call from within you loop:
def f(lam, T):
return ((2*h*c**2)/(lam**5))*((1)/(np.exp((h*c)/(lam*k*T))-1))
now you can collect the output of your function, together with the input parameters, e.g. in a list of tuples:
outputs = []
for lam in wavelengths:
for T in temperature:
outputs.append((lam, T, f(lam, T)))
Since you vary both wavelength and temperature, a 3d plot makes sense:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.plot(*zip(*outputs))
An alternative would be to display the data as an image, using colour to indicate the function output.
I am also including an alternative method to generate the data in this one. Since the function f can take arrays as input, you can feed one temperature at a time, and with it, all the wavelengths simultaneously.
# initialise output as array with proper shape
outputs = np.zeros((len(wavelengths), len(temperature)))
for i, T in enumerate(temperature):
outputs[:,i] = f(wavelengths, T)
The output now is a large matrix, which you can visualise as an image:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(outputs, aspect=10e8, interpolation='none',
extent=[
np.min(temperature),
np.max(temperature),
np.max(wavelengths),
np.min(wavelengths)]
)

How to do clustering with k-means algorithm for an imported data set with proper scaling of both axis

I m new to data science and python, and jupyter notebook, I m currently studying how to do k means clustering on a data set. I came across ways in which can introduce data
Data = {'x': [25,34,22,27,33,33,31,22,35,34,67,54,57,43,50,57,59,52,65,47,49,48,35,33,44,45,38,43,51,46],
'y': [79,51,53,78,59,74,73,57,69,75,51,32,40,47,53,36,35,58,59,50,25,20,14,12,20,5,29,27,8,7]
}
df = DataFrame(Data,columns=['x','y'])
and use of blobs
data = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.6, random_state=50)
but I would like to know how to do a proper code with a csv file imported from my computer and do a k means with scaling, thank you in advance, I could not find relevant blogs to help me
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
data=pd.read_csv("C:/Users/Dulangi/Downloads/winequality-red.csv")
data
data["alcohol"]=data["alcohol"]/data["alcohol"].max()
data["quality"]=data["quality"]/data["quality"].max()
plt.scatter(data["alcohol"],data['quality'])
plt.xlabel("alcohol")
plt.ylabel('quality')
plt.show()
x=data.copy()
kmeans=KMeans(2)
kmeans.fit(x)
clusters=x.copy()
clusters['cluster_pred']=kmeans.fit_predict(x)
plt.scatter(clusters["alcohol"],clusters['quality'],c=clusters['cluster_pred'],cmap='rainbow')
plt.xlabel("alcohol")
plt.ylabel('quality')
plt.show()
from sklearn import preprocessing
x_scaled=preprocessing.scale(x)
#x_scaled
wcss=[]
for i in range(1,30):
kmeans=KMeans(i)
kmeans.fit(x_scaled)
wcss.append(kmeans.inertia_)
wcss
plt.plot(range(1,30),wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
This is what i tried
the error i got
ValueError Traceback (most recent call last)
<ipython-input-12-d4955ce8615e> in <module>
39
40
---> 41 plt.plot(range(1,30),wcss)
42 plt.xlabel('Number of clusters')
43 plt.ylabel('WCSS')
~\Anaconda3\lib\site-packages\matplotlib\pyplot.py in plot(scalex, scaley, data, *args, **kwargs)
2787 return gca().plot(
2788 *args, scalex=scalex, scaley=scaley, **({"data": data} if data
-> 2789 is not None else {}), **kwargs)
2790
2791
~\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in plot(self, scalex, scaley, data, *args, **kwargs)
1664 """
1665 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D._alias_map)
-> 1666 lines = [*self._get_lines(*args, data=data, **kwargs)]
1667 for line in lines:
1668 self.add_line(line)
~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in __call__(self, *args, **kwargs)
223 this += args[0],
224 args = args[1:]
--> 225 yield from self._plot_args(this, kwargs)
226
227 def get_next_color(self):
~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _plot_args(self, tup, kwargs)
389 x, y = index_of(tup[-1])
390
--> 391 x, y = self._xy_from_xy(x, y)
392
393 if self.command == 'plot':
~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _xy_from_xy(self, x, y)
268 if x.shape[0] != y.shape[0]:
269 raise ValueError("x and y must have same first dimension, but "
--> 270 "have shapes {} and {}".format(x.shape, y.shape))
271 if x.ndim > 2 or y.ndim > 2:
272 raise ValueError("x and y can be no greater than 2-D, but have "
ValueError: x and y must have same first dimension, but have shapes (29,) and (1,)
You can easily do by using scikit-Learn
import pandas as pd
data=pd.read_csv('myfile.csv')
df=pd.DataFrame(data,index=None)
df.head()
Check if rows contain any null values
df.isnull().sum()
Drop all the rows with null values if any
df_numeric.dropna(inplace=True)
Normalize data
Normalize the data with MinMax scaling provided by sklearn
from sklearn import preprocessing
minmax_processed = preprocessing.MinMaxScaler().fit_transform(df.drop('title',axis=1))
df_numeric_scaled = pd.DataFrame(minmax_processed, index=df.index, columns=df.columns[:-1])
df_numeric_scaled.head()
from sklearn.cluster import KMeans
Apply K-Means Clustering
What k to choose?
Let's fit cluster size 1 to 20 on our data and take a look at the corresponding score value.
Nc = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in Nc]
score = [kmeans[i].fit(df_numeric_scaled).score(df_numeric_scaled) for i in range(len(kmeans))]
These score values signify how far our observations are from the cluster center. We want to keep this score value around 0. A large positive or a large negative value would indicate that the cluster center is far from the observations.
Based on these scores value, we plot an Elbow curve to decide which cluster size is optimal. Note that we are dealing with tradeoff between cluster size(hence the computation required) and the relative accuracy.
import matplotlib as pl
pl.plot(Nc,score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()
Fit K-Means for clustering with k=5
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_numeric_scaled)
df['cluster'] = kmeans.labels_
df.head()

error: unpack requires a buffer of 16 bytes

The following code gives me an error:
"error: unpack requires a buffer of 16 bytes"
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import cartopy
import cartopy.crs as ccrs
from cartopy.io.shapereader import Reader
# Reading the data from shapefiles .shp
basin = Reader('C:\\...\\BasinCOL2014.shp')
fig = plt.figure (figsize = (10,5))
ax = fig.add_subplot (1, 1, 1, projection = ccrs.PlateCarree(central_longitude=0, globe=None))
ax.set_extent ([-66.0, -80.0, -5.0, 13.0])
ax.gridlines (draw_labels = True)
# Aditional elements to display in map
ax.coastlines (resolution = '10m')
ax.add_feature (cartopy.feature.RIVERS, linewidth=4)
ax.add_geometries (basin.geometries(), crs = ccrs.Geodetic(), edgecolor = 't', facecolor = 'none')
I expect the output will be a map with all the three elements Coastlines, Rivers and Basin. I only get two of them (see image below).
Partial results I´m getting from the code above
Currently installed by Anaconda Navigator 1.9.7
- Jupyter notebook 5.7.8
- Cartopy 0.17.0
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-6-628330e4110c> in <module>
12 ax.coastlines (resolution = '10m')
13 ax.add_feature (cartopy.feature.RIVERS, linewidth=4)
---> 14 ax.add_geometries (basin.geometries(), crs = ccrs.Geodetic(), edgecolor = 't', facecolor = 'none')
C:\ProgramData\Anaconda3\lib\site-packages\cartopy\mpl\geoaxes.py in add_geometries(self, geoms, crs, **kwargs)
586 """
587 styler = kwargs.pop('styler', None)
--> 588 feature = cartopy.feature.ShapelyFeature(geoms, crs, **kwargs)
589 return self.add_feature(feature, styler=styler)
590
C:\ProgramData\Anaconda3\lib\site-packages\cartopy\feature\__init__.py in __init__(self, geometries, crs, **kwargs)
229 """
230 super(ShapelyFeature, self).__init__(crs, **kwargs)
--> 231 self._geoms = tuple(geometries)
232
233 def geometries(self):
C:\ProgramData\Anaconda3\lib\site-packages\cartopy\io\shapereader.py in geometries(self)
234 geometry_factory = self._geometry_factory
235 for i in range(self._reader.numRecords):
--> 236 shape = self._reader.shape(i)
237 yield _make_geometry(geometry_factory, shape)
238
C:\ProgramData\Anaconda3\lib\site-packages\shapefile.py in shape(self, i)
811 return k
812 shp.seek(offset)
--> 813 return self.__shape()
814
815 def shapes(self):
C:\ProgramData\Anaconda3\lib\site-packages\shapefile.py in __shape(self)
749 # Read m extremes and values
750 if shapeType in (13,15,18,23,25,28,31):
--> 751 (mmin, mmax) = unpack("<2d", f.read(16))
752 # Measure values less than -10e38 are nodata values according to the spec
753 record.m = []
error: unpack requires a buffer of 16 bytes

Resources