getting indices in numpy - python-3.x

Can someone find out what is wrong with the code below?
import numpy as np
data = np.recfromcsv("data.txt", delimiter=" ", names=['name', 'types', 'value'])
indices = np.where((data.name == 'david') * data.types.startswith('height'))
mean_value = np.mean(data.value[indices])
I want to calculate mean of weight and height for david and mark as follows:
david>> mean(weight_2005 and weight_2012), mean (height_2005 and height_2012)
mark>> mean(weight_2005 and weight_2012), mean (height_2005 and height_2012)
From the text (data.txt) file:
david weight_2005 50
david weight_2012 60
david height_2005 150
david height_2012 160
mark weight_2005 90
mark weight_2012 85
mark height_2005 160
mark height_2012 170
I am using python 3.2 and numpy 1.8
The above code provides the type error as follows:
TypeError: startswith first arg must be bytes or a tuple of bytes, not numpy.str_

With Python3.2 and numpy 1.7, this line works
indices = np.where((data.name == b'david') * data.types.startswith(b'height'))
data displays as:
rec.array([(b'david', b'weight_2005', 50),...],
dtype=[('name', 'S5'), ('types', 'S11'), ('value', '<i4')])
type(data.name[0]) is <class 'bytes'>.
b'height' works in Python2.7 as well.
another option is to convert all the data to unicode (Python 3 strings)
dtype=[('name','U5'), ('types', 'U11'), ('value', '<i4')]
dataU=data.astype(dtype=dtype)
indices = np.where((dataU.name == 'david') * dataU.types.startswith('height'))
or
data = np.recfromtxt('data.txt', delimiter=" ",
names=['name', 'types', 'value'], dtype=dtype)
It looks like recfromcsv does not take a dtype, but recfromtxt does.

Related

Performing a Principal Component Analysis to reconstruct time series creates more values than expected

I want to do a Principal Component Analysis following this notebook to reconstruct the DJIA (I'm using alpha_ventage) from its components (found with Quandl). Yet, it seems that I create more values than expected, than the original dataframe, when reconstructing the values multiplying the principal components by their weights
kernel_pca = KernelPCA(n_components=5).fit(df_z_components)
pca_5 = kernel_pca.transform(-daily_df_components)
weights = fn_weighted_average(kernel_pca.lambdas_)
reconstructed_values = np.dot(pca_5, weights)
Indeed, daily_df_components is created from the components of the DJIA by the quandl API which seem to have more data than the library I use to get the DJIA Index, alpha_ventage.
Here is the full code
"""
Obtaining the components data from quandl
"""
import quandl
QUANDL_API_KEY = 'MYKEY'
quandl.ApiConfig.api_key = QUANDL_API_KEY
SYMBOLS = [
'AAPL', 'MMM', 'BA', 'AXP', 'CAT',
'CVX', 'CSCO', 'KO', 'DD', 'XOM',
'GS', 'HD', 'IBM', 'INTC', 'JNJ',
'JPM', 'MCD', 'MRK', 'MSFT', 'NKE',
'PFE', 'PG', 'UNH', 'UTX', 'TRV',
'VZ', 'V', 'WMT', 'WBA', 'DIS'
]
wiki_symbols = ['WIKI/%s'%symbol for symbol in SYMBOLS]
df_components = quandl.get(
wiki_symbols,
start_date='2017-01-01',
end_date='2017-12-31',
column_index=11)
df_components.columns = SYMBOLS
filled_df_components = df_components.fillna(method='ffill')
daily_df_components = filled_df_components.resample('24h').ffill()
daily_df_components = daily_df_components.fillna(method='bfill')
"""
Download the all-time DJIA dataset
"""
from alpha_vantage.timeseries import TimeSeries
# Update your Alpha Vantage API key here...
ALPHA_VANTAGE_API_KEY = 'MYKEY'
ts = TimeSeries(key=ALPHA_VANTAGE_API_KEY, output_format='pandas')
df, meta_data = ts.get_intraday(symbol='DIA',interval='1min', outputsize='full')
# Finding eigenvectors and eigen values
fn_weighted_average = lambda x: x/x.sum()
weighted_values = fn_weighted_average(fitted_pca.lambdas_)[:5]
from sklearn.decomposition import KernelPCA
fn_z_score = lambda x: (x - x.mean())/x.std()
df_z_components = daily_df_components.apply(fn_z_score)
fitted_pca = KernelPCA().fit(df_z_components)
# Reconstructing the Dow Average with PCA
import numpy as np
kernel_pca = KernelPCA(n_components=5).fit(df_z_components)
pca_5 = kernel_pca.transform(-daily_df_components)
weights = fn_weighted_average(kernel_pca.lambdas_)
reconstructed_values = np.dot(pca_5, weights)
# Combine PCA and Index to compare
df_combined = djia_2020_weird.copy()
df_combined['pca_5'] = reconstructed_values
But it returns:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-100-2808dc14f789> in <module>()
9 # Combine PCA and Index to compare
10 df_combined = djia_2020_weird.copy()
---> 11 df_combined['pca_5'] = reconstructed_values
12 df_combined = df_combined.apply(fn_z_score)
13 df_combined.plot(figsize=(12,8));
3 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/construction.py in sanitize_index(data, index)
746 if len(data) != len(index):
747 raise ValueError(
--> 748 "Length of values "
749 f"({len(data)}) "
750 "does not match length of index "
ValueError: Length of values (361) does not match length of index (14)
Indeed, reconstructed_values is 361 long and df_combined is 14 values long...
Here is this last dataframe:
DJI
date
2021-01-21 NaN
2021-01-22 311.37
2021-01-23 310.03
2021-01-24 310.03
2021-01-25 310.03
2021-01-26 309.01
2021-01-27 309.49
2021-01-28 302.17
2021-01-29 305.25
2021-01-30 299.20
2021-01-31 299.20
2021-02-01 299.20
2021-02-02 302.13
2021-02-03 307.86
Maybe the reason is that the notebook author was available to get the data for the whole year he was interested in, when I run the data it seems that I only have two months?
Ahoy there, I'm the author of the notebook. It seems Quandl no longer provides historical prices of DJIA after the time of writing, and copyright wasn't granted to redistribute the data. For research, you may consider other free stock tickers to proxy DJIA.
The example usages have been updated in the repo to demostrate KernelPCA, as explained here.

KeyError: "None of [Index(['23/01/2020' ......,\n dtype='object', length=9050)] are in the [columns]"

I am learning pandas and matplotlib on my own by using some public dataset via
this api link
I'm using colab and below are my codes:
import datetime
import io
import json
import pandas as pd
import requests
import matplotlib.pyplot as plt
confirm_resp = requests.get('https://api.data.gov.hk/v2/filterq=%7B%22resource%22%3A%22http%3A%2F%2Fwww.chp.gov.hk%2Ffiles%2Fmisc%2Fenhanced_sur_covid_19_eng.csv%22%2 C%22section%22%3A1%2C%22format%22%3A%22json%22%7D').content
confirm_df = pd.read_json(io.StringIO(confirm_resp.decode('utf-8')))
confirm_df.columns = confirm_df.columns.str.replace(" ", "_")
pd.to_datetime(confirm_df['Report_date'])
confirm_df.columns = ['Case_no', 'Report_date', 'Onset_date', 'Gender', 'Age',
'Name_of_hospital_admitted', 'Status', 'Resident', 'Case_classification', 'Confirmed_probable']
confirm_df = confirm_df.drop('Name_of_hospital_admitted', axis = 1)
confirm_df.head()
and this is what the dataframe looks like:
Case_no
Report_date
Onset_date
Gender
Age
Status
Resident
Case_classification
Confirmed_probable
1
23/01/2020
21/01/2020
M
39
Discharged
Non-HK resident
Imported case
Confirmed
2
23/01/2020
18/01/2020
M
56
Discharged
HK resident
Imported case
Confirmed
3
24/01/2020
20/01/2020
F
62
Discharged
Non-HK resident
Imported case
Confirmed
4
24/01/2020
23/01/2020
F
62
Discharged
Non-HK resident
Imported case
Confirmed
5
24/01/2020
23/01/2020
M
63
Discharged
Non-HK resident
Imported case
Confirmed
When I try to make a simple plot with the below code:
x = confirm_df['Report_date']
y = confirm_df['Case_classification']
confirm_df.plot(x, y)
It gives me the below error:
KeyError Traceback (most recent call last)
<ipython-input-17-e4139a9b5ef1> in <module>()
4 y = confirm_df['Case_classification']
5
----> 6 confirm_df.plot(x, y)
3 frames
/usr/local/lib/python3.6/dist-packages/pandas/plotting/_core.py in __call__(self, *args, **kwargs)
912 if is_integer(x) and not data.columns.holds_integer():
913 x = data_cols[x]
--> 914 elif not isinstance(data[x], ABCSeries):
915 raise ValueError("x must be a label or position")
916 data = data.set_index(x)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2910 if is_iterator(key):
2911 key = list(key)
-> 2912 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2913
2914 # take() does not accept boolean indexers
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1252 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1253
-> 1254 self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
1255 return keyarr, indexer
1256
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1296 if missing == len(indexer):
1297 axis_name = self.obj._get_axis_name(axis)
-> 1298 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1299
1300 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "*None of [Index(['23/01/2020', '23/01/2020', '24/01/2020', '24/01/2020', '24/01/2020',\n '26/01/2020', '26/01/2020', '26/01/2020', '29/01/2020', '29/01/2020',\n ...\n '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021',\n '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021'],\n dtype='object', length=9050)] are in the [column*s]"
I have tried to make the plot with and without converting Report date to datetime object, I tried substitute x value with all the columns in the data frame, but all give me the same error code.
Appreciate if anyone can help me to understand how to handle these issues here and going forward. I've spent hours to resolve it but cannot find the answers.
I did not encounter this issue before when I downloaded some notebooks and datasets from Kaggle to follow along.
Thank you and happy new year.
First, you need to assign the converted date back to the column:
confirm_df['Report_date'] = pd.to_datetime(confirm_df['Report_date'])
Second, When the plot method is called from a dataframe object, you need to provide only the column names as argument (1).
confirm_df.plot(x='Report_date', y='Case_classification')
But the above code still throws error because 'Case_classification' is not numeric data.
You are trying to plot datetime vs. categorical data, so normal plot won't work but Something like this could work (2):
# I used only first 15 examples here, full dataset is kinda messy
confirm_df.iloc[:15, :].groupby(['Report_date', 'Case_classification']).size().unstack().plot.bar()
(1)pandas.DataFrame.plot
(2)How to plot categorical variable against a date column in Python
Several problems. First, the links were incorrect, I have edited them (probably just a copy/paste error). Second, you have to assign the converted datetime series back to the dataframe. Use print(confirm_df.dtypes) to see the difference. Then, the dataset is not ordered by date, but matplotlib expects an ordered x-axis. Well, actually, the problem was that the parser misinterpreted the datetime objects. I have added dayfirst=True to ensure that the dates are read correctly. Finally, what do you want to plot here? Just the cases by date? The number of cases per group by date? Your original code implies just the former but this is not really informative, is it?
import io
import pandas as pd
import requests
import matplotlib.pyplot as plt
print("starting download")
confirm_resp = requests.get('https://api.data.gov.hk/v2/filter?q=%7B%22resource%22%3A%22http%3A%2F%2Fwww.chp.gov.hk%2Ffiles%2Fmisc%2Fenhanced_sur_covid_19_eng.csv%22%2C%22section%22%3A1%2C%22format%22%3A%22json%22%7D').content
print("finished download")
confirm_df = pd.read_json(io.StringIO(confirm_resp.decode('utf-8')))
confirm_df.columns = confirm_df.columns.str.replace(" ", "_")
confirm_df['Report_date'] = pd.to_datetime(confirm_df['Report_date'], dayfirst=True)
confirm_df.columns = ['Case_no', 'Report_date', 'Onset_date', 'Gender', 'Age',
'Name_of_hospital_admitted', 'Status', 'Resident', 'Case_classification', 'Confirmed_probable']
confirm_df = confirm_df.drop('Name_of_hospital_admitted', axis = 1)
print(confirm_df.dtypes)
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(confirm_df['Report_date'], confirm_df['Case_classification'])
plt.tight_layout()
plt.show()
Sample output:
Some grouping and data aggregation might be more informative, but you have to decide what you want to display first before writing the code.

Cannot create a numpy array using numpy's `full()` method and a python list

I can create a numpy array from a python list as follows:
>>> a = [1,2,3]
>>> b = np.array(a).reshape(3,1)
>>> print(b)
[[1]
[2]
[3]]
However, I don't know what causes error in the following code:
Code :
>>> a = [1,2,3]
>>> b = np.full((3,1), a)
Error :
ValueError Traceback (most recent call last)
<ipython-input-275-1ab6c109dda4> in <module>()
1 a = [1,2,3]
----> 2 b = np.full((3,1), a)
3 print(b)
/usr/local/lib/python3.6/dist-packages/numpy/core/numeric.py in full(shape, fill_value, dtype, order)
324 dtype = array(fill_value).dtype
325 a = empty(shape, dtype, order)
--> 326 multiarray.copyto(a, fill_value, casting='unsafe')
327 return a
328
<__array_function__ internals> in copyto(*args, **kwargs)
ValueError: could not broadcast input array from shape (3) into shape (3,1)
Even though the list a has 3 elements inside it and I expect a 3x1 numpy array, the full() method fails to deliver it.
I referred the broadcasting article of numpy too. However, they are much more focused towards the arithmetic operation perspective, hence I couldn't obtain anything useful from there.
So it would be great if you can help me to understand the difference in b/w. the above mentioned array creation methods and the cause of the error too.
Numpy is unable to broadcast the two shapes together because your list is interpreted as a 'row vector' (np.array(a).shape = (3,)) while you are asking for a 'column vector' (shape = (3, 1)). If you are set on using np.full, then you can shape your list as a column vector initially:
>>> import numpy as np
>>>
>>> a = [[1],[2],[3]]
>>> b = np.full((3,1), a)
Another option is to convert a into a numpy array ahead of time and add a new axis to match the desired output shape.
>>> a = [1,2,3]
>>> a = np.array(a)[:, np.newaxis]
>>> b = np.full((3,1), a)

Get a function pickleable for using in Differential Evolution workers = -1

#I EDITED MY ORIGINAL POST in order to put a simpler example.
I use differential evolution (DE) of Scipy to optimize certain parameters.
I would like to use all the PC processors in this task and I try to use the option "workers=-1"
The codition asked is that the function called by DE must be pickleable.
If I run the example in https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.differential_evolution.html#scipy.optimize.differential_evolution, the optimisation works.
from scipy.optimize import rosen, differential_evolution
import pickle
import dill
bounds = [(0,2), (0, 2)]
result = differential_evolution(rosen, bounds, updating='deferred',workers=-1)
result.x, result.fun
(array([1., 1.]), 0.0)
But if I define a custom function 'Ros_custom', the optimisation crashes (doesn't give a result)
def Ros_custom(X):
x = X[0]
y = X[1]
a = 1. - x
b = y - x*x
return a*a + b*b*100
result = differential_evolution(Ros_custom, bounds, updating='deferred',workers=-1)
If I try to pickle.dumps and pickle.loads 'Ros_custom' I get the same behaviour (optimisation crash, no answer).
If I use dill
Ros_pick_1=dill.dumps(Ros_custom)
Ros_pick_2=dill.loads(Ros_pick_1)
result = differential_evolution(Ros_pick_2, bounds, updating='deferred',workers=-1)
result.x, result.fun
I get the following message error
PicklingError: Can't pickle <function Ros_custom at 0x0000020247F04C10>: it's not the same object as __main__.Ros_custom
My question are:
Why do I get the error ? and if there would be a way to get 'Ros_custom' pickleable in order to use all the PC processors in DE.
Thank you in advance for any advice.
Two things:
I'm not able to reproduce the error you are seeing unless I first pickle/unpickle the custom function.
There's no need to pickle/unpickle the custom function before passing it to the solver.
This seems to work for me. Python 3.6.12 and scipy 1.5.2:
>>> from scipy.optimize import rosen, differential_evolution
>>> bounds = [(0,2), (0, 2)]
>>>
>>> def Ros_custom(X):
... x = X[0]
... y = X[1]
... a = 1. - x
... b = y - x*x
... return a*a + b*b*100
...
>>> result = differential_evolution(Ros_custom, bounds, updating='deferred',workers=-1)
>>> result.x, result.fun
(array([1., 1.]), 0.0)
>>>
>>> result
fun: 0.0
message: 'Optimization terminated successfully.'
nfev: 4953
nit: 164
success: True
x: array([1., 1.])
>>>
I can even nest a function inside of the custom objective:
>>> def foo(a,b):
... return a*a + b*b*100
...
>>> def custom(X):
... x,y = X[0],X[1]
... return foo(1.-x, y-x*x)
...
>>> result = differential_evolution(custom, bounds, updating='deferred',workers=-1)
>>> result
fun: 0.0
message: 'Optimization terminated successfully.'
nfev: 4593
nit: 152
success: True
x: array([1., 1.])
So, for me, at least the code works as expected.
You should have no need to serialize/deserialize the function ahead of it's use in scipy. Yes, the function need to be picklable, but scipy will do that for you. Basically, what's happening under the covers is that your function will get serialized, passed to multiprocessing as a string, then distributed to the processors, then unpickled and used on the target processors.
Like this, for 4 sets on inputs, run one per processor:
>>> import multiprocessing as mp
>>> res = mp.Pool().map(custom, [(0,1), (1,2), (4,9), (3,4)])
>>> list(res)
[101.0, 100.0, 4909.0, 2504.0]
>>>
Older versions of multiprocessing had difficulty serializing functions defined in the interpreter, and often needed to have the code executed in a __main__ block. If you are on windows, this is still often the case... and you might also need to call mp.freeze_support(), depending on how the code in scipy is implemented.
I tend to like dill (I'm the author) because it can serialize a broader range of objects that pickle. However, as scipy uses multiprocessing, which uses pickle... I often choose to use mystic (I'm the author), which uses multiprocess (I'm the author), which uses dill. Very roughly, equivalent codes, but they all work with dill instead of pickle.
>>> from mystic.solvers import diffev2
>>> from pathos.pools import ProcessPool
>>> diffev2(custom, bounds, npop=40, ftol=1e-10, map=ProcessPool().map)
Optimization terminated successfully.
Current function value: 0.000000
Iterations: 42
Function evaluations: 1720
array([1.00000394, 1.00000836])
With mystic, you get some additional nice features, like a monitor:
>>> from mystic.monitors import VerboseMonitor
>>> mon = VerboseMonitor(5,5)
>>> diffev2(custom, bounds, npop=40, ftol=1e-10, itermon=mon, map=ProcessPool().map)
Generation 0 has ChiSquare: 0.065448
Generation 0 has fit parameters:
[0.769543181527466, 0.5810893880113548]
Generation 5 has ChiSquare: 0.065448
Generation 5 has fit parameters:
[0.588156685059123, -0.08325052939774935]
Generation 10 has ChiSquare: 0.060129
Generation 10 has fit parameters:
[0.8387858177101133, 0.6850849855634057]
Generation 15 has ChiSquare: 0.001492
Generation 15 has fit parameters:
[1.0904350077743412, 1.2027007403275813]
Generation 20 has ChiSquare: 0.001469
Generation 20 has fit parameters:
[0.9716429877952866, 0.9466681129902448]
Generation 25 has ChiSquare: 0.000114
Generation 25 has fit parameters:
[0.9784047411865372, 0.9554056558210251]
Generation 30 has ChiSquare: 0.000000
Generation 30 has fit parameters:
[0.996105436348129, 0.9934091068974504]
Generation 35 has ChiSquare: 0.000000
Generation 35 has fit parameters:
[0.996589586891175, 0.9938925277204567]
Generation 40 has ChiSquare: 0.000000
Generation 40 has fit parameters:
[1.0003791956048833, 1.0007133195321427]
Generation 45 has ChiSquare: 0.000000
Generation 45 has fit parameters:
[1.0000170425596364, 1.0000396089375592]
Generation 50 has ChiSquare: 0.000000
Generation 50 has fit parameters:
[0.9999013984263114, 0.9998041148375927]
STOP("VTRChangeOverGeneration with {'ftol': 1e-10, 'gtol': 1e-06, 'generations': 30, 'target': 0.0}")
Optimization terminated successfully.
Current function value: 0.000000
Iterations: 54
Function evaluations: 2200
array([0.99999186, 0.99998338])
>>>
All of the above are running in parallel.
So, in summary, the code should work as is (and without pre-pickling) -- maybe unless you are on windows, where you might need to use freeze_support and run the code in the __main__ block.
Writing the function separately from the code worked for me.
create rosen_custom.py with code inside:
import numpy as np
def rosen(x):
x = np.array(x)
r = np.sum(100.0 * (x[1:] - x[:-1]**2.0)**2.0 + (1 - x[:-1])**2.0,
axis=0)
return r
Then use it in DE:
from scipy.optimize import differential_evolution
from rosen_custom import rosen
import numpy as np
bounds = [(0,2), (0, 2), (0, 2), (0, 2), (0, 2)]
result = differential_evolution(rosen_custom, bounds,
updating='deferred',workers=-1)
print(result.x, result.fun)

Don't understand error message (basic sklearn command)

I'm new to Python and programming in general and I wanted to exercise a littlebit with linear regression in one variable.
Im currently following this tutorial in the link
https://www.youtube.com/watch?v=8jazNUpO3lQ&list=PLeo1K3hjS3uvCeTYTeyfe0-rN5r8zn9rw&index=2
and I am exactly doing what he is doing.
I did however encounter an error when compiling as shown in the code below
(for simplicity, I put '--' to places which is the output. I used Jupyter Notebook)
At the end I encounterd a long list of errors when trying to compile 'reg.predict(3300)'.
I don't understand what went wrong.
Can someone help me out?
Cheers!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
df = pd.read_csv("homeprices.csv")
df
--area price
0 2600 550000
1 3000 565000
2 3200 610000
3 3600 680000
4 4000 725000
%matplotlib inline
plt.xlabel('area(sqr ft)')
plt.ylabel('price(US$)')
plt.scatter(df.area, df.price, color='red', marker = '+')
--<matplotlib.collections.PathCollection at 0x2e823ce66a0>
reg = linear_model.LinearRegression()
reg.fit(df[['area']],df.price)
--LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
reg.predict(3300)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-16-ad5a8409ff75> in <module>
----> 1 reg.predict(3300)
~\Anaconda3\lib\site-packages\sklearn\linear_model\base.py in predict(self, X)
211 Returns predicted values.
212 """
--> 213 return self._decision_function(X)
214
215 _preprocess_data = staticmethod(_preprocess_data)
~\Anaconda3\lib\site-packages\sklearn\linear_model\base.py in _decision_function(self, X)
194 check_is_fitted(self, "coef_")
195
--> 196 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
197 return safe_sparse_dot(X, self.coef_.T,
198 dense_output=True) + self.intercept_
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
543 "Reshape your data either using array.reshape(-1, 1) if "
544 "your data has a single feature or array.reshape(1, -1) "
--> 545 "if it contains a single sample.".format(array))
546 # If input is 1D raise error
547 if array.ndim == 1:
ValueError: Expected 2D array, got scalar array instead:
array=3300.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Try reg.predict([[3300]]). The api used to allow scalar value but now you need to give 2D array
reg.fit(df[['area']],df.price)
I think above we are using 2 variables, so using 2D array to fit [X]. we need to use 2D array in reg.predict for [X],too. Hence,
reg.predict([[3300]])
Expected 2D array,got scalar array instead: this is written in the error explained box so
kindly change it to :
just wrote it like this
reg.predict([[3300]])

Resources