Strange "reindexing error" converting Series to DataFrame - python-3.x

I have two Series objects, which from my perspective look exactly the same, except they contain different data. I have attempted to convert them to DataFrames and to put them both in the same DataFrame as separate columns. For some reason I cannot fathom, one of the Series will be converted happily to a DataFrame and the other one refuses to be converted when placed in a container (list or dict). I get a reindexing error, but there are no duplicates in the index of either Series.
import pickle
import pandas as pd
s1 = pickle.load(open('s1.p', 'rb'))
s2 = pickle.load(open('s2.p', 'rb'))
print(s1.head(10))
print(s2.head(10))
pd.DataFrame(s1) # <--- works fine
pd.DataFrame(s2) # <--- works fine
pd.DataFrame([s1]) # <--- works fine
# pd.DataFrame([s2]) # <--- doesn't work
# pd.DataFrame([s1, s2]) # <--- doesn't work
pd.DataFrame({s1.name: s1}) # <--- works fine
pd.DataFrame({s2.name: s2}) # <--- works fine
pd.DataFrame({s1.name: s1, s2.name: s1}) # <--- works fine
# pd.DataFrame({s1.name: s1, s2.name: s2}) # <--- doesn't work
Here is the output, although you can't see it here, there is overlap between the index values; they are just in a different order. I want the indexes to be matched when I combine them into the same DataFrame.
id
801120 42.01
801138 50.18
801139 50.01
802101 53.77
802110 56.52
802112 47.37
802113 46.52
802114 46.58
802115 42.59
802117 40.85
Name: age, dtype: float64
id
A32067 0.39083
A32195 0.28506
A01685 0.36432
A11124 0.55649
A32020 0.41524
A32021 0.43788
A32098 0.49206
A00699 0.37515
A32158 0.58793
A14139 0.47413
Name: lh_vtx_000001, dtype: float64
Traceback when the final line is uncommented:
Traceback (most recent call last):
File "/Users/sm2286/Documents/Vertex/test.py", line 18, in <module>
pd.DataFrame({s1.name: s1, s2.name: s2}) # <--- doesn't work
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 224, in __init__
mgr = self._init_dict(data, index, columns, dtype=dtype)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 360, in _init_dict
return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 5236, in _arrays_to_mgr
arrays = _homogenize(arrays, index, dtype)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 5534, in _homogenize
v = v.reindex(index, copy=False)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/series.py", line 2287, in reindex
return super(Series, self).reindex(index=index, **kwargs)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 2229, in reindex
fill_value, copy).__finalize__(self)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 2247, in _reindex_axes
copy=copy, allow_dups=False)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 2341, in _reindex_with_indexers
copy=copy)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3586, in reindex_indexer
self.axes[axis]._can_reindex(indexer)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py", line 2293, in _can_reindex
raise ValueError("cannot reindex from a duplicate axis")
ValueError: cannot reindex from a duplicate axis
Traceback when line 13 is uncommented:
Traceback (most recent call last):
File "/Users/sm2286/Documents/Vertex/test.py", line 13, in <module>
pd.DataFrame([s2]) # <--- doesn't work
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 263, in __init__
arrays, columns = _to_arrays(data, columns, dtype=dtype)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 5359, in _to_arrays
dtype=dtype)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 5453, in _list_of_series_to_arrays
indexer = indexer_cache[id(index)] = index.get_indexer(columns)
File "/Users/sm2286/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py", line 2082, in get_indexer
raise InvalidIndexError('Reindexing only valid with uniquely'
pandas.indexes.base.InvalidIndexError: Reindexing only valid with uniquely valued Index objects

After more investigation the difference between the Series was that the latter contained missing values. Removing them fixed the issue.

Related

KeyError: 'longitude' when reading from csv file [duplicate]

I have successfully read a csv file using pandas. When I am trying to print the a particular column from the data frame i am getting keyerror. Hereby i am sharing the code with the error.
import pandas as pd
reviews_new = pd.read_csv("D:\\aviva.csv")
reviews_new['review']
**
reviews_new['review']
Traceback (most recent call last):
File "<ipython-input-43-ed485b439a1c>", line 1, in <module>
reviews_new['review']
File "C:\Users\30216\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\frame.py", line 1997, in __getitem__
return self._getitem_column(key)
File "C:\Users\30216\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\frame.py", line 2004, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\30216\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\generic.py", line 1350, in _get_item_cache
values = self._data.get(item)
File "C:\Users\30216\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\internals.py", line 3290, in get
loc = self.items.get_loc(item)
File "C:\Users\30216\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\indexes\base.py", line 1947, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\index.pyx", line 137, in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)
File "pandas\index.pyx", line 159, in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)
File "pandas\hashtable.pyx", line 675, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)
File "pandas\hashtable.pyx", line 683, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)
KeyError: 'review'
**
Can someone help me in this ?
I think first is best investigate, what are real columns names, if convert to list better are seen some whitespaces or similar:
print (reviews_new.columns.tolist())
I think there can be 2 problems (obviously):
1.whitespaces in columns names (maybe in data also)
Solutions are strip whitespaces in column names:
reviews_new.columns = reviews_new.columns.str.strip()
Or add parameter skipinitialspace to read_csv:
reviews_new = pd.read_csv("D:\\aviva.csv", skipinitialspace=True)
2.different separator as default ,
Solution is add parameter sep:
#sep is ;
reviews_new = pd.read_csv("D:\\aviva.csv", sep=';')
#sep is whitespace
reviews_new = pd.read_csv("D:\\aviva.csv", sep='\s+')
reviews_new = pd.read_csv("D:\\aviva.csv", delim_whitespace=True)
EDIT:
You get whitespace in column name, so need 1.solutions:
print (reviews_new.columns.tolist())
['Name', ' Date', ' review']
^ ^
import pandas as pd
df=pd.read_csv("file.txt", skipinitialspace=True)
df.head()
df['review']
dfObj['Hash Key'] = (dfObj['DEAL_ID'].map(str) +dfObj['COST_CODE'].map(str) +dfObj['TRADE_ID'].map(str)).apply(hash)
#for index, row in dfObj.iterrows():
# dfObj.loc[`enter code here`index,'hash'] = hashlib.md5(str(row[['COST_CODE','TRADE_ID']].values)).hexdigest()
print(dfObj['hash'])

How to graph 2 column dataframe into multiple violin plots with seaborn

I'm struggling with getting seaborn to work for me. I've looked at several tutorials, and nothing seems to be working. First column is float values, second column is class labels. I'm wanting each class label to have it's own violin plot. There is class-imbalance. How do I get the second column to split up into separate violin plots, and the floats from it's neighboring column assigned into those groups?
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
data = <read files and append [float, str] to list>
data = pd.DataFrame(data=data, columns=['Slope', 'Class'])
Slope Class
0 0.1204778488753736 A
1 -0.05555463166744862 B
3 0.12801810567575655 A
4 -0.05620886965822473 B
... ... ...
1126 0.10525394490595655 D
1127 0.10174103119847132 E
1128 -0.03527457290513986 C
1129 -0.035187264760902316 D
1130 -0.03407274349346173 E
sns.violinplot(data=data)
It displays:
I'm running out of ideas of what to try. Most things end up with the error:
Traceback (most recent call last):
File "c:\Path\Slope Graphs.py", line 43, in <module>
main(slopes1, slopes2)
File "c:\Path\Slope Graphs.py", line 26, in main
sns.violinplot(x='Class', y='Slope', data=data)
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\seaborn\categorical.py", line 2384, in violinplot
plotter = _ViolinPlotter(x, y, hue, data, order, hue_order,
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\seaborn\categorical.py", line 554, in __init__
self.estimate_densities(bw, cut, scale, scale_hue, gridsize)
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\seaborn\categorical.py", line 620, in estimate_densities
kde, bw_used = self.fit_kde(kde_data, bw)
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\seaborn\categorical.py", line 705, in fit_kde
kde = stats.gaussian_kde(x, bw)
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\scipy\stats\kde.py", line 209, in __init__
self.set_bandwidth(bw_method=bw_method)
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\scipy\stats\kde.py", line 565, in set_bandwidth
self._compute_covariance()
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\scipy\stats\kde.py", line 574, in _compute_covariance
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
File "<__array_function__ internals>", line 5, in cov
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\numpy\lib\function_base.py", line 2422, in cov
avg, w_sum = average(X, axis=1, weights=w, returned=True)
File "<__array_function__ internals>", line 5, in average
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\numpy\lib\function_base.py", line 420, in average
scl = wgt.sum(axis=axis, dtype=result_dtype)
File "C:\...\AppData\Local\Programs\Python\Python38-32\lib\site-packages\numpy\core\_methods.py", line 38, in _sum
return umr_sum(a, axis, dtype, out, keepdims, initial, where)
TypeError: No loop matching the specified signature and casting was found for ufunc add

Python MEMORY ERROR when loading 2455 CSV files (42GB) as pandas dataframe

Good day, I have 42Gb of data in a list of sequenced 2455 xCSV files.
I am trying to import the data sequentially using a loop into a pd.DataFrame for analysis.
I have tried it with 3 files and it works well.
from glob import glob
import pandas as pd
# Import data into DF
filenames = glob('Z:\PersonalFolders\AllData\*.csv')
df_trial = [pd.read_csv(f) for f in filenames]
df_trial
I am getting the following error. Copy pasted the traceback here. Please help
df_trial = [pd.read_csv(f) for f in filenames]
Traceback (most recent call last):
File "<ipython-input-23-0438182db491>", line 1, in <module>
df_trial = [pd.read_csv(f) for f in filenames]
File "<ipython-input-23-0438182db491>", line 1, in <listcomp>
df_trial = [pd.read_csv(f) for f in filenames]
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 454, in _read
data = parser.read(nrows)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1148, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\frame.py", line 435, in __init__
mgr = init_dict(data, index, columns, dtype=dtype)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\internals\construction.py", line 254, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\internals\construction.py", line 74, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 1670, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 1726, in form_blocks
float_blocks = _multi_blockify(items_dict["FloatBlock"])
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 1820, in _multi_blockify
values, placement = _stack_arrays(list(tup_block), dtype)
File "C:\Users\WorkStation\Anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 1848, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError: Unable to allocate 107. MiB for an array with shape (124, 113012) and data type float64
There are a number of things you can do.
First, only process one dataframe at a time:
filenames = glob('Z:\PersonalFolders\AllData\*.csv')
for f in filenames:
df = pd.read_csv(f)
process(df)
Second, if that's not possible you can try to reduce the amount of memory used when loading the dataframes by a variety of means (smaller dtypes for numeric columns, omitting numeric columns, and more). See https://pythonspeed.com/articles/pandas-load-less-data/ for some starting points on these techniques.
Thanks to all.
I was able to process all 42GB of data using the nrows argument
filenames = glob('Z:\PersonalFolders\AllData\*.csv')
df_2019=[]
for filename in filenames:
df = pd.read_csv(filename, index_col=None, header=0, nrows = 1000)
df_2019.append(df)
frame = pd.concat(df_2019, axis=0, ignore_index=True)

ValueError after MinMaxScaler and Transform

I am experiencing difficulty in this area. I experienced ValueError in the following: (I have tried solutions online but to no avail)
Here's my original code, which returns Convert String to Float error
ValueError: could not convert string to float: '3,1,0,0,0,1,0,1,89874,49.99'):
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
training_data_df = pd.read_csv('./data/sales_data_training.csv')
scaler = MinMaxScaler(feature_range=(0,1))
scaled_training= scaler.fit_transform(training_data_df)
scaled_training_df = pd.DataFrame(scaled_training,columns= training_data_df.columns.values)
My CSV Data:
"critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price"
"3.5,1,0,1,0,1,0,0,132717,59.99"
"4.5,0,0,0,0,1,1,0,83407,49.99"...
'3,1,0,0,0,1,0,1,89874,49.99'
I have 9 columns of data across 1000 rows (~9999 data, with first row being the header).
Regards,
Yuki
The full error is as follows:
Traceback (most recent call last):
File "C:/Users/YukiKawaii/PycharmProjects/PandasTest/module2_NN/test.py", line 6, in <module>
scaled_training= scaler.fit_transform(training_data_df)
File "C:\Users\YukiKawaii\Python\Python35\lib\site-packages\sklearn\base.py", line 517, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "C:\Users\YukiKawaii\Python\Python35\lib\site-packages\sklearn\preprocessing\data.py", line 308, in fit
return self.partial_fit(X, y)
File "C:\Users\YukiKawaii\Python\Python35\lib\site-packages\sklearn\preprocessing\data.py", line 334, in partial_fit
estimator=self, dtype=FLOAT_DTYPES)
File "C:\Users\YukiKawaii\Python\Python35\lib\site-packages\sklearn\utils\validation.py", line 433, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: could not convert string to float: '3,1,0,0,0,1,0,1,89874,49.99'
You should remove the "" and '' wrapped around each line in the csv file.
By default pd.read_csv() splits each line by , and thus it cannot convert strings to floats if the "" and '' were there.
So the csv file should look as follows.
critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
3.5,1,0,1,0,1,0,0,132717,59.99
4.5,0,0,0,0,1,1,0,83407,49.99
3,1,0,0,0,1,0,1,89874,49.99
I just verified by running your code after making the above change.

Why I am wrong to concatenate matrix and vector?

This is my code in theano
max_max=200
beReplaced=T.matrix()
toReplace=T.matrix()
timeArray=T.arange(max_max)
def f(v,k,w):
return T.concatenate([w[:k],v,w[k+1:]],axis=0)
result,_=theano.scan(f,
sequences=[toReplace,timeArray],
outputs_info=beReplaced)
What I am trying to do is replace beReplaced with toReplace line by line. The way I do it is by concatenate the upper part of w, v and lower parter of w.
vis lines of toReplace.
Here is the error report
Traceback (most recent call last):
File "/Users/qiansteven/Desktop/NLP/RNN/my.py", line 20, in <module>
outputs_info=np.zeros((5,5),dtype=np.float64))
File "/usr/local/lib/python2.7/site-packages/theano/scan_module/scan.py", line 745, in scan
condition, outputs, updates = scan_utils.get_updates_and_outputs(fn(*args))
File "/Users/qiansteven/Desktop/NLP/RNN/my.py", line 16, in f
return T.concatenate([a,b,c],axis=0)
File "/usr/local/lib/python2.7/site-packages/theano/tensor/basic.py", line 4225, in concatenate
return join(axis, *tensor_list)
File "/usr/local/lib/python2.7/site-packages/theano/gof/op.py", line 611, in __call__
node = self.make_node(*inputs, **kwargs)
File "/usr/local/lib/python2.7/site-packages/theano/tensor/basic.py", line 3750, in make_node
axis, tensors, as_tensor_variable_args, output_maker)
File "/usr/local/lib/python2.7/site-packages/theano/tensor/basic.py", line 3816, in _make_node_internal
raise TypeError("Join() can only join tensors with the same "
TypeError: Join() can only join tensors with the same number of dimensions.
What's wrong???????????
Put toReplace into non_sequences, otherwise each timestep will only take a slice of it. Theano will report error when it tries to concatenate a vector with matrix.
def f(k,w,v): #NOTE the argument order change
return T.concatenate([w[:k],v,w[k+1:]],axis=0)
result,_=theano.scan(f,
sequences=timeArray,
outputs_info=beReplaced,
non_sequences=toReplace)
The solution is to concatenate v.dimshuffle('x',0) and that solves the dim problem.

Resources