When importing a csv file I can't seem to set the index. I can't work out if am importing the file correctly, I am doing everything in the interpreter currently, here is what I have:
df = pd.read_csv('E:/test.vbo', sep='\t', encoding='iso-8859-1', skiprows=97)
print(df.head())
this give the following:
sats time lat long velocity heading height ...
0 [data]
1 008 144403.30 003067.21791 000031.98044 010.033 097.16 +00112.43 ...
2 008 144403.40 003067.21777 000031.98036 010.584 098.58 +00113.06 ...
3 008 144403.50 003067.21765 000031.98032 010.809 099.74 +00113.72 ...
4 008 144403.60 003067.21749 000031.98025 011.231 101.05 +00114.34 ...
5 008 144403.70 003067.21728 000031.98021 011.575 102.14 +00114.89 ...
Which is fine, however, this line:
print(df.set_index('time'))
give an error:
>>> print(df.set_index('time'))
Traceback (most recent call last):
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\indexes\base.py", line 1945, in get_loc
return self._engine.get_loc(key)
File "pandas\index.pyx", line 137, in pandas.index.IndexEngine.get_loc (pandas
\index.c:4154)
File "pandas\index.pyx", line 159, in pandas.index.IndexEngine.get_loc (pandas
\index.c:4018)
File "pandas\hashtable.pyx", line 675, in pandas.hashtable.PyObjectHashTable.g
et_item (pandas\hashtable.c:12368)
File "pandas\hashtable.pyx", line 683, in pandas.hashtable.PyObjectHashTable.g
et_item (pandas\hashtable.c:12322)
KeyError: 'time'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\core\frame.py", line 2837, in set_index
level = frame[col]._values
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\core\frame.py", line 1997, in __getitem__
return self._getitem_column(key)
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\core\frame.py", line 2004, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\core\generic.py", line 1350, in _get_item_cache
values = self._data.get(item)
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\core\internals.py", line 3290, in get
loc = self.items.get_loc(item)
File "C:\Users\rob.kinsey\AppData\Local\Continuum\Anaconda3\lib\site-packages\
pandas\indexes\base.py", line 1947, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\index.pyx", line 137, in pandas.index.IndexEngine.get_loc (pandas
\index.c:4154)
File "pandas\index.pyx", line 159, in pandas.index.IndexEngine.get_loc (pandas
\index.c:4018)
File "pandas\hashtable.pyx", line 675, in pandas.hashtable.PyObjectHashTable.g
et_item (pandas\hashtable.c:12368)
File "pandas\hashtable.pyx", line 683, in pandas.hashtable.PyObjectHashTable.g
et_item (pandas\hashtable.c:12322)
KeyError: 'time'
>>>
What am I missing please?
>>> print(df.columns.tolist())
['sats time lat long velocity heading height vert-vel dgps racceleratorpedal ast
eeringwheel pbrake glateral glongitudinal awingpitch ngearengaged nengine nwheel
fr nwheelrr nwheelfl nwheelrl mengine rdrsavailabledisplayed pwaterpump toil tdc
dc tmotorstator phvac nephsmotor pboost taircharge tcoolant tclutchoil nephspump
demanded tcellmax vbattery paerooil taerooil tmcucoldplate awingpitchdemand tmcu
_igbtmax avifileindex avitime ']
Solved, here is the correct read_csv line:
>>> df = pd.read_csv('E:/vbox_data/P1GTR__20150922144312_0001.vbo', delim_whitespace=True, encoding='iso-8859-1', header=90)
Related
import pandas as pd
import numpy as np
df = pd.read_csv("ia-infect-dublin.csv", header = None)
df.columns = ['Person_ID', 'Contacted']
df = df.sort_values(by=['Person_ID', 'Contacted'])
unique = df['Person_ID'].unique()
unique = np.append(unique, ["Start"])
matrix = pd.DataFrame(0, columns=unique, index=unique, dtype=int)
l_group = df.groupby('Person_ID')
for name, group in l_group:
i = 0
for index, rows in group.iterrows():
if i ==0:
matrix.loc[['Start'], rows['Person_ID']] += 1
previous_state = rows['Person_ID']
i = 1
else:
matrix.loc[previous_state, rows['Person_ID']] += 1
print(matrix.head())
I am making a transition matrix of where it goes through the csv file and find the frequency of changes from one person to another. A -> B -> C -> D and adds the total count. However, I receive an error:
Traceback (most recent call last):
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 6
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/vydang/Documents/FA22/BMI5007/Homeworks/hw10/testing", line 23, in <module>
matrix.loc[['Start'], rows['Person_ID']] += 1
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 961, in __getitem__
return self._getitem_tuple(key)
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 1140, in _getitem_tuple
return self._getitem_lowerdim(tup)
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 867, in _getitem_lowerdim
section = self._getitem_axis(key, axis=i)
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 1202, in _getitem_axis
return self._get_label(key, axis=axis)
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 1153, in _get_label
return self.obj.xs(label, axis=axis)
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 3849, in xs
return self[key]
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py", line 3505, in __getitem__
indexer = self.columns.get_loc(key)
File "/Users/vydang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3623, in get_loc
raise KeyError(key) from err
KeyError: 6
I have tried to see where the error is by:
value = '6'
if value in matrix.index:
print(matrix.loc[value])
else:
print("Not in index")
And it does populate and I have also tried:
matrix['6']
matrix.loc['6']
and no error occurs. Is there any other possible reasons that this may be occuring?
I have tried to check if the index has the 'Start' and it did.
I'm fairly new to Python and am following a tutorial on creating a wordcloud based on a customer reviews file. The tutorial link is https://towardsdatascience.com/detecting-bad-customer-reviews-with-nlp-d8b36134dc7e
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
# read data
reviews_df = pd.read_csv("Hotel_Reviews3.csv")
# append the positive and negative text reviews
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
# create the label
reviews_df["is_bad_review"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)
# select only relevant columns
reviews_df = reviews_df[["review", "is_bad_review"]]
reviews_df.head()
Hotel_Reviews3.csv:
https://i.stack.imgur.com/8ZGxj.png
ERROR MESSAGE:
Traceback (most recent call last):
File "C:\Users\stecd\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\indexes\base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Positive_Review'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\stecd\Desktop\WorldCloud\wordCloud.py", line 6, in <module>
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
File "C:\Users\stecd\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\frame.py", line 2688, in __getitem__
return self._getitem_column(key)
File "C:\Users\stecd\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\frame.py", line 2695, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\stecd\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "C:\Users\stecd\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "C:\Users\stecd\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\indexes\base.py", line 3080, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Positive_Review'
>>>
From the error message i'd guess that Hotel_Reviews3.csv may not have a "Positive_Review" column. It could be that the corresponding table entry is truncated or has whitespaces so that it does not match "Positive_Review".
I was trying out this ..sorting out the necessary data using pandas taking the data set from quandl. when this showed up on execution.
import pandas as pd
import quandl
df = quandl.get('WIKI/GOOGL')
print(df.head())
df = df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume']]
print(df.head())
df['HL_PCT']=(df['Adj. High']- df['Adj. Close'])/df['Adj. close'] *100.0
df['PCT_change'] = (df['Adj .Close']-df['Adj. open'])/df['Adj. Open']*100.0
df = df[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]
so i got that as an error message
update:
i updated my code with right capitalizing
now its got this as error
Traceback (most recent call last):
File "D:\Program Files\Python37\lib\site-packages\pandas\core\indexes\base.py", line 2656, in get_loc
return self._engine.get_loc(key)
File "pandas_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas_libs\hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Adj .Close'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "sentdex2.py", line 8, in
df['PCT_change'] = (df['Adj .Close']-df['Adj. Open'])/df['Adj. Open'] *100.0
File "D:\Program Files\Python37\lib\site-packages\pandas\core\frame.py", line 2927, in getitem
indexer = self.columns.get_loc(key)
File "D:\Program Files\Python37\lib\site-packages\pandas\core\indexes\base.py", line 2658, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas_libs\hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Adj .Close'
Replace Adj. close with Adj. Close (note the capital C)
I can selected one column from a DataFrame, for example: the code like print(df['201809']) works:
df = pd.read_csv('xxxx.csv', low_memory=False)
print(df.info()]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 10
Data columns (total 4 columns):
BO_product2 11 non-null object
201808 11 non-null float64
201809 11 non-null float64
4 11 non-null float64
dtypes: float64(3), object(1)
memory usage: 440.0+ bytes
print(df['201809']) # works fine
None
0 1.634931e+06
1 2.653640e+08
2 7.475315e+07
3 9.710830e+06
4 3.023899e+08
5 1.087862e+08
6 2.031106e+08
7 3.556234e+08
8 5.830665e+06
9 8.766841e+08
10 7.544689e+07
Name: 201809, dtype: float64
However print(df['4']) don't. Any tips or ideas is here?
PS: if i save the df.to_csv('yy.csv) to local file in csv format, print(a['4'])works after `df = pd.read_csv('yy.csv').
print(df['4'])
Traceback (most recent call last):
File "C:\Users\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\indexes\base.py", line 3063, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '4'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:/Python/2.py", line 45, in <module>
he()
File "E:/Python/2.py", line 26, in he
print(a['4'])
File "C:\Users\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\frame.py", line 2685, in __getitem__
return self._getitem_column(key)
File "C:\Users\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\frame.py", line 2692, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\generic.py", line 2486, in _get_item_cache
values = self._data.get(item)
File "C:\Users\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "C:\Users\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\indexes\base.py", line 3065, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '4'
If you execute the below:
[type(i) for i in df.columns]
#[str, str, str, int]
For columns having type int you should call the column as df[4] and not df['4']
Probably the reason why it is getting written as string is due to the quoting builtin function. From the docs:
quoting : optional constant from csv module
defaults to csv.QUOTE_MINIMAL. If you have set a float_format then floats are >>converted to strings and thus csv.QUOTE_NONNUMERIC will treat them as non->>numeric
Hope this helps.
I am trying to set up Python (3.4) code to sort a time-series by date.
In python shell, I key in the following
>>>data = quandl.get("YAHOO/INDEX_GSPC", start_date="2017-01-01", end_date="2017-01-20")
>>>print(data)
So, I can load in the data. But when I try to use sort by the command
>>>data = data.sort_values(by='Date')
I get the following list of errors messages. I can't seem to understand/get the syntax for date sort from http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort_values.html
Experts out there......., many thanks for advice.
Traceback (most recent call last):
File "C:\Python34\lib\site-packages\pandas\indexes\base.py", line 2134, in get_loc
return self._engine.get_loc(key)
File "pandas\index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas\index.c:4433)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
File "pandas\src\hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13742)
File "pandas\src\hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13696)
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<pyshell#37>", line 1, in <module>
data = data.sort_values(by='Date')
File "C:\Python34\lib\site-packages\pandas\core\frame.py", line 3230, in sort_values
k = self.xs(by, axis=other_axis).values
File "C:\Python34\lib\site-packages\pandas\core\generic.py", line 1770, in xs
return self[key]
File "C:\Python34\lib\site-packages\pandas\core\frame.py", line 2059, in __getitem__
return self._getitem_column(key)
File "C:\Python34\lib\site-packages\pandas\core\frame.py", line 2066, in _getitem_column
return self._get_item_cache(key)
File "C:\Python34\lib\site-packages\pandas\core\generic.py", line 1386, in _get_item_cache
values = self._data.get(item)
File "C:\Python34\lib\site-packages\pandas\core\internals.py", line 3543, in get
loc = self.items.get_loc(item)
File "C:\Python34\lib\site-packages\pandas\indexes\base.py", line 2136, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas\index.c:4433)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
File "pandas\src\hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13742)
File "pandas\src\hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13696)
KeyError: 'Date'
quandl.get loads a DataFrame with the date as index.
So if you sort by index, you're good to go:
data = data.sort_index()
Make sure you look at the error. You are getting a KeyError which means that the column Date does not exist in your DataFrame. It's like that the dates are stored in the index which requires the sort_index method instead. The 'Date' name that you see in your DataFrame is the name of the index and not a column.
data.sort_index()