I'm getting " ValueError: 111816 is not in range" error when trying to use FuzzyWuzzy between two other dataframe column - fuzzywuzzy

I am getting error when trying to use FuzzyWuzzy between two other dataframe column.
I want to match df_1['name_new'] to df['term'].
below is the site where I got my code
https://towardsdatascience.com/fuzzy-string-match-with-python-on-large-dataset-and-why-you-should-not-use-fuzzywuzzy-4ec9f0defcd
#Transform text to vectors with TF-IDF:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')
tf_idf_matrix_1 = tfidf_vectorizer.fit_transform(df_1['name_new'])
tf_idf_matrix_2 = tfidf_vectorizer.fit_transform(df['term'])
I careated "tf_idf_matrix_2 " to match other df's 'term' column
from scipy.sparse import csr_matrix
!pip install sparse_dot_topn
import sparse_dot_topn.sparse_dot_topn as ct
def awesome_cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M*ntop
indptr = np.zeros(M+1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data,indices,indptr),shape=(M,N))
import time
t1 = time.time()
# adjust lower bound: 0.8
# keep top 10 similar results
matches = awesome_cossim_top(tf_idf_matrix_1, tf_idf_matrix_2.transpose(), 10, 0.8)
t = time.time()-t1
print("finished in:", t)
def get_matches_df(sparse_matrix, name_vector, top=10000):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'name_new_1': left_side,
'term_1': right_side,
'similairity_score': similairity})
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_1['name_new'], top=10000)
# Remove all exact matches
I get my error like this=>
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
384 try:
--> 385 return self._range.index(new_key)
386 except ValueError as err:
ValueError: 111816 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
385 return self._range.index(new_key)
386 except ValueError as err:
--> 387 raise KeyError(key) from err
388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: 111816
Please help... what is wrong with my code?

Related

name = 0 when trying to print Column names of pandas dataframe

Filtering on Date columns
dateCols = c[newDF.columns.str.contains('Date', case = False)]
dateCols
Output:
Index(['A. Date ', 'B.Date', 'C.D date'], dtype='object', name=0)
##Infact I see name = 0 even when I print newDF.columns
When I try to change the above columns to Date using:
newDF[dateCols] = pd.to_datetime(newDF[dateCols].stack(), errors='coerce').unstack()
I am getting error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [124], in <cell line: 4>()
2 dateCols = newDF.columns[newDF.columns.str.contains('Date', case = False)]
3 dateCols
----> 4 newDF[dateCols] = newDF[[dateCols]].apply(pd.to_datetime)
File ~/.pyenv/versions/3.8.2/lib/python3.8/site-packages/pandas/core/frame.py:3512, in DataFrame.__getitem__(self, key)
3510 if is_iterator(key):
3511 key = list(key)
-> 3512 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3514 # take() does not accept boolean indexers
3515 if getattr(indexer, "dtype", None) == bool:
File ~/.pyenv/versions/3.8.2/lib/python3.8/site-packages/pandas/core/indexes/base.py:5782, in Index._get_indexer_strict(self, key, axis_name)
5779 else:
5780 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5782 self._raise_if_missing(keyarr, indexer, axis_name)
5784 keyarr = self.take(indexer)
5785 if isinstance(key, Index):
5786 # GH 42790 - Preserve name from an Index
File ~/.pyenv/versions/3.8.2/lib/python3.8/site-packages/pandas/core/indexes/base.py:5842, in Index._raise_if_missing(self, key, indexer, axis_name)
5840 if use_interval_msg:
5841 key = list(key)
-> 5842 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
5844 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
5845 raise KeyError(f"{not_found} not in index")
KeyError: "None of Index(['A. Date ', 'B.Date', 'C.D date'], dtype='object', name=0) are in the [columns]"
I have tried to replicate your error, but I did not receive any errors.
Code used:
df = pd.DataFrame({'A.Date': ['2020-01-01'] * 10, 'B.Date': ['2020-01-01'] * 10, 'C.Date': ['2020-01-01'] * 10, 'other': np.arange(10)})
cols = df.columns[df.columns.str.contains('Date', case=False)]
Which shows for cols:
Index(['A.Date', 'B.Date', 'C.Date'], dtype='object')
Finally using your last statement:
df[cols] = pd.to_datetime(df[cols].stack(), errors='coerce').unstack()
Edit: when looking into the logs we see the code:
newDF[dateCols] = newDF[[dateCols]].apply(pd.to_datetime)
This should be changed to the following code to resolve your problems.
newDF[dateCols] = newDF[dateCols].apply(pd.to_datetime)

Couldn't resolve a key not in index error

This is the link to the pdf file from which I want to extract data
def onlyenglish(text):
import re
alphabet_regular_expression = re.compile("[^a-zA-Z|()]")
text = re.sub(alphabet_regular_expression,"",text)
return text
annexure2page1 = tabula.read_pdf(file, pages = 1 , lattice = True, relative_area=True)
annexure2page1_df1= annexure2page1[0]
annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
annexure2page1_df2 = annexure2page1_df2.replace('\r',' ', regex=True)`
annexure2page1_df2['ReportMonth'] = reportmonth
annexure2page1_df2['एयरपोर्\rAIRPORT'] = annexure2page1_df2['एयरपोर्\rAIRPORT'].str.title()
annexure2page1_df2['Airports'] = annexure2page1_df2['एयरपोर्\rAIRPORT'].apply(lambda x: onlyenglish(str(x)))
annexure2page1_df2 = annexure2page1_df2.rename(columns={'वरयुयरन प्रचरलन (िंख्यर म )\rAIRCRAFT MOVEMENTS (IN NOS.)':'value','Unnamed: 8':'value_ytm','Unnamed: 4':'value_smly','Unnamed: 10':'value_ytmly'})
annexure2page1_df2 = annexure2page1_df2.replace(r'^\s*$', np.nan, regex=True)
annexure2page1_df3 = annexure2page1_df2.dropna()
annexure2page1_df3["Service"] = "International"
annexure2page1_df3["Metric"] = "ATMs"
annexure2page1_df3['ReportName'] = reportname
annexure2page1_df3['reportlink'] = file
##extracting page 1
annexure2page1extraction = annexure2page1_df3[['ReportName','reportlink','ReportMonth','Airports','Service','Metric','value','value_smly','value_ytm','value_ytmly']]
Error Stack
> KeyError Traceback (most recent call last)
<ipython-input-14-9c5d09fa538a> in <module>()
2 annexure2page1_df1= annexure2page1[0]
3 #
----> 4 annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
5 annexure2page1_df2 = annexure2page1_df2.replace('\r',' ', regex=True)
6 annexure2page1_df2['ReportMonth'] = reportmonth
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1302 if raise_missing:
1303 not_found = list(set(key) - set(ax))
-> 1304 raise KeyError(f"{not_found} not in index")
1305
1306 # we skip the warning on Categorical
KeyError: "['वायुयाि प्रर्ालि (संख्या में)\\rAIRCRAFT MOVEMENTS (IN NOS.)', 'एयिपोर्च\\rAIRPORT'] not in index"
Replace this line of code -->
annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
with --> annexure2page1_df2 = annexure2page1_df1.iloc[:,[3,5,7,11,13]]
The error that you got ( KeyError: "['वायुयाि प्रर्ालि (संख्या में)\\rAIRCRAFT MOVEMENTS (IN NOS.)', 'एयिपोर्च\\rAIRPORT'] not in index") is becuase of keys not found.
So , I have bypassed it by directly providing the index location of the columns that you want to subset.
Check out the screenshot

ValueError: Shape of passed values is, indices imply

Reposting again because i didn't get a response to the first post
I have the following data is below:
desc = pd.DataFrame(description, columns =['new_desc'])
new_desc
257623 the public safety report is compiled from crim...
161135 police say a sea isle city man ordered two pou...
156561 two people are behind bars this morning, after...
41690 pumpkin soup is a beloved breakfast soup in ja...
70092 right now, 15 states are grappling with how be...
... ...
207258 operation legend results in 59 more arrests, i...
222170 see story, 3a
204064 st. louis — missouri secretary of state jason ...
151443 tony lavell jones, 54, of sunset view terrace,...
97367 walgreens, on the other hand, is still going t...
[9863 rows x 1 columns]
I'm trying to find the dominant topic within the documents, and When I run the following code
best_lda_model = lda_desc
data_vectorized = tfidf
lda_output = best_lda_model.transform(data_vectorized)
topicnames = ["Topic " + str(i) for i in range(best_lda_model.n_components)]
docnames = ["Doc " + str(i) for i in range(len(dataset))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topicnames, index = docnames)
dominant_topic = np.argmax(df_document_topic.values, axis = 1)
df_document_topic['dominant_topic'] = dominant_topic
I've tried tweaking the code, however, no matter what I change, I get the following error tracebook error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
c:\python36\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1673
-> 1674 mgr = BlockManager(blocks, axes)
1675 mgr._consolidate_inplace()
c:\python36\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
148 if do_integrity_check:
--> 149 self._verify_integrity()
150
c:\python36\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
328 if block.shape[1:] != mgr_shape[1:]:
--> 329 raise construction_error(tot_items, block.shape[1:], self.axes)
330 if len(self.items) != tot_items:
ValueError: Shape of passed values is (9863, 8), indices imply (0, 8)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-41-bd470d69b181> in <module>
4 topicnames = ["Topic " + str(i) for i in range(best_lda_model.n_components)]
5 docnames = ["Doc " + str(i) for i in range(len(dataset))]
----> 6 df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topicnames, index = docnames)
7 dominant_topic = np.argmax(df_document_topic.values, axis = 1)
8 df_document_topic['dominant_topic'] = dominant_topic
c:\python36\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
c:\python36\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
c:\python36\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1679 blocks = [getattr(b, "values", b) for b in blocks]
1680 tot_items = sum(b.shape[0] for b in blocks)
-> 1681 raise construction_error(tot_items, blocks[0].shape[1:], axes, e)
1682
1683
ValueError: Shape of passed values is (9863, 8), indices imply (0, 8)
The desired results is to produce a list of documents according to a specific topic. Below is example code and desired output.
df_document_topic(df_document_topic['dominant_topic'] == 2).head(10)
When I run this code, I get the following traceback
TypeError Traceback (most recent call last)
<ipython-input-55-8cf9694464e6> in <module>
----> 1 df_document_topic(df_document_topic['dominant_topic'] == 2).head(10)
TypeError: 'DataFrame' object is not callable
Below is the desired output
Any help would be greatly appreciated.
The index you're passing as docnames is empty which is obtained from dataset as follows:
docnames = ["Doc " + str(i) for i in range(len(dataset))]
So this means that the dataset is empty too. For a workaround, you can create Doc indices based on the size of lda_output as follows:
docnames = ["Doc " + str(i) for i in range(len(lda_output))]
Let me know if this works.

IndexError multiprocessing.Pool

I'm getting an IndexError using multiprocessing to process parts of a pandas DataFrame in parallel. vacancies is a pandas DataFrame containing several vacancies, of which one column is the raw text.
def addSkillRelevance(vacancies):
skills = pickle.load(open("skills.pkl", "rb"))
vacancies['skill'] = ''
vacancies['skillcount'] = 0
vacancies['all_skills_in_vacancy'] = ''
new_vacancies = pd.DataFrame(columns=vacancies.columns)
for vacancy_index, vacancy_row in vacancies.iterrows():
#Create a df for which each row is a found skill (with the other attributes of the vacancy)
per_vacancy_df = pd.DataFrame(columns=vacancies.columns)
all_skills_in_vacancy = []
skillcount = 0
for skill_index, skill_row in skills.iterrows():
#Making the search for the skill in the text body a bit smarter
spaceafter = ' ' + skill_row['txn_skill_name'] + ' '
newlineafter = ' ' + skill_row['txn_skill_name'] + '\n'
tabafter = ' ' + skill_row['txn_skill_name'] + '\t'
#Statement that returns true if we find a variation of the skill in the text body
if((spaceafter in vacancies.at[vacancy_index,'body']) or (newlineafter in vacancies.at[vacancy_index,'body']) or (tabafter in vacancies.at[vacancy_index,'body'])):
#Adding the skill to the list of skills found in the vacancy
all_skills_in_vacancy.append(skill_row['txn_skill_name'])
#Increasing the skillcount
skillcount += 1
#Adding the skill to the row
vacancies.at[vacancy_index,'skill'] = skill_row['txn_skill_name']
#Add a row to the vacancy df where 1 row, means 1 skill
per_vacancy_df = per_vacancy_df.append(vacancies.iloc[vacancy_index])
#Adding the list of all found skills in the vacancy to each (skill) row
per_vacancy_df['all_skills_in_vacancy'] = str(all_skills_in_vacancy)
per_vacancy_df['skillcount'] = skillcount
#Adds the individual vacancy df to a new vacancy df
new_vacancies = new_vacancies.append(per_vacancy_df)
return(new_vacancies)
def executeSkillScript(vacancies):
from multiprocessing import Pool
vacancies = vacancies.head(100298)
num_workers = 47
pool = Pool(num_workers)
vacancy_splits = np.array_split(vacancies, num_workers)
results_list = pool.map(addSkillRelevance,vacancy_splits)
new_vacancies = pd.concat(results_list, axis=0)
pool.close()
pool.join()
executeSkillScript(vacancies)
The function addSkillRelevance() takes in a pandas DataFrame and outputs a pandas DataFrame (with more columns). For some reason, after finishing all the multiprocessing, I get an IndexError on results_list = pool.map(addSkillRelevance,vacancy_splits). I'm quite stuck as I don't know how to handle the error. Does anyone have tips as to why the IndexError is occurring?
The error:
IndexError Traceback (most recent call last)
<ipython-input-11-7cb04a51c051> in <module>()
----> 1 executeSkillScript(vacancies)
<ipython-input-9-5195d46f223f> in executeSkillScript(vacancies)
14
15 vacancy_splits = np.array_split(vacancies, num_workers)
---> 16 results_list = pool.map(addSkillRelevance,vacancy_splits)
17 new_vacancies = pd.concat(results_list, axis=0)
18
~/anaconda3/envs/amazonei_tensorflow_p36/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
~/anaconda3/envs/amazonei_tensorflow_p36/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
IndexError: single positional indexer is out-of-bounds
As per the suggestion
The error is coming from this line:
per_vacancy_df = per_vacancy_df.append(vacancies.iloc[vacancy_index])
The error is occuring because vacancy_index is not in the index of the vacancies dataframe.

Getting Type Error Expected Strings or Bytes Like Object

I am working on a dataset with tweets and I am trying to find the mentions to other users in a tweet, these tweets can have none, single or multiple users mentioned.
Here is the head of the DataFrame:
The following is the function that I created to extract the list of mentions in a tweet:
def getMention(text):
mention = re.findall('(^|[^#\w])#(\w{1,15})', text)
if len(mention) > 0:
return [x[1] for x in mention]
else:
return None
I'm trying to create a new column in the DataFrame and apply the function with the following code:
df['mention'] = df['text'].apply(getMention)
On running this code I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-43-426da09a8770> in <module>
----> 1 df['mention'] = df['text'].apply(getMention)
~/anaconda3_501/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
3192 else:
3193 values = self.astype(object).values
-> 3194 mapped = lib.map_infer(values, f, convert=convert_dtype)
3195
3196 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-42-d27373022afd> in getMention(text)
1 def getMention(text):
2
----> 3 mention = re.findall('(^|[^#\w])#(\w{1,15})', text)
4 if len(mention) > 0:
5 return [x[1] for x in mention]
~/anaconda3_501/lib/python3.6/re.py in findall(pattern, string, flags)
220
221 Empty matches are included in the result."""
--> 222 return _compile(pattern, flags).findall(string)
223
224 def finditer(pattern, string, flags=0):
TypeError: expected string or bytes-like object
I can't comment (not enough rep) so here's what I suggest to troubleshoot the error.
It seems findall raises an exception because text is not a string so you might want to check which type text actually is, using this:
def getMention(text):
print(type(text))
mention = re.findall(r'(^|[^#\w])#(\w{1,15})', text)
if len(mention) > 0:
return [x[1] for x in mention]
else:
return None
(or the debugger if you know how to)
And if text can be converted to a string maybe try this ?
def getMention(text):
mention = re.findall(r'(^|[^#\w])#(\w{1,15})', str(text))
if len(mention) > 0:
return [x[1] for x in mention]
else:
return None
P.S.: don't forget the r'...' in front of your regexp, to avoid special chars to be interpreted

Resources