I'm getting an IndexError using multiprocessing to process parts of a pandas DataFrame in parallel. vacancies is a pandas DataFrame containing several vacancies, of which one column is the raw text.
def addSkillRelevance(vacancies):
skills = pickle.load(open("skills.pkl", "rb"))
vacancies['skill'] = ''
vacancies['skillcount'] = 0
vacancies['all_skills_in_vacancy'] = ''
new_vacancies = pd.DataFrame(columns=vacancies.columns)
for vacancy_index, vacancy_row in vacancies.iterrows():
#Create a df for which each row is a found skill (with the other attributes of the vacancy)
per_vacancy_df = pd.DataFrame(columns=vacancies.columns)
all_skills_in_vacancy = []
skillcount = 0
for skill_index, skill_row in skills.iterrows():
#Making the search for the skill in the text body a bit smarter
spaceafter = ' ' + skill_row['txn_skill_name'] + ' '
newlineafter = ' ' + skill_row['txn_skill_name'] + '\n'
tabafter = ' ' + skill_row['txn_skill_name'] + '\t'
#Statement that returns true if we find a variation of the skill in the text body
if((spaceafter in vacancies.at[vacancy_index,'body']) or (newlineafter in vacancies.at[vacancy_index,'body']) or (tabafter in vacancies.at[vacancy_index,'body'])):
#Adding the skill to the list of skills found in the vacancy
all_skills_in_vacancy.append(skill_row['txn_skill_name'])
#Increasing the skillcount
skillcount += 1
#Adding the skill to the row
vacancies.at[vacancy_index,'skill'] = skill_row['txn_skill_name']
#Add a row to the vacancy df where 1 row, means 1 skill
per_vacancy_df = per_vacancy_df.append(vacancies.iloc[vacancy_index])
#Adding the list of all found skills in the vacancy to each (skill) row
per_vacancy_df['all_skills_in_vacancy'] = str(all_skills_in_vacancy)
per_vacancy_df['skillcount'] = skillcount
#Adds the individual vacancy df to a new vacancy df
new_vacancies = new_vacancies.append(per_vacancy_df)
return(new_vacancies)
def executeSkillScript(vacancies):
from multiprocessing import Pool
vacancies = vacancies.head(100298)
num_workers = 47
pool = Pool(num_workers)
vacancy_splits = np.array_split(vacancies, num_workers)
results_list = pool.map(addSkillRelevance,vacancy_splits)
new_vacancies = pd.concat(results_list, axis=0)
pool.close()
pool.join()
executeSkillScript(vacancies)
The function addSkillRelevance() takes in a pandas DataFrame and outputs a pandas DataFrame (with more columns). For some reason, after finishing all the multiprocessing, I get an IndexError on results_list = pool.map(addSkillRelevance,vacancy_splits). I'm quite stuck as I don't know how to handle the error. Does anyone have tips as to why the IndexError is occurring?
The error:
IndexError Traceback (most recent call last)
<ipython-input-11-7cb04a51c051> in <module>()
----> 1 executeSkillScript(vacancies)
<ipython-input-9-5195d46f223f> in executeSkillScript(vacancies)
14
15 vacancy_splits = np.array_split(vacancies, num_workers)
---> 16 results_list = pool.map(addSkillRelevance,vacancy_splits)
17 new_vacancies = pd.concat(results_list, axis=0)
18
~/anaconda3/envs/amazonei_tensorflow_p36/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
~/anaconda3/envs/amazonei_tensorflow_p36/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
IndexError: single positional indexer is out-of-bounds
As per the suggestion
The error is coming from this line:
per_vacancy_df = per_vacancy_df.append(vacancies.iloc[vacancy_index])
The error is occuring because vacancy_index is not in the index of the vacancies dataframe.
Related
I am getting error when trying to use FuzzyWuzzy between two other dataframe column.
I want to match df_1['name_new'] to df['term'].
below is the site where I got my code
https://towardsdatascience.com/fuzzy-string-match-with-python-on-large-dataset-and-why-you-should-not-use-fuzzywuzzy-4ec9f0defcd
#Transform text to vectors with TF-IDF:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')
tf_idf_matrix_1 = tfidf_vectorizer.fit_transform(df_1['name_new'])
tf_idf_matrix_2 = tfidf_vectorizer.fit_transform(df['term'])
I careated "tf_idf_matrix_2 " to match other df's 'term' column
from scipy.sparse import csr_matrix
!pip install sparse_dot_topn
import sparse_dot_topn.sparse_dot_topn as ct
def awesome_cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M*ntop
indptr = np.zeros(M+1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data,indices,indptr),shape=(M,N))
import time
t1 = time.time()
# adjust lower bound: 0.8
# keep top 10 similar results
matches = awesome_cossim_top(tf_idf_matrix_1, tf_idf_matrix_2.transpose(), 10, 0.8)
t = time.time()-t1
print("finished in:", t)
def get_matches_df(sparse_matrix, name_vector, top=10000):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'name_new_1': left_side,
'term_1': right_side,
'similairity_score': similairity})
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_1['name_new'], top=10000)
# Remove all exact matches
I get my error like this=>
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
384 try:
--> 385 return self._range.index(new_key)
386 except ValueError as err:
ValueError: 111816 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
385 return self._range.index(new_key)
386 except ValueError as err:
--> 387 raise KeyError(key) from err
388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: 111816
Please help... what is wrong with my code?
Reposting again because i didn't get a response to the first post
I have the following data is below:
desc = pd.DataFrame(description, columns =['new_desc'])
new_desc
257623 the public safety report is compiled from crim...
161135 police say a sea isle city man ordered two pou...
156561 two people are behind bars this morning, after...
41690 pumpkin soup is a beloved breakfast soup in ja...
70092 right now, 15 states are grappling with how be...
... ...
207258 operation legend results in 59 more arrests, i...
222170 see story, 3a
204064 st. louis — missouri secretary of state jason ...
151443 tony lavell jones, 54, of sunset view terrace,...
97367 walgreens, on the other hand, is still going t...
[9863 rows x 1 columns]
I'm trying to find the dominant topic within the documents, and When I run the following code
best_lda_model = lda_desc
data_vectorized = tfidf
lda_output = best_lda_model.transform(data_vectorized)
topicnames = ["Topic " + str(i) for i in range(best_lda_model.n_components)]
docnames = ["Doc " + str(i) for i in range(len(dataset))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topicnames, index = docnames)
dominant_topic = np.argmax(df_document_topic.values, axis = 1)
df_document_topic['dominant_topic'] = dominant_topic
I've tried tweaking the code, however, no matter what I change, I get the following error tracebook error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
c:\python36\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1673
-> 1674 mgr = BlockManager(blocks, axes)
1675 mgr._consolidate_inplace()
c:\python36\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
148 if do_integrity_check:
--> 149 self._verify_integrity()
150
c:\python36\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
328 if block.shape[1:] != mgr_shape[1:]:
--> 329 raise construction_error(tot_items, block.shape[1:], self.axes)
330 if len(self.items) != tot_items:
ValueError: Shape of passed values is (9863, 8), indices imply (0, 8)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-41-bd470d69b181> in <module>
4 topicnames = ["Topic " + str(i) for i in range(best_lda_model.n_components)]
5 docnames = ["Doc " + str(i) for i in range(len(dataset))]
----> 6 df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topicnames, index = docnames)
7 dominant_topic = np.argmax(df_document_topic.values, axis = 1)
8 df_document_topic['dominant_topic'] = dominant_topic
c:\python36\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
c:\python36\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
c:\python36\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1679 blocks = [getattr(b, "values", b) for b in blocks]
1680 tot_items = sum(b.shape[0] for b in blocks)
-> 1681 raise construction_error(tot_items, blocks[0].shape[1:], axes, e)
1682
1683
ValueError: Shape of passed values is (9863, 8), indices imply (0, 8)
The desired results is to produce a list of documents according to a specific topic. Below is example code and desired output.
df_document_topic(df_document_topic['dominant_topic'] == 2).head(10)
When I run this code, I get the following traceback
TypeError Traceback (most recent call last)
<ipython-input-55-8cf9694464e6> in <module>
----> 1 df_document_topic(df_document_topic['dominant_topic'] == 2).head(10)
TypeError: 'DataFrame' object is not callable
Below is the desired output
Any help would be greatly appreciated.
The index you're passing as docnames is empty which is obtained from dataset as follows:
docnames = ["Doc " + str(i) for i in range(len(dataset))]
So this means that the dataset is empty too. For a workaround, you can create Doc indices based on the size of lda_output as follows:
docnames = ["Doc " + str(i) for i in range(len(lda_output))]
Let me know if this works.
I am trying to create a dataframe with python's pandas library utilizing data obtained with a requests response. The problem is when there is not that item available on the API so it raises a KeyError and crashes the program.
The source data frame is being iterated over each product name. It then takes the product name of that row and finds how many different SKUs exists, creating a row in a new dataframe for each SKU and adding some quantities and other needed information to the new dataframe. The idea is to have a row with ALL the same information on the first dataframe repeated however many SKUs there are updated with the quantity and package ID for that SKU.
If the length of the response returned is 0, I still want it to append the row from the first data frame
def create_additional_rows_needed(comb_data):
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")
new_combined_data = pd.DataFrame(columns=comb_data.columns)
COVA_DATA_LEN = 2993
row = 0
current_item = ''
while row < len(comb_data):
number_of_skus = 0
current_item = comb_data.iloc[row, 1]
if (len(current_item)) is not None:
number_of_skus = len(find_gb_product(current_item))
else:
number_of_skus = 0
current_quantity = find_gb_product(current_item).iloc[number_of_skus - 1, find_gb_product(current_item).columns.get_loc('quantity')]
logger.info('Current Quantity: {}'.format(current_quantity))
current_package = find_gb_product(current_item)['lot_number'][number_of_skus - 1]
if number_of_skus == 0:
pass
while number_of_skus > 0:
logger.info('Current Item: {}'.format(current_item))
logger.info('Number of Skus: {}'.format(number_of_skus))
logger.info('Appending: {}'.format(comb_data.iloc[row, 1]))
new_combined_data = new_combined_data.append([comb_data.iloc[row, :]])
new_combined_data.iloc[-1, new_combined_data.columns.get_loc('TotalOnHand')] = current_quantity
new_combined_data.iloc[-1, new_combined_data.columns.get_loc('PackageId')] = current_package
number_of_skus = number_of_skus - 1
logger.info('Finished index {}'.format(row))
row = row + 1
logger.info('Moving to index {}'.format(row))
return new_combined_data
It goes well for every item with the exception of a few. Here is the error I get.
KeyError
2889 return self._engine.get_loc(casted_key)
2890 except KeyError as err:
-> 2891 raise KeyError(key) from err
2892
2893 if tolerance is not None:
KeyError: 'quantity'
This has taken up my entire weekend and all my sleep and is due Monday Morning at 10am MST with only two days notice. Please help me.
Catching the error and continuing should work. Something along the lines of:
while row < len(comb_data):
....
try:
current_quantity = find_gb_product(current_item).iloc[number_of_skus - 1, find_gb_product(current_item).columns.get_loc('quantity')]
except KeyError:
continue
....
Im trying to learn machine learning and i need to fill in the missing values for the cleaning stage of the workflow. i have 13 columns and need to impute the values for 8 of them. One column is called Dependents and i want to fill in the blanks with the word missing and change the cells that do contain data as follows: 1 to one, two to 2, 3 to three and 3+ to threePlus.
Im running the program in Anaconda and the name of the dataframe is train
train.columns
this gives me
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
dtype='object')
next
print("Dependents")
print(train['Dependents'].unique())
this gives me
Dependents
['0' '1' '2' '3+' nan]
now i try imputing values as stated
def impute_dependent():
my_dict={'1':'one','2':'two','3':'three','3+':'threePlus'};
return train.Dependents.map(my_dict).fillna('missing')
def convert_data(dataset):
temp_data = dataset.copy()
temp_data['Dependents'] = temp_data[['Dependents']].apply(impute_dependent,axis=1)
return temp_data
this gives the error
TypeError Traceback (most recent call last)
<ipython-input-46-ccb1a5ea7edd> in <module>()
4 return temp_data
5
----> 6 train_dataset = convert_data(train)
7 #test_dataset = convert_data(test)
<ipython-input-46-ccb1a5ea7edd> in convert_data(dataset)
1 def convert_data(dataset):
2 temp_data = dataset.copy()
----> 3 temp_data['Dependents'] =
temp_data[['Dependents']].apply(impute_dependent,axis=1)
4 return temp_data
5
D:\Anaconda2\lib\site-packages\pandas\core\frame.py in apply(self, func,
axis, broadcast, raw, reduce, result_type, args, **kwds)
6002 args=args,
6003 kwds=kwds)
-> 6004 return op.get_result()
6005
6006 def applymap(self, func):
D:\Anaconda2\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
D:\Anaconda2\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
D:\Anaconda2\lib\site-packages\pandas\core\apply.py in
apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
TypeError: ('impute_dependent() takes 0 positional arguments but 1 was
given', 'occurred at index 0')
i expected one, two , three and threePlus to replace the existing values and missing to fill in the blanks
Would this do?
my_dict = {'1':'one','2':'two','3':'three','3+':'threePlus', np.nan: 'missing'}
def convert_data(dataset):
temp_data = dataset.copy()
temp_data.Dependents = temp_data.Dependents.map(my_dict)
return temp_data
As a side note, part of your problem might be the use of apply: essentially apply passes data through a function and puts in what comes out. I might be wrong but I think your function needs to take the input given by apply, eg:
def impute_dependent(dep):
my_dict = {'1':'one','2':'two','3':'three','3+':'threePlus', np.nan: 'missing'}
return my_dict[dep]
df.dependents = df.dependents.apply(impute_dependents)
This way, for every value in df.dependents, apply will take that value and give it to impute_dependents as an argument, then take the returned value as output. As is, when I trial your code I get an error because impute_dependent takes no arguments.
I was trying to import multiple csv files into sqlite database into multiple tables(using jupyter notebook in python3). The name of each file will be the name of the table. I have defined a function to covert the encoding to utf8 as below:
import sqlite3
import glob
import csv
import sys
def convert_to_utf8(dirname):
for filename in glob.glob(os.path.join(dirname, '*.csv')):
ifp = open(filename, "rt", encoding='cp1252')
input_data = ifp.read()
ifp.close()
ofp = open(filename + ".fix", "wt", encoding='utf-8')
for c in input_data:
if c != '\0':
ofp.write(c)
ofp.close()
return
all the files are in the same folder. staging_dir_name_1 is where the files are. And I have below code to covert the csv file into tables, some of the codes are from similar questions in StackFlow:
convert_to_utf8(staging_dir_name_1)
conn = sqlite3.connect("medicare_hospital_compare_1.db")
c = conn.cursor()
for filename in glob.glob(os.path.join(staging_dir_name_1, '*.csv')):
with open(filename, "rb") as f:
data = csv.DictReader(f)
cols = data.fieldnames
tablename = os.path.splitext(os.path.basename(filename))[0]
sql_str = "drop table if exists %s" % tablename
c.execute(sql_str)
sql_str = "create table if not exists %s (%s)" % (tablename, ','.join(["%s text" % col for col in cols]))
c.execute(sql_str)
sql_str = "insert into %s values (%s)" % (tablename, ','.join(["?" for col in cols]))
c.executemany(sql_str, (list(map(row.get, cols)) for row in data))
conn.commit()
but when i run this i get this error
> Error Traceback (most recent call
> last) <ipython-input-29-be7c1f43e4c5> in <module>()
> 2 with open(filename, "rb") as f:
> 3 data = csv.DictReader(f)
> ----> 4 cols = data.fieldnames
> 5 tablename = os.path.splitext(os.path.basename(filename))[0]
> 6
>
> C:\Users\dupin\Anaconda3\lib\csv.py in fieldnames(self)
> 96 if self._fieldnames is None:
> 97 try:
> ---> 98 self._fieldnames = next(self.reader)
> 99 except StopIteration:
> 100 pass
>
> Error: iterator should return strings, not bytes (did you open the
> file in text mode?)
Could anyone help me on how to resolve this issue? I have been thinking about it for a while but still couldn't figure out how to resolve this.
**===UPDATE===**
Now i have changed 'rb' to 'rt', i got a new error full NULL values, i think the first function has already removed all the null values
Error Traceback (most recent call last)
<ipython-input-77-68d56c0b4cf2> in <module>()
3
4 data = csv.DictReader(f)
----> 5 cols = data.fieldnames
6 table = os.path.splitext(os.path.basename(filename))[0]
7
C:\Users\dupin\Anaconda3\lib\csv.py in fieldnames(self)
96 if self._fieldnames is None:
97 try:
---> 98 self._fieldnames = next(self.reader)
99 except StopIteration:
100 pass
Error: line contains NULL byte