empty vocabulary; perhaps the documents only contain stop words - nlp

When replaced vectorizer.fit_transform(wines["description"]) it workes. I'm not sure what I've done, have I not removed the stop words?
tqdm.pandas()
wines["processed_description"] = wines["description"].progress_apply(spacy_tokenizer)
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(wines["processed_description"])
`ValueError Traceback (most recent call last)
Input In [158], in <cell line: 3>()
1 # Creating a vectorizer
2 vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z-][a-zA-Z-]{2,}')
----> 3 data_vectorized = vectorizer.fit_transform(wines["processed_description"])
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1330, in CountVectorizer.fit_transform(self, raw_documents, y)
1322 warnings.warn(
1323 "Upper case characters found in"
1324 " vocabulary while 'lowercase'"
1325 " is True. These entries will not"
1326 " be matched with any documents"
1327 )
1328 break
-> 1330 vocabulary, X = self.count_vocab(raw_documents, self.fixed_vocabulary)
1332 if self.binary:
1333 X.data.fill(1)`
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1220, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
1218 vocabulary = dict(vocabulary)
1219 if not vocabulary:
-> 1220 raise ValueError(
1221 "empty vocabulary; perhaps the documents only contain stop words"
1222 )
1224 if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1
1225 if _IS_32BIT:
ValueError: empty vocabulary; perhaps the documents only contain stop words

Related

How to parallelize classification with Zero Shot Classification by Huggingface?

I have around 70 categories (it can be 20 or 30 also) and I want to be able to parallelize the process using ray but I get an error:
import pandas as pd
import swifter
import json
import ray
from transformers import pipeline
classifier = pipeline("zero-shot-classification")
labels = ["vegetables", "potato", "bell pepper", "tomato", "onion", "carrot", "broccoli",
"lettuce", "cucumber", "celery", "corn", "garlic", "mashrooms", "cabbage", "spinach",
"beans", "cauliflower", "asparagus", "fruits", "bananas", "apples", "strawberries",
"grapes", "oranges", "lemons", "avocados", "peaches", "blueberries", "pineapple",
"cherries", "pears", "mangoe", "berries", "red meat", "beef", "pork", "mutton",
"veal", "lamb", "venison", "goat", "mince", "white meat", "chicken", "turkey",
"duck", "goose", "pheasant", "rabbit", "Processed meat", "sausages", "bacon",
"ham", "hot dogs", "frankfurters", "tinned meat", "salami", "pâtés", "beef jerky",
"chorizo", "pepperoni", "corned beef", "fish", "catfish", "cod", "pangasius", "pollock",
"tilapia", "tuna", "salmon", "seafood", "shrimp", "squid", "mussels", "scallop",
"octopus", "grains", "rice", "wheat", "bulgur", "corn", "oat", "quinoa", "buckwheat",
"meals", "salad", "soup", "steak", "pizza", "pie", "burger", "backery", "bread", "souce",
"pasta", "sandwich", "waffles", "barbecue", "roll", "wings", "ribs", "cookies"]
ray.init()
#ray.remote
def get_meal_category(seq, labels, n=3):
res_dict = classifier(seq, labels)
return list(zip([seq for i in range(n)], res_dict["labels"][0:n], res_dict["scores"][0:n]))
res_list = ray.get([get_meal_category.remote(merged_df["title"][i], labels) for i in range(10)])
Where merged_df is a big dataframe with meal names in it's labels column like:
['Cappuccino',
'Stove Top Stuffing Mix For Turkey (Kraft)',
'Stove Top Stuffing Mix For Turkey (Kraft)',
'Roasted Dark Turkey Meat',
'Roasted Dark Turkey Meat',
'Roasted Dark Turkey Meat',
'Cappuccino',
'Low Fat 2% Small Curd Cottage Cheese (Daisy)',
'Rice Cereal (Gerber)',
'Oranges']
Please advise how to avoid ray's error and parallelize the classification.
The error:
2021-02-17 16:54:51,689 WARNING worker.py:1107 -- Warning: The remote function __main__.get_meal_category has size 1630925709 when pickled. It will be stored in Redis, which could cause memory issues. This may mean that its definition uses a large array or other object.
---------------------------------------------------------------------------
ConnectionResetError Traceback (most recent call last)
~/.local/lib/python3.8/site-packages/redis/connection.py in send_packed_command(self, command, check_health)
705 for item in command:
--> 706 sendall(self._sock, item)
707 except socket.timeout:
~/.local/lib/python3.8/site-packages/redis/_compat.py in sendall(sock, *args, **kwargs)
8 def sendall(sock, *args, **kwargs):
----> 9 return sock.sendall(*args, **kwargs)
10
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-9-1a5345832fba> in <module>
----> 1 res_list = ray.get([get_meal_category.remote(merged_df["title"][i], labels) for i in range(10)])
<ipython-input-9-1a5345832fba> in <listcomp>(.0)
----> 1 res_list = ray.get([get_meal_category.remote(merged_df["title"][i], labels) for i in range(10)])
~/.local/lib/python3.8/site-packages/ray/remote_function.py in _remote_proxy(*args, **kwargs)
99 #wraps(function)
100 def _remote_proxy(*args, **kwargs):
--> 101 return self._remote(args=args, kwargs=kwargs)
102
103 self.remote = _remote_proxy
~/.local/lib/python3.8/site-packages/ray/remote_function.py in _remote(self, args, kwargs, num_returns, num_cpus, num_gpus, memory, object_store_memory, accelerator_type, resources, max_retries, placement_group, placement_group_bundle_index, placement_group_capture_child_tasks, override_environment_variables, name)
205
206 self._last_export_session_and_job = worker.current_session_and_job
--> 207 worker.function_actor_manager.export(self)
208
209 kwargs = {} if kwargs is None else kwargs
~/.local/lib/python3.8/site-packages/ray/function_manager.py in export(self, remote_function)
142 key = (b"RemoteFunction:" + self._worker.current_job_id.binary() + b":"
143 + remote_function._function_descriptor.function_id.binary())
--> 144 self._worker.redis_client.hset(
145 key,
146 mapping={
~/.local/lib/python3.8/site-packages/redis/client.py in hset(self, name, key, value, mapping)
3048 items.extend(pair)
3049
-> 3050 return self.execute_command('HSET', name, *items)
3051
3052 def hsetnx(self, name, key, value):
~/.local/lib/python3.8/site-packages/redis/client.py in execute_command(self, *args, **options)
898 conn = self.connection or pool.get_connection(command_name, **options)
899 try:
--> 900 conn.send_command(*args)
901 return self.parse_response(conn, command_name, **options)
902 except (ConnectionError, TimeoutError) as e:
~/.local/lib/python3.8/site-packages/redis/connection.py in send_command(self, *args, **kwargs)
723 def send_command(self, *args, **kwargs):
724 "Pack and send a command to the Redis server"
--> 725 self.send_packed_command(self.pack_command(*args),
726 check_health=kwargs.get('check_health', True))
727
~/.local/lib/python3.8/site-packages/redis/connection.py in send_packed_command(self, command, check_health)
715 errno = e.args[0]
716 errmsg = e.args[1]
--> 717 raise ConnectionError("Error %s while writing to socket. %s." %
718 (errno, errmsg))
719 except BaseException:
ConnectionError: Error 104 while writing to socket. Connection reset by peer.
This error is happening because of sending large objects to redis. merged_df is a large dataframe and since you are calling get_meal_category 10 times, Ray will attempt to serialize merged_df 10 times. Instead if you put merged_df into the Ray object store just once, and then pass along a reference to the object, this should work.
EDIT: Since the classifier is also large, do something similar for that as well.
Can you try something like this:
ray.init()
df_ref = ray.put(merged_df)
model_ref = ray.put(classifier)
#ray.remote
def get_meal_category(classifier, df, i, labels, n=3):
seq = df["title"][i]
res_dict = classifier(seq, labels)
return list(zip([seq for i in range(n)], res_dict["labels"][0:n], res_dict["scores"][0:n]))
res_list = ray.get([get_meal_category.remote(model_ref, df_ref, i, labels) for i in range(10)])

Can not resolve exception: "ValueError: The index must be timezone aware when indexing with a date string with a UTC offset"

I have a time-series that looks like this:
y.index = pd.to_datetime(y.index)
y.index = y.index.tz_localize(None)
When I try to slice rows using the following expression I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-677-f1b3153cb92b> in <module>
----> 1 y['2020-02-24 10-11-12':]
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
863 key = check_bool_indexer(self.index, key)
864
--> 865 return self._get_with(key)
866
867 def _get_with(self, key):
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\series.py in _get_with(self, key)
868 # other: fancy integer or otherwise
869 if isinstance(key, slice):
--> 870 return self._slice(key)
871 elif isinstance(key, ABCDataFrame):
872 raise TypeError(
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\series.py in _slice(self, slobj, axis, kind)
818
819 def _slice(self, slobj: slice, axis: int = 0, kind=None):
--> 820 slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem")
821 return self._get_values(slobj)
822
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\base.py in _convert_slice_indexer(self, key, kind)
2943 indexer = key
2944 else:
-> 2945 indexer = self.slice_indexer(start, stop, step, kind=kind)
2946
2947 return indexer
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\datetimes.py in slice_indexer(self, start, end, step, kind)
806
807 try:
--> 808 return Index.slice_indexer(self, start, end, step, kind=kind)
809 except KeyError:
810 # For historical reasons DatetimeIndex by default supports
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\base.py in slice_indexer(self, start, end, step, kind)
4675 slice(1, 3)
4676 """
-> 4677 start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
4678
4679 # return a slice
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\base.py in slice_locs(self, start, end, step, kind)
4888 start_slice = None
4889 if start is not None:
-> 4890 start_slice = self.get_slice_bound(start, "left", kind)
4891 if start_slice is None:
4892 start_slice = 0
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\base.py in get_slice_bound(self, label, side, kind)
4800 # For datetime indices label may be a string that has to be converted
4801 # to datetime boundary according to its resolution.
-> 4802 label = self._maybe_cast_slice_bound(label, side, kind)
4803
4804 # we need to look up the label
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\datetimes.py in _maybe_cast_slice_bound(self, label, side, kind)
761 freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
762 _, parsed, reso = parsing.parse_time_string(label, freq)
--> 763 lower, upper = self._parsed_string_to_bounds(reso, parsed)
764 # lower, upper form the half-open interval:
765 # [parsed, parsed + 1 freq)
~\Anaconda3\envs\tf2\lib\site-packages\pandas\core\indexes\datetimes.py in _parsed_string_to_bounds(self, reso, parsed)
569 if self.tz is None:
570 raise ValueError(
--> 571 "The index must be timezone aware when indexing "
572 "with a date string with a UTC offset"
573 )
ValueError: The index must be timezone aware when indexing with a date string with a UTC offset
I provide a limited part of this Series for reproducibility purpose (json format):
'{"1582539072500":1,"1582539073000":1,"1582539073500":1,"1582539074000":1,"1582539074500":1,"1582539075000":1,"1582539075500":1,"1582539076000":1,"1582539076500":1,"1582539077000":1,"1582539077500":1,"1582539078000":1,"1582539078500":1,"1582539080500":1,"1582539081000":1,"1582539081500":1,"1582539082000":1,"1582539082500":1,"1582539083000":1,"1582539083500":1,"1582539084000":1,"1582539084500":1,"1582539085000":1,"1582539085500":1,"1582539086000":1,"1582539086500":1,"1582539088500":1,"1582539089000":1,"1582539089500":1,"1582539090000":1,"1582539090500":1,"1582539091000":1,"1582539091500":1,"1582539092500":1,"1582539093000":1,"1582539093500":1,"1582539094000":1,"1582539094500":1,"1582539095000":1,"1582539095500":1,"1582539096000":1,"1582539097500":1,"1582539099500":1,"1582539101000":1,"1582539101500":1,"1582539104000":1,"1582539104500":1,"1582539105500":1,"1582539106000":1,"1582539109000":1}'
What creates the error, why my actions do not resolve it and what should I do?
The error is "The index must be timezone aware when indexing with a date string with a UTC offset". However there may be a typo in your code. You have y['2020-02-24 10-11-12':] but there are hyphens in between the hours, minutes, and seconds. I reproduced the error you had and just replaced the time portion hyphens with colons and was able to get it to run on the sample data.
y['2020-02-24 10:11:12':] should work.

tfidf first time, using it on a Pandas series that has a list per entry

Data looks like this :
data_clean2.head(3)
text target
0 [deed, reason, earthquak, may, allah, forgiv, u] 1
1 [forest, fire, near, la, rong, sask, canada] 1
2 [resid, ask, shelter, place, notifi, offic, evacu, shelter, place, order, expect] 1
I got this by stemming and lemmatizing the sentence and tokenizing before that. ( Hope that is right).
Now I want to use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
It gives me the following error :
AttributeError Traceback (most recent call last)
<ipython-input-140-6f68d1115c5f> in <module>
1 vectorizer = TfidfVectorizer()
----> 2 vectors = vectorizer.fit_transform(data_clean2['text'])
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1650 """
1651 self._check_params()
-> 1652 X = super().fit_transform(raw_documents)
1653 self._tfidf.fit(X)
1654 # X is already a transformed view of raw_documents so
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1056
1057 vocabulary, X = self._count_vocab(raw_documents,
-> 1058 self.fixed_vocabulary_)
1059
1060 if self.binary:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
968 for doc in raw_documents:
969 feature_counter = {}
--> 970 for feature in analyze(doc):
971 try:
972 feature_idx = vocabulary[feature]
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
350 tokenize)
351 return lambda doc: self._word_ngrams(
--> 352 tokenize(preprocess(self.decode(doc))), stop_words)
353
354 else:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'list' object has no attribute 'lower'
I know that I somehow cannot use it on the list, so what is my play here, trying to return the list into a string again?
Yes, first convert to string using:
data_clean2['text'] = data_clean2['text'].apply(', '.join)
Then use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
v = pd.DataFrame(vectors.toarray(), columns = vectorizer.get_feature_names())

fit_transform error using CountVectorizer

So I have a dataframe X which looks something like this:
X.head()
0 My wife took me here on my birthday for breakf...
1 I have no idea why some people give bad review...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
6 Drop what you're doing and drive here. After I...
Name: text, dtype: object
And then,
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
But I get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-61-8ff79b91e317> in <module>()
----> 1 X = cv.fit_transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
230
231 if self.lowercase:
--> 232 return lambda x: strip_accents(x.lower())
233 else:
234 return strip_accents
~/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in __getattr__(self, attr)
574 return self.getnnz()
575 else:
--> 576 raise AttributeError(attr + " not found")
577
578 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
No idea why.
You need to specify the column name of the text data even if the dataframe has single column.
X_countMatrix = cv.fit_transform(X['text'])
Because a CountVectorizer expects an iterable as input and when you supply a dataframe as an argument, only thing thats iterated is the column names. So even if you did not have any errors, that would be incorrect. Lucky that you got an error and got a chance to correct it.

tfidf vectorizer process shows error

I am working on non-Engish corpus analysis but facing several problems. One of those problems is tfidf_vectorizer. After importing concerned liberaries, I processed following code to get results
contents = [open("D:\test.txt", encoding='utf8').read()]
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.2, stop_words=stopwords,
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(3,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
print(tfidf_matrix.shape)
After processing above code I got following error message.
ValueError Traceback (most recent call last)
<ipython-input-144-bbcec8b8c065> in <module>()
5 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(3,3))
6
----> 7 get_ipython().magic('time tfidf_matrix = tfidf_vectorizer.fit_transform(contents) #fit the vectorizer to synopses')
8
9 print(tfidf_matrix.shape)
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in magic(self, arg_s)
2156 magic_name, _, magic_arg_s = arg_s.partition(' ')
2157 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158 return self.run_line_magic(magic_name, magic_arg_s)
2159
2160 #-------------------------------------------------------------------------
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_line_magic(self, magic_name, line)
2077 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2078 with self.builtin_trap:
-> 2079 result = fn(*args,**kwargs)
2080 return result
2081
<decorator-gen-60> in time(self, line, cell, local_ns)
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\magics\execution.py in time(self, line, cell, local_ns)
1178 else:
1179 st = clock2()
-> 1180 exec(code, glob, local_ns)
1181 end = clock2()
1182 out = None
<timed exec> in <module>()
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1303 Tf-idf-weighted document-term matrix.
1304 """
-> 1305 X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1306 self._tfidf.fit(X)
1307 # X is already a transformed view of raw_documents so
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
836 max_doc_count,
837 min_doc_count,
--> 838 max_features)
839
840 self.vocabulary_ = vocabulary
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _limit_features(self, X, vocabulary, high, low, limit)
731 kept_indices = np.where(mask)[0]
732 if len(kept_indices) == 0:
--> 733 raise ValueError("After pruning, no terms remain. Try a lower"
734 " min_df or a higher max_df.")
735 return X[:, kept_indices], removed_terms
ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.
If I change then min and max value the error is
Assuming your tokeniser works as expected, I see two problems with your code. First, TfIdfVectorizer expects a list of strings, whereas you are providing a single string. Second, min_df=0.2 is quite high- to be included, a term needs to occur in 20% of all documents, which is very unlikely for trigram features.
The following works for me
from sklearn.feature_extraction.text import TfidfVectorizer
with open("README.md") as infile:
contents = infile.readlines() # Note: readlines() instead of read()
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=2, use_idf=True, ngram_range=(3,3))
# note: minimum of 2 occurrences, rather than 0.2 (20% of all documents)
tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
print(tfidf_matrix.shape)
outputs (155, 28)

Resources