fit_transform error using CountVectorizer - python-3.x

So I have a dataframe X which looks something like this:
X.head()
0 My wife took me here on my birthday for breakf...
1 I have no idea why some people give bad review...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
6 Drop what you're doing and drive here. After I...
Name: text, dtype: object
And then,
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
But I get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-61-8ff79b91e317> in <module>()
----> 1 X = cv.fit_transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
230
231 if self.lowercase:
--> 232 return lambda x: strip_accents(x.lower())
233 else:
234 return strip_accents
~/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in __getattr__(self, attr)
574 return self.getnnz()
575 else:
--> 576 raise AttributeError(attr + " not found")
577
578 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
No idea why.

You need to specify the column name of the text data even if the dataframe has single column.
X_countMatrix = cv.fit_transform(X['text'])
Because a CountVectorizer expects an iterable as input and when you supply a dataframe as an argument, only thing thats iterated is the column names. So even if you did not have any errors, that would be incorrect. Lucky that you got an error and got a chance to correct it.

Related

How to perform Min Max Scaler on an array which contains columns with string and numbers?

please i really need your help, i'm struggling with MinMaxScaler, i would like to apply this technique on the array below that contains columns with string and numbers. I only want to implement this technique on the columns that contains numbers.
clean_tweets_no_urls = pd.DataFrame(counts_no_urls.most_common(15),
columns=['words', 'count'])
clean_tweets_no_urls.head()
That's my array
minmax_scaling(clean_tweets_no_urls, columns=['words', 'count'])
For that, i'm getting this result :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-108-eeb7b44d7121> in <module>
----> 1 minmax_scaling(clean_tweets_no_urls, columns=['words', 'count'])
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\preprocessing\scaling.py in minmax_scaling(array, columns, min_val, max_val)
36
37 """
---> 38 ary_new = array.astype(float)
39 if len(ary_new.shape) == 1:
40 ary_new = ary_new[:, np.newaxis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5696 else:
5697 # else, only a single dtype is given
-> 5698 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
5699 return self._constructor(new_data).__finalize__(self)
5700
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
580
581 def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
583
584 def convert(self, **kwargs):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, filter, **kwargs)
440 applied = b.apply(f, **kwargs)
441 else:
--> 442 applied = getattr(b, f)(**kwargs)
443 result_blocks = _extend_blocks(applied, result_blocks)
444
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
623 vals1d = values.ravel()
624 try:
--> 625 values = astype_nansafe(vals1d, dtype, copy=True)
626 except (ValueError, TypeError):
627 # e.g. astype_nansafe can fail on object-dtype of strings
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
895 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
896 # Explicit copy, or required since NumPy can't view from / to object.
--> 897 return arr.astype(dtype, copy=True)
898
899 return arr.view(dtype)
ValueError: could not convert string to float: 'joebiden'
from sklearn.preprocessing import minmax_scale
clean_tweets_no_urls[['count']] = minmax_scale.fit_transform(clean_tweets_no_urls[['count']])
This may be used to automate finding numeric columns.

tfidf first time, using it on a Pandas series that has a list per entry

Data looks like this :
data_clean2.head(3)
text target
0 [deed, reason, earthquak, may, allah, forgiv, u] 1
1 [forest, fire, near, la, rong, sask, canada] 1
2 [resid, ask, shelter, place, notifi, offic, evacu, shelter, place, order, expect] 1
I got this by stemming and lemmatizing the sentence and tokenizing before that. ( Hope that is right).
Now I want to use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
It gives me the following error :
AttributeError Traceback (most recent call last)
<ipython-input-140-6f68d1115c5f> in <module>
1 vectorizer = TfidfVectorizer()
----> 2 vectors = vectorizer.fit_transform(data_clean2['text'])
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1650 """
1651 self._check_params()
-> 1652 X = super().fit_transform(raw_documents)
1653 self._tfidf.fit(X)
1654 # X is already a transformed view of raw_documents so
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1056
1057 vocabulary, X = self._count_vocab(raw_documents,
-> 1058 self.fixed_vocabulary_)
1059
1060 if self.binary:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
968 for doc in raw_documents:
969 feature_counter = {}
--> 970 for feature in analyze(doc):
971 try:
972 feature_idx = vocabulary[feature]
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
350 tokenize)
351 return lambda doc: self._word_ngrams(
--> 352 tokenize(preprocess(self.decode(doc))), stop_words)
353
354 else:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'list' object has no attribute 'lower'
I know that I somehow cannot use it on the list, so what is my play here, trying to return the list into a string again?
Yes, first convert to string using:
data_clean2['text'] = data_clean2['text'].apply(', '.join)
Then use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
v = pd.DataFrame(vectors.toarray(), columns = vectorizer.get_feature_names())

FeatureTools TypeError: unhashable type: 'set'

I'm trying this code for featuretools:
features, feature_names = ft.dfs(entityset = es, target_entity = 'demo',
agg_primitives = ['count', 'max', 'time_since_first', 'median', 'time_since_last', 'avg_time_between',
'sum', 'mean'],
trans_primitives = ['is_weekend', 'year', 'week', 'divide_by_feature', 'percentile'])
But I had this error
TypeError Traceback (most recent call last)
<ipython-input-17-89e925ff895d> in <module>
3 agg_primitives = ['count', 'max', 'time_since_first', 'median', 'time_since_last', 'avg_time_between',
4 'sum', 'mean'],
----> 5 trans_primitives = ['is_weekend', 'year', 'week', 'divide_by_feature', 'percentile'])
~/.local/lib/python3.6/site-packages/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
44 ep.on_error(error=e,
45 runtime=runtime)
---> 46 raise e
47
48 # send return value
~/.local/lib/python3.6/site-packages/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
36 # call function
37 start = time.time()
---> 38 return_value = func(*args, **kwargs)
39 runtime = time.time() - start
40 except Exception as e:
~/.local/lib/python3.6/site-packages/featuretools/synthesis/dfs.py in dfs(entities, relationships, entityset, target_entity, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_entities, ignore_variables, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_variable_types)
226 n_jobs=n_jobs,
227 dask_kwargs=dask_kwargs,
--> 228 verbose=verbose)
229 return feature_matrix, features
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in calculate_feature_matrix(features, entityset, cutoff_time, instance_ids, entities, relationships, cutoff_time_in_index, training_window, approximate, save_progress, verbose, chunk_size, n_jobs, dask_kwargs)
265 cutoff_df_time_var=cutoff_df_time_var,
266 target_time=target_time,
--> 267 pass_columns=pass_columns)
268
269 feature_matrix = pd.concat(feature_matrix)
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in linear_calculate_chunks(chunks, feature_set, approximate, training_window, verbose, save_progress, entityset, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns)
496 no_unapproximated_aggs,
497 cutoff_df_time_var,
--> 498 target_time, pass_columns)
499 feature_matrix.append(_feature_matrix)
500 # Do a manual garbage collection in case objects from calculate_chunk
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in calculate_chunk(chunk, feature_set, entityset, approximate, training_window, verbose, save_progress, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns)
341 ids,
342 precalculated_features=precalculated_features_trie,
--> 343 training_window=window)
344
345 id_name = _feature_matrix.index.name
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/utils.py in wrapped(*args, **kwargs)
35 def wrapped(*args, **kwargs):
36 if save_progress is None:
---> 37 r = method(*args, **kwargs)
38 else:
39 time = args[0].to_pydatetime()
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in calc_results(time_last, ids, precalculated_features, training_window)
316 ignored=all_approx_feature_set)
317
--> 318 matrix = calculator.run(ids)
319 return matrix
320
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/feature_set_calculator.py in run(self, instance_ids)
100 precalculated_trie=self.precalculated_features,
101 filter_variable=target_entity.index,
--> 102 filter_values=instance_ids)
103
104 # The dataframe for the target entity should be stored at the root of
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/feature_set_calculator.py in _calculate_features_for_entity(self, entity_id, feature_trie, df_trie, full_entity_df_trie, precalculated_trie, filter_variable, filter_values, parent_data)
187 columns=columns,
188 time_last=self.time_last,
--> 189 training_window=self.training_window)
190
191 # Step 2: Add variables to the dataframe linking it to all ancestors.
~/.local/lib/python3.6/site-packages/featuretools/entityset/entity.py in query_by_values(self, instance_vals, variable_id, columns, time_last, training_window)
271
272 if columns is not None:
--> 273 df = df[columns]
274
275 return df
~/.local/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
~/.local/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
~/.local/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
2485 """Return the cached item, item represents a label indexer."""
2486 cache = self._item_cache
-> 2487 res = cache.get(item)
2488 if res is None:
2489 values = self._data.get(item)
TypeError: unhashable type: 'set'
I also tried the simplest code for deep feature synthesis (dfs) as shown below, but it still encountered the same error
features, feature_names = ft.dfs(entityset = es, target_entity = 'demo')
I'm not really sure why I encountered this error, any help or recommendations on how to go about from here is deeply appreciated.
Thanks in advance for your help!
I found a solution, my current version had bugs in it that was fixed by the FeatureTools team. Just run pip install directly from master,
pip install --upgrade https://github.com/featuretools/featuretools/zipball/master
This fixed and has been released in Featuretools 0.9.1. If you upgrade to the latest version of Featuretools, it will go away.

'<' not supported between instances of 'float' and 'str' Error for Tukey HSD Test

I get a strange error when running the Tukey test. I hope somebody is able to help me with this as I tried a lot. This is my dataframe:
Name Score
1 A 2.29
2 B 2.19
This is my Tukey Test code:
#TUKEY HSD TEST
tukey = pairwise_tukeyhsd(endog=df['Score'].astype('float'),
groups=df['Name'],
alpha=0.05)
tukey.plot_simultaneous()
plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red")
tukey.summary()
This is the error:
<ipython-input-12-3e12e78a002f> in <module>()
2 tukey = pairwise_tukeyhsd(endog=df['Score'].astype('float'),
3 groups=df['Name'],
----> 4 alpha=0.05)
5
6 tukey.plot_simultaneous()
/usr/local/lib/python3.6/dist-packages/statsmodels/stats/multicomp.py in pairwise_tukeyhsd(endog, groups, alpha)
36 '''
37
---> 38 return MultiComparison(endog, groups).tukeyhsd(alpha=alpha)
/usr/local/lib/python3.6/dist-packages/statsmodels/sandbox/stats/multicomp.py in __init__(self, data, groups, group_order)
794 if group_order is None:
795 self.groupsunique, self.groupintlab = np.unique(groups,
--> 796 return_inverse=True)
797 else:
798 #check if group_order has any names not in groups
/usr/local/lib/python3.6/dist-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
221 ar = np.asanyarray(ar)
222 if axis is None:
--> 223 return _unique1d(ar, return_index, return_inverse, return_counts)
224 if not (-ar.ndim <= axis < ar.ndim):
225 raise ValueError('Invalid axis kwarg specified for unique')
/usr/local/lib/python3.6/dist-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
278
279 if optional_indices:
--> 280 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
281 aux = ar[perm]
282 else:
**TypeError: '<' not supported between instances of 'float' and 'str'**
How can this error be resolved? Thanks in advance!
You have the problem because df['Name'] contains both floats and strings AND df['Name'] is of type pandas.core.series.Series. This combination leads to an error with numpy.unique() as seen from traceback. You can fix the problem with 2 ways.
tukey = pairwise_tukeyhsd(endog=df['Score'].astype('float'),
groups=list(df['Name']), # list instead of a Series
alpha=0.05)
OR
Make sure df['Name'] contains only numbers or only strings.

tfidf vectorizer process shows error

I am working on non-Engish corpus analysis but facing several problems. One of those problems is tfidf_vectorizer. After importing concerned liberaries, I processed following code to get results
contents = [open("D:\test.txt", encoding='utf8').read()]
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.2, stop_words=stopwords,
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(3,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
print(tfidf_matrix.shape)
After processing above code I got following error message.
ValueError Traceback (most recent call last)
<ipython-input-144-bbcec8b8c065> in <module>()
5 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(3,3))
6
----> 7 get_ipython().magic('time tfidf_matrix = tfidf_vectorizer.fit_transform(contents) #fit the vectorizer to synopses')
8
9 print(tfidf_matrix.shape)
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in magic(self, arg_s)
2156 magic_name, _, magic_arg_s = arg_s.partition(' ')
2157 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158 return self.run_line_magic(magic_name, magic_arg_s)
2159
2160 #-------------------------------------------------------------------------
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_line_magic(self, magic_name, line)
2077 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2078 with self.builtin_trap:
-> 2079 result = fn(*args,**kwargs)
2080 return result
2081
<decorator-gen-60> in time(self, line, cell, local_ns)
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\magics\execution.py in time(self, line, cell, local_ns)
1178 else:
1179 st = clock2()
-> 1180 exec(code, glob, local_ns)
1181 end = clock2()
1182 out = None
<timed exec> in <module>()
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1303 Tf-idf-weighted document-term matrix.
1304 """
-> 1305 X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1306 self._tfidf.fit(X)
1307 # X is already a transformed view of raw_documents so
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
836 max_doc_count,
837 min_doc_count,
--> 838 max_features)
839
840 self.vocabulary_ = vocabulary
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _limit_features(self, X, vocabulary, high, low, limit)
731 kept_indices = np.where(mask)[0]
732 if len(kept_indices) == 0:
--> 733 raise ValueError("After pruning, no terms remain. Try a lower"
734 " min_df or a higher max_df.")
735 return X[:, kept_indices], removed_terms
ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.
If I change then min and max value the error is
Assuming your tokeniser works as expected, I see two problems with your code. First, TfIdfVectorizer expects a list of strings, whereas you are providing a single string. Second, min_df=0.2 is quite high- to be included, a term needs to occur in 20% of all documents, which is very unlikely for trigram features.
The following works for me
from sklearn.feature_extraction.text import TfidfVectorizer
with open("README.md") as infile:
contents = infile.readlines() # Note: readlines() instead of read()
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=2, use_idf=True, ngram_range=(3,3))
# note: minimum of 2 occurrences, rather than 0.2 (20% of all documents)
tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
print(tfidf_matrix.shape)
outputs (155, 28)

Resources