How to fix this ValueError? - python-3.x

I am trying to run a python code, mostly based on NLTK book, for ngram POS Tagging a Gujarati language text from my GujaratiTextCorpus. I encountered a ValueError.
I am working with Python 3.7.3 in Windows 10. I use jupyter notebook through anaconda. I am a beginner in using python. I studied the answers available on stackoverflow. com to fix my ValueError, but could not solve it.
import nltk
f = open('C:\\Users\\BHOGAYATA\\Documents\\GujaratiPosTagging\\cts260.txt', encoding = 'utf8')
raw = f.read()
train2_sents = nltk.sent_tokenize(raw)
text2 = nltk.Text(train2_sents)
train2_sents
import nltk
f = open('C:\\Users\\BHOGAYATA\\Documents\\GujaratiPosTagging\\txt42_sents.txt', encoding = 'utf8')
raw = f.read()
bs_sents = nltk.sent_tokenize(raw)
text3 = nltk.Text(bs_sents)
bs_sents
unigram_tagger = nltk.UnigramTagger(train2_sents)
unigram_tagger.tag(bs_sents)
I expected that the words of the two Gujarati sentences would be POS Tagged. I found the following error messages:
ValueError
Traceback (most recent call last)
<ipython-input-3-5fae0b92393e> in <module>
11 text3 = nltk.Text(bs_sents)
12 bs_sents
---> 13 unigram_tagger = nltk.UnigramTagger(train2_sents)
14 unigram_tagger.tag(bs_sents)
15
~\Anaconda3\lib\site-packages\nltk\tag\sequential.py in __init__(self, train, model, backoff, cutoff, verbose)
344
345 def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
--> 346 NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
347
348 def encode_json_obj(self):
~\Anaconda3\lib\site-packages\nltk\tag\sequential.py in __init__(self, n, train, model, backoff, cutoff, verbose)
293
294 if train:
--> 295 self._train(train, cutoff, verbose)
296
297 def encode_json_obj(self):
~\Anaconda3\lib\site-packages\nltk\tag\sequential.py in _train(self, tagged_corpus, cutoff, verbose)
181 fd = ConditionalFreqDist()
182 for sentence in tagged_corpus:
--> 183 tokens, tags = zip(*sentence)
184 for index, (token, tag) in enumerate(sentence):
185 # Record the event.
ValueError: not enough values to unpack (expected 2, got 1)

It says the variable you are passing have one output but you are expecting two..
Ex:
for a, b in [("a", "b")]:
print("a:", a, "b:", b)
This will work
for a, b in [("a")]:
print("a:", a, "b:", b)
This will not work
Edit:
Look at your UnigramTagger
For first argument it takes a list of tagged sentences of type
list(list(tuple(str, str)))
You are giving train2_sents of type
list(tuple(str,str)
Where your
list(tuple(str,str) is same as train2_sents

Related

Cannot interpret SVM model using Shapash

Currently, I'm exploring machine learning interpretability tools for one of my project. I found Shapash quite a new tool and many people suggesting to use it to create a few easily interpretable charts for ML model. When I tried it with RandomForestClassifier it worked fine and generate a webpage full of different charts but the same I cannot achieve while using SVM(just exploring this library, not focusing on the perfect ML model for a problem).
Note - using Shapash link here
#Fit blackbox model
svc = svm.SVC()
svc.fit(X_train_smote, y_train_smote)
y_pred = svc.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")
from shapash import SmartExplainer
xpl = SmartExplainer(model=svc)
error which I'm getting -
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
/tmp/ipykernel_13648/1233939729.py in <module>
----> 1 xpl = SmartExplainer(model=svc)
~/Python_AI/ai_env/lib/python3.8/site-packages/shapash/explainer/smart_explainer.py in __init__(self, model, backend, preprocessing, postprocessing, features_groups, features_dict, label_dict, title_story, palette_name, colors_dict, **kwargs)
194 if isinstance(backend, str):
195 backend_cls = get_backend_cls_from_name(backend)
--> 196 self.backend = backend_cls(
197 model=self.model, preprocessing=preprocessing, **kwargs)
198 elif isinstance(backend, BaseBackend):
~/Python_AI/ai_env/lib/python3.8/site-packages/shapash/backend/shap_backend.py in __init__(self, model, preprocessing, explainer_args, explainer_compute_args)
16 self.explainer_args = explainer_args if explainer_args else {}
17 self.explainer_compute_args = explainer_compute_args if explainer_compute_args else {}
---> 18 self.explainer = shap.Explainer(model=model, **self.explainer_args)
19
20 def run_explainer(self, x: pd.DataFrame) -> dict:
~/Python_AI/ai_env/lib/python3.8/site-packages/shap/explainers/_explainer.py in __init__(self, model, masker, link, algorithm, output_names, feature_names, **kwargs)
166 # if we get here then we don't know how to handle what was given to us
167 else:
--> 168 raise Exception("The passed model is not callable and cannot be analyzed directly with the given masker! Model: " + str(model))
169
170 # build the right subclass
Exception: The passed model is not callable and cannot be analyzed directly with the given masker! Model: SVC()

sklearn DictVectorizer() throwing error with a dictionary as input

I'm fairly new to sklearn's DictVectorizer, and am trying to create a function where DictVectorizer will output feature names from a list of bigrams that I have used to form a from a feature dictionary. The input to my function is a string, and the function should return a list consisting of a formed into dictionaries (something like this).
def features (str) -> List[Dict[Text, Union[Text, int]]]:
# my feature dictionary should have 'bigram' as the key, and the values will be the bigrams themselves. your feature dict needs to have "bigram" as a key
# bigram: a form of "w[i]-w[i+1]"
# This is my bigram list (as structured above)
bigrams: List[Dict[Text, Union[Text, int]]] = []
# here is my code:
bigrams = {'bigram':i for j in sentence for i in zip(j.split(" ").
[:-1], j.split(" ")[1:])}
return bigrams
vect = DictVectorizer(sparse=False)
text = str()
feature_catalog = features(text)
vect.fit(feature_catalog)
print(sorted(vectorizer.get_feature_names_out()))
Everything works fine until the code advances to the DictVectorizer blocks (hidden in the class itself). This is what I get:
AttributeError Traceback (most recent call last)
/var/folders/pl/k80fpf9s4f9_3rp8hnpw5x0m0000gq/T/ipykernel_3804/266218402.py in <module>
22 features = get_feature(text)
23
---> 24 vectorizer.fit(features)
25
26 print(sorted(vectorizer.get_feature_names()))
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/feature_extraction/_dict_vectorizer.py in fit(self, X, y)
159
160 for x in X:
--> 161 for f, v in x.items():
162 if isinstance(v, str):
163 feature_name = "%s%s%s" % (f, self.separator, v)
AttributeError: 'str' object has no attribute 'items'
Any ideas? This ultimately going to be used as part of a larger processsing effort on a corpus.

How to encode empty string using BERT

I have recently been trying to encode an empty string with CamemBERT (BERT model for French). I wasn't sure on how to do that. If I try to simply encode an empty string,
from transformers import CamembertModel, CamembertTokenizer
import torch
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
camembert = CamembertModel.from_pretrained("camembert-base")
tokenized_sentence = tokenizer.tokenize("")
encoded_sentence = tokenizer.encode(tokenized_sentence, return_tensors='pt')
embeddings = camembert(encoded_sentence)
embeddings.last_hidden_state.squeeze()[0] # embedding of the CLS token
I get the error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-553400f369a8> in <module>
1 # Tokenize in sub-words with SentencePiece
2 tokenized_sentence = tokenizer.tokenize("")
----> 3 encoded_sentence = tokenizer.encode(tokenized_sentence, return_tensors='pt')
4 embeddings = camembert(encoded_sentence)
5 embeddings.last_hidden_state.squeeze()[0] # embeddings.last_hidden_state[0][0]
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in encode(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)
2057 ``convert_tokens_to_ids`` method).
2058 """
-> 2059 encoded_inputs = self.encode_plus(
2060 text,
2061 text_pair=text_pair,
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in encode_plus(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2376 )
2377
-> 2378 return self._encode_plus(
2379 text=text,
2380 text_pair=text_pair,
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils.py in _encode_plus(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
459 )
460
--> 461 first_ids = get_input_ids(text)
462 second_ids = get_input_ids(text_pair) if text_pair is not None else None
463
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils.py in get_input_ids(text)
446 )
447 else:
--> 448 raise ValueError(
449 f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
450 )
ValueError: Input [] is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
Which I think is expected behavior. I have tried with spaCy's French transformer model but have also been unsuccessful. Here's the code I used for spaCy :
from transformers import BertTokenizer, BertModel
import spacy
#!python -m spacy download fr_dep_news_trf
trf_fr = spacy.load("fr_dep_news_trf")
example = trf_fr("")
example._.trf_data.tensors[1].flatten() # embedding of the CLS token
And the error is
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-27-c53de04d2e6f> in <module>
1 example = trf_fr("")
----> 2 example._.trf_data.tensors[1].flatten()
IndexError: list index out of range
simply because the model returns [].
I guess that at this point, my question is theoretical: what would be the best or a good way to encode an empty string using CamemBERT or spaCy? Would "forcing" the model to return a vector of 0 be a good thing? Would returning "impossible" values such as a (10,..., 10) be a good possibility? Should I force the tokenizer to create a sequence of [PAD] tokens? In this case, how would I implement that using spaCy and/or CamemBERT?
Thanks!
PS : I'm using
Python 3.8.10
spaCy 3.0.6
transformers 4.6.1

tfidf first time, using it on a Pandas series that has a list per entry

Data looks like this :
data_clean2.head(3)
text target
0 [deed, reason, earthquak, may, allah, forgiv, u] 1
1 [forest, fire, near, la, rong, sask, canada] 1
2 [resid, ask, shelter, place, notifi, offic, evacu, shelter, place, order, expect] 1
I got this by stemming and lemmatizing the sentence and tokenizing before that. ( Hope that is right).
Now I want to use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
It gives me the following error :
AttributeError Traceback (most recent call last)
<ipython-input-140-6f68d1115c5f> in <module>
1 vectorizer = TfidfVectorizer()
----> 2 vectors = vectorizer.fit_transform(data_clean2['text'])
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1650 """
1651 self._check_params()
-> 1652 X = super().fit_transform(raw_documents)
1653 self._tfidf.fit(X)
1654 # X is already a transformed view of raw_documents so
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1056
1057 vocabulary, X = self._count_vocab(raw_documents,
-> 1058 self.fixed_vocabulary_)
1059
1060 if self.binary:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
968 for doc in raw_documents:
969 feature_counter = {}
--> 970 for feature in analyze(doc):
971 try:
972 feature_idx = vocabulary[feature]
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
350 tokenize)
351 return lambda doc: self._word_ngrams(
--> 352 tokenize(preprocess(self.decode(doc))), stop_words)
353
354 else:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'list' object has no attribute 'lower'
I know that I somehow cannot use it on the list, so what is my play here, trying to return the list into a string again?
Yes, first convert to string using:
data_clean2['text'] = data_clean2['text'].apply(', '.join)
Then use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
v = pd.DataFrame(vectors.toarray(), columns = vectorizer.get_feature_names())

tfidf vectorizer process shows error

I am working on non-Engish corpus analysis but facing several problems. One of those problems is tfidf_vectorizer. After importing concerned liberaries, I processed following code to get results
contents = [open("D:\test.txt", encoding='utf8').read()]
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.2, stop_words=stopwords,
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(3,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
print(tfidf_matrix.shape)
After processing above code I got following error message.
ValueError Traceback (most recent call last)
<ipython-input-144-bbcec8b8c065> in <module>()
5 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(3,3))
6
----> 7 get_ipython().magic('time tfidf_matrix = tfidf_vectorizer.fit_transform(contents) #fit the vectorizer to synopses')
8
9 print(tfidf_matrix.shape)
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in magic(self, arg_s)
2156 magic_name, _, magic_arg_s = arg_s.partition(' ')
2157 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158 return self.run_line_magic(magic_name, magic_arg_s)
2159
2160 #-------------------------------------------------------------------------
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_line_magic(self, magic_name, line)
2077 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2078 with self.builtin_trap:
-> 2079 result = fn(*args,**kwargs)
2080 return result
2081
<decorator-gen-60> in time(self, line, cell, local_ns)
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
C:\Users\mazhar\Anaconda3\lib\site-packages\IPython\core\magics\execution.py in time(self, line, cell, local_ns)
1178 else:
1179 st = clock2()
-> 1180 exec(code, glob, local_ns)
1181 end = clock2()
1182 out = None
<timed exec> in <module>()
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1303 Tf-idf-weighted document-term matrix.
1304 """
-> 1305 X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1306 self._tfidf.fit(X)
1307 # X is already a transformed view of raw_documents so
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
836 max_doc_count,
837 min_doc_count,
--> 838 max_features)
839
840 self.vocabulary_ = vocabulary
C:\Users\mazhar\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _limit_features(self, X, vocabulary, high, low, limit)
731 kept_indices = np.where(mask)[0]
732 if len(kept_indices) == 0:
--> 733 raise ValueError("After pruning, no terms remain. Try a lower"
734 " min_df or a higher max_df.")
735 return X[:, kept_indices], removed_terms
ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.
If I change then min and max value the error is
Assuming your tokeniser works as expected, I see two problems with your code. First, TfIdfVectorizer expects a list of strings, whereas you are providing a single string. Second, min_df=0.2 is quite high- to be included, a term needs to occur in 20% of all documents, which is very unlikely for trigram features.
The following works for me
from sklearn.feature_extraction.text import TfidfVectorizer
with open("README.md") as infile:
contents = infile.readlines() # Note: readlines() instead of read()
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=2, use_idf=True, ngram_range=(3,3))
# note: minimum of 2 occurrences, rather than 0.2 (20% of all documents)
tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
print(tfidf_matrix.shape)
outputs (155, 28)

Resources