I am trying to use the CelebA dataset in a deep learning project. I have the zipped folder from Kaggle.
I wanted to unzip and then split the images into training, testing, and validation, but then found out that it would not be possible on my not-so-powerful system.
So, to avoid wasting time, I wanted to use the TensorFlow-datasets method to load the CelebA dataset. But unfortunately, the dataset is inaccessible with the following error:
(Code first)
ds = tfds.load('celeb_a', split='train', download=True)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-69-d7b9371eb674> in <module>
----> 1 ds = tfds.load('celeb_a', split='train', download=True)
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\core\load.py in load(name, split, data_dir, batch_size, shuffle_files, download, as_supervised, decoders, read_config, with_info, builder_kwargs, download_and_prepare_kwargs, as_dataset_kwargs, try_gcs)
344 if download:
345 download_and_prepare_kwargs = download_and_prepare_kwargs or {}
--> 346 dbuilder.download_and_prepare(**download_and_prepare_kwargs)
347
348 if as_dataset_kwargs is None:
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\core\dataset_builder.py in download_and_prepare(self, download_dir, download_config)
383 self.info.read_from_directory(self._data_dir)
384 else:
--> 385 self._download_and_prepare(
386 dl_manager=dl_manager,
387 download_config=download_config)
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\core\dataset_builder.py in _download_and_prepare(self, dl_manager, download_config)
1020 def _download_and_prepare(self, dl_manager, download_config):
1021 # Extract max_examples_per_split and forward it to _prepare_split
-> 1022 super(GeneratorBasedBuilder, self)._download_and_prepare(
1023 dl_manager=dl_manager,
1024 max_examples_per_split=download_config.max_examples_per_split,
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\core\dataset_builder.py in _download_and_prepare(self, dl_manager, **prepare_split_kwargs)
959 split_generators_kwargs = self._make_split_generators_kwargs(
960 prepare_split_kwargs)
--> 961 for split_generator in self._split_generators(
962 dl_manager, **split_generators_kwargs):
963 if str(split_generator.split_info.name).lower() == "all":
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\image\celeba.py in _split_generators(self, dl_manager)
137 all_images = {
138 os.path.split(k)[-1]: img for k, img in
--> 139 dl_manager.iter_archive(downloaded_dirs["img_align_celeba"])
140 }
141
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\core\download\download_manager.py in iter_archive(self, resource)
559 if isinstance(resource, six.string_types):
560 resource = resource_lib.Resource(path=resource)
--> 561 return extractor.iter_archive(resource.path, resource.extract_method)
562
563 def extract(self, path_or_paths):
c:\users\aman\appdata\local\programs\python\python38\lib\site-packages\tensorflow_datasets\core\download\extractor.py in iter_archive(path, method)
221 An iterator of `(path_in_archive, f_obj)`
222 """
--> 223 return _EXTRACT_METHODS[method](path)
KeyError: <ExtractMethod.NO_EXTRACT: 1>
Could someone explain what I am doing wrong?
On a side-note, if this does not work, is there a way to convert the already downloaded zipped file from Kaggle into the required format without unzipping and then iterating over each image individually? Basically, I cannot go down the unzip-then-split route for such a large dataset...
TIA!
EDIT: I tried the same on Colab, but getting a similar error:
It seems like there is some sort of quota limit for downloading form GDrive. Go to the google drive link shown in the error, and make a copy to your drive. You can download the copy alternatively through libraries such as gdown, google_drive_downloader.
upgrade the tfds to the nightly version, which worked for me
Related
Is anyone else facing an issue while running the Prophet model after the latest v1.1.2 release?
I get the following error when executing model.fit():
RuntimeError Traceback (most recent call last)
<ipython-input-86-1e4ae74985f6> in <module>
----> 1 model_training(top_5_aro, X_trainARO, X_validARO, df_ARO, predict_index)
<ipython-input-85-9ee9b229fcde> in model_training(list_accounts, df_train, df_validation, df_original, predict_index)
98 model = Prophet()
99 # fit the model
--> 100 model.fit(train_data3)
101
102 # use the model to make a forecast
/opt/app-root/lib64/python3.8/site-packages/prophet/forecaster.py in fit(self, df, **kwargs)
1179 self.params = self.stan_backend.sampling(stan_init, dat, self.mcmc_samples, **kwargs)
1180 else:
-> 1181 self.params = self.stan_backend.fit(stan_init, dat, **kwargs)
1182
1183 self.stan_fit = self.stan_backend.stan_fit
/opt/app-root/lib64/python3.8/site-packages/prophet/models.py in fit(self, stan_init, stan_data, **kwargs)
98 # Fall back on Newton
99 if not self.newton_fallback or args['algorithm'] == 'Newton':
--> 100 raise e
101 logger.warning('Optimization terminated abnormally. Falling back to Newton.')
102 args['algorithm'] = 'Newton'
/opt/app-root/lib64/python3.8/site-packages/prophet/models.py in fit(self, stan_init, stan_data, **kwargs)
94
95 try:
---> 96 self.stan_fit = self.model.optimize(**args)
97 except RuntimeError as e:
98 # Fall back on Newton
/opt/app-root/lib64/python3.8/site-packages/cmdstanpy/model.py in optimize(self, data, seed, inits, output_dir, sig_figs, save_profile, algorithm, init_alpha, tol_obj, tol_rel_obj, tol_grad, tol_rel_grad, tol_param, history_size, iter, save_iterations, require_converged, show_console, refresh, time_fmt, timeout)
736 get_logger().warning(msg)
737 else:
--> 738 raise RuntimeError(msg)
739 mle = CmdStanMLE(runset)
740 return mle
RuntimeError: Error during optimization! Command '/opt/app-root/lib/python3.8/site-packages/prophet/stan_model/prophet_model.bin random seed=97108 data file=/tmp/tmplilnrs7m/fk4f5y6m.json init=/tmp/tmplilnrs7m/q_6vjmch.json output file=/tmp/tmplilnrs7m/prophet_model1ny0mums/prophet_model-20230125181922.csv method=optimize algorithm=newton iter=10000' failed
I did not have this issue in the previous version. Any help appreciated!
Python version: 3.8.3
Prophet version: 1.1.2
When using the earlier version I was successfully able to train the model and view the model predictions. After upgrading to the latest v1.1.2 of prophet, I face this error when training the model.
I want to train new NER entities with the following code:
def train_spacy_model(data, model='en_core_web_trf', n_iter=50):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
TRAIN_DATA = data
ner = nlp.get_pipe("ner")
examples = []
for text, annotations in TRAIN_DATA:
examples.append(Example.from_dict(nlp.make_doc(text), annotations))
nlp.initialize(lambda: examples)
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(n_iter):
random.shuffle(examples)
losses = {}
batches = minibatch(examples, size=compounding(4.0, 64.0, 1.2))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
batch,
drop=0.20,
losses=losses
)
print("Losses", losses)
return nlp
nlp = train_spacy_model(data=dataset, n_iter=30)
I keep getting this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[296], line 40
36 print("Losses", losses)
38 return nlp
---> 40 nlp = train_spacy_model(data=no_verlaps_dataset, n_iter=30)
42 # save model to output directory
43 output_dir = '_data/models/actor_ner'
Cell In[296], line 16, in train_spacy_model(data, model, n_iter)
14 for text, annotations in TRAIN_DATA:
15 examples.append(Example.from_dict(nlp.make_doc(text), annotations))
---> 16 nlp.initialize(lambda: examples)
17 # for ent in annotations.get('entities'):
18 # ner.add_label(ent[2])
20 pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:1290, in Language.initialize(self, get_examples, sgd)
1288 config = self.config.interpolate()
1289 # These are the settings provided in the [initialize] block in the config
-> 1290 I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
1291 before_init = I["before_init"]
1292 if before_init is not None:
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:746, in registry.resolve(cls, config, schema, overrides, validate)
737 #classmethod
738 def resolve(
739 cls,
(...)
744 validate: bool = True,
745 ) -> Dict[str, Any]:
--> 746 resolved, _ = cls._make(
747 config, schema=schema, overrides=overrides, validate=validate, resolve=True
748 )
749 return resolved
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:795, in registry._make(cls, config, schema, overrides, resolve, validate)
793 if not is_interpolated:
794 config = Config(orig_config).interpolate()
--> 795 filled, _, resolved = cls._fill(
796 config, schema, validate=validate, overrides=overrides, resolve=resolve
797 )
798 filled = Config(filled, section_order=section_order)
799 # Check that overrides didn't include invalid properties not in config
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:867, in registry._fill(cls, config, schema, validate, resolve, parent, overrides)
864 getter = cls.get(reg_name, func_name)
865 # We don't want to try/except this and raise our own error
866 # here, because we want the traceback if the function fails.
--> 867 getter_result = getter(*args, **kwargs)
868 else:
869 # We're not resolving and calling the function, so replace
870 # the getter_result with a Promise class
871 getter_result = Promise(
872 registry=reg_name, name=func_name, args=args, kwargs=kwargs
873 )
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:108, in load_lookups_data(lang, tables)
105 #registry.misc("spacy.LookupsDataLoader.v1")
106 def load_lookups_data(lang, tables):
107 util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
--> 108 lookups = load_lookups(lang=lang, tables=tables)
109 return lookups
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/lookups.py:30, in load_lookups(lang, tables, strict)
28 if lang not in registry.lookups:
29 if strict and len(tables) > 0:
---> 30 raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
31 return lookups
32 data = registry.lookups.get(lang)
ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.
I have installed the package:
pip install spacy-lookups-data
Collecting spacy-lookups-data
Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.5/98.5 MB 25.9 MB/s eta 0:00:00
But it still persists.
How can I fix this error to commence updating the model to detect new entitities for ner tasks?
EDIT
It got fixed when I restarted the kernel in jupyter notbook that this code ran in.
To answer the narrow question: you probably need to restart your runtime in order for the tables in spacy-lookups-data to be registered.
To answer the question you didn't ask: the quoted script looks like it was only partially updated from v2 and I wouldn't recommend using it, in particular not for en_core_web_trf. One recommended way to update ner components in spacy v3 pipelines is shown in this demo project:
https://github.com/explosion/projects/tree/v3/pipelines/ner_demo_update
It handles a lot of the pipeline/config/training details for you in order to update ner without affecting the performance of the other components in the pipeline. A walkthrough of how to run a project is shown in the v2->v3 examples README.
I've been running this code in Jupyter Notebook and the error persisted until I restarted the kernel. So the answer is to restart the notebook kernel.
I am trying to replicate Keras-LSTM DeepExplainer example. I am getting the following error when trying to compute the shap values:
This warning: keras is no longer supported, please use tf.keras instead.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
And this error:
TypeError Traceback (most recent call last)
in
1 import shap
2 explainer = shap.DeepExplainer(model, x_train[:100])
----> 3 shap_values = explainer.shap_values(x_test[:10])
~/miniconda3/envs/mtq/lib/python3.8/site-packages/shap/explainers/_deep/init.py
in shap_values(self, X, ranked_outputs, output_rank_order,
check_additivity)
122 were chosen as "top".
123 """
--> 124 return self.explainer.shap_values(X, ranked_outputs, output_rank_order, check_additivity=check_additivity)
~/miniconda3/envs/mtq/lib/python3.8/site-packages/shap/explainers/_deep/deep_tf.py
in shap_values(self, X, ranked_outputs, output_rank_order,
check_additivity)
306 # run attribution computation graph
307 feature_ind = model_output_ranks[j,i]
--> 308 sample_phis = self.run(self.phi_symbolic(feature_ind), self.model_inputs,
joint_input) 309
310 # assign the attributions to the right part of the output arrays
~/miniconda3/envs/mtq/lib/python3.8/site-packages/shap/explainers/_deep/deep_tf.py
in run(self, out, model_inputs, X)
363
364 return final_out
--> 365 return self.execute_with_overridden_gradients(anon)
366
367 def custom_grad(self, op, *grads):
~/miniconda3/envs/mtq/lib/python3.8/site-packages/shap/explainers/_deep/deep_tf.py
in execute_with_overridden_gradients(self, f)
399 # define the computation graph for the attribution values using a custom gradient-like computation
400 try:
--> 401 out = f()
402 finally:
403 # reinstate the backpropagatable check
~/miniconda3/envs/mtq/lib/python3.8/site-packages/shap/explainers/_deep/deep_tf.py
in anon()
356 shape = list(self.model_inputs[i].shape)
357 shape[0] = -1
--> 358 data = X[i].reshape(shape)
359 v = tf.constant(data, dtype=self.model_inputs[i].dtype)
360 inputs.append(v)
TypeError: 'NoneType' object cannot be interpreted as an integer
I have checked out the PR#1483, but couldn't find a relevant fix there. Please suggest on what tensorflow, keras, and shap versions are needed to successfully replicate the example.
Working through the featuretools "predict_next_purchase" demo against my own data. I've created the entity set, and have also created a new pandas.dataframe comprised of the labels and times. I'm to the point of using ft.dfs for deep feature synthesis, and am getting a RuntimeError: maximum recursion depth exceeded. Below is the stack trace:
feature_matrix, features = ft.dfs(target_entity='projects',
cutoff_time=labels.reset_index().loc[:,['jobnumber','time']],
training_window=inst_defn['training_window'],
entityset=es,
verbose=True)
Stack Trace:
Building features: 0it [00:00, ?it/s]
RuntimeError: maximum recursion depth exceeded
RuntimeErrorTraceback (most recent call last)
<ipython-input-743-f05fc567dd1b> in <module>()
3 training_window=inst_defn['training_window'],
4 entityset=es,
----> 5 verbose=True)
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/dfs.pyc in dfs(entities, relationships, entityset, target_entity, cutoff_time, instance_ids, agg_primitives, trans_primitives, allowed_paths, max_depth, ignore_entities, ignore_variables, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, verbose)
164 seed_features=seed_features)
165
--> 166 features = dfs_object.build_features(verbose=verbose)
167
168 if features_only:
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/deep_feature_synthesis.pyc in build_features(self, variable_types, verbose)
227 self.where_clauses = defaultdict(set)
228 self._run_dfs(self.es[self.target_entity_id], [],
--> 229 all_features, max_depth=self.max_depth)
230
231 new_features = list(all_features[self.target_entity_id].values())
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/deep_feature_synthesis.pyc in _run_dfs(self, entity, entity_path, all_features, max_depth)
353 entity_path=list(entity_path),
354 all_features=all_features,
--> 355 max_depth=new_max_depth)
356
357 """
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/deep_feature_synthesis.pyc in _run_dfs(self, entity, entity_path, all_features, max_depth)
338 if self._apply_traversal_filters(entity, self.es[b_id],
339 entity_path,
--> 340 forward=False) and
341 b_id not in self.ignore_entities]
342 for b_entity_id in backward_entities:
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/deep_feature_synthesis.pyc in _apply_traversal_filters(self, parent_entity, child_entity, entity_path, forward)
429 child_entity=child_entity,
430 target_entity_id=self.target_entity_id,
--> 431 entity_path=entity_path, forward=forward):
432 return False
433
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/dfs_filters.pyc in is_valid(self, feature, entity, target_entity_id, child_feature, child_entity, entity_path, forward, where)
53
54 if type(feature) != list:
---> 55 return func(*args)
56
57 else:
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/synthesis/dfs_filters.pyc in apply_filter(self, parent_entity, child_entity, target_entity_id, entity_path, forward)
76 if (parent_entity.id == target_entity_id or
77 es.find_backward_path(parent_entity.id,
---> 78 target_entity_id) is None):
79 return True
80 path = es.find_backward_path(parent_entity.id, child_entity.id)
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/entityset/base_entityset.pyc in find_backward_path(self, start_entity_id, goal_entity_id)
308 is returned if no path exists.
309 """
--> 310 forward_path = self.find_forward_path(goal_entity_id, start_entity_id)
311 if forward_path is not None:
312 return forward_path[::-1]
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/entityset/base_entityset.pyc in find_forward_path(self, start_entity_id, goal_entity_id)
287
288 for r in self.get_forward_relationships(start_entity_id):
--> 289 new_path = self.find_forward_path(r.parent_entity.id, goal_entity_id)
290 if new_path is not None:
291 return [r] + new_path
... last 1 frames repeated, from the frame below ...
/Users/nbernini/OneDrive/PSC/venv/ml20/lib/python2.7/site-packages/featuretools/entityset/base_entityset.pyc in find_forward_path(self, start_entity_id, goal_entity_id)
287
288 for r in self.get_forward_relationships(start_entity_id):
--> 289 new_path = self.find_forward_path(r.parent_entity.id, goal_entity_id)
290 if new_path is not None:
291 return [r] + new_path
RuntimeError: maximum recursion depth exceeded
The issue here is cyclical relationships in your entity set. Currently, Deep Feature Synthesis can only create features when there is one unique path between two entities. If you have an entity with a relationship to itself, you would also get this error.
A future release of Featuretools will offer better support for this use case.
So I have a dataframe X which looks something like this:
X.head()
0 My wife took me here on my birthday for breakf...
1 I have no idea why some people give bad review...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
6 Drop what you're doing and drive here. After I...
Name: text, dtype: object
And then,
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
But I get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-61-8ff79b91e317> in <module>()
----> 1 X = cv.fit_transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
230
231 if self.lowercase:
--> 232 return lambda x: strip_accents(x.lower())
233 else:
234 return strip_accents
~/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in __getattr__(self, attr)
574 return self.getnnz()
575 else:
--> 576 raise AttributeError(attr + " not found")
577
578 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
No idea why.
You need to specify the column name of the text data even if the dataframe has single column.
X_countMatrix = cv.fit_transform(X['text'])
Because a CountVectorizer expects an iterable as input and when you supply a dataframe as an argument, only thing thats iterated is the column names. So even if you did not have any errors, that would be incorrect. Lucky that you got an error and got a chance to correct it.