dask: read parquet from Azure blob - AzureHttpError - azure

I created a parquet file in an Azure blob using dask.dataframe.to_parquet (Moving data from a database to Azure blob storage).
I would now like to read that file. I'm doing:
STORAGE_OPTIONS={'account_name': 'ACCOUNT_NAME',
'account_key': 'ACCOUNT_KEY'}
df = dd.read_parquet('abfs://BLOB/FILE.parquet', storage_options=STORAGE_OPTIONS)
but I get an AzureHttpError:
---------------------------------------------------------------------------
AzureHttpError Traceback (most recent call last)
<ipython-input-4-2184e772e417> in <module>
3 'account_key': 'ACCOUNT_KEY'}
4
----> 5 df = dd.read_parquet('abfs://BLOB/FILE', storage_options=STORAGE_OPTIONS)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\io\parquet\core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, chunksize, **kwargs)
231 filters=filters,
232 split_row_groups=split_row_groups,
--> 233 **kwargs
234 )
235 if meta.index.name is not None:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\io\parquet\fastparquet.py in read_metadata(fs, paths, categories, index, gather_statistics, filters, **kwargs)
176 # correspond to a row group (populated below).
177 parts, pf, gather_statistics, fast_metadata = _determine_pf_parts(
--> 178 fs, paths, gather_statistics, **kwargs
179 )
180
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\io\parquet\fastparquet.py in _determine_pf_parts(fs, paths, gather_statistics, **kwargs)
127 open_with=fs.open,
128 sep=fs.sep,
--> 129 **kwargs.get("file", {})
130 )
131 if gather_statistics is None:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\fastparquet\api.py in __init__(self, fn, verify, open_with, root, sep)
109 fn2 = join_path(fn, '_metadata')
110 self.fn = fn2
--> 111 with open_with(fn2, 'rb') as f:
112 self._parse_header(f, verify)
113 fn = fn2
~\AppData\Local\Continuum\anaconda3\lib\site-packages\fsspec\spec.py in open(self, path, mode, block_size, cache_options, **kwargs)
722 autocommit=ac,
723 cache_options=cache_options,
--> 724 **kwargs
725 )
726 if not ac:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\adlfs\core.py in _open(self, path, mode, block_size, autocommit, cache_options, **kwargs)
552 autocommit=autocommit,
553 cache_options=cache_options,
--> 554 **kwargs,
555 )
556
~\AppData\Local\Continuum\anaconda3\lib\site-packages\adlfs\core.py in __init__(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, **kwargs)
582 cache_type=cache_type,
583 cache_options=cache_options,
--> 584 **kwargs,
585 )
586
~\AppData\Local\Continuum\anaconda3\lib\site-packages\fsspec\spec.py in __init__(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, **kwargs)
954 if mode == "rb":
955 if not hasattr(self, "details"):
--> 956 self.details = fs.info(path)
957 self.size = self.details["size"]
958 self.cache = caches[cache_type](
~\AppData\Local\Continuum\anaconda3\lib\site-packages\fsspec\spec.py in info(self, path, **kwargs)
499 if out:
500 return out[0]
--> 501 out = self.ls(path, detail=True, **kwargs)
502 path = path.rstrip("/")
503 out1 = [o for o in out if o["name"].rstrip("/") == path]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\adlfs\core.py in ls(self, path, detail, invalidate_cache, delimiter, **kwargs)
446 # then return the contents
447 elif self._matches(
--> 448 container_name, path, as_directory=True, delimiter=delimiter
449 ):
450 logging.debug(f"{path} appears to be a directory")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\adlfs\core.py in _matches(self, container_name, path, as_directory, delimiter)
386 prefix=path,
387 delimiter=delimiter,
--> 388 num_results=None,
389 )
390
~\AppData\Local\Continuum\anaconda3\lib\site-packages\azure\storage\blob\baseblobservice.py in list_blob_names(self, container_name, prefix, num_results, include, delimiter, marker, timeout)
1360 '_context': operation_context,
1361 '_converter': _convert_xml_to_blob_name_list}
-> 1362 resp = self._list_blobs(*args, **kwargs)
1363
1364 return ListGenerator(resp, self._list_blobs, args, kwargs)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\azure\storage\blob\baseblobservice.py in _list_blobs(self, container_name, prefix, marker, max_results, include, delimiter, timeout, _context, _converter)
1435 }
1436
-> 1437 return self._perform_request(request, _converter, operation_context=_context)
1438
1439 def get_blob_account_information(self, container_name=None, blob_name=None, timeout=None):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\azure\storage\common\storageclient.py in _perform_request(self, request, parser, parser_args, operation_context, expected_errors)
444 status_code,
445 exception_str_in_one_line)
--> 446 raise ex
447 finally:
448 # If this is a location locked operation and the location is not set,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\azure\storage\common\storageclient.py in _perform_request(self, request, parser, parser_args, operation_context, expected_errors)
372 except AzureException as ex:
373 retry_context.exception = ex
--> 374 raise ex
375 except Exception as ex:
376 retry_context.exception = ex
~\AppData\Local\Continuum\anaconda3\lib\site-packages\azure\storage\common\storageclient.py in _perform_request(self, request, parser, parser_args, operation_context, expected_errors)
358 # and raised as an azure http exception
359 _http_error_handler(
--> 360 HTTPError(response.status, response.message, response.headers, response.body))
361
362 # Parse the response
~\AppData\Local\Continuum\anaconda3\lib\site-packages\azure\storage\common\_error.py in _http_error_handler(http_error)
113 ex.error_code = error_code
114
--> 115 raise ex
116
117
AzureHttpError: Server encountered an internal error. Please try again after some time. ErrorCode: InternalError
<?xml version="1.0" encoding="utf-8"?><Error><Code>InternalError</Code><Message>Server encountered an internal error. Please try again after some time.
RequestId:...
Time:2020-04-15T02:44:06.8611398Z</Message></Error>

The text of the error suggests that the service was temporarily down. If it persists, you may want to lodge an issue at adlfs; perhaps it could be as simple as more thorough retry logic on their end.

Related

Error using tfds.load on Tensorflow Dataset

I was wondering if tensorflow 2.2 dataset has an issue on Windows release.
Here is my diagnostic code
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")
Version: 2.2.0
Eager mode: True
Hub version: 0.8.0
GPU is available
I can load the list of datasets
tfds.list_builders()
['abstract_reasoning',
'aeslc',
'aflw2k3d',
'amazon_us_reviews',
'anli',
.
.
.
'xnli',
'xsum',
'yelp_polarity_reviews']
However, I am unable to load any dataset
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
I receive the following errors
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\py_utils.py in try_reraise(*args, **kwargs)
398 try:
--> 399 yield
400 except Exception: # pylint: disable=broad-except
c:\python37\lib\site-packages\tensorflow_datasets\core\registered.py in builder(name, **builder_init_kwargs)
243 prefix="Failed to construct dataset {}".format(name)):
--> 244 return builder_cls(name)(**builder_kwargs)
245
c:\python37\lib\site-packages\wrapt\wrappers.py in __call__(self, *args, **kwargs)
602 return self._self_wrapper(self.__wrapped__, self._self_instance,
--> 603 args, kwargs)
604
c:\python37\lib\site-packages\tensorflow_datasets\core\api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
c:\python37\lib\site-packages\tensorflow_datasets\core\dataset_builder.py in __init__(self, data_dir, config, version)
205 else: # Use the code version (do not restore data)
--> 206 self.info.initialize_from_bucket()
207
c:\python37\lib\site-packages\tensorflow_datasets\core\dataset_info.py in initialize_from_bucket(self)
422 tmp_dir = tempfile.mkdtemp("tfds")
--> 423 data_files = gcs_utils.gcs_dataset_info_files(self.full_name)
424 if not data_files:
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\gcs_utils.py in gcs_dataset_info_files(dataset_dir)
69 """Return paths to GCS files in the given dataset directory."""
---> 70 return gcs_listdir(posixpath.join(GCS_DATASET_INFO_DIR, dataset_dir))
71
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\gcs_utils.py in gcs_listdir(dir_name)
62 root_dir = gcs_path(dir_name)
---> 63 if _is_gcs_disabled or not tf.io.gfile.exists(root_dir):
64 return None
c:\python37\lib\site-packages\tensorflow\python\lib\io\file_io.py in file_exists_v2(path)
266 try:
--> 267 _pywrap_file_io.FileExists(compat.as_bytes(path))
268 except errors.NotFoundError:
UnimplementedError: File system scheme 'gs' not implemented (file: 'gs://tfds-data/dataset_info/imdb_reviews/plain_text/1.0.0')
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-36-06930b64f980> in <module>
1 #tfds.list_builders()
----> 2 imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
c:\python37\lib\site-packages\wrapt\wrappers.py in __call__(self, *args, **kwargs)
562
563 return self._self_wrapper(self.__wrapped__, self._self_instance,
--> 564 args, kwargs)
565
566 class BoundFunctionWrapper(_FunctionWrapperBase):
c:\python37\lib\site-packages\tensorflow_datasets\core\api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
67 _check_no_positional(fn, args, ismethod, allowed=allowed)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
71 return disallow_positional_args_dec(wrapped) # pylint: disable=no-value-for-parameter
c:\python37\lib\site-packages\tensorflow_datasets\core\registered.py in load(name, split, data_dir, batch_size, shuffle_files, download, as_supervised, decoders, read_config, with_info, builder_kwargs, download_and_prepare_kwargs, as_dataset_kwargs, try_gcs)
366 data_dir = constants.DATA_DIR
367
--> 368 dbuilder = builder(name, data_dir=data_dir, **builder_kwargs)
369 if download:
370 download_and_prepare_kwargs = download_and_prepare_kwargs or {}
c:\python37\lib\site-packages\tensorflow_datasets\core\registered.py in builder(name, **builder_init_kwargs)
242 with py_utils.try_reraise(
243 prefix="Failed to construct dataset {}".format(name)):
--> 244 return builder_cls(name)(**builder_kwargs)
245
246
c:\python37\lib\contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\py_utils.py in try_reraise(*args, **kwargs)
399 yield
400 except Exception: # pylint: disable=broad-except
--> 401 reraise(*args, **kwargs)
402
403
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\py_utils.py in reraise(prefix, suffix)
390 suffix = '\n' + suffix if suffix else ''
391 msg = prefix + str(exc_value) + suffix
--> 392 six.reraise(exc_type, exc_type(msg), exc_traceback)
393
394
TypeError: __init__() missing 2 required positional arguments: 'op' and 'message'
Is the library broken? As mentioned, I am on Windows 10 machine and using Jupyter Lab.
After I reported the issue on GitHub, the problem was fixed in version 3.2.1.

Getting AttributeError while training DNN classifer

Here is the code:
continous=['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
creating feat_cols
I have automated the process using for loop
feat_cols = []
for col in census.columns:
## Continous - Unchanged
if col in continous:
feat_cols.append(tf.feature_column.numeric_column(col))
## Categorical - Trick, no need to count
elif col!='income_bracket':
print('Embedded {}'.format(col))
feat_cols.append(tf.feature_column.embedding_column(categorical_column=col,dimension=X_train[col].nunique()))
Creating model
Importing Tensorflow I've created this model.
dnnmodel = tf.estimator.DNNClassifier(hidden_units=[7,7,7], feature_columns=feat_cols, n_classes=2)
ip_dnn = tf.estimator.inputs.pandas_input_fn(X_train, y_train, num_epochs=None,shuffle=True)
dnnmodel.train(input_fn=ip_dnn, steps=5000)
Error:
Earlier with LinearClassifier everything worked fine.
INFO:tensorflow:Calling model_fn.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-67-751f0a94d0d2> in <module>()
1 # ip_func created earliar
2
----> 3 dnnmodel.train(input_fn=ip_dnn, steps=5000)
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
356
357 saving_listeners = _check_listeners_type(saving_listeners)
--> 358 loss = self._train_model(input_fn, hooks, saving_listeners)
359 logging.info('Loss for final step: %s.', loss)
360 return self
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1122 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1123 else:
-> 1124 return self._train_model_default(input_fn, hooks, saving_listeners)
1125
1126 def _train_model_default(self, input_fn, hooks, saving_listeners):
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _train_model_default(self, input_fn, hooks, saving_listeners)
1152 worker_hooks.extend(input_hooks)
1153 estimator_spec = self._call_model_fn(
-> 1154 features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
1155 global_step_tensor = training_util.get_global_step(g)
1156 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode, config)
1110
1111 logging.info('Calling model_fn.')
-> 1112 model_fn_results = self._model_fn(features=features, **kwargs)
1113 logging.info('Done calling model_fn.')
1114
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
520 input_layer_partitioner=input_layer_partitioner,
521 config=config,
--> 522 batch_norm=batch_norm)
523
524 super(DNNClassifier, self).__init__(
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in _dnn_model_fn(features, labels, mode, head, hidden_units, feature_columns, optimizer, activation_fn, dropout, input_layer_partitioner, config, use_tpu, batch_norm)
285 input_layer_partitioner=input_layer_partitioner,
286 batch_norm=batch_norm)
--> 287 logits = logit_fn(features=features, mode=mode)
288
289 if use_tpu:
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in dnn_logit_fn(features, mode)
101 batch_norm,
102 name='dnn')
--> 103 return dnn_model(features, mode)
104
105 return dnn_logit_fn
~\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
552 # In graph mode, failure to build the layer's graph
553 # implies a user-side bug. We don't catch exceptions.
--> 554 outputs = self.call(inputs, *args, **kwargs)
555 else:
556 try:
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in call(self, features, mode)
193 'input_from_feature_columns',
194 partitioner=self._input_layer_partitioner):
--> 195 net = self._input_layer(features)
196 for i in range(len(self._hidden_layers)):
197 net = self._hidden_layers[i](net)
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in __call__(self, features)
335 trainable=self._trainable,
336 cols_to_vars=None,
--> 337 from_template=True)
338
339 #property
~\Anaconda3\lib\site-packages\tensorflow\python\ops\template.py in __call__(self, *args, **kwargs)
366 custom_getter=self._custom_getter) as vs:
367 self._variable_scope = vs
--> 368 return self._call_func(args, kwargs)
369
370 #property
~\Anaconda3\lib\site-packages\tensorflow\python\ops\template.py in _call_func(self, args, kwargs)
309 # Checkpointable).
310 with checkpointable_util.capture_dependencies(template=self):
--> 311 result = self._func(*args, **kwargs)
312
313 if self._variables_created:
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in _internal_input_layer(features, feature_columns, weight_collections, trainable, cols_to_vars, scope, cols_to_output_tensors, from_template)
179 """See input_layer. `scope` is a name or variable scope to use."""
180
--> 181 feature_columns = _normalize_feature_columns(feature_columns)
182 for column in feature_columns:
183 if not isinstance(column, _DenseColumn):
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in _normalize_feature_columns(feature_columns)
2266 name_to_column = dict()
2267 for column in feature_columns:
-> 2268 if column.name in name_to_column:
2269 raise ValueError('Duplicate feature column name found for columns: {} '
2270 'and {}. This usually means that these columns refer to '
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py in name(self)
2960 def name(self):
2961 """See `FeatureColumn` base class."""
-> 2962 return '{}_embedding'.format(self.categorical_column.name)
2963
2964 #property
**AttributeError: 'str' object has no attribute 'name'**
originally defined at:
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py", line 102, in dnn_logit_fn
name='dnn')
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py", line 134, in __init__
create_scope_now=False)
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 327, in __init__
self._name, _internal_input_layer, create_scope_now_=create_scope_now)
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow\python\ops\template.py", line 154, in make_template
**kwargs)

TimeoutError: Worker failed to start

I work in an environment conda on jupyter notebook.
When trying to create a client using processes as follows
from dask.distributed import Client, progress
client = Client(processes = True)
the following error occurs
TimeoutError Traceback (most recent call last)
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
callback(f)
827 try:
--> 828 result_list.append(f.result())
829 except Exception as e:
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/concurrent.py
in result(self, timeout)
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/util.py in
raise_exc_info(exc_info)
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
run(self)
1068 else:
-> 1069 yielded = self.gen.send(value)
1070
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/deploy/local.py
in _start_worker(self, death_timeout, **kwargs)
228 self.workers.remove(w)
--> 229 raise gen.TimeoutError("Worker failed to start")
230
TimeoutError: Worker failed to start
During handling of the above exception, another exception occurred:
TimeoutError Traceback (most recent call last)
<ipython-input-26-9ebe205475b6> in <module>()
3
4 # Use all 8 cores
----> 5 cluster = LocalCluster(processes = True, n_workers = 4)
6 ##client = Client(processes = True)
7 ##client = Client('localhost:8789')
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/deploy/local.py
in init(self, n_workers, threads_per_worker, processes, loop,
start, ip, scheduler_port, silence_logs, diagnostics_port, services,
worker_services, service_kwargs, asynchronous, security,
**worker_kwargs)
140 self.worker_kwargs['security'] = security
141
--> 142 self.start(ip=ip, n_workers=n_workers)
143
144 clusters_to_close.add(self)
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/deploy/local.py
in start(self, **kwargs)
177 self._started = self._start(**kwargs)
178 else:
--> 179 self.sync(self._start, **kwargs)
180
181 #gen.coroutine
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/deploy/local.py
in sync(self, func, *args, **kwargs)
170 return future
171 else:
--> 172 return sync(self.loop, func, *args, **kwargs)
173
174 def start(self, **kwargs):
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/utils.py
in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
/home/vlad/anaconda3/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/utils.py
in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/concurrent.py
in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/util.py in
raise_exc_info(exc_info)
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/deploy/local.py
in _start(self, ip, n_workers)
197 self.scheduler.start(scheduler_address)
198
--> 199 yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)]
200
201 self.status = 'running'
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/concurrent.py
in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/util.py in
raise_exc_info(exc_info)
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
callback(f)
826 for f in children:
827 try:
--> 828 result_list.append(f.result())
829 except Exception as e:
830 if future.done():
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/concurrent.py
in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/util.py in
raise_exc_info(exc_info)
/home/vlad/anaconda3/lib/python3.6/site-packages/tornado/gen.py in
run(self)
1067 exc_info = None
1068 else:
-> 1069 yielded = self.gen.send(value)
1070
1071 if stack_context._state.contexts is not orig_stack_contexts:
/home/vlad/anaconda3/lib/python3.6/site-packages/distributed/deploy/local.py
in _start_worker(self, death_timeout, **kwargs)
227 if w.status == 'closed' and self.scheduler.status == 'running':
228 self.workers.remove(w)
--> 229 raise gen.TimeoutError("Worker failed to start")
230
231 raise gen.Return(w)
TimeoutError: Worker failed to start
How can I solve this problem? I have the following setup
python version: 3.6.9
dask version 1.1.4
distributed version 1.26.0
tornado version 4.5
conda 4.6.14
The problem was solved by updating packages dask,distributed,tornado to version respectively 2.4.0 , 2.4.0, 6.0.3

folium heatmap with `Object of type 'int64' is not JSON serializable` error

I got an error as "Object of type 'int64' is not JSON serializable" when I tried to generate heatmap from folium.
I am running my jupyter notebook in anaconda using python 3.6, and the version of folium is '0.9.1'.
df_2y_cons_LatLo.dtypes: Latitude float64;
Longitude float64;
Descriptor int64.
def generateBaseMap(default_location=[40.704652, -73.923688], default_zoom_start=11):
base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
return base_map
base_map = generateBaseMap()
hm = HeatMap(list(zip(df_2y_cons_LatLo.Latitude.values,df_2y_cons_LatLo.Longitude.values,df_2y_cons_LatLo.Descriptor.values)))
base_map.add_child(hm)
I expected to have a heatmap showing. But, after I run the above code, it gave me the error mentioned above.
TypeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\Anaconda3\lib\site-packages\folium\folium.py in _repr_html_(self, **kwargs)
291 self._parent = None
292 else:
--> 293 out = self._parent._repr_html_(**kwargs)
294 return out
295
~\Anaconda3\lib\site-packages\branca\element.py in _repr_html_(self, **kwargs)
326
327 """
--> 328 html = self.render(**kwargs)
329 html = "data:text/html;charset=utf-8;base64," + base64.b64encode(html.encode('utf8')).decode('utf8') # noqa
330
~\Anaconda3\lib\site-packages\branca\element.py in render(self, **kwargs)
319 """Renders the HTML representation of the element."""
320 for name, child in self._children.items():
--> 321 child.render(**kwargs)
322 return self._template.render(this=self, kwargs=kwargs)
323
~\Anaconda3\lib\site-packages\folium\folium.py in render(self, **kwargs)
368 '</style>'), name='map_style')
369
--> 370 super(Map, self).render(**kwargs)
371
372 def fit_bounds(self, bounds, padding_top_left=None,
~\Anaconda3\lib\site-packages\branca\element.py in render(self, **kwargs)
631
632 for name, element in self._children.items():
--> 633 element.render(**kwargs)
~\Anaconda3\lib\site-packages\folium\plugins\heat_map.py in render(self, **kwargs)
79
80 def render(self, **kwargs):
---> 81 super(HeatMap, self).render(**kwargs)
82
83 figure = self.get_root()
~\Anaconda3\lib\site-packages\branca\element.py in render(self, **kwargs)
627 script = self._template.module.__dict__.get('script', None)
628 if script is not None:
--> 629 figure.script.add_child(Element(script(self, kwargs)),
630 name=self.get_name())
631
~\Anaconda3\lib\site-packages\jinja2\runtime.py in __call__(self, *args, **kwargs)
573 (self.name, len(self.arguments)))
574
--> 575 return self._invoke(arguments, autoescape)
576
577 def _invoke(self, arguments, autoescape):
~\Anaconda3\lib\site-packages\jinja2\asyncsupport.py in _invoke(self, arguments, autoescape)
108 def _invoke(self, arguments, autoescape):
109 if not self._environment.is_async:
--> 110 return original_invoke(self, arguments, autoescape)
111 return async_invoke(self, arguments, autoescape)
112 return update_wrapper(_invoke, original_invoke)
~\Anaconda3\lib\site-packages\jinja2\runtime.py in _invoke(self, arguments, autoescape)
577 def _invoke(self, arguments, autoescape):
578 """This method is being swapped out by the async implementation."""
--> 579 rv = self._func(*arguments)
580 if autoescape:
581 rv = Markup(rv)
<template> in macro(l_1_this, l_1_kwargs)
~\Anaconda3\lib\site-packages\jinja2\filters.py in do_tojson(eval_ctx, value, indent)
1076 options = dict(options)
1077 options['indent'] = indent
-> 1078 return htmlsafe_json_dumps(value, dumper=dumper, **options)
1079
1080
~\Anaconda3\lib\site-packages\jinja2\utils.py in htmlsafe_json_dumps(obj, dumper, **kwargs)
563 if dumper is None:
564 dumper = json.dumps
--> 565 rv = dumper(obj, **kwargs) \
566 .replace(u'<', u'\\u003c') \
567 .replace(u'>', u'\\u003e') \
~\Anaconda3\lib\json\__init__.py in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, default=default, sort_keys=sort_keys,
--> 238 **kw).encode(obj)
239
240
~\Anaconda3\lib\json\encoder.py in encode(self, o)
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
~\Anaconda3\lib\json\encoder.py in iterencode(self, o, _one_shot)
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
258
259 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
~\Anaconda3\lib\json\encoder.py in default(self, o)
178 """
179 raise TypeError("Object of type '%s' is not JSON serializable" %
--> 180 o.__class__.__name__)
181
182 def encode(self, o):
TypeError: Object of type 'int64' is not JSON serializable

pyspark ibmdpy giving ValueError: ('ordinal must be >= 1

I am loading data from netezza into a dataframe and then trying to write to dashdb. I am using ibmdpy to try to load the data into dashdb on bluemix. Ibmdpy requires a pandas dataframe so I convert the spark dataframe to pandas to load into dashdb.
all_disputes_df = sqlContext.read.format('jdbc').options(url='jdbc:netezza://pda1-wall.pok.ibm.com:5480/BACC_PRD_ISCNZ_GAPNZ', user=user, password=password, dbtable='METRICS.AR_EM_D2_02_AGG', driver='org.netezza.Driver').load()
from ibmdbpy import IdaDataBase
idadb = IdaDataBase(dsn='BLUDB', uid='dash107474', pwd='k5TY24AbzFjE')
print("current_schema is %s" % idadb.current_schema)
print("tables %s" % idadb.show_tables())
idadb.as_idadataframe(all_disputes_df.toPandas(), "all_disputes")
I am getting the following traceback.
ValueError Traceback (most recent call last)
<ipython-input-4-63dde713c67b> in <module>()
----> 1 idadb.as_idadataframe(all_disputes_df.toPandas(), "all_disputes")
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/dataframe.pyc in toPandas(self)
1379 """
1380 import pandas as pd
-> 1381 return pd.DataFrame.from_records(self.collect(), columns=self.columns)
1382
1383 ##########################################################################################
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/dataframe.pyc in collect(self)
279 with SCCallSiteSync(self._sc) as css:
280 port = self._jdf.collectToPython()
--> 281 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
282
283 #ignore_unicode_prefix
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/rdd.pyc in _load_from_socket(port, serializer)
140 try:
141 rf = sock.makefile("rb", 65536)
--> 142 for item in serializer.load_stream(rf):
143 yield item
144 finally:
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/serializers.pyc in load_stream(self, stream)
137 while True:
138 try:
--> 139 yield self._read_with_length(stream)
140 except EOFError:
141 return
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/serializers.pyc in _read_with_length(self, stream)
162 if len(obj) < length:
163 raise EOFError
--> 164 return self.loads(obj)
165
166 def dumps(self, obj):
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/serializers.pyc in loads(self, obj, encoding)
420 else:
421 def loads(self, obj, encoding=None):
--> 422 return pickle.loads(obj)
423
424
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/types.pyc in <lambda>(*a)
1157 # This is used to unpickle a Row from JVM
1158 def _create_row_inbound_converter(dataType):
-> 1159 return lambda *a: dataType.fromInternal(a)
1160
1161
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/types.pyc in fromInternal(self, obj)
563 return obj
564 if self._needSerializeAnyField:
--> 565 values = [f.fromInternal(v) for f, v in zip(self.fields, obj)]
566 else:
567 values = obj
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/types.pyc in fromInternal(self, obj)
436
437 def fromInternal(self, obj):
--> 438 return self.dataType.fromInternal(obj)
439
440
/home/brente/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/types.pyc in fromInternal(self, v)
174 def fromInternal(self, v):
175 if v is not None:
--> 176 return datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
177
178
ValueError: ('ordinal must be >= 1', <function <lambda> at 0x7f97c0be76e0>, (u'788', u'10', u'00620000 ', u'0129101548 ', 1, u'000028628 ', 16520, Decimal('2124.76'), Decimal('2124.76'), 16525, 16525, u'000611099
Any ideas on what the problem is?
Reading your data from Netezza into dataframes fails. Everything beyond that is speculation from my side:
Could there be invalid data stored in Netezza, that throws off the deserialization into dataframes?
Maybe try some other queries, to make sure that there is no connectivity problem, no typo in the database name, things like that.

Resources