Azure ML Tabular Dataset : missing 1 required positional argument: 'stream_column' - azure

For the Python API for tabular dataset of AzureML (azureml.data.TabularDataset), there are two experimental methods which have been introduced:
download(stream_column, target_path=None, overwrite=False, ignore_not_found=True)
mount(stream_column, mount_point=None)
Parameter stream_column has been defined as The stream column to mount or download.
What is the actual meaning of stream_column? I don't see any example any where?
Any pointer will be helpful.
The stack trace:
Method download: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_11561/3904436543.py in <module>
----> 1 tab_dataset.download(target_path="../data/tabular")
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/azureml/_base_sdk_common/_docstring_wrapper.py in wrapped(*args, **kwargs)
50 def wrapped(*args, **kwargs):
51 module_logger.warning("Method {0}: {1} {2}".format(func.__name__, _method_msg, _experimental_link_msg))
---> 52 return func(*args, **kwargs)
53 return wrapped
54
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/azureml/data/_loggerfactory.py in wrapper(*args, **kwargs)
130 with _LoggerFactory.track_activity(logger, func.__name__, activity_type, custom_dimensions) as al:
131 try:
--> 132 return func(*args, **kwargs)
133 except Exception as e:
134 if hasattr(al, 'activity_info') and hasattr(e, 'error_code'):
TypeError: download() missing 1 required positional argument: 'stream_column'

Update on 5th March, 2022
I posted this as a support ticket with Azure. Following is the answer I have received:
As you can see from our documentation of TabularDataset Class,
the “stream_column” parameter is required. So, that error is occurring
because you are not passing any parameters when you are calling the
download method. The “stream_column” parameter should have the
stream column to download/mount. So, you need to pass the column name
that contains the paths from which the data will be streamed.
Please find an example here.

Related

Azureml TabularDataset to_pandas_dataframe() returns InvalidEncoding error

When I run:
datasetTabular = Dataset.get_by_name(ws, "<Redacted>")
datasetTabular.to_pandas_dataframe()
The following error is returned. What can I do to get past this?
ExecutionError Traceback (most recent call last) File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\data\dataset_error_handling.py:101, in _try_execute(action, operation, dataset_info, **kwargs)
100 else:
--> 101 return action()
102 except Exception as e:
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\data\tabular_dataset.py:169, in TabularDataset.to_pandas_dataframe.<locals>.<lambda>()
168 dataflow = get_dataflow_for_execution(self._dataflow, 'to_pandas_dataframe', 'TabularDataset')
--> 169 df = _try_execute(lambda: dataflow.to_pandas_dataframe(on_error=on_error,
170 out_of_range_datetime=out_of_range_datetime),
171 'to_pandas_dataframe',
172 None if self.id is None else {'id': self.id, 'name': self.name, 'version': self.version})
173 fine_grain_timestamp = self._properties.get(_DATASET_PROP_TIMESTAMP_FINE, None)
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\_loggerfactory.py:213, in track.<locals>.monitor.<locals>.wrapper(*args, **kwargs)
212 try:
--> 213 return func(*args, **kwargs)
214 except Exception as e:
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\dataflow.py:697, in Dataflow.to_pandas_dataframe(self, extended_types, nulls_as_nan, on_error, out_of_range_datetime)
696 with tracer.start_as_current_span('Dataflow.to_pandas_dataframe', trace.get_current_span()) as span:
--> 697 return get_dataframe_reader().to_pandas_dataframe(self,
698 extended_types,
699 nulls_as_nan,
700 on_error,
701 out_of_range_datetime,
702 to_dprep_span_context(span.get_context()))
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\_dataframereader.py:386, in _DataFrameReader.to_pandas_dataframe(self, dataflow, extended_types, nulls_as_nan, on_error, out_of_range_datetime, span_context)
384 if have_pyarrow() and not extended_types and not inconsistent_schema:
385 # if arrow is supported, and we didn't get inconsistent schema, and extended typed were not asked for - fallback to feather
--> 386 return clex_feather_to_pandas()
387 except _InconsistentSchemaError as e:
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\_dataframereader.py:298, in
_DataFrameReader.to_pandas_dataframe.<locals>.clex_feather_to_pandas()
297 activity_data = dataflow_to_execute._dataflow_to_anonymous_activity_data(dataflow_to_execute)
--> 298 dataflow._engine_api.execute_anonymous_activity(
299 ExecuteAnonymousActivityMessageArguments(anonymous_activity=activity_data, span_context=span_context))
301 try:
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\_aml_helper.py:38, in update_aml_env_vars.<locals>.decorator.<locals>.wrapper(op_code, message, cancellation_token)
37 engine_api_func().update_environment_variable(changed)
---> 38 return send_message_func(op_code, message, cancellation_token)
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\engineapi\api.py:160, in EngineAPI.execute_anonymous_activity(self, message_args, cancellation_token)
158 #update_aml_env_vars(get_engine_api)
159 def execute_anonymous_activity(self, message_args: typedefinitions.ExecuteAnonymousActivityMessageArguments, cancellation_token: CancellationToken = None) -> None:
--> 160 response = self._message_channel.send_message('Engine.ExecuteActivity', message_args, cancellation_token)
161 return response
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\engineapi\engine.py:291, in MultiThreadMessageChannel.send_message(self, op_code, message, cancellation_token)
290 cancel_on_error()
--> 291 raise_engine_error(response['error'])
292 else:
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\dataprep\api\errorhandlers.py:10, in raise_engine_error(error_response)
9 if 'ScriptExecution' in error_code:
---> 10 raise ExecutionError(error_response)
11 if 'Validation' in error_code:
ExecutionError: Error Code: ScriptExecution.StreamAccess.Validation Validation Error Code: InvalidEncoding Validation Target: TextFile Failed Step: 78059bb0-278f-4c7f-9c21-01a0cccf7b96 Error Message: ScriptExecutionException was caused by StreamAccessException. StreamAccessException was caused by ValidationException.
Unable to read file using Unicode (UTF-8). Attempted read range 0:777. Lines read in the range 0. Decoding error: Unable to translate bytes [8B] at index 1 from specified code page to Unicode.
Unable to translate bytes [8B] at index 1 from specified code page to Unicode. | session_id=295acf7e-4af9-42f1-b04a-79f3c5a0f98c
During handling of the above exception, another exception occurred:
UserErrorException Traceback (most recent call last) Input In [34], in <module>
1 # preview the first 3 rows of the dataset
2 #datasetTabular.take(3)
----> 3 datasetTabular.take(3).to_pandas_dataframe()
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\data\_loggerfactory.py:132, in track.<locals>.monitor.<locals>.wrapper(*args, **kwargs)
130 with _LoggerFactory.track_activity(logger, func.__name__, activity_type, custom_dimensions) as al:
131 try:
--> 132 return func(*args, **kwargs)
133 except Exception as e:
134 if hasattr(al, 'activity_info') and hasattr(e, 'error_code'):
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\data\tabular_dataset.py:169, in TabularDataset.to_pandas_dataframe(self, on_error, out_of_range_datetime)
158 """Load all records from the dataset into a pandas DataFrame.
159
160 :param on_error: How to handle any error values in the dataset, such as those produced by an error while (...)
166 :rtype: pandas.DataFrame
167 """
168 dataflow = get_dataflow_for_execution(self._dataflow, 'to_pandas_dataframe', 'TabularDataset')
--> 169 df = _try_execute(lambda: dataflow.to_pandas_dataframe(on_error=on_error,
170 out_of_range_datetime=out_of_range_datetime),
171 'to_pandas_dataframe',
172 None if self.id is None else {'id': self.id, 'name': self.name, 'version': self.version})
173 fine_grain_timestamp = self._properties.get(_DATASET_PROP_TIMESTAMP_FINE, None)
175 if fine_grain_timestamp is not None and df.empty is False:
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\data\dataset_error_handling.py:104, in _try_execute(action, operation, dataset_info, **kwargs)
102 except Exception as e:
103 message, is_dprep_exception = _construct_message_and_check_exception_type(e, dataset_info, operation)
--> 104 _dataprep_error_handler(e, message, is_dprep_exception)
File C:\ProgramData\Anaconda3_2\envs\amlds\lib\site-packages\azureml\data\dataset_error_handling.py:154, in _dataprep_error_handler(e, message, is_dprep_exception)
152 for item in user_exception_list:
153 if _contains(item, getattr(e, 'error_code', 'Unexpected')):
--> 154 raise UserErrorException(message, inner_exception=e)
156 raise AzureMLException(message, inner_exception=e)
UserErrorException: UserErrorException: Message: Execution failed with error message: ScriptExecutionException was caused by StreamAccessException. StreamAccessException was caused by ValidationException.
Unable to read file using Unicode (UTF-8). Attempted read range 0:777. Lines read in the range 0. Decoding error: [REDACTED]
Failed due to inner exception of type: DecoderFallbackException | session_id=295acf7e-4af9-42f1-b04a-79f3c5a0f98c ErrorCode: ScriptExecution.StreamAccess.Validation InnerException Error Code: ScriptExecution.StreamAccess.Validation Validation Error Code: InvalidEncoding Validation Target: TextFile Failed Step: 78059bb0-278f-4c7f-9c21-01a0cccf7b96 Error Message: ScriptExecutionException was caused by StreamAccessException. StreamAccessException was caused by ValidationException.
Unable to read file using Unicode (UTF-8). Attempted read range 0:777. Lines read in the range 0. Decoding error: Unable to translate bytes [8B] at index 1 from specified code page to Unicode.
Unable to translate bytes [8B] at index 1 from specified code page to Unicode. | session_id=295acf7e-4af9-42f1-b04a-79f3c5a0f98c ErrorResponse {
"error": {
"code": "UserError",
"message": "Execution failed with error message: ScriptExecutionException was caused by StreamAccessException.\r\n StreamAccessException was caused by ValidationException.\r\n Unable to read file using Unicode (UTF-8). Attempted read range 0:777. Lines read in the range 0. Decoding error: [REDACTED]\r\n Failed due to inner exception of type: DecoderFallbackException\r\n| session_id=295acf7e-4af9-42f1-b04a-79f3c5a0f98c ErrorCode: ScriptExecution.StreamAccess.Validation"
} }
This kind of error usually happens if the base input is not our supported OS version.
Unable to read file using Unicode (UTF-8) -> this is the key point in the error occurred
str_value = raw_data.decode('utf-8')
using the above code block convert the input and then perform the operation.
Since you're working on a collection of .json files I'd suggest using a FileDataset (if you want to work with the jsons) as you're currently doing.
If you'd prefer working with the data in tabular form, then I'd suggest doing some preprocessing to flatten the json files into a pandas dataframe before saving it as a dataset on AzureML. Then use the register_pandas_dataframe method from the DatasetFactory class to save this dataframe. This will ensure that when you fetch the Dataset from azure, the to_pandas_dataframe() method will work. Just be aware that some datatypes such as numpy arrays are not supported when using the register_pandas_dataframe() method.
The issue with creating a tabular set from json files and then converting this to a pandas dataframe once you've begun working with it (in a run or notebook), is that you're expecting azure to handle the flattening/processing.
Alternatively, you can also look at the from_json_lines method since it might suit your use case better.

AttributeError: 'NoneType' object has no attribute 'encode' (Binance)

I expect this simple script that connects to Binance and gets the details of my account to work using python-binance library (version: 0.7.9) out of the box but it does not seem to. I am able to connect to the API and get the price of bitcoin therefore I am confident the error is not this.
import os
from binance.client import Client
from binance.websockets import BinanceSocketManager
from twisted.internet import reactor
# Get keys
api_key = os.environ.get('binance_api')
api_secret = os.environ.get('binance_secret')
# Connect to Binance
client = Client(api_key, api_secret)
print(client.get_account())
gives the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-9c56ae96674c> in <module>
----> 1 print(client.get_account())
~/.local/lib/python3.8/site-packages/binance/client.py in get_account(self, **params)
1765
1766 """
-> 1767 return self._get('account', True, data=params)
1768
1769 def get_asset_balance(self, asset, **params):
~/.local/lib/python3.8/site-packages/binance/client.py in _get(self, path, signed, version, **kwargs)
235
236 def _get(self, path, signed=False, version=PUBLIC_API_VERSION, **kwargs):
--> 237 return self._request_api('get', path, signed, version, **kwargs)
238
239 def _post(self, path, signed=False, version=PUBLIC_API_VERSION, **kwargs):
~/.local/lib/python3.8/site-packages/binance/client.py in _request_api(self, method, path, signed, version, **kwargs)
200 uri = self._create_api_uri(path, signed, version)
201
--> 202 return self._request(method, uri, signed, **kwargs)
203
204 def _request_withdraw_api(self, method, path, signed=False, **kwargs):
~/.local/lib/python3.8/site-packages/binance/client.py in _request(self, method, uri, signed, force_params, **kwargs)
178 # generate signature
179 kwargs['data']['timestamp'] = int(time.time() * 1000)
--> 180 kwargs['data']['signature'] = self._generate_signature(kwargs['data'])
181
182 # sort get and post params to match signature order
~/.local/lib/python3.8/site-packages/binance/client.py in _generate_signature(self, data)
133 ordered_data = self._order_params(data)
134 query_string = '&'.join(["{}={}".format(d[0], d[1]) for d in ordered_data])
--> 135 m = hmac.new(self.API_SECRET.encode('utf-8'), query_string.encode('utf-8'), hashlib.sha256)
136 return m.hexdigest()
137
AttributeError: 'NoneType' object has no attribute 'encode'
Checking the docs, I don't see what I might be doing wrong. Since the debug logs indicate the error might be in the binance api but I'm not sure because this is a basic feature and should work without any trouble.
I appreciate your help.
Keys were not imported correctly, check with 'binance_api' in os.environ. Add set -gx ENV_NAME=value to ~/.config/fish/config.fish

Python Numba - Convert DataFrame series object to numpy array

I have a pandas dataframe with strings I am trying to use the set operation using python numba to get the unique characters in the column that contains strings in the dataframe. Since, numba does note recognize pandas dataframes, I need to convert the string column to an numpy array. However, once converted the column shows the dtype as a object. Is there a way that I could convert the pandas dataframe (column of strings) to a normal array (not an object array)
Please find the code for your understanding.
z = train.head(2).sentence.values #Train is a pandas DataFrame
z
Output:
array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)"],
dtype=object)
Python Numba code:
#njit
def set_(z):
x = set(z.sum())
return x
set_(z)
Output:
---------------------------------------------------------------------------
TypingError Traceback (most recent call last)
<ipython-input-51-9d5bc17d106b> in <module>()
----> 1 set_(z)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
342 raise e
343 else:
--> 344 reraise(type(e), e, None)
345 except errors.UnsupportedError as e:
346 # Something unsupported is present in the user code, add help info
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/six.py in reraise(tp, value, tb)
656 value = tp()
657 if value.__traceback__ is not tb:
--> 658 raise value.with_traceback(tb)
659 raise value
660
TypingError: Failed at nopython (nopython frontend)
Internal error at <numba.typeinfer.ArgConstraint object at 0x7fbe66c01a58>:
--%<----------------------------------------------------------------------------
Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/errors.py", line 491, in new_error_context
yield
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/typeinfer.py", line 194, in __call__
assert ty.is_precise()
AssertionError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/typeinfer.py", line 138, in propagate
constraint(typeinfer)
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/typeinfer.py", line 195, in __call__
typeinfer.add_type(self.dst, ty, loc=self.loc)
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/contextlib.py", line 99, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/errors.py", line 499, in new_error_context
six.reraise(type(newerr), newerr, tb)
File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/numba/six.py", line 659, in reraise
raise value
numba.errors.InternalError:
[1] During: typing of argument at <ipython-input-50-566e4e12481d> (3)
--%<----------------------------------------------------------------------------
File "<ipython-input-50-566e4e12481d>", line 3:
def set_(z):
x = set(z.sum())
^
This error may have been caused by the following argument(s):
- argument 0: Unsupported array dtype: object
This is not usually a problem with Numba itself but instead often caused by
the use of unsupported features or an issue in resolving types.
To see Python/NumPy features supported by the latest release of Numba visit:
http://numba.pydata.org/numba-doc/dev/reference/pysupported.html
and
http://numba.pydata.org/numba-doc/dev/reference/numpysupported.html
For more information about typing errors and how to debug them visit:
http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-doesn-t-compile
If you think your code should work with Numba, please report the error message
and traceback, along with a minimal reproducer at:
https://github.com/numba/numba/issues/new
Would anyone be able to help me in this regard.
Thanks & Best Regards
Michael

Deleting index in elasticsearch

I want to delete an entire index of elastic search which i had created using the following code in python notebook.
es.index(index='para', doc_type='people', id=1, body={
"name":"Farid ullah",
"height":"160",
"age":"23",
"gender":"male",
"date of birth":"04/02/1994",
"Qualification":"BS in Software engineering"
})
the delete command is as follows,
es.delete(index='para', doc_type='people'),
but I get the following error
TypeError Traceback (most recent call last)
<ipython-input-7-26c24345ae23> in <module>()
----> 1 es.delete(index='para', doc_type='people')
C:\Users\Farid ullah\Anaconda3\lib\site-packages\elasticsearch\client\utils.py in _wrapped(*args, **kwargs)
71 if p in kwargs:
72 params[p] = kwargs.pop(p)
---> 73 return func(*args, params=params, **kwargs)
74 return _wrapped
75 return _wrapper
TypeError: delete() missing 1 required positional argument: 'id'
Can I not be able to delete entire index?
Is there any way to delete it without specifying the id of a particular one?
In your case, 'people' is not an index, it's a type. The index name is 'para'.
I don't know the python API, but your should try something like :
es.delete(index='para')
In this doc :
http://elasticsearch-py.readthedocs.io/en/master/api.html
It is suggested to use something like :
es.indices.delete(index='para')

How do you use dask + distributed for NFS files?

Working from Matthew Rocklin's post on distributed data frames with Dask, I'm trying to distribute some summary statistics calculations across my cluster. Setting up the cluster with dcluster ... works fine. Inside a notebook,
import dask.dataframe as dd
from distributed import Executor, progress
e = Executor('...:8786')
df = dd.read_csv(...)
The file I'm reading is on an NFS mount that all the worker machines have access to. At this point I can look at df.head() for example and everything looks correct. From the blog post, I think I should be able to do this:
df_future = e.persist(df)
progress(df_future)
# ... wait for everything to load ...
df_future.head()
But that's an error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-26-8d59adace8bf> in <module>()
----> 1 fraudf.head()
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/dataframe/core.py in head(self, n, compute)
358
359 if compute:
--> 360 result = result.compute()
361 return result
362
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/base.py in compute(self, **kwargs)
35
36 def compute(self, **kwargs):
---> 37 return compute(self, **kwargs)[0]
38
39 #classmethod
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/base.py in compute(*args, **kwargs)
108 for opt, val in groups.items()])
109 keys = [var._keys() for var in variables]
--> 110 results = get(dsk, keys, **kwargs)
111
112 results_iter = iter(results)
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, **kwargs)
55 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
56 cache=cache, queue=queue, get_id=_thread_get_id,
---> 57 **kwargs)
58
59 return results
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/async.py in get_async(apply_async, num_workers, dsk, result, cache, queue, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, **kwargs)
479 _execute_task(task, data) # Re-execute locally
480 else:
--> 481 raise(remote_exception(res, tb))
482 state['cache'][key] = res
483 finish_task(dsk, key, state, results, keyorder.get)
AttributeError: 'Future' object has no attribute 'head'
Traceback
---------
File "/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/async.py", line 264, in execute_task
result = _execute_task(task, data)
File "/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/async.py", line 246, in _execute_task
return func(*args2)
File "/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/dataframe/core.py", line 354, in <lambda>
dsk = {(name, 0): (lambda x, n: x.head(n=n), (self._name, 0), n)}
What's the right approach to distributing a data frame when it comes from a normal file system instead of HDFS?
Dask is trying to use the single-machine scheduler, which is the default if you create a dataframe using the normal dask library. Switch the default to use your cluster with the following lines:
import dask
dask.set_options(get=e.get)

Resources