Connect to S3 accelerate endpoint with boto3

Connect to S3 accelerate endpoint with boto3 - python-3.x

I want to download a file into a Python file object from an S3 bucket that has acceleration activated. I came across a few resources suggesting whether to overwrite the endpoint_url to "s3-accelerate.amazonaws.com" and/or to use the use_accelerate_endpoint attribute.
I have tried both, and several variations but the same error was returned everytime. One of the scripts I tried is:
from botocore.config import Config
import boto3
from io import BytesIO
session = boto3.session.Session()
s3 = session.client(
service_name='s3',
aws_access_key_id=<MY_KEY_ID>,
aws_secret_access_key=<MY_KEY>,
region_name="us-west-2",
config=Config(s3={"use_accelerate_endpoint": True,
"addressing_style": "path"}))
input = BytesIO()
s3.download_fileobj(<MY_BUCKET>,<MY_KEY>, input)
Returns the following error:
---------------------------------------------------------------------------
ClientError Traceback (most recent call last)
<ipython-input-61-92b89b45f215> in <module>()
11 "addressing_style": "path"}))
12 input = BytesIO()
---> 13 s3.download_fileobj(bucket, filename, input)
14
15
~/Project/venv/lib/python3.5/site-packages/boto3/s3/inject.py in download_fileobj(self, Bucket, Key, Fileobj, ExtraArgs, Callback, Config)
568 bucket=Bucket, key=Key, fileobj=Fileobj,
569 extra_args=ExtraArgs, subscribers=subscribers)
--> 570 return future.result()
571
572
~/Project//venv/lib/python3.5/site-packages/s3transfer/futures.py in result(self)
71 # however if a KeyboardInterrupt is raised we want want to exit
72 # out of this and propogate the exception.
---> 73 return self._coordinator.result()
74 except KeyboardInterrupt as e:
75 self.cancel()
~/Project/venv/lib/python3.5/site-packages/s3transfer/futures.py in result(self)
231 # final result.
232 if self._exception:
--> 233 raise self._exception
234 return self._result
235
~/Project/venv/lib/python3.5/site-packages/s3transfer/tasks.py in _main(self, transfer_future, **kwargs)
253 # Call the submit method to start submitting tasks to execute the
254 # transfer.
--> 255 self._submit(transfer_future=transfer_future, **kwargs)
256 except BaseException as e:
257 # If there was an exception raised during the submission of task
~/Project/venv/lib/python3.5/site-packages/s3transfer/download.py in _submit(self, client, config, osutil, request_executor, io_executor, transfer_future)
347 Bucket=transfer_future.meta.call_args.bucket,
348 Key=transfer_future.meta.call_args.key,
--> 349 **transfer_future.meta.call_args.extra_args
350 )
351 transfer_future.meta.provide_transfer_size(
~/Project/venv/lib/python3.5/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
310 "%s() only accepts keyword arguments." % py_operation_name)
311 # The "self" in this scope is referring to the BaseClient.
--> 312 return self._make_api_call(operation_name, kwargs)
313
314 _api_call.__name__ = str(py_operation_name)
~/Project/venv/lib/python3.5/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
603 error_code = parsed_response.get("Error", {}).get("Code")
604 error_class = self.exceptions.from_code(error_code)
--> 605 raise error_class(parsed_response, operation_name)
606 else:
607 return parsed_response
ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
When I run the same script with "use_accelerate_endpoint": False it works fine.
However, it returned the same error when:
I overwrite the endpoint_url with "s3-accelerate.amazonaws.com"
I define "addressing_style": "virtual"
When running
s3.get_bucket_accelerate_configuration(Bucket=<MY_BUCKET>)
I get {..., 'Status': 'Enabled'} as expected.
Any idea what is wrong with that code and what I should change to properly query the accelerate endpoint of that bucket?
Using python3.5 with boto3==1.4.7, botocore==1.7.43 on Ubuntu 17.04.
EDIT:
I have also tried a similar script for uploads:
from botocore.config import Config
import boto3
from io import BytesIO
session = boto3.session.Session()
s3 = session.client(
service_name='s3',
aws_access_key_id=<MY_KEY_ID>,
aws_secret_access_key=<MY_KEY>,
region_name="us-west-2",
config=Config(s3={"use_accelerate_endpoint": True,
"addressing_style": "virtual"}))
output = BytesIO()
output.seek(0)
s3.upload_fileobj(output, <MY_BUCKET>,<MY_KEY>)
Which works without the use_accelerate_endpoint option (so my keys are fine), but returns this error when True:
ClientError: An error occurred (SignatureDoesNotMatch) when calling the PutObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.
I have tried both addressing_style options here as well (virtual and path)

Using boto3==1.4.7 and botocore==1.7.43.
Here is one way to retrieve an object from a bucket with transfer acceleration enabled.
import boto3
from botocore.config import Config
from io import BytesIO
config = Config(s3={"use_accelerate_endpoint": True})
s3_resource = boto3.resource("s3",
aws_access_key_id=<MY_KEY_ID>,
aws_secret_access_key=<MY_KEY>,
region_name="us-west-2",
config=config)
s3_client = s3_resource.meta.client
file_object = BytesIO()
s3_client.download_fileobj(<MY_BUCKET>, <MY_KEY>, file_object)
Note that the client sends a HEAD request to the accelerated endpoint before a GET.
The canonical request of which looks somewhat like the following:
CanonicalRequest:
HEAD
/<MY_KEY>
host:<MY_BUCKET>.s3-accelerate.amazonaws.com
x-amz-content-sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
x-amz-date:20200520T204128Z
host;x-amz-content-sha256;x-amz-date
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
Some reasons why the HEAD request can fail include:
Object with given key doesn't exist or has strict access control enabled
Invalid credentials
Transfer acceleration isn't enabled

Related

How to load a torchvision model from disk?

I'm trying to follow the solution from the top answer here to load an object detection model from the .pth file.
os.environ['TORCH_HOME'] = '../input/torchvision-fasterrcnn-resnet-50/' #setting the environment variable
model = detection.fasterrcnn_resnet50_fpn(pretrained=False).to(DEVICE)
I get the following error
NotADirectoryError: [Errno 20] Not a directory: '../input/torchvision-fasterrcnn-resnet-50/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth/hub'
google did not reveal an answer to the error and I don't exactly know what it means except for the obvious (that folder 'hub' is missing).
Do I have to unpack or create a folder?
I have tried loading the weights but I get the same error message.
this is how I load the model
model = detection.fasterrcnn_resnet50_fpn(pretrained=True)
checkpoint = torch.load('../input/torchvision-fasterrcnn-resnet-50/model.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
thank you for your help!
Full Error Trace:
gaierror: [Errno -3] Temporary failure in name resolution
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
/tmp/ipykernel_42/1218627017.py in <module>
1 # to load
----> 2 model = detection.fasterrcnn_resnet50_fpn(pretrained=True)
3 checkpoint = torch.load('../input/torchvision-fasterrcnn-resnet-50/model.pth.tar')
4 model.load_state_dict(checkpoint['state_dict'])
/opt/conda/lib/python3.7/site-packages/torchvision/models/detection/faster_rcnn.py in fasterrcnn_resnet50_fpn(pretrained, progress, num_classes, pretrained_backbone, trainable_backbone_layers, **kwargs)
360 if pretrained:
361 state_dict = load_state_dict_from_url(model_urls['fasterrcnn_resnet50_fpn_coco'],
--> 362 progress=progress)
363 model.load_state_dict(state_dict)
364 return model
/opt/conda/lib/python3.7/site-packages/torch/hub.py in load_state_dict_from_url(url, model_dir, map_location, progress, check_hash, file_name)
553 r = HASH_REGEX.search(filename) # r is Optional[Match[str]]
554 hash_prefix = r.group(1) if r else None
--> 555 download_url_to_file(url, cached_file, hash_prefix, progress=progress)
556
557 if _is_legacy_zip_format(cached_file):
/opt/conda/lib/python3.7/site-packages/torch/hub.py in download_url_to_file(url, dst, hash_prefix, progress)
423 # certificates in older Python
424 req = Request(url, headers={"User-Agent": "torch.hub"})
--> 425 u = urlopen(req)
426 meta = u.info()
427 if hasattr(meta, 'getheaders'):
/opt/conda/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
/opt/conda/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
523 req = meth(req)
524
--> 525 response = self._open(req, data)
526
527 # post-process response
/opt/conda/lib/python3.7/urllib/request.py in _open(self, req, data)
541 protocol = req.type
542 result = self._call_chain(self.handle_open, protocol, protocol +
--> 543 '_open', req)
544 if result:
545 return result
/opt/conda/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
/opt/conda/lib/python3.7/urllib/request.py in https_open(self, req)
1391 def https_open(self, req):
1392 return self.do_open(http.client.HTTPSConnection, req,
-> 1393 context=self._context, check_hostname=self._check_hostname)
1394
1395 https_request = AbstractHTTPHandler.do_request_
/opt/conda/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1350 encode_chunked=req.has_header('Transfer-encoding'))
1351 except OSError as err: # timeout error
-> 1352 raise URLError(err)
1353 r = h.getresponse()
1354 except:
URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

If you are loading a pretrained network, you don't need to load the model from torchvision pretrained (as in pretrained by torchvision on ImageNet using pretrained=True). You have two options:
Either set pretrained=False and load you weights using:
checkpoint = torch.load('../input/torchvision-fasterrcnn-resnet-50/model.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
Or if you decide to change TORCH_HOME (which is not ideal) you need to keep the same directory structure Torchvision has which would be:
inputs/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
In practice, you wouldn't change TORCH_HOME just to load one model.

I found the solution digging deep into github, to the problem, which is a little hidden.
detection.()
has a default argument besides pretrained, it's called pretrained_backbone which by default is set to true, which if True sets the models to download from a dictionary path of urls.
this will work:
detection.fasterrcnn_resnet50_fpn(pretrained=False, pretrained_backbone = False, num_classes = 91).
then load the model as usual.
num_classes is expected, in the docs it's a default = 91 but in github i saw it as None, which is why I added it here for saftey.

Writing CSV file into dataframe from FTPS server with python

I am trying to get a csv file out of an ftps server. I am receiving this info, though:
file = r'filename.csv'
with ftplib.FTP() as ftp:
with open(file, 'rb') as f:
ftp.retrbinary(file, f.read)
df1= pd.read_csv(file)
df1.head()
with this particular error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-10-a2725f958d45> in <module>
4
5 with open(file, 'rb') as f:
----> 6 ftp.retrbinary(file, f.read)
7 df1= pd.read_csv(file) #delimiter = '|', encoding = 'latin1')
8 df1.head()
~\AppData\Local\Continuum\anaconda3\lib\ftplib.py in retrbinary(self, cmd, callback, blocksize, rest)
439 The response code.
440 """
--> 441 self.voidcmd('TYPE I')
442 with self.transfercmd(cmd, rest) as conn:
443 while 1:
~\AppData\Local\Continuum\anaconda3\lib\ftplib.py in voidcmd(self, cmd)
275 def voidcmd(self, cmd):
276 """Send a command and expect a response beginning with '2'."""
--> 277 self.putcmd(cmd)
278 return self.voidresp()
279
~\AppData\Local\Continuum\anaconda3\lib\ftplib.py in putcmd(self, line)
197 def putcmd(self, line):
198 if self.debugging: print('*cmd*', self.sanitize(line))
--> 199 self.putline(line)
200
201 # Internal: return one line from the server, stripping CRLF.
~\AppData\Local\Continuum\anaconda3\lib\ftplib.py in putline(self, line)
192 if self.debugging > 1:
193 print('*put*', self.sanitize(line))
--> 194 self.sock.sendall(line.encode(self.encoding))
195
196 # Internal: send one command to the server (through putline())
AttributeError: 'NoneType' object has no attribute 'sendall'
Any ideas as to why this isn't putting the requested file into a dataframe?

the documentation says that the cmd argument of retrbinary method should be an appropriate RETR command: RETR filename and the callback function is called for each block of data received.
if you need to get data, write to file and read file try: ftp.retrbinary(f'RETR {file}', f.write)

Method Name:
retrbinary
retrbinary(cmd, callback, blocksize=8192, rest=None)
callback:For each block of the data received from the FTP server the callback function is called. This callback function can be used for processing the data received. For example, the callback can be used for writing the received blocks into a file
for example:
you can use this:
fhandle = open(filename, 'wb')
ftp.retrbinary('RETR ' + filename, fhandle.write)
or
ftp.retrbinary('RETR %s' % FILE, open(FILE, 'wb').write)

AttributeError: 'NoneType' object has no attribute 'encode' (Binance)

I expect this simple script that connects to Binance and gets the details of my account to work using python-binance library (version: 0.7.9) out of the box but it does not seem to. I am able to connect to the API and get the price of bitcoin therefore I am confident the error is not this.
import os
from binance.client import Client
from binance.websockets import BinanceSocketManager
from twisted.internet import reactor
# Get keys
api_key = os.environ.get('binance_api')
api_secret = os.environ.get('binance_secret')
# Connect to Binance
client = Client(api_key, api_secret)
print(client.get_account())
gives the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-9c56ae96674c> in <module>
----> 1 print(client.get_account())
~/.local/lib/python3.8/site-packages/binance/client.py in get_account(self, **params)
1765
1766 """
-> 1767 return self._get('account', True, data=params)
1768
1769 def get_asset_balance(self, asset, **params):
~/.local/lib/python3.8/site-packages/binance/client.py in _get(self, path, signed, version, **kwargs)
235
236 def _get(self, path, signed=False, version=PUBLIC_API_VERSION, **kwargs):
--> 237 return self._request_api('get', path, signed, version, **kwargs)
238
239 def _post(self, path, signed=False, version=PUBLIC_API_VERSION, **kwargs):
~/.local/lib/python3.8/site-packages/binance/client.py in _request_api(self, method, path, signed, version, **kwargs)
200 uri = self._create_api_uri(path, signed, version)
201
--> 202 return self._request(method, uri, signed, **kwargs)
203
204 def _request_withdraw_api(self, method, path, signed=False, **kwargs):
~/.local/lib/python3.8/site-packages/binance/client.py in _request(self, method, uri, signed, force_params, **kwargs)
178 # generate signature
179 kwargs['data']['timestamp'] = int(time.time() * 1000)
--> 180 kwargs['data']['signature'] = self._generate_signature(kwargs['data'])
181
182 # sort get and post params to match signature order
~/.local/lib/python3.8/site-packages/binance/client.py in _generate_signature(self, data)
133 ordered_data = self._order_params(data)
134 query_string = '&'.join(["{}={}".format(d[0], d[1]) for d in ordered_data])
--> 135 m = hmac.new(self.API_SECRET.encode('utf-8'), query_string.encode('utf-8'), hashlib.sha256)
136 return m.hexdigest()
137
AttributeError: 'NoneType' object has no attribute 'encode'
Checking the docs, I don't see what I might be doing wrong. Since the debug logs indicate the error might be in the binance api but I'm not sure because this is a basic feature and should work without any trouble.
I appreciate your help.

Keys were not imported correctly, check with 'binance_api' in os.environ. Add set -gx ENV_NAME=value to ~/.config/fish/config.fish

Python3 code Uploading to S3 bucket with IO instead of String IO

I am trying to download the zip file in memory, expand it and upload it to S3.
import boto3
import io
import zipfile
import mimetypes
s3 = boto3.resource('s3')
service_zip = io.BytesIO()
service_bucket = s3.Bucket('services.mydomain.com')
build_bucket = s3.Bucket('servicesbuild.mydomain.com')
build_bucket.download_fileobj('servicesbuild.zip', service_zip)
with zipfile.ZipFile(service_zip) as myzip:
for nm in myzip.namelist():
obj = myzip.open(nm)
print(obj)
service_bucket.upload_fileobj(obj,nm,
ExtraArgs={'ContentType': mimetypes.guess_type(nm)[0]})
service_bucket.Object(nm).Acl().put(ACL='public-read')
Here is the error I get
<zipfile.ZipExtFile name='favicon.ico' mode='r' compress_type=deflate>
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-7-5941e5e45adc> in <module>
18 print(obj)
19 service_bucket.upload_fileobj(obj,nm,
---> 20 ExtraArgs={'ContentType': mimetypes.guess_type(nm)[0]})
21 service_bucket.Object(nm).Acl().put(ACL='public-read')
~/bitbucket/clguru/env/lib/python3.7/site-packages/boto3/s3/inject.py in bucket_upload_fileobj(self, Fileobj, Key, ExtraArgs, Callback, Config)
579 return self.meta.client.upload_fileobj(
580 Fileobj=Fileobj, Bucket=self.name, Key=Key, ExtraArgs=ExtraArgs,
--> 581 Callback=Callback, Config=Config)
582
583
~/bitbucket/clguru/env/lib/python3.7/site-packages/boto3/s3/inject.py in upload_fileobj(self, Fileobj, Bucket, Key, ExtraArgs, Callback, Config)
537 fileobj=Fileobj, bucket=Bucket, key=Key,
538 extra_args=ExtraArgs, subscribers=subscribers)
--> 539 return future.result()
540
541
~/bitbucket/clguru/env/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
71 # however if a KeyboardInterrupt is raised we want want to exit
72 # out of this and propogate the exception.
---> 73 return self._coordinator.result()
74 except KeyboardInterrupt as e:
75 self.cancel()
~/bitbucket/clguru/env/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
231 # final result.
232 if self._exception:
--> 233 raise self._exception
234 return self._result
235
~/bitbucket/clguru/env/lib/python3.7/site-packages/s3transfer/tasks.py in _main(self, transfer_future, **kwargs)
253 # Call the submit method to start submitting tasks to execute the
254 # transfer.
--> 255 self._submit(transfer_future=transfer_future, **kwargs)
256 except BaseException as e:
257 # If there was an exception raised during the submission of task
~/bitbucket/clguru/env/lib/python3.7/site-packages/s3transfer/upload.py in _submit(self, client, config, osutil, request_executor, transfer_future, bandwidth_limiter)
547 # Determine the size if it was not provided
548 if transfer_future.meta.size is None:
--> 549 upload_input_manager.provide_transfer_size(transfer_future)
550
551 # Do a multipart upload if needed, otherwise do a regular put object.
~/bitbucket/clguru/env/lib/python3.7/site-packages/s3transfer/upload.py in provide_transfer_size(self, transfer_future)
324 fileobj.seek(0, 2)
325 end_position = fileobj.tell()
--> 326 fileobj.seek(start_position)
327 transfer_future.meta.provide_transfer_size(
328 end_position - start_position)
/usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/zipfile.py in seek(self, offset, whence)
1023 # Position is before the current position. Reset the ZipExtFile
1024
-> 1025 self._fileobj.seek(self._orig_compress_start)
1026 self._running_crc = self._orig_start_crc
1027 self._compress_left = self._orig_compress_size
/usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/zipfile.py in seek(self, offset, whence)
702 def seek(self, offset, whence=0):
703 with self._lock:
--> 704 if self.writing():
705 raise ValueError("Can't reposition in the ZIP file while "
706 "there is an open writing handle on it. "
AttributeError: '_SharedFile' object has no attribute 'writing'
If I comment out the lines after print(obj) to see the validate the zip file content,
import boto3
import io
import zipfile
import mimetypes
s3 = boto3.resource('s3')
service_zip = io.BytesIO()
service_bucket = s3.Bucket('services.readspeech.com')
build_bucket = s3.Bucket('servicesbuild.readspeech.com')
build_bucket.download_fileobj('servicesbuild.zip', service_zip)
with zipfile.ZipFile(service_zip) as myzip:
for nm in myzip.namelist():
obj = myzip.open(nm)
print(obj)
# service_bucket.upload_fileobj(obj,nm,
# ExtraArgs={'ContentType': mimetypes.guess_type(nm)[0]})
# service_bucket.Object(nm).Acl().put(ACL='public-read')
I see the following:
<zipfile.ZipExtFile name='favicon.ico' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='styles/main.css' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='images/example3.png' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='images/example1.png' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='images/example2.png' mode='r' compress_type=deflate>
<zipfile.ZipExtFile name='index.html' mode='r' compress_type=deflate>

Appears the issue is with python 3.7. I downgraded to python 3.6 and everything is fine. There is a bug reported on python 3.7
The misprint in the file lib/zipfile.py in line 704 leads to AttributeError: '_SharedFile' object has no attribute 'writing'
"self.writing()" should be replaced by "self._writing()". I also think this code should be covered by tests.
attribute 'writing
So to resolve the issue, use python 3.6.
On osx you can go back to Python 3.6 with the following command.
brew switch python 3.6.4_4

How do you use dask + distributed for NFS files?

Working from Matthew Rocklin's post on distributed data frames with Dask, I'm trying to distribute some summary statistics calculations across my cluster. Setting up the cluster with dcluster ... works fine. Inside a notebook,
import dask.dataframe as dd
from distributed import Executor, progress
e = Executor('...:8786')
df = dd.read_csv(...)
The file I'm reading is on an NFS mount that all the worker machines have access to. At this point I can look at df.head() for example and everything looks correct. From the blog post, I think I should be able to do this:
df_future = e.persist(df)
progress(df_future)
# ... wait for everything to load ...
df_future.head()
But that's an error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-26-8d59adace8bf> in <module>()
----> 1 fraudf.head()
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/dataframe/core.py in head(self, n, compute)
358
359 if compute:
--> 360 result = result.compute()
361 return result
362
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/base.py in compute(self, **kwargs)
35
36 def compute(self, **kwargs):
---> 37 return compute(self, **kwargs)[0]
38
39 #classmethod
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/base.py in compute(*args, **kwargs)
108 for opt, val in groups.items()])
109 keys = [var._keys() for var in variables]
--> 110 results = get(dsk, keys, **kwargs)
111
112 results_iter = iter(results)
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, **kwargs)
55 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
56 cache=cache, queue=queue, get_id=_thread_get_id,
---> 57 **kwargs)
58
59 return results
/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/async.py in get_async(apply_async, num_workers, dsk, result, cache, queue, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, **kwargs)
479 _execute_task(task, data) # Re-execute locally
480 else:
--> 481 raise(remote_exception(res, tb))
482 state['cache'][key] = res
483 finish_task(dsk, key, state, results, keyorder.get)
AttributeError: 'Future' object has no attribute 'head'
Traceback
---------
File "/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/async.py", line 264, in execute_task
result = _execute_task(task, data)
File "/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/async.py", line 246, in _execute_task
return func(*args2)
File "/work/analytics2/analytics/python/envs/analytics/lib/python3.5/site-packages/dask/dataframe/core.py", line 354, in <lambda>
dsk = {(name, 0): (lambda x, n: x.head(n=n), (self._name, 0), n)}
What's the right approach to distributing a data frame when it comes from a normal file system instead of HDFS?

Dask is trying to use the single-machine scheduler, which is the default if you create a dataframe using the normal dask library. Switch the default to use your cluster with the following lines:
import dask
dask.set_options(get=e.get)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Connect to S3 accelerate endpoint with boto3 - python-3.x

Related

How to load a torchvision model from disk?

Writing CSV file into dataframe from FTPS server with python

AttributeError: 'NoneType' object has no attribute 'encode' (Binance)

Python3 code Uploading to S3 bucket with IO instead of String IO

How do you use dask + distributed for NFS files?

Categories

Resources