How to stream data to a Bokeh server using Holoviews - python-3.x

I want to stream data generated by python to a webpage.
I came up with the following example, put together using examples from
https://holoviews.org/user_guide/Streaming_Data.html
and
http://holoviews.org/user_guide/Deploying_Bokeh_Apps.html
However I get a document lock error:
"'_pending_writes should be non-None when we have a document lock, and we should have the lock when the document changes'"
This is my example:
import numpy as np
import holoviews as hv
import holoviews.plotting.bokeh
import streamz
import streamz.dataframe
renderer = hv.renderer('bokeh')
from holoviews import opts
from holoviews.streams import Pipe, Buffer
hv.extension('bokeh')
source_df = streamz.dataframe.Random(freq='5ms', interval='100ms')
sdf = (source_df-0.5).cumsum()
raw_dmap = hv.DynamicMap(hv.Curve, streams=[Buffer(sdf.x)])
smooth_dmap = hv.DynamicMap(hv.Curve, streams=[Buffer(sdf.x.rolling('50ms').mean())])
fig = (raw_dmap.relabel('raw') * smooth_dmap.relabel('smooth')).opts(
opts.Curve(width=500, show_grid=True))
server = renderer.app(fig, show=True, new_window=True)
A page opens, figure shows up but is not updating. In my notebook I get the following error:
tornado.application - ERROR - Exception in callback functools.partial(<function wrap.<locals>.null_wrapper at 0x00000234E3CB9400>, <Future finished exception=RuntimeError('_pending_writes should be non-None when we have a document lock, and we should have the lock when the document changes')>)
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
ret = callback()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 779, in _discard_future_result
future.result()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\gen.py", line 1147, in run
yielded = self.gen.send(value)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\dataframe\core.py", line 802, in _cb
yield source._emit((last, now, freq))
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 298, in _emit
r = downstream.update(x, who=self)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 563, in update
return self._emit(result)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 298, in _emit
r = downstream.update(x, who=self)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 563, in update
return self._emit(result)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 298, in _emit
r = downstream.update(x, who=self)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 747, in update
return self._emit(result)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 298, in _emit
r = downstream.update(x, who=self)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 563, in update
return self._emit(result)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 298, in _emit
r = downstream.update(x, who=self)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 563, in update
return self._emit(result)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 298, in _emit
r = downstream.update(x, who=self)
File "C:\ProgramData\Anaconda3\lib\site-packages\streamz\core.py", line 516, in update
result = self.func(x, *self.args, **self.kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\streams.py", line 436, in send
self.event(data=data)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\streams.py", line 375, in event
self.trigger([self])
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\streams.py", line 156, in trigger
subscriber(**dict(union))
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\plot.py", line 615, in refresh
self._trigger_refresh(stream_key)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\plot.py", line 624, in _trigger_refresh
self.update(key)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\plot.py", line 596, in update
item = self.__getitem__(key)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\plot.py", line 261, in __getitem__
self.update_frame(frame)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\bokeh\element.py", line 1944, in update_frame
self._update_ranges(element, ranges)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\bokeh\element.py", line 657, in _update_ranges
self._shared['x'], self.logx, streaming)
File "C:\ProgramData\Anaconda3\lib\site-packages\holoviews\plotting\bokeh\element.py", line 702, in _update_range
axis_range.trigger(k, old, new)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\model.py", line 599, in trigger
super(Model, self).trigger(attr, old, new, hint=hint, setter=setter)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\util\callback_manager.py", line 143, in trigger
self._document._notify_change(self, attr, old, new, hint, setter, invoke)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\document.py", line 1004, in _notify_change
self._trigger_on_change(event)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\document.py", line 1099, in _trigger_on_change
self._with_self_as_curdoc(invoke_callbacks)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\document.py", line 1112, in _with_self_as_curdoc
return f()
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\document.py", line 1098, in invoke_callbacks
cb(event)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\document.py", line 668, in <lambda>
self._callbacks[receiver] = lambda event: event.dispatch(receiver)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\events.py", line 244, in dispatch
super(ModelChangedEvent, self).dispatch(receiver)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\document\events.py", line 126, in dispatch
receiver._document_patched(self)
File "C:\ProgramData\Anaconda3\lib\site-packages\bokeh\server\session.py", line 214, in _document_patched
raise RuntimeError("_pending_writes should be non-None when we have a document lock, and we should have the lock when the document changes")
RuntimeError: _pending_writes should be non-None when we have a document lock, and we should have the lock when the document changes
Any clues what I'm doing wrong?
Kind regards

I changed the last line to renderer.server_doc(fig), saved everything as a notebook named test.ipynb. In the command prompt, I ran >bokeh serve --show .\test.ipynb. The server is up and the streaming of data is shown in the browser as expected.
import numpy as np
import holoviews as hv
import holoviews.plotting.bokeh
import streamz
import streamz.dataframe
renderer = hv.renderer('bokeh')
from holoviews import opts
from holoviews.streams import Pipe, Buffer
hv.extension('bokeh')
source_df = streamz.dataframe.Random(freq='5ms', interval='100ms')
sdf = (source_df-0.5).cumsum()
raw_dmap = hv.DynamicMap(hv.Curve, streams=[Buffer(sdf.x)])
smooth_dmap = hv.DynamicMap(hv.Curve, streams=[Buffer(sdf.x.rolling('50ms').mean())])
fig = (raw_dmap.relabel('raw') * smooth_dmap.relabel('smooth')).opts(
opts.Curve(width=500, show_grid=True))
renderer.server_doc(fig)

Related

BERTopic: pop from empty list IndexError while Inferencing

I have trained a BERTopic model on colab and I am now trying to use it locally I get the IndexError.
IndexError: Failed in nopython mode pipeline (step: analyzing bytecode)
pop from empty list
The code I used is:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('KBLab/sentence-bert-swedish-cased')
model = BERTopic.load('bertopic_model')
text = "my text here for example"
text = [text]
embeddings = sentence_model.encode(text)
topic, _ = model.transform(text, embeddings)
The last line gives me the error.
Noticeably, the same code works just fine on colab. Not sure whats going on mlocally.
My numba and other related libraries are up-to-date as it was on colab.
Full Traceback:
Traceback (most recent call last):
File "/home/vaibhav/.local/lib/python3.10/site-packages/flask/app.py", line 2525, in wsgi_app
response = self.full_dispatch_request()
File "/home/vaibhav/.local/lib/python3.10/site-packages/flask/app.py", line 1822, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/home/vaibhav/.local/lib/python3.10/site-packages/flask/app.py", line 1820, in full_dispatch_request
rv = self.dispatch_request()
File "/home/vaibhav/.local/lib/python3.10/site-packages/flask/app.py", line 1796, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
File "app.py", line 20, in reference_prediction
preds = data_process(input_api)
File "data_process.py", line 63, in data_process
topic, _ = topic_model_mi.transform(text, embeddings)
File "/home/vaibhav/.local/lib/python3.10/site-packages/bertopic/_bertopic.py", line 423, in transform
umap_embeddings = self.umap_model.transform(embeddings)
File "/home/vaibhav/.local/lib/python3.10/site-packages/umap/umap_.py", line 2859, in transform
dmat = pairwise_distances(
File "/home/vaibhav/.local/lib/python3.10/site-packages/sklearn/metrics/pairwise.py", line 2022, in pairwise_distances
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
File "/home/vaibhav/.local/lib/python3.10/site-packages/sklearn/metrics/pairwise.py", line 1563, in _parallel_pairwise
return func(X, Y, **kwds)
File "/home/vaibhav/.local/lib/python3.10/site-packages/sklearn/metrics/pairwise.py", line 1607, in _pairwise_callable
out[i, j] = metric(X[i], Y[j], **kwds)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/dispatcher.py", line 487, in _compile_for_args
raise e
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/dispatcher.py", line 420, in _compile_for_args
return_val = self.compile(tuple(argtypes))
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/dispatcher.py", line 965, in compile
cres = self._compiler.compile(args, return_type)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/dispatcher.py", line 125, in compile
status, retval = self._compile_cached(args, return_type)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/dispatcher.py", line 139, in _compile_cached
retval = self._compile_core(args, return_type)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/dispatcher.py", line 152, in _compile_core
cres = compiler.compile_extra(self.targetdescr.typing_context,
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler.py", line 716, in compile_extra
return pipeline.compile_extra(func)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler.py", line 452, in compile_extra
return self._compile_bytecode()
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler.py", line 520, in _compile_bytecode
return self._compile_core()
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler.py", line 499, in _compile_core
raise e
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler.py", line 486, in _compile_core
pm.run(self.state)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler_machinery.py", line 368, in run
raise patched_exception
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler_machinery.py", line 356, in run
self._runPass(idx, pass_inst, state)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler_lock.py", line 35, in _acquire_compile_lock
return func(*args, **kwargs)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler_machinery.py", line 311, in _runPass
mutated |= check(pss.run_pass, internal_state)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/compiler_machinery.py", line 273, in check
mangled = func(compiler_state)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/untyped_passes.py", line 86, in run_pass
func_ir = interp.interpret(bc)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/interpreter.py", line 1321, in interpret
flow.run()
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/byteflow.py", line 107, in run
runner.dispatch(state)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/byteflow.py", line 282, in dispatch
fn(state, inst)
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/byteflow.py", line 1061, in _binaryop
rhs = state.pop()
File "/home/vaibhav/.local/lib/python3.10/site-packages/numba/core/byteflow.py", line 1344, in pop
return self._stack.pop()
IndexError: Failed in nopython mode pipeline (step: analyzing bytecode)
pop from empty list

Python SDK for Vmware get stuck in openSSL SSL_read

I am using [Python SDK][1] for VMware automation tasks e.g. power off, power on and delete VMs in vcenter server.
from com.vmware.vcenter.vm_client import Power
from samples.vsphere.vcenter.helper.vm_helper import get_vm
import requests
import urllib3
from vmware.vapi.vsphere.client import create_vsphere_client
import argparse
Inputs = fetch_inputs()
vm_name = Inputs.VM_Name
action = Inputs.action
session = requests.session()
# Disable cert verification for demo purpose.
# This is not recommended in a production environment.
session.verify = False
# Disable the secure connection warning for demo purpose.
# This is not recommended in a production environment.
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Connect to a vCenter Server using username and password
vsphere_client = create_vsphere_client(server=Inputs.server, username=Inputs.username, password=Inputs.password, session=session)
vsphere_client.vcenter.vm.Power.stop(vm) <<--- It gets stuck here.
However, it does the operation but after that it gets stuck there and never comes out. If I do CTRL+C, then I can see that it seemd to get stuck in SSL_read in OpenSSL library:
^CTraceback (most recent call last):
File "/home/qa1/python_sdk_scripts/manage_vm.py", line 97, in <module>
power_on(vsphere_client, vm_name)
File "/home/qa1/python_sdk_scripts/manage_vm.py", line 40, in power_on
return_value = vsphere_client.vcenter.vm.Power.start(vm)
File "/home/qa1/.local/lib/python3.8/site-packages/com/vmware/vcenter/vm_client.py", line 2141, in start
return self._invoke('start',
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/bindings/stub.py", line 345, in _invoke
return self._api_interface.native_invoke(ctx, _method_name, kwargs)
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/bindings/stub.py", line 266, in native_invoke
method_result = self.invoke(ctx, method_id, data_val)
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/bindings/stub.py", line 199, in invoke
return self._api_provider.invoke(self._iface_id.get_name(),
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/security/client/security_context_filter.py", line 101, in invoke
method_result = ApiProviderFilter.invoke(
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/provider/filter.py", line 75, in invoke
method_result = self.next_provider.invoke(
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/protocol/client/msg/json_connector.py", line 79, in invoke
response = self._do_request(VAPI_INVOKE, ctx, params)
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/protocol/client/msg/json_connector.py", line 120, in _do_request
http_response = self.http_provider.do_request(
File "/home/qa1/.local/lib/python3.8/site-packages/vmware/vapi/protocol/client/rpc/requests_provider.py", line 95, in do_request
output = self._session.request(
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.8/http/client.py", line 1348, in getresponse
response.begin()
File "/usr/lib/python3.8/http/client.py", line 316, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.8/http/client.py", line 277, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3/dist-packages/urllib3/contrib/pyopenssl.py", line 313, in recv_into
return self.connection.recv_into(*args, **kwargs)
File "/home/qa1/.local/lib/python3.8/site-packages/OpenSSL/SSL.py", line 1733, in recv_into
result = _lib.SSL_read(self._ssl, buf, nbytes)
KeyboardInterrupt
How can I debug this, where is the issue?
[1]: https://github.com/vmware/vsphere-automation-sdk-python

Google Cloud Functions - ImportError: cannot import name 'InvalidKeyError' from 'jwt.exceptions'

I am getting the below error in GCP while executing Cloud Functions i.e. Cloud PubSub with Python 3.8, also below is the packages included in the requirements.txt
I have also tried only with jwt installed, but got the same error. tried only with pyjwt, again got the same error. Tried with both, again same error.
Requirements.txt:
atlassian-python-api==3.13.0
google==3.0.0
google-api-python-client==2.18.0
google-auth==2.0.1
google-auth-httplib2==0.1.0
google-auth-oauthlib==0.4.5
oauth2client==4.1.3
oauthlib==3.1.1
sendgrid==6.8.1
gspread==4.0.1
pandas==1.3.2
jwt==1.2.0
PyJWT==2.1.0
cryptography==3.4.8
rsa==4.7.2
Traceback:
Traceback (most recent call last):
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/flask/app.py", line 2447, in wsgi_app
response = self.full_dispatch_request()
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/flask/app.py", line 1952, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/flask/app.py", line 1821, in handle_user_exception
reraise(exc_type, exc_value, tb)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/flask/_compat.py", line 39, in reraise
raise value
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/flask/app.py", line 1950, in full_dispatch_request
rv = self.dispatch_request()
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/flask/app.py", line 1936, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/functions_framework/__init__.py", line 152, in view_func
function(data, context)
File "/workspace/main.py", line 72, in get_data_from_jira
data = jira_instance.jql("project = PROJECTNAME AND issuetype = 'ISSUETYPE' AND status = 'In Progress'")
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/atlassian/jira.py", line 2418, in jql
return self.get(url, params=params)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/atlassian/rest_client.py", line 264, in get
response = self.request(
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/atlassian/rest_client.py", line 218, in request
response = self._session.request(
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/requests/sessions.py", line 528, in request
prep = self.prepare_request(req)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/requests/sessions.py", line 456, in prepare_request
p.prepare(
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/requests/models.py", line 320, in prepare
self.prepare_auth(auth, url)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/requests/models.py", line 556, in prepare_auth
r = auth(self)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/requests_oauthlib/oauth1_auth.py", line 108, in __call__
r.url, headers, _ = self.client.sign(
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/oauthlib/oauth1/rfc5849/__init__.py", line 351, in sign
('oauth_signature', self.get_oauth_signature(request)))
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/oauthlib/oauth1/rfc5849/__init__.py", line 186, in get_oauth_signature
sig = self.SIGNATURE_METHODS[self.signature_method](base_string, self)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/oauthlib/oauth1/rfc5849/signature.py", line 742, in sign_rsa_sha1_with_client
return _sign_rsa('SHA-1', sig_base_str, client.rsa_key)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/oauthlib/oauth1/rfc5849/signature.py", line 617, in _sign_rsa
alg = _get_jwt_rsa_algorithm(hash_algorithm_name)
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/oauthlib/oauth1/rfc5849/signature.py", line 562, in _get_jwt_rsa_algorithm
import jwt.algorithms as jwt_algorithms
File "/layers/google.python.pip/pip/lib/python3.8/site-packages/jwt/algorithms.py", line 5, in <module>
from .exceptions import InvalidKeyError
ImportError: cannot import name 'InvalidKeyError' from 'jwt.exceptions' (/layers/google.python.pip/pip/lib/python3.8/site-packages/jwt/exceptions.py)
I think it's because of the same names in jwt and pyjwt. https://github.com/jazzband/djangorestframework-simplejwt/issues/42

Dask - trying to read hdfs data getting error ArrowIOError: HDFS file does not exist

I tried creating a dataframe from csv stored in hdfs. Connecting is successful. But when trying to get output of len function getting error.
Code:
from dask_yarn import YarnCluster
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import subprocess
import os
# GET HDFS CLASSPATH
classpath = subprocess.Popen(["/usr/hdp/current/hadoop-client/bin/hadoop", "classpath", "--glob"], stdout=subprocess.PIPE).communicate()[0]
os.environ["HADOOP_HOME"] = "/usr/hdp/current/hadoop-client"
os.environ["ARROW_LIBHDFS_DIR"] = "/usr/hdp/3.1.4.0-315/usr/lib/"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java/"
os.environ["CLASSPATH"] = classpath.decode("utf-8")
# GET HDFS CLASSPATH
classpath = subprocess.Popen(["/usr/hdp/current/hadoop-client/bin/hadoop", "classpath", "--glob"], stdout=subprocess.PIPE).communicate()[0]
cluster = YarnCluster(environment='python:///opt/anaconda3/bin/python3', worker_vcores=32, worker_memory="128GiB", n_workers=10)
client = Client(cluster)
client
df = dd.read_csv('hdfs://masterha/data/batch/82.csv')
len(df)
Error:
>>> len(ddf)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/anaconda3/lib/python3.7/site-packages/dask/dataframe/core.py", line 504, in __len__
len, np.sum, token="len", meta=int, split_every=False
File "/opt/anaconda3/lib/python3.7/site-packages/dask/base.py", line 165, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/opt/anaconda3/lib/python3.7/site-packages/dask/base.py", line 436, in compute
results = schedule(dsk, keys, **kwargs)
File "/opt/anaconda3/lib/python3.7/site-packages/distributed/client.py", line 2539, in get
results = self.gather(packed, asynchronous=asynchronous, direct=direct)
File "/opt/anaconda3/lib/python3.7/site-packages/distributed/client.py", line 1839, in gather
asynchronous=asynchronous,
File "/opt/anaconda3/lib/python3.7/site-packages/distributed/client.py", line 756, in sync
self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
File "/opt/anaconda3/lib/python3.7/site-packages/distributed/utils.py", line 333, in sync
raise exc.with_traceback(tb)
File "/opt/anaconda3/lib/python3.7/site-packages/distributed/utils.py", line 317, in f
result[0] = yield future
File "/opt/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 735, in run
value = future.result()
File "/opt/anaconda3/lib/python3.7/site-packages/distributed/client.py", line 1695, in _gather
raise exception.with_traceback(traceback)
File "/opt/anaconda3/lib/python3.7/site-packages/dask/bytes/core.py", line 181, in read_block_from_file
with copy.copy(lazy_file) as f:
File "/opt/anaconda3/lib/python3.7/site-packages/fsspec/core.py", line 88, in __enter__
f = self.fs.open(self.path, mode=mode)
File "/opt/anaconda3/lib/python3.7/site-packages/fsspec/implementations/hdfs.py", line 116, in <lambda>
return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw)
File "/opt/anaconda3/lib/python3.7/site-packages/fsspec/spec.py", line 708, in open
path, mode=mode, block_size=block_size, autocommit=ac, **kwargs
File "/opt/anaconda3/lib/python3.7/site-packages/fsspec/implementations/hdfs.py", line 116, in <lambda>
return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw)
File "/opt/anaconda3/lib/python3.7/site-packages/fsspec/implementations/hdfs.py", line 72, in _open
return HDFSFile(self, path, mode, block_size, **kwargs)
File "/opt/anaconda3/lib/python3.7/site-packages/fsspec/implementations/hdfs.py", line 171, in __init__
self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs)
File "pyarrow/io-hdfs.pxi", line 431, in pyarrow.lib.HadoopFileSystem.open
File "pyarrow/error.pxi", line 83, in pyarrow.lib.check_status
pyarrow.lib.ArrowIOError: HDFS file does not exist: /data/batch/82.csv
It looks like your file "/data/batch/82.csv" doesn't exist. You might want to verify that you have the right path.

Using filepaths as global variables in Python

I have a file global_vars.py that contains file paths saved as variables:
from pandas import Timestamp
final_vol_path = 'datasets/final_vols.csv'
final_price_path = 'datasets/final_prices.csv'
final_start_date = Timestamp('2017-01-01')
with other variables written in a similar fashion. However, the functions that I'm using to read in the data throw a FileNotFoundError when attempting to do the following in file1.py:
import scripts.global_vars as gv
read_data(gv.final_vol_path, gv.final_price_path) # throws FileNotFoundError
read_data('datasets/final_vols.csv', 'datasets/final_prices.csv') # this passes
Additionally, I've checked the file paths, and have gotten the following:
gv.final_vol_path == 'datasets/final_vols.csv' # returns True
gv.final_price_path == 'datasets/final_prices.csv' # returns True
Moreover, the pandas Timestamp object is processed without any problems.
Is there any explanation for why the FileNotFoundError is being thrown when attempting to access the file path as a variable from global_vars.py, but is not thrown when the actual string is passed in?
EDIT: The overall directory structure is as follows:
working_dir
L file1.py
L scripts
L global_vars.py
L datasets
L final_vols.csv
L final_prices.csv
EDIT 2: I added in a try-catch block to ensure the rest of the function doesn't break, not sure if that has affected the traceback, but here's what I get:
Traceback (most recent call last):
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\runpy.py", line
184, in _run_module_as_main
"__main__", mod_spec)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\Ananth\Anaconda3\envs\analytics-cpu\Scripts\nose2.exe\__main__.py", line 9, in <module>
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\main.py", line 306, in discover
return main(*args, **kwargs)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\main.py", line 100, in __init__
super(PluggableTestProgram, self).__init__(**kw)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\unittest\main.py", line 93, in __init__
self.parseArgs(argv)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\main.py", line 133, in parseArgs
self.createTests()
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\main.py", line 258, in createTests
self.testNames, self.module)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\loader.py", line 69, in loadTestsFromNames
for name in event.names]
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\loader.py", line 69, in <listcomp>
for name in event.names]
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\loader.py", line 84, in loadTestsFromName
result = self.session.hooks.loadTestsFromName(event)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\events.py", line 224, in __call__
result = getattr(plugin, self.method)(event)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\plugins\loader\testcases.py", line 56, in loadTestsFromName
result = util.test_from_name(name, module)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\util.py", line 106, in test_from_name
parent, obj = object_from_name(name, module)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\nose2\util.py", line 117, in object_from_name
module = __import__('.'.join(parts_copy))
File "C:\Users\Ananth\Desktop\Modules\PortfolioVARModule\tests\test_simulation.py", line 24, in <module>
gv.test_start_date)
File "C:\Users\Ananth\Desktop\Modules\PortfolioVARModule\scripts\prep_data.py", line 119, in read_data
priceDF = pd.read_csv(pricepath).dropna()
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\pandas\io\parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\pandas\io\parsers.py", line 389, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\pandas\io\parsers.py", line 730, in __init__
self._make_engine(self.engine)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\pandas\io\parsers.py", line 923, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "c:\users\ananth\anaconda3\envs\analytics-cpu\lib\site-packages\pandas\io\parsers.py", line 1390, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas\parser.pyx", line 373, in pandas.parser.TextReader.__cinit__ (pandas\parser.c:4184)
File "pandas\parser.pyx", line 667, in pandas.parser.TextReader._setup_parser_source (pandas\parser.c:8449)
FileNotFoundError: File b'datasets/corn_price.csv' does not exist
Problem is the addition of the letter b in front of your file's path.
You get the b because you encoded to utf-8.
Try:
read_data(str(gv.final_vol_path,'utf-8'), str(gv.final_price_path, 'utf-8'))

Resources