Error importing PyDeequ package on databricks - apache-spark

I want to do some tests regarding data quality and for that I pretend to use PyDeequ on a databricks notebook. Keep in mind that I'm very new to databricks and Spark.
First, I created a cluster with the Runtime version "10.4 LTS (includes Apache Spark 3.2.1, Scala 2.12)" and added to the environment variable SPARK_VERSION=3.2, as referred in the repository's GitHub.
Since the available PyPI package is not up to date I tried installing the package through a notebook-scoped library with the following comand
%pip install numpy==1.22 %pip install git+https://github.com/awslabs/python-deequ.git
(The first line is only to prevent a conflict on the numpy versions.)
Then, when doing
import pydeequ
I get
IndexError Traceback (most recent call last)
<command-3386600260354339> in <module>
----> 1 import pydeequ
/databricks/python_shell/dbruntime/PythonPackageImportsInstrumentation/__init__.py in import_patch(name, globals, locals, fromlist, level)
165 # Import the desired module. If you’re seeing this while debugging a failed import,
166 # look at preceding stack frames for relevant error information.
--> 167 original_result = python_builtin_import(name, globals, locals, fromlist, level)
168
169 is_root_import = thread_local._nest_level == 1
/local_disk0/.ephemeral_nfs/envs/pythonEnv-5ccb9322-9b7e-4caf-b370-843c10304472/lib/python3.8/site-packages/pydeequ/__init__.py in <module>
19 from pydeequ.analyzers import AnalysisRunner
20 from pydeequ.checks import Check, CheckLevel
---> 21 from pydeequ.configs import DEEQU_MAVEN_COORD
22 from pydeequ.profiles import ColumnProfilerRunner
23
/databricks/python_shell/dbruntime/PythonPackageImportsInstrumentation/__init__.py in import_patch(name, globals, locals, fromlist, level)
165 # Import the desired module. If you’re seeing this while debugging a failed import,
166 # look at preceding stack frames for relevant error information.
--> 167 original_result = python_builtin_import(name, globals, locals, fromlist, level)
168
169 is_root_import = thread_local._nest_level == 1
/local_disk0/.ephemeral_nfs/envs/pythonEnv-5ccb9322-9b7e-4caf-b370-843c10304472/lib/python3.8/site-packages/pydeequ/configs.py in <module>
35
36
---> 37 DEEQU_MAVEN_COORD = _get_deequ_maven_config()
38 IS_DEEQU_V1 = re.search("com\.amazon\.deequ\:deequ\:1.*", DEEQU_MAVEN_COORD) is not None
/local_disk0/.ephemeral_nfs/envs/pythonEnv-5ccb9322-9b7e-4caf-b370-843c10304472/lib/python3.8/site-packages/pydeequ/configs.py in _get_deequ_maven_config()
26
27 def _get_deequ_maven_config():
---> 28 spark_version = _get_spark_version()
29 try:
30 return SPARK_TO_DEEQU_COORD_MAPPING[spark_version[:3]]
/local_disk0/.ephemeral_nfs/envs/pythonEnv-5ccb9322-9b7e-4caf-b370-843c10304472/lib/python3.8/site-packages/pydeequ/configs.py in _get_spark_version()
21 ]
22 output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
---> 23 spark_version = output.stdout.decode().split("\n")[-2]
24 return spark_version
25
IndexError: list index out of range
Can you please help me find the reason for this or an alternative way to get the library without the PyPI.
Thanks in advance!

I assumed I wouldn't need to add the Deequ library. Apparently, all I had to do was add it via Maven coordinates and it solved the problem.

Related

Torchtext import error on macbook air m1. Used pip3 to install all of it

While the installation went smooth, I keep facing this error when I try to import torchtext on my macbook air m1
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Input In [10], in <cell line: 1>()
----> 1 import torchtext
File ~/Library/Python/3.8/lib/python/site-packages/torchtext/__init__.py:6, in <module>
3 from torch.hub import _get_torch_home
5 # the following import has to happen first in order to load the torchtext C++ library
----> 6 from torchtext import _extension # noqa: F401
8 _TEXT_BUCKET = "https://download.pytorch.org/models/text/"
10 _CACHE_DIR = os.path.expanduser(os.path.join(_get_torch_home(), "text"))
File ~/Library/Python/3.8/lib/python/site-packages/torchtext/_extension.py:64, in <module>
59 # This import is for initializing the methods registered via PyBind11
60 # This has to happen after the base library is loaded
61 from torchtext import _torchtext # noqa
---> 64 _init_extension()
File ~/Library/Python/3.8/lib/python/site-packages/torchtext/_extension.py:58, in _init_extension()
55 if not _mod_utils.is_module_available("torchtext._torchtext"):
56 raise ImportError("torchtext C++ Extension is not found.")
---> 58 _load_lib("libtorchtext")
59 # This import is for initializing the methods registered via PyBind11
60 # This has to happen after the base library is loaded
61 from torchtext import _torchtext
File ~/Library/Python/3.8/lib/python/site-packages/torchtext/_extension.py:50, in _load_lib(lib)
48 if not path.exists():
49 return False
---> 50 torch.ops.load_library(path)
51 return True
File ~/Library/Python/3.8/lib/python/site-packages/torch/_ops.py:220, in _Ops.load_library(self, path)
215 path = torch._utils_internal.resolve_library_path(path)
216 with dl_open_guard():
217 # Import the shared library into the process, thus running its
218 # static (global) initialization code in order to register custom
219 # operators with the JIT.
--> 220 ctypes.CDLL(path)
221 self.loaded_libraries.add(path)
File /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/ctypes/__init__.py:365, in CDLL.__init__(self, name, mode, handle, use_errno, use_last_error, winmode)
362 self._FuncPtr = _FuncPtr
364 if handle is None:
--> 365 self._handle = _dlopen(self._name, mode)
366 else:
367 self._handle = handle
OSError: dlopen(/Users/pratik/Library/Python/3.8/lib/python/site-packages/torchtext/lib/libtorchtext.so, 0x0006): Symbol not found: __ZN3c1012OptionalType3getENS_4Type24SingletonOrSharedTypePtrIS1_EE
Referenced from: /Users/pratik/Library/Python/3.8/lib/python/site-packages/torchtext/lib/libtorchtext.so
Expected in: /Users/pratik/Library/Python/3.8/lib/python/site-packages/torch/lib/libtorch_cpu.dylib
I don't quite understand what I can do about this. I uninstalled and reinstalled it all to avoid an error I was having previously. But now, this comes up.

I wanted to import pandas and numpy on jupyter notebook and received [WinError 193] %1 is not a valid Win32 application?

Warning: I'm a beginner and will have trouble understanding a complicated answer. However, I have used Jupyter Notebook months ago and did not have any problems with using these same packages when I first started learning. Is it possible that when I installed PyCharm afterwards that it messed up my Jupyter Notebook set up?
Code I wrote:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Error I received:
OSError Traceback (most recent call last)
<ipython-input-1-1de8dee942b6> in <module>
----> 1 import numpy as np
2 import pandas as pd
3
4 from pandas import Series, DataFrame
~\AppData\Roaming\Python\Python38\site-packages\numpy\__init__.py in <module>
136
137 # Allow distributors to run custom init code
--> 138 from . import _distributor_init
139
140 from . import core
~\AppData\Roaming\Python\Python38\site-packages\numpy\_distributor_init.py in <module>
24 # NOTE: would it change behavior to load ALL
25 # DLLs at this path vs. the name restriction?
---> 26 WinDLL(os.path.abspath(filename))
27 DLL_filenames.append(filename)
28 if len(DLL_filenames) > 1:
~\Documents\Anaconda\lib\ctypes\__init__.py in __init__(self, name, mode, handle, use_errno,
use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: [WinError 193] %1 is not a valid Win32 application

How to resolve this error when trying to use Pandas Styling?

I am running Python 3.6.5 and Pandas 0.25.2.
On attempting to style a pandas dataframe I am getting a specific error which can be generated by simplifying to this code:
import pandas as pd
import pandas.io.formats.style
The summary of the error generated is:
ImportError: The 'packaging._typing' package is required; normally this is bundled with this package so if you get this warning, consult the packager of your distribution.
The full error message is:
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-1-e9b944578fec> in <module>()
1 import pandas as pd
----> 2 import pandas.io.formats.style
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in <module>()
48
49
---> 50 class Styler:
51 """
52 Helps style a DataFrame or Series according to the data with HTML and CSS.
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in Styler()
109 """
110
--> 111 loader = jinja2.PackageLoader("pandas", "io/formats/templates")
112 env = jinja2.Environment(loader=loader, trim_blocks=True)
113 template = env.get_template("html.tpl")
~\Anaconda3\lib\site-packages\jinja2\loaders.py in __init__(self, package_name, package_path, encoding)
220 def __init__(self, package_name, package_path='templates',
221 encoding='utf-8'):
--> 222 from pkg_resources import DefaultProvider, ResourceManager, \
223 get_provider
224 provider = get_provider(package_name)
~\Anaconda3\lib\site-packages\pkg_resources\__init__.py in <module>()
79 from pkg_resources.extern import appdirs
80 from pkg_resources.extern import packaging
---> 81 __import__('pkg_resources.extern.packaging.version')
82 __import__('pkg_resources.extern.packaging.specifiers')
83 __import__('pkg_resources.extern.packaging.requirements')
~\Anaconda3\lib\site-packages\pkg_resources\_vendor\packaging\version.py in <module>()
9
10 from ._structures import Infinity, NegativeInfinity
---> 11 from ._typing import TYPE_CHECKING
12
13 if TYPE_CHECKING: # pragma: no cover
~\Anaconda3\lib\site-packages\pkg_resources\extern\__init__.py in load_module(self, fullname)
52 "normally this is bundled with this package so if you get "
53 "this warning, consult the packager of your "
---> 54 "distribution.".format(**locals())
55 )
56
I have tried reinstalling and upgrading the pandas installation, but each time I get the same error. This is being doine through an Anaconda environment.
Has anyone seen this error before? Is there a more detailed explanation that anyone can provide in an effort to solve this issue so that I can get the pandas styling working.
Thanks!
The correct way to do this is by:
from pandas.io.formats import style
This is because style is a module of pandas.io.formats package and the correct syntax is:
from package import module

Problem with loading spacy.load('en_core_web_md')

I have installed in anaconda the packages shown below:
spacy 2.2.2
spacy-model-en_core_web_md 2.2.5
spacy-model-en_core_web_sm 2.2.5
python 3.6.2
The above packages were installed in conda with the commands shown below:
conda install -c conda-forge spacy=2.2.2
conda install -c conda-forge spacy-model-en_core_web_sm
conda install -c conda-forge spacy-model-en_core_web_md
When i load en_core_web_md and en_core_web_sm, i get an error message shown below:
import spacy
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')
Error message:
-> ---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-39-d6345e302427> in <module>
1 import spacy
2 import en_core_web_sm
----> 3 nlp = spacy.load('en_core_web_sm')
4
5 #import en_core_web_sm
~\anaconda3\envs\ADS99\lib\site-packages\spacy\__init__.py in load(name, **overrides)
17 from . import util
18 from .util import register_architecture, get_architecture
---> 19 from .language import component
20
21
~\anaconda3\envs\ADS99\lib\site-packages\spacy\util.py in load_model(name, **overrides)
117
118 path (unicode or Path): Path to new data directory.
--> 119 """
120 global _data_path
121 _data_path = ensure_path(path)
OSError: Can't find model 'en_core_web_sm'
I tried a different way of loading en_core_web_sm but again i got a different error:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
Error message:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-40-9427d7caa44a> in <module>
1 import spacy
2 import en_core_web_sm
----> 3 nlp = en_core_web_sm.load()
4
5
~\anaconda3\envs\ADS99\lib\site-packages\en_core_web_sm\__init__.py in load(**overrides)
10
11 def load(**overrides):
---> 12 return load_model_from_init_py(__file__, **overrides)
~\anaconda3\envs\ADS99\lib\site-packages\spacy\util.py in load_model_from_init_py(init_file, **overrides)
174 return Path(path)
175 else:
--> 176 return path
177
178
~\anaconda3\envs\ADS99\lib\site-packages\spacy\util.py in load_model_from_path(model_path, meta, **overrides)
143
144
--> 145 def make_layer(arch_config):
146 arch_func = get_architecture(arch_config["arch"])
147 return arch_func(arch_config["config"])
~\anaconda3\envs\ADS99\lib\site-packages\spacy\util.py in get_lang_class(lang)
47
48 factories = "spacy_factories"
---> 49 languages = "spacy_languages"
50 displacy_colors = "spacy_displacy_colors"
51 lookups = "spacy_lookups"
~\anaconda3\envs\ADS99\lib\importlib\__init__.py in import_module(name, package)
124 break
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
127
128
~\anaconda3\envs\ADS99\lib\importlib\_bootstrap.py in _gcd_import(name, package, level)
~\anaconda3\envs\ADS99\lib\importlib\_bootstrap.py in _find_and_load(name, import_)
~\anaconda3\envs\ADS99\lib\importlib\_bootstrap.py in _find_and_load_unlocked(name, import_)
~\anaconda3\envs\ADS99\lib\importlib\_bootstrap.py in _load_unlocked(spec)
~\anaconda3\envs\ADS99\lib\importlib\_bootstrap_external.py in exec_module(self, module)
~\anaconda3\envs\ADS99\lib\importlib\_bootstrap.py in _call_with_frames_removed(f, *args, **kwds)
~\anaconda3\envs\ADS99\lib\site-packages\spacy\lang\en\__init__.py in <module>
12 from ..tokenizer_exceptions import BASE_EXCEPTIONS
13 from ..norm_exceptions import BASE_NORMS
---> 14 from ...language import Language
15 from ...attrs import LANG, NORM
16 from ...util import update_exc, add_lookups
~\anaconda3\envs\ADS99\lib\site-packages\spacy\language.py in <module>
18 from .vocab import Vocab
19 from .lemmatizer import Lemmatizer
---> 20 from .lookups import Lookups
21 from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
22 from .compat import izip, basestring_, is_python2, class_types
~\anaconda3\envs\ADS99\lib\site-packages\spacy\lookups.py in <module>
4 import srsly
5 from collections import OrderedDict
----> 6 from preshed.bloom import BloomFilter
7
8 from .errors import Errors
bloom.pyx in init preshed.bloom()
AttributeError: type object 'preshed.bloom.BloomFilter' has no attribute '__reduce_cython__'
If someone can provide me any hint on how this could issue could be fixed, i would be really grateful.
you can try using anaconda prompt, and you can do this:
conda install -c conda-forge spacy
python -m spacy download en
after that, you can load the model via its full package name.
import spacy
nlp = spacy.load('en_core_web_sm')
and you can try to test it, like this:
check = nlp("How's your spicy its really spicy, don't you think?")
and do this
for token in check:
print (token)
if its really work, it turn out be like this.
How
's
your
spicy
its
really
spicy
,
do
n't
you
think
?
good luck.
find a location of your 'en_core_web_sm' in spacy use in load directory.
e.g
model = spacy.load('path/....')
Is the version of Python you're using to install the en_core_web_sm model the same as the version of Python you're using to consume the model? If not, it could be that the SpaCy model cannot be found by your application because it is installed under a different Python version.
You can easily confirm this by running the following tests:
From console:
python --version
From your code:
import sys
print(sys.version)
If you have successfully downloaded the model check where the model is installed. I have found that downloading en_core_web_md in a conda environment can lead it to be saved in the site-packages folder directly (using windows) NOT the data folder in the spacy package where the load_model looks by default.

Import errors when trying to use pandas_gbq

I have been trying to run the code on this page which describes how to authenticate before using the pandas_gbq.read_gbq function:
import pandas_gbq
import pydata_google_auth
SCOPES = [
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/drive',
]
credentials = pydata_google_auth.get_user_credentials(
SCOPES,
# Set auth_local_webserver to True to have a slightly more convienient
# authorization flow. Note, this doesn't work if you're running from a
# notebook on a remote sever, such as over SSH or with Google Colab.
auth_local_webserver=True,)
df = pandas_gbq.read_gbq(
"SELECT my_col FROM `my_dataset.my_table`",
project_id='YOUR-PROJECT-ID',
credentials=credentials,)
I am getting the following error:
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
/usr/local/lib/python3.7/site-packages/pandas_gbq/gbq.py in _test_google_api_imports()
94
---> 95 try:
96 from google.cloud import bigquery # noqa
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery/__init__.py in <module>
34
---> 35 from google.cloud.bigquery.client import Client
36 from google.cloud.bigquery.dataset import AccessEntry
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery/client.py in <module>
52 from google.cloud.bigquery import _pandas_helpers
---> 53 from google.cloud.bigquery.dataset import Dataset
54 from google.cloud.bigquery.dataset import DatasetListItem
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery/dataset.py in <module>
23 from google.cloud.bigquery import _helpers
---> 24 from google.cloud.bigquery.model import ModelReference
25 from google.cloud.bigquery.routine import RoutineReference
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery/model.py in <module>
26 from google.cloud.bigquery import _helpers
---> 27 from google.cloud.bigquery_v2 import types
28
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery_v2/__init__.py in <module>
22
---> 23 from google.cloud.bigquery_v2 import types
24 from google.cloud.bigquery_v2.gapic import enums
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery_v2/types.py in <module>
22
---> 23 from google.cloud.bigquery_v2.proto import model_pb2
24 from google.cloud.bigquery_v2.proto import model_reference_pb2
/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/cloud/bigquery_v2/proto/model_pb2.py in <module>
27 from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2
---> 28 from google.api import client_pb2 as google_dot_api_dot_client__pb2
29
ImportError: cannot import name 'client_pb2' from 'google.api' (/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/google/api/__init__.py)
I have tried uninstalling and reinstalling using the following command line code without any luck:
pip install pandas-gbq -U
This same code seems to work without any issue on my PC with Anaconda installed. However, the code does not run on my mac. Both computers have python 3.7 installed. I installed python 3 on my mac via homebrew, and I do not have Anaconda installed on that computer.
I'm facing the same problem. but solve it by installing using conda:
conda install pandas-gbq --channel conda-forge

Resources