GoogleCobal (Free Version) - Working with PySpark with Hive - apache-spark

AnalysisException Traceback (most recent call last)
in
----> 1 spark.sql('''CREATE TABLE employees (
2 employee_id INT,
3 employee_first_name STRING,
4 employee_last_name STRING,
5 employee_salary FLOAT,
2 frames
/usr/local/lib/python3.8/dist-packages/pyspark/sql/utils.py in deco(*a, **kw)
194 # Hide where the exception came from that shows a non-Pythonic
195 # JVM exception message.
--> 196 raise converted from None
197 else:
198 raise
AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT);
'CreateTable pyspark_semistructureddata.employees, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists
`

Related

PySpark: Java Heap Error (Jupyter Notebook)

I am running a simple spark job, where in I am querying a table to get 3 columns and 7M rows. I tried various spark configs, but everytime I get Java Heap Space error.
Can someone please help me with this. I am trying to create an ETL process which computes data from 5 tables all of similar size, but I am getting java heap error when I am running the code with only 1 table. I tried to reduce the data volume as well but I still get the same error.
The tables are having >60 columns and Billions of rows of which I am getting only a subset of data for my process.
Please see below the code:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import *
import getpass
spark =SparkSession.builder.getOrCreate()
spark.sparkContext._conf.getAll()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'),
('spark.app.name', 'John Doe'), ('spark.executor.cores', '8'), ('spark.cores.max',
'8'),('spark.driver.memory','15g')])
spark.sparkContext.stop()
spark.sparkContext.stop()
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df=spark.sql("""
SELECT DISTINCT col1
,col2
,col3
from schema.table
where condition1
and condition2
and condition3
and condition4
""")
df.show()
Stacktrace:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480,
in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038,
in send_command
response = connection.send_command(command)
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503,
in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-4-8faeb4b518d0> in <module>
24
25
---> 26 df_upsell.show()
/opt/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
492
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
496 try:
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __call__(self,
*args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
/opt/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
332 format(target_id, ".", name, value))
333 else:
--> 334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o683.showString

Brightway2 writing the imported database

I created several excel inventory files using the eco invent exchanges.
To run my LCA I successfully imported the database with 0 unlinked exchanges using:
imp = bw.ExcelImporter("Inventory_fuelcell.xlsx")
imp.apply_strategies()
imp.match_database("ecoinvent 3.6 cutoff", fields=('name','unit','location'))
imp.match_database("biosphere3", fields=('name','unit'))
imp.match_database(fields=('name', 'unit', 'location'))
imp.statistics()
But when I run imp.write_database()
I get the following error:
Writing activities to SQLite3 database:
0% [######## ] 100% | ETA: 00:00:00
---------------------------------------------------------------------------
InvalidExchange Traceback (most recent call last)
<ipython-input-41-1daab0bbe8d8> in <module>
----> 1 imp.write_database()
/opt/anaconda3/envs/Masterarbeit/lib/python3.7/site-packages/bw2io/importers/excel.py in write_database(self, **kwargs)
257 """Same as base ``write_database`` method, but ``activate_parameters`` is True by default."""
258 kwargs['activate_parameters'] = kwargs.get('activate_parameters', True)
--> 259 super(ExcelImporter, self).write_database(**kwargs)
260
261 def get_activity(self, sn, ws):
/opt/anaconda3/envs/Masterarbeit/lib/python3.7/site-packages/bw2io/importers/base_lci.py in write_database(self, data, delete_existing, backend, activate_parameters, **kwargs)
238
239 existing.update(data)
--> 240 db.write(existing)
241
242 if activate_parameters:
/opt/anaconda3/envs/Masterarbeit/lib/python3.7/site-packages/bw2data/project.py in writable_project(wrapped, instance, args, kwargs)
354 if projects.read_only:
355 raise ReadOnlyProject(READ_ONLY_PROJECT)
--> 356 return wrapped(*args, **kwargs)
/opt/anaconda3/envs/Masterarbeit/lib/python3.7/site-packages/bw2data/backends/peewee/database.py in write(self, data, process)
258 if data:
259 try:
--> 260 self._efficient_write_many_data(data)
261 except:
262 # Purge all data from database, then reraise
/opt/anaconda3/envs/Masterarbeit/lib/python3.7/site-packages/bw2data/backends/peewee/database.py in _efficient_write_many_data(self, data, indices)
203 for index, (key, ds) in enumerate(data.items()):
204 exchanges, activities = self._efficient_write_dataset(
--> 205 index, key, ds, exchanges, activities
206 )
207
/opt/anaconda3/envs/Masterarbeit/lib/python3.7/site-packages/bw2data/backends/peewee/database.py in _efficient_write_dataset(self, index, key, ds, exchanges, activities)
154 for exchange in ds.get('exchanges', []):
155 if 'input' not in exchange or 'amount' not in exchange:
--> 156 raise InvalidExchange
157 if 'type' not in exchange:
158 raise UntypedExchange
InvalidExchange:
I never had this problem before.
Is there a way to figure out where the invalid exchange is?
But even with the error if I look for databases it still shows up.
So it seems like the database was in fact imported.
Can anybody help me what could be wrong?
If we look through the error traceback, we can see the line raising the error:
154 for exchange in ds.get('exchanges', []):
155 if 'input' not in exchange or 'amount' not in exchange:
--> 156 raise InvalidExchange
This means that at least one exchange doesn't have an input or and amount. As all your exchanges are linked, they all have input values, so the amount must be missing. This could be due to a typo in the column field, or off by one errors, etc.
To find it, you could try:
for ds in imp.data:
for exc in ds['exchanges']:
if 'amount' not in exc:
print("Missing `amount` in exc:")
print("\t", exc)
print("Dataset", ds['name'], ds['location'])
elif 'input' not in exc:
# check just to make sure
print("Missing `input` in exc:")
print("\t", exc)
print("Dataset", ds['name'], ds['location'])

Need help passing date to pandas query

How do I pass the output of this prompt to a pandas search by date in excel?
import pandas as pd
TestedDateBegin = pd.to_datetime(input('Input date in mm-dd-yyyy format: '))
For example, if I input 2019-09-08 into above input prompt and run TestedDateBegin I get this output:
Timestamp('2019-09-08 00:00:00')
This search with the date hard coded works fine.
data = df.loc[df['emr_first_access_date'] >= '2019-09-08', ['site_name','subs_num','emr_id', ```'emr_first_access_date']]
But how do I pass the date inputted from the prompt so the user can search by any date?
This doesnt work:
data = df.loc[df['emr_first_access_date'] >= 'TestedDateBegin', ['site_name','subs_num','emr_id',
and throws a exception:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
pandas/_libs/tslibs/np_datetime.pyx in pandas._libs.tslibs.np_datetime._string_to_dts()
ValueError: Error parsing datetime string "TestedDateBegin" at position 0
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
pandas/_libs/tslibs/parsing.pyx in pandas._libs.tslibs.parsing.parse_datetime_string()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dateutil\parser\_parser.py in parse(timestr, parserinfo, **kwargs)
1357 else:
-> 1358 return DEFAULTPARSER.parse(timestr, **kwargs)
1359
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dateutil\parser\_parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
648 if res is None:
--> 649 raise ValueError("Unknown string format:", timestr)
650
ValueError: ('Unknown string format:', 'TestedDateBegin')
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in wrapper(self, other)
144 try:
--> 145 other = _to_M8(other, tz=self.tz)
146 except ValueError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in _to_M8(key, tz)
77 # this also converts strings
---> 78 key = Timestamp(key)
79 if key.tzinfo is not None and tz is not None:
pandas/_libs/tslibs/timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
ValueError: could not convert string to Timestamp
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-2-702fd23c14bb> in <module>
----> 1 data = df.loc[df['emr_first_access_date'] >= 'TestedDateBegin', ['site_name','subs_num','emr_id', 'emr_first_access_date']]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1714
1715 res_values = dispatch_to_index_op(op, self, other,
-> 1716 pd.DatetimeIndex)
1717
1718 return self._constructor(res_values, index=self.index,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in dispatch_to_index_op(op, left, right, index_class)
1189 left_idx = left_idx._shallow_copy(freq=None)
1190 try:
-> 1191 result = op(left_idx, right)
1192 except NullFrequencyError:
1193 # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\datetimelike.py in wrapper(self, other)
115 other = other._values
116
--> 117 result = op(self._data, maybe_unwrap_index(other))
118 return result
119
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in wrapper(self, other)
146 except ValueError:
147 # string that cannot be parsed to Timestamp
--> 148 return ops.invalid_comparison(self, other, op)
149
150 result = op(self.asi8, other.view('i8'))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in invalid_comparison(left, right, op)
1056 else:
1057 raise TypeError("Invalid comparison between dtype={dtype} and {typ}"
-> 1058 .format(dtype=left.dtype, typ=type(right).__name__))
1059 return res_values
1060
TypeError: Invalid comparison between dtype=datetime64[ns] and str
The error
TypeError: Invalid comparison between dtype=datetime64[ns] and str
tells that you try to compare datetime with string. To do it convert your string to datetime manually. In your case try:
from datetime import datetime
date = '2019-09-08'
date = datetime.strptime(date, '%Y-%m-%d')
To learn more information about date formatting see documentation

how to remove nameerror Traceback (most recent call last)

I am making a series using three dictionaries, in dictionary, there is no keyword or values "name"/ "null".but it is showing NameError: name 'null' is not defined
rerun the code in different jupyter notebook
import pandas as pd
p1=pd.Series({'team':'england','keyplayer':'joe root','bowler':'jofra'})
p2=pd.Series({'team':'india','keyplayer':'virat kohli','bowler':'bumhra'})
p3=pd.Series({'team':'australia','keyplayer':'steve smith','bowler':'starc'})
df=pd.DataFrame([p1,p2,p3],index=['1','2','3'])
df.head()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in pretty(self, obj)
400 if cls is not object \
401 and callable(cls.__dict__.get('__repr__')):
--> 402 return _repr_pprint(obj, self, cycle)
403
404 return _default_pprint(obj, self, cycle)
~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in _repr_pprint(obj, p, cycle)
695 """A pprint that just redirects to the normal repr function."""
696 # Find newlines and replace them with p.break_()
--> 697 output = repr(obj)
698 for idx,output_line in enumerate(output.splitlines()):
699 if idx:
~\Anaconda3\lib\site-packages\pandas\core\base.py in __repr__(self)
80 Yields Bytestring in Py2, Unicode String in py3.
81 """
---> 82 return str(self)
83
84
~\Anaconda3\lib\site-packages\pandas\core\base.py in __str__(self)
59
60 if compat.PY3:
---> 61 return self.__unicode__()
62 return self.__bytes__()
63
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __unicode__(self)
661 width = None
662 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 663 line_width=width, show_dimensions=show_dimensions)
664
665 return buf.getvalue()
NameError: name 'null' is not defined
----------------after this table is showing
I tried to run your code in an isolated environment and I managed!
IPython:
In [1]: import pandas as pd
In [2]: p1=pd.Series({'team':'england','keyplayer':'joe root','bowler':'jofra'})
: p2=pd.Series({'team':'india','keyplayer':'virat kohli','bowler':'bumhra'})
: p3=pd.Series({'team':'australia','keyplayer':'steve smith','bowler':'starc'})
: df=pd.DataFrame([p1,p2,p3],index=['1','2','3'])
In [3]: df.head()
Out[3]:
team keyplayer bowler
1 england joe root jofra
2 india virat kohli bumhra
3 australia steve smith starc
I installed the minimum required to run: numpy==1.16.4, pandas==0.24.2 e jupyter==1.0.0
Maybe there is some problem with your libs.
I recommend you try to run your code using virtualenv and install the required libraries.
To learn more, nothing better than the Python documentation itself: https://docs.python.org/3/tutorial/venv.html

AttributeError: Can only use .dt accessor with datetimelike values in 0yrs 0mon format

I am trying converting date string format to numeric, but I get some error,
my date column like this :
train['AVERAGE_ACCT_AGE'].head(6)
0 0yrs 0mon
1 1yrs 11mon
2 0yrs 0mon
3 0yrs 8mon
4 0yrs 0mon
5 1yrs 9mon
Name: AVERAGE_ACCT_AGE, dtype: object
I tried this code to add DateTime format to that variable.
train['AVERAGE_ACCT_AGE']=pd.to_datetime(train['AVERAGE.ACCT.AGE'], format='%Y%m')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
376 try:
--> 377 values, tz = conversion.datetime_to_datetime64(arg)
378 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.datetime_to_datetime64()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-49-13f5c298f460> in <module>()
----> 1 train['AVERAGE_ACCT_AGE']=pd.to_datetime(train['AVERAGE.ACCT.AGE'], format='%Y-%m')
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin, cache)
449 else:
450 from pandas import Series
--> 451 values = _convert_listlike(arg._values, True, format)
452 result = Series(values, index=arg.index, name=arg.name)
453 elif isinstance(arg, (ABCDataFrame, MutableMapping)):
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
378 return DatetimeIndex._simple_new(values, name=name, tz=tz)
379 except (ValueError, TypeError):
--> 380 raise e
381
382 if arg is None:
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
366 dayfirst=dayfirst,
367 yearfirst=yearfirst,
--> 368 require_iso8601=require_iso8601
369 )
370
pandas\_libs\tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas\_libs\tslib.pyx in pandas._libs.tslib.array_to_datetime()
ValueError: time data 0yrs 0mon doesn't match format specified
After that, I tried this code to added error ignore to the column.
train['AVERAGE_ACCT_AGE']=pd.to_datetime(train['AVERAGE.ACCT.AGE'], format='%Y%m',errors='ignore',infer_datetime_format=True)
Its added datetime format then I this code
train['yrs']=train['AVERAGE_ACCT_AGE'].dt.year
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-50-39b8c6e07f77> in <module>()
----> 1 train['yrs']=train['AVERAGE_ACCT_AGE'].dt.year
~\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
4366 if (name in self._internal_names_set or name in self._metadata or
4367 name in self._accessors):
-> 4368 return object.__getattribute__(self, name)
4369 else:
4370 if self._info_axis._can_hold_identifiers_and_holds_name(name):
~\Anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
130 # we're accessing the attribute of the class, i.e., Dataset.geo
131 return self._accessor
--> 132 accessor_obj = self._accessor(obj)
133 # Replace the property with the accessor object. Inspired by:
134 # http://www.pydanny.com/cached-property.html
~\Anaconda3\lib\site-packages\pandas\core\indexes\accessors.py in __new__(cls, data)
323 pass # we raise an attribute error anyway
324
--> 325 raise AttributeError("Can only use .dt accessor with datetimelike "
326 "values")
please help me how to convert object type to numeric type. I want years and months of columns separately.
AttributeError: Can only use .dt accessor with datetimelike values
The column is not of Datetime format.
Here is a quick way to get it to numeric.
I am using more lines than needed.
# doing this so we can have it in string format
train['AVERAGE_ACCT_AGE'] = train['AVERAGE_ACCT_AGE'].astype(str)
#Now remove the trailing or any such spaces
train['AVERAGE_ACCT_AGE'] = train['AVERAGE_ACCT_AGE'].map(lambda x: x.strip())
#Next we split and expand the column into 2 columns:
train[['yrs','months']] = train['AVERAGE_ACCT_AGE'].str.split(' ',n=1,expand=True)
#remove characters from new columns,
#I am assuming the characters remain the same
train['yrs'] = train['yrs'].str.replace('yrs','')
train['months'] = train['months'].str.replace('mon','')
# Convert yrs to float
train['yrs'] = train['yrs'].astype('float')
# Convert months to float
train['months'] = train['yrs'].astype('float')
Hope it helps.

Resources