Error when calling Dask groupby with custom aggregate - python-3.x

The following error occurs when running the below example test, what am I doing wrong?
Error: Exception('Column(s) id already selected',)
Code:
import unittest
import dask
import pandas as pd
import dask.dataframe as dd
class TestDaskCustomAgg(unittest.TestCase):
def mode(self, x):
val = pd.Series.mode(x)
if val.empty:
return np.NaN
return val[0]
def test_get_transactions(self):
df = dask.datasets.timeseries()
custom_agg = dd.Aggregation('custom_agg', agg=lambda x: self.mode(x), chunk=lambda x0: self.mode(x0))
df.groupby('name').agg(custom_agg).compute()

Related

How to call another object within Pyspark UDF function

I have a class Hello with a few methods
I would like to create a hello object within a UDF pyspark function, such as:
def foo_generation(query_params):
query_obj = Hello()
foo = query_obj.hello_method(query_params)
return foo
​
spark.udf.register("foo_generation", foo_generation)
df = df.withColumn("foo", F.expr("foo_generation(query_param_ES)"))
This doesn't appear to be working.
How should I generate a Hello object in this instance?
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
#udf(returnType=StringType())
def foo_generation(str):
query_obj = Hello()
foo = query_obj.hello_method(str)
return foo
df = df.withColumn("foo", F.expr("foo_generation(str)"))

Why wont this work? - def (function) not being called from main()

I need to be able to use classes, but trying to just get my simple code to work
import pandas as pd, numpy as np
class OutOfCountry7ViewModel():
def pandas_conversion(self):
#from csv import readers
deImport = pd.read_csv("ooc-exceptions.csv")
d1 = pd.read_csv("CS_Out_Of_Country.csv", encoding='windows-1252', parse_dates=True)
d2 = pd.read_csv("sccm-devices.csv", encoding='windows-1252')
d3 = pd.read_csv("CTLDAPRawData.csv", encoding='windows-1252')
#pandas join magic
lj_df1 = pd.merge(d1, d2, left_on="ComputerName", right_on="Name", how="left")
lj_df2 = pd.merge(d2, d3, left_on="PrimaryUser", right_on="Employee Number", how="left")
#lj_df = plj_df1d.join(lj_df2, lsuffix=)
df = (lj_df1)
#print(df)
df.to_csv('CS_Out_of_country_tabl.csv', index=False,header=df.columns, encoding='utf-8')
csv = 'CS_Out_of_country_tabl.csv'
return csv
def main():
pandas_conversion(self)
if __name__ == '__main__':
main()
i keep getting an error, NameError: name 'pandas_conversion' is not defined
Are you trying to do something like this? -
import pandas as pd, numpy as np
class OutOfCountry7ViewModel():
def pandas_conversion(self,csv):
...
def main(self):
self.pandas_conversion(csv)
if __name__ == '__main__':
some_object = OutOfCountry7ViewModel()
some_object.main()
This should work:
a = OutOfCountry7ViewModel()
a.pandas_conversion()
Hope this helped!
Try to remember the semantics and indentation of python.
Unused import numpy
Class/Parent Class has no (), Line 3
class OutOfCountry7ViewModel(): #wrong
class OutOfCountry7ViewModel: #right
There is no need of ()
df = (lj_df1)
#if you using some func then you miss that func name
If you're defining a method in the class you've to add instance self
def main(self):
pandas_conversion(self) #wrong calling func with parameter
I think your code is wrong because in PyCharm, it says def pandas_conversion(self): may be static.
So, your code is incomplete, there is something missing that we can't find.

How to Encode ( utf-8) in Pandas (Excel as source)

I am trying to read from excel and load into Mongodb using Pymongo.
The Error I got cannot is "encode object: , of type: <class 'pandas._libs.missing.NAType'>", when researched, I was told to use utf-8-sign format to insert it into monogodb, but in pandas dataframe there is no option to use utf-8
from pymongo import MongoClient
from datetime import datetime
import pandas as pd
import Parameters
import pandasql as pf
import json
import pymongo
import xlrd
from pathlib import Path
import os
import constants
try:
class conn:
def __init__(self):
client = pymongo.MongoClient( "mongodb://" + constants.USER_NAME + ":" + constants.PWD + constants.server + constants.CA_CERTIFICATES_PATH)
db = client[Parameters.STG_QC_Hub_Files]
week="08-02-2021"
out_col = db[Parameters.col]
filename = "1.xlsx"
path1 = Path('//test3'+'/'+filename)
data_load_date = datetime.today().strftime('%m-%d-%Y')
df1=pd.read_excel(path1,sheet_name="AU-ARCM Details",keep_default_na=False)
# df1 = pd.read_excel(xls+filename,keep_default_na=False,encoding='utf-8-sig')
# df1 = pd.read_csv(xls,keep_default_na=False,encoding='utf-8-sig').iloc[:, : 86]
df1["Week"]=week
df1["Data Load Date"]=data_load_date
df1 = df1.astype('string')
# df1.index = df1.index.str.encode('utf-8')
df1=df1.drop(['Source.Name'], axis=1)
records = json.loads(df1.T.to_json()).values()
out_col.insert_many(df1.to_dict('records'))
print("Imported File " +str(filename)+" with " +str(len(records) )+ " records")
c = conn()
except Exception as e:
print(e)
Traceback:
File "C:\Users\PycharmProjects\ReMs\venv\lib\site-packages\pymongo\message.py", line 1323, in _do_batched_op_msg
operation, command, docs, check_keys, ack, opts, ctx)
bson.errors.InvalidDocument: cannot encode object: <NA>, of type: <class 'pandas._libs.missing.NAType'>
You have some blank cells in your spreadsheet that pandas has its own type (NAT) for; pymongo doesn't know what to do with this type, hence the error. You will need to remove any of these in order to load the values into mongodb using the method you are using.
Consider something like this just before you attempt the insert:
import numpy as np
df1 = df1.replace(np.nan, None)

Pyspark S3A Access Denied Exception for cross account STS assume role

I setup an AWS Glue job to process S3 files present in another AWS account B. The IAM role in Account A(glue job IAM role) is using STS to assume a role in Account B which provides access to my desired files. Account B's IAM role have Trust relationship to the Glue job role in Account A. I was able to print access key and secret key, so assuming the STS is working well.
I get below error:
An error occurred while calling o83.json. com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied;
what is the right implementation of S3A connector as i get Access Denied Exception.
Here is my code:
from __future__ import print_function
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql import HiveContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import explode_outer
from pyspark.sql.functions import substring_index
from pyspark.sql.functions import input_file_name
from pyspark.sql import functions as f
import sys
import os
import os
import boto3
import sys
import errno
import time
import datetime
from datetime import timedelta, date
from pyspark.sql.functions import split
from pyspark.sql.functions import substring
from boto3.session import Session
spark = SparkSession\
.builder\
.appName("JsonInputFormat")\
.enableHiveSupport()\
.getOrCreate()\
sc = spark.sparkContext
hive_context = HiveContext(sc)
hive_context.setConf("hive.exec.dynamic.partition", "true")
hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
hive_context.setConf("hive.serialization.extend.nesting.levels","true")
sqlCtx = HiveContext(sc)
client = boto3.client('sts')
response = client.assume_role(RoleArn='ROLE_TO_ASSUME', RoleSessionName='AssumeRoleSession1')
credentials = response['Credentials']
ACCESS_KEY = credentials['AccessKeyId']
SECRET_KEY = credentials['SecretAccessKey']
print('access key is {}'.format(ACCESS_KEY))
print('secret key is {}'.format(SECRET_KEY))
print("Hadoop version: " + sc._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
session = Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3a.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3-us-east-1.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
print('path is {}'.format(path))
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
sqlCtx.sql("set spark.sql.caseSensitive=true")
yesterday = date.today() - timedelta(1)
daybefore=yesterday.strftime("%Y-%m-%d")
currentdate=time.strftime("%Y-%m-%d")
key = 'KEY={}'.format(str(daybefore))
bucket = 'BUKCET_NAME'
df_base=spark.read.json('s3a://{}/{}/*/'.format(bucket,key))
base_schema=df_base.schema
datePrefix=str(daybefore)
source='s3a://{}/{}'.format(bucket,key)
df1=spark.read.json(source,schema=base_schema)
schema=df1.schema.jsonValue()
columns_list=flatten_schema(schema)
print('columns list is {}'.format(columns_list))
df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.contents).alias("contents_flat"))
df3=df3.drop("contents")
print('df3 is {}'.format(df3))
schema4=df3.schema.jsonValue()
columns_list4=flatten_schema(schema4)
print('columns list 4 is {}'.format(columns_list4))
df5 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
print('df5 is {}'.format(df5))
schema5=df5.schema.jsonValue()
columns_list5=flatten_schema(schema5)
print('columns list 5 is {}'.format(columns_list5))
df6 = df5.select(*(col(x).alias(x.replace('contents_flat','contents')) for x in columns_list5))
print('df6 is {}'.format(df6))
schema6=df6.schema.jsonValue()
columns_list6=flatten_schema(schema6)
print('column list 6 is {}'.format(columns_list6))
df7 = df6.select(*(col(x) for x in columns_list6)) #above line edited down here
schema7=df7.schema.jsonValue()
print('schema7 is {}'.format(schema7))
columns_list7=flatten_schema(schema7)
print('columns list 7 is {}'.format(columns_list7))
df7 = df7.select(*(col(x).alias(x.replace('.','_')) for x in columns_list7))
df7=df7.select("*",explode_outer(df7.business_address_latLng_warnings).alias("business_address_latLng_warnings_flat"))
df7=df7.drop("business_address_latLng_warnings")
print('df7 is {}'.format(df7))
df8 = df7.withColumn("filename",input_file_name())
split_col = split(df8['filename'], 'short_date=')
df9 = df8.withColumn('shortfilename', split_col.getItem(1))
df_final = df9.withColumn('filedate', substring('shortfilename',1,10)).drop('shortfilename')
print('df_final is {}'.format(df_final))
df_final.write.mode('append').csv('s3://{bucket}/{folder1}/', header='true')
spark.stop()```

Cassandra ExecutionResult on importlib._bootstrap failed

I am trying to execute multi-process to pull the data from Cassandra. But, I'm facing the issue.I want to pull it for single key or multiple keys using the multi-process provided my Cassandra
My cassandra_db class
from cassandra.cluster import Cluster
import cassandra
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import os
from threading import Event
import itertools
from multiprocessing import Pool
from cassandra.concurrent import execute_concurrent_with_args
from cassandra.query import tuple_factory
ip_address = '127.0.0.1'
class cassandra_db(object):
concurrency = 2 # chosen to match the default in execute_concurrent_with_args
def __init__(self,process_count=None):
self.pool = Pool(processes=process_count, initializer=self._setup)
#classmethod
def _setup(cls):
cls.session = Cluster([ip_address]).connect(keyspace='test')
cls.session.row_factory = pandas_factory
cls.prepared = cls.session.prepare('SELECT * FROM tr_test WHERE key=?')
def close_pool(self):
self.pool.close()
self.pool.join()
def get_results(self, params):
try:
xrange
except NameError:
xrange = range
params = list(params)
print("-----> ",params)
print("-----+>",self.concurrency)
self.pool.map(_multiprocess_get, (params[n:n + self.concurrency] for n in xrange(0, len(params), self.concurrency)))
#classmethod
def _results_from_concurrent(cls, params):
return execute_concurrent_with_args(cls.session, cls.prepared, params)
def _multiprocess_get(params):
return cassandra_db._results_from_concurrent(params)
My calling class
import os
import pandas as pd
import sys
relative_path='/home/anji'
sys.path.append(os.path.join(relative_path ,'commons','Database Operations'))
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra_db import cassandra_db
from cassandra.policies import ConstantReconnectionPolicy
processes =2
con_db = cassandra_db(processes)
keys=[(1,),(2,)]
df = con_db.get_results(keys)
print("Result",df.head())
Error:
multiprocessing.pool.MaybeEncodingError: Error sending result: '[[ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7fa93658bbe0>), ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7fa936a2e0f0>)]]'. Reason: 'PicklingError("Can't pickle <class 'importlib._bootstrap.ExecutionResult'>: attribute lookup ExecutionResult on importlib._bootstrap failed",)'
My trying to execute for 2 keys but facing the issue. Can any help me to solve this issue

Resources