Size in spark dataframe - apache-spark

I created a dataframe with a table of my postgres database. when i pass this command to see the number of row (df.count()), i have the error :
WARN TaskSetManager: Stage 9 contains a task of very large size (22439 KiB). The maximum recommended task size is 1000 KiB.
What does that mean ? what is the maximum size of a dataframe in spark ?
Here's the way that i connected to the postgre Database :
import configparser
import psycopg2
import pandas as pd
from queries import COUNTRY_TABLE,ACTORS_TABLE,COL_ACTOR, COL_COUNTRY
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
spark = SparkSession.builder.appName('ETL dvdrental pysaprk').getOrCreate()
def connection_db():
conn = psycopg2.connect("host=localhost dbname=demo user=postgres password=admin port=5432")
cur = conn.cursor()
return [cur, conn]
def extract_data(query):
conn_param = connection_db()
cur = conn_param[0]
conn = conn_param[1]
try:
cur.execute(query)
data = cur.fetchall()
return data
except Exception as e:
print(e)
tickets_col = ["ticket_no","book_ref", "passenger_id", "passenger_name","contact_data"]
tickets = spark.createDataFrame(extract_data("SELECT * FROM tickets")).toDF(*tickets_col)
tickets.count()
I have the warning when i execute tickets.count()

Related

pyspark called from an imported method that calls another method gives empty dataframe

I have a module called test_pyspark_module with following code:
class SparkTest:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
spark = self.spark
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
def fetch_data(sql, sql_text):
data = sql(sql_text).toPandas()
print(len(data))
I have a pyspark kernel running and I have following code in my jupyter notebook:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sqlContext = SQLContext(spark)
sql = sqlContext.sql
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
from test_pyspark_module import *
st = SparkTest(spark, sql)
Now when I run st.fetch_data() I get 43000. However, when I run st.call_fetch_data() I get 0.
I wanted to see if something is going wrong with import so I implemented a duplicate of SparkTest locally calling it SparkTest2. However, this works as I expect with both functions returning 43000.
class SparkTest2:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLE_NAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
st2 = SparkTest2(spark, sql)
st2.fetch_data(sql_text) gives output 43000 and st2.call_fetch_data() gives output 43000
So seems like if try to run a class method that calls another method and the class is imported then it will fail to give correct results. Note that there is no error or exception. Just that I get 0 rows (I do get correct number of columns, i.e. 28)

How to load data from a connection string with vaex package?

If I have a table on my server and I am producing a connection string to it, how can I, using Vaex, load it to a dataframe?
Here is what I am doing but with Pandas:
from sqlalchemy import types, create_engine, text
import pandas as pd
import pymysql
def connect_to_data(driver='mysql+pymysql://', conn_string=''):
try:
conn = create_engine(driver + conn_string)
print("MySQL Connection Successfull!")
except Exception as err:
print("MySQL Connection Failed!")
print(err)
return conn
# Connect to the db:
conn_string = 'xxxxxxxx'
conn = connect_to_data(conn_string=conn_string)
# Get all requests from the db:
query = '''SELECT * FROM table_name'''
result = conn.execute(text(query))
# Desired dataframe:
df = pd.read_sql_query(query, conn)
How can I do the same with Vaex (because of it's high performance)?
For now at least, you can't do it directly. But vaex can easily read a pandas dataframe so you can
# Following your example..
pandas_df = pd.read_sql_query(query, conn)
df = vaex.from_pandas(pandas_df)

Python pandas into azure SQL, bulk insert

How can I arrange bulk insert of python dataframe into corresponding azure SQL.
I see that INSERT works with individual records :
INSERT INTO XX ([Field1]) VALUES (value1);
How can I insert the entire content of dataframe into Azure table?
Thanks
According to my test, we also can use to_sql to insert data to Azure sql
for example
from urllib.parse import quote_plus
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, event
import pyodbc
# azure sql connect tion string
conn ='Driver={ODBC Driver 17 for SQL Server};Server=tcp:<server name>.database.windows.net,1433;Database=<db name>;Uid=<user name>;Pwd=<password>;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
quoted = quote_plus(conn)
engine=create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted))
#event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
print("FUNC call")
if executemany:
cursor.fast_executemany = True
#insert
table_name = 'Sales'
# For test, I use a csv file to create dataframe
df = pd.read_csv('D:\data.csv')
df.to_sql(table_name, engine, index=False, if_exists='replace', schema='dbo')
#test after inserting
query = 'SELECT * FROM {table}'.format(table=table_name )
dfsql = pd.read_sql(query, engine)
print(dfsql)

Pyspark S3A Access Denied Exception for cross account STS assume role

I setup an AWS Glue job to process S3 files present in another AWS account B. The IAM role in Account A(glue job IAM role) is using STS to assume a role in Account B which provides access to my desired files. Account B's IAM role have Trust relationship to the Glue job role in Account A. I was able to print access key and secret key, so assuming the STS is working well.
I get below error:
An error occurred while calling o83.json. com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied;
what is the right implementation of S3A connector as i get Access Denied Exception.
Here is my code:
from __future__ import print_function
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql import HiveContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import explode_outer
from pyspark.sql.functions import substring_index
from pyspark.sql.functions import input_file_name
from pyspark.sql import functions as f
import sys
import os
import os
import boto3
import sys
import errno
import time
import datetime
from datetime import timedelta, date
from pyspark.sql.functions import split
from pyspark.sql.functions import substring
from boto3.session import Session
spark = SparkSession\
.builder\
.appName("JsonInputFormat")\
.enableHiveSupport()\
.getOrCreate()\
sc = spark.sparkContext
hive_context = HiveContext(sc)
hive_context.setConf("hive.exec.dynamic.partition", "true")
hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
hive_context.setConf("hive.serialization.extend.nesting.levels","true")
sqlCtx = HiveContext(sc)
client = boto3.client('sts')
response = client.assume_role(RoleArn='ROLE_TO_ASSUME', RoleSessionName='AssumeRoleSession1')
credentials = response['Credentials']
ACCESS_KEY = credentials['AccessKeyId']
SECRET_KEY = credentials['SecretAccessKey']
print('access key is {}'.format(ACCESS_KEY))
print('secret key is {}'.format(SECRET_KEY))
print("Hadoop version: " + sc._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
session = Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3a.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3-us-east-1.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
print('path is {}'.format(path))
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
sqlCtx.sql("set spark.sql.caseSensitive=true")
yesterday = date.today() - timedelta(1)
daybefore=yesterday.strftime("%Y-%m-%d")
currentdate=time.strftime("%Y-%m-%d")
key = 'KEY={}'.format(str(daybefore))
bucket = 'BUKCET_NAME'
df_base=spark.read.json('s3a://{}/{}/*/'.format(bucket,key))
base_schema=df_base.schema
datePrefix=str(daybefore)
source='s3a://{}/{}'.format(bucket,key)
df1=spark.read.json(source,schema=base_schema)
schema=df1.schema.jsonValue()
columns_list=flatten_schema(schema)
print('columns list is {}'.format(columns_list))
df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.contents).alias("contents_flat"))
df3=df3.drop("contents")
print('df3 is {}'.format(df3))
schema4=df3.schema.jsonValue()
columns_list4=flatten_schema(schema4)
print('columns list 4 is {}'.format(columns_list4))
df5 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
print('df5 is {}'.format(df5))
schema5=df5.schema.jsonValue()
columns_list5=flatten_schema(schema5)
print('columns list 5 is {}'.format(columns_list5))
df6 = df5.select(*(col(x).alias(x.replace('contents_flat','contents')) for x in columns_list5))
print('df6 is {}'.format(df6))
schema6=df6.schema.jsonValue()
columns_list6=flatten_schema(schema6)
print('column list 6 is {}'.format(columns_list6))
df7 = df6.select(*(col(x) for x in columns_list6)) #above line edited down here
schema7=df7.schema.jsonValue()
print('schema7 is {}'.format(schema7))
columns_list7=flatten_schema(schema7)
print('columns list 7 is {}'.format(columns_list7))
df7 = df7.select(*(col(x).alias(x.replace('.','_')) for x in columns_list7))
df7=df7.select("*",explode_outer(df7.business_address_latLng_warnings).alias("business_address_latLng_warnings_flat"))
df7=df7.drop("business_address_latLng_warnings")
print('df7 is {}'.format(df7))
df8 = df7.withColumn("filename",input_file_name())
split_col = split(df8['filename'], 'short_date=')
df9 = df8.withColumn('shortfilename', split_col.getItem(1))
df_final = df9.withColumn('filedate', substring('shortfilename',1,10)).drop('shortfilename')
print('df_final is {}'.format(df_final))
df_final.write.mode('append').csv('s3://{bucket}/{folder1}/', header='true')
spark.stop()```

Spark 2.0.0 truncate from Redshift table using jdbc

Hello I am using Spark SQL(2.0.0) with Redshift where I want to truncate my tables. I am using this spark-redshift package & I want to know how I can truncate my table.Can anyone please share example of this ??
I was unable to accomplish this using Spark and the code in the spark-redshift repo that you have listed above.
I was, however, able to use AWS Lambda with psycopg2 to truncate a redshift table. Then I use boto3 to kick off my spark job via AWS Glue.
The important code below is cur.execute("truncate table yourschema.yourtable")
from __future__ import print_function
import sys
import psycopg2
import boto3
def lambda_handler(event, context):
db_database = "your_redshift_db_name"
db_user = "your_user_name"
db_password = "your_password"
db_port = "5439"
db_host = "your_redshift.hostname.us-west-2.redshift.amazonaws.com"
try:
print("attempting to connect...")
conn = psycopg2.connect(dbname=db_database, user=db_user, password=db_password, host=db_host, port=db_port)
print("connected...")
conn.autocommit = True
cur = conn.cursor()
count_sql = "select count(pivotid) from yourschema.yourtable"
cur.execute(count_sql)
results = cur.fetchone()
print("countBefore: ", results[0])
countOfPivots = results[0]
if countOfPivots > 0:
cur.execute("truncate table yourschema.yourtable")
print("truncated yourschema.yourtable")
cur.execute(count_sql)
results = cur.fetchone()
print("countAfter: ", results[0])
cur.close()
conn.close()
glueClient = boto3.client("glue")
startTriiggerResponse = glueClient.start_trigger(Name="your-awsglue-ondemand-trigger")
print("startedTrigger:", startTriiggerResponse.Name)
return results
except Exception as e:
print(e)
raise e
You need to specify the mode to the library before calling save. For example:
my_dataframe.write
.format("com.databricks.spark.redshift")
.option("url", "jdbc:redshift://my_cluster.qwertyuiop.eu-west-1.redshift.amazonaws.com:5439/my_database?user=my_user&password=my_password")
.option("dbtable", "my_table")
.option("tempdir", "s3://my-bucket")
.option("diststyle", "KEY")
.option("distkey", "dist_key")
.option("sortkeyspec", "COMPOUND SORTKEY(key_1, key_2)")
.option("extracopyoptions", "TRUNCATECOLUMNS COMPUPDATE OFF STATUPDATE OFF")
.mode("overwrite") // "append" / "error"
.save()

Resources