spark cannot read from the aws - apache-spark

My code in Python3, fails with the following error:
Py4JJavaError: An error occurred while calling o45.load. :
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.s3a.S3AFileSystem not found
The code:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import os
from pandas.io.json import json_normalize
from pyspark.sql.types import *
os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory 10g --packages org.elasticsearch:elasticsearch-hadoop:6.5.2,org.apache.hadoop:hadoop-aws:3.1.1 pyspark-shell'
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").config(conf=SparkConf()).config("spark.local.‌​dir","/Users/psuresh
/spark_local_dir").enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
hadoopConf = spark._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoopConf.set("fs.s3a.access.key", "XXXXXX")
hadoopConf.set("fs.s3a.secret.key","XXXXXXXXXX")
from pyspark.sql.functions import input_file_name
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import *
df= spark.read.format("parquet").load("s3a://a/b/c/d/dump/")

Related

How to write pyspark dataframe directly into S3 bucket?

I want to save pyspark dataframe directly into s3 bucket. I tried some options but getting error. Can someone help me to solve my problem?
I created one sample pyspark dataframe and tried to save in S3 bucket directly.
I tried below code-
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import last
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from functools import reduce
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import max
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import dense_rank
from pyspark.sql.window import Window
from pyspark.sql.functions import abs, lit
#from __future__ import division
import sys
import mysql.connector
import traceback
import json
#from sqlalchemy import create_engine
import os
import math
import os.path
import datetime
from os import getpid
import pymysql.cursors
import time
import signal
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.context import SparkConf
from collections import OrderedDict
import multiprocessing
import multiprocessing as mp
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from threading import Thread
from functools import partial
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.application import MIMEApplication
from email import encoders
import smtplib
import shutil
import glob
from datetime import datetime, date
from pyspark.sql import Row
spark = SparkSession.builder.appName("app_name").getOrCreate()
print(spark.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
sc = spark.sparkContext
aws_access_key_id="*******"
aws_secret_access_key="********"
spark._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId", aws_access_key_id)
spark._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey", aws_secret_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.DefaultAWSCredentialsProviderChain')
df = spark.createDataFrame([Row(a=1, b=4., c='GFG1', d=date(2000, 8, 1),e=datetime(2000, 8, 1, 12, 0)),
Row(a=2, b=8., c='GFG2', d=date(2000, 6, 2),e=datetime(2000, 6, 2, 12, 0)),
Row(a=4, b=5., c='GFG3', d=date(2000, 5, 3),e=datetime(2000, 5, 3, 12, 0))])
print(df.show())
print(df.printSchema())
df.write.format('csv').option('header','true').save('s3a://******/testing_s3/emp.csv',mode='overwrite')
After running this code I am getting below error-
py4j.protocol.Py4JJavaError: An error occurred while calling o48.save.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 403, AWS Service: Amazon S3, AWS Request ID: RNKTVM6JMDACZ16W, AWS Error Code: null, AWS Error Message: Forbidden, S3 Extended Request ID: MS8lToBlzqSmn1YDdq6SPh7JC6aCKSROuldEz5x9LbsnQdxhKVEQriOpJz5KkCJPBnlk4KgsCkQ=
Please tell me what are the things I am missing in my script. Thanks in advance!!
after creating the spark context use these lines to set the credentials
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", AWS_ACCESS_KEY_ID)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
or
import pyspark
conf = (
pyspark.SparkConf()
.setAppName('app_name')
.setMaster(SPARK_MASTER)
.set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)
.set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)
)
sc = pyspark.SparkContext(conf=conf)

Spark Performance Issue - Writing to S3

I have a AWS Glue job in which I am using pyspark to read a large file (30gb) csv on s3 and then save it as parquet on s3. The job ran for more then 3 hours post which I killed it. Not sure why converting the file format would take so long ? Is spark right tool to do this conversion . below is the code I am using
import logging
import sys
from datetime import datetime
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import boto3
import time
if __name__ == "__main__":
sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
sqc = SQLContext(sc)
rdd = (sc. \
textFile("s3://my-bucket/data.txt")\
.flatMap(lambda line: line.split("END")) \
.map(lambda x: x.split("|")) \
.filter(lambda x: len(x) > 1))
df=sqc.createDataFrame(rdd)
#print(df1.head(10))
print(f'df.rdd.getNumPartitions() - {df.rdd.getNumPartitions()}')
df1.write.mode('overwrite').parquet('s3://my-bucket/processed')
job.commit()
Any suggestions for reducing the run time ?

ImportError: No module named pyspark_llap

Below is my main code which I want to UnitTest
get_data.py
from pyspark.sql import SparkSession
from pyspark_llap.sql.session import HiveWarehouseSession
def get_hive_data(query):
hive_data = hive.executeQuery(query)
return hive_data
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("HiveApp")\
.getOrCreate()
hive=HiveWarehouseSession.session(spark).build()
data = get_hive_data()
Below is my unittest code, I have written only the imports here, since i get error when i do from get_data import /*/
test.py
import unittest
import pyspark
import pyspark.sql.functions as f
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from get_data import *
ERROR
ImportError: No module named pyspark_llap
But if i run just get_data.py, it runs successfully
I am running it on edge node!

convert spark dataframe to aws glue dynamic frame

I tried converting my spark dataframes to dynamic to output as glueparquet files but I'm getting the error
'DataFrame' object has no attribute 'fromDF'"
My code uses heavily spark dataframes. Is there a way to convert from spark dataframe to dynamic frame so I can write out as glueparquet? If so could you please provide an example, and point out what I'm doing wrong below?
code:
# importing libraries
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
# updated 11/19/19 for error caused in error logging function
spark = glueContext.spark_session
from pyspark.sql import Window
from pyspark.sql.functions import col
from pyspark.sql.functions import first
from pyspark.sql.functions import date_format
from pyspark.sql.functions import lit,StringType
from pyspark.sql.types import *
from pyspark.sql.functions import substring, length, min,when,format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format,unix_timestamp
base_pth='s3://test/'
bckt_pth1=base_pth+'test_write/glueparquet/'
test_df=glueContext.create_dynamic_frame.from_catalog(
database='test_inventory',
table_name='inventory_tz_inventory').toDF()
test_df.fromDF(test_df, glueContext, "test_nest")
glueContext.write_dynamic_frame.from_options(frame = test_nest,
connection_type = "s3",
connection_options = {"path": bckt_pth1+'inventory'},
format = "glueparquet")
error:
'DataFrame' object has no attribute 'fromDF'
Traceback (most recent call last):
File "/mnt/yarn/usercache/livy/appcache/application_1574556353910_0001/container_1574556353910_0001_01_000001/pyspark.zip/pyspark/sql/dataframe.py", line 1300, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'fromDF'
fromDF is a class function. Her's how you can convert Dataframe to DynamicFrame
from awsglue.dynamicframe import DynamicFrame
DynamicFrame.fromDF(test_df, glueContext, "test_nest")
Just to consolidate the answers for Scala users too, here's how to transform a Spark Dataframe to a DynamicFrame (the method fromDF doesn't exist in the scala API of the DynamicFrame) :
import com.amazonaws.services.glue.DynamicFrame
val dynamicFrame = DynamicFrame(df, glueContext)
I hope it helps !

model.freqItemsets FPGROWTH algorithm is spark 2.4 is not showing any results for the complete dataset

model.freqItemsets FPGROWTH algorithm is spark 2.4 is not showing any results for the complete dataset of 16gb but the same model or code is working for the 1Gb sample dataset which are subset or sample dataset from the 16GB data set
code snippet
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext._
import org.apache.log4j.Level
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import scala.io.Source
import java.nio.charset.CodingErrorAction
import scala.io.Codec
import org.apache.spark.mllib.recommendation._
import org.apache.spark.ml.fpm.FPGrowth
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.fpm.FPGrowthModel
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.explode
object Full_Data_Association_4 {
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val ss = SparkSession
.builder
.appName("Fpgrowth_1").getOrCreate()
import ss.implicits._
val in = ss.read.textFile(args(0))
val in_2 = in.map(x => x.split("\t")(1))
val in_3 = in_2.map(t => t.split(",")).toDF("items")
val fpgrowth = new FPGrowth().setItemsCol("items")
.setMinSupport(0.1).setMinConfidence(0.6)
val model = fpgrowth.fit(in_3)
model.freqItemsets.show(300)
}
And I get the following output:
+-----+----+
|items|freq|
+-----+----+
+-----+----+
This means, with minimum support 0.1 and minimum confidence 0.6, there is no result. Try giving a different value, say minimum support as .001, you might get some result(depending on the dataset)

Resources