PySpark import statements running for a very long time - apache-spark

My PySpark code is below, and the first part, i.e, the import statements cell takes a very long time to run in Jupyter, in fact, the execution didn't happen till 5 - 6 hours, and later it shows a "Time limit exceeded error".
I have tried everything, like restarting jupyter, uninstalling anaconda, and then reinstalling, uninstalling spark and pyspark, and then re-installing both of them again. I even removed python completely and then installed it again, BUT THE PROBLEM NEVER SOLVED...!
Edit 1:- I realized that the problem is with the line spark = init_spark() This is taking a lot of time to run (in fact not running even after 4 - 5 hours)
Please help me with this...
import os
import sys
import pyspark
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions
from pyspark.sql.functions import lit, desc, col, size
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from IPython.core.interactiveshell import InteractiveShell
import matplotlib
from pylab import *
import scipy.stats as stats
# This helps auto print out the items without explixitly using 'print'
InteractiveShell.ast_node_interactivity = "all"
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]")
def init_spark():
spark = SparkSession \
.builder \
.appName("Statistical Inferences with Pyspark") \
.config(conf=conf) \
.getOrCreate()
return spark
spark = init_spark()
filename_data = 'D:\Subjects\ARTIFICIAL INTELLIGENCE\SEMESTER - 5\Big Data and DataBase Management\End Sem Project\endomondoHR_proper.json'
df = spark.read.json(filename_data, mode="DROPMALFORMED")
# Load meta data file into pyspark data frame as well
print('Data frame type: {}'.format(type(df)))

Related

How to write pyspark dataframe directly into S3 bucket?

I want to save pyspark dataframe directly into s3 bucket. I tried some options but getting error. Can someone help me to solve my problem?
I created one sample pyspark dataframe and tried to save in S3 bucket directly.
I tried below code-
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import last
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from functools import reduce
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import max
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import dense_rank
from pyspark.sql.window import Window
from pyspark.sql.functions import abs, lit
#from __future__ import division
import sys
import mysql.connector
import traceback
import json
#from sqlalchemy import create_engine
import os
import math
import os.path
import datetime
from os import getpid
import pymysql.cursors
import time
import signal
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.context import SparkConf
from collections import OrderedDict
import multiprocessing
import multiprocessing as mp
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from threading import Thread
from functools import partial
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.application import MIMEApplication
from email import encoders
import smtplib
import shutil
import glob
from datetime import datetime, date
from pyspark.sql import Row
spark = SparkSession.builder.appName("app_name").getOrCreate()
print(spark.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
sc = spark.sparkContext
aws_access_key_id="*******"
aws_secret_access_key="********"
spark._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId", aws_access_key_id)
spark._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey", aws_secret_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.DefaultAWSCredentialsProviderChain')
df = spark.createDataFrame([Row(a=1, b=4., c='GFG1', d=date(2000, 8, 1),e=datetime(2000, 8, 1, 12, 0)),
Row(a=2, b=8., c='GFG2', d=date(2000, 6, 2),e=datetime(2000, 6, 2, 12, 0)),
Row(a=4, b=5., c='GFG3', d=date(2000, 5, 3),e=datetime(2000, 5, 3, 12, 0))])
print(df.show())
print(df.printSchema())
df.write.format('csv').option('header','true').save('s3a://******/testing_s3/emp.csv',mode='overwrite')
After running this code I am getting below error-
py4j.protocol.Py4JJavaError: An error occurred while calling o48.save.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 403, AWS Service: Amazon S3, AWS Request ID: RNKTVM6JMDACZ16W, AWS Error Code: null, AWS Error Message: Forbidden, S3 Extended Request ID: MS8lToBlzqSmn1YDdq6SPh7JC6aCKSROuldEz5x9LbsnQdxhKVEQriOpJz5KkCJPBnlk4KgsCkQ=
Please tell me what are the things I am missing in my script. Thanks in advance!!
after creating the spark context use these lines to set the credentials
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", AWS_ACCESS_KEY_ID)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
or
import pyspark
conf = (
pyspark.SparkConf()
.setAppName('app_name')
.setMaster(SPARK_MASTER)
.set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)
.set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)
)
sc = pyspark.SparkContext(conf=conf)

Spark Performance Issue - Writing to S3

I have a AWS Glue job in which I am using pyspark to read a large file (30gb) csv on s3 and then save it as parquet on s3. The job ran for more then 3 hours post which I killed it. Not sure why converting the file format would take so long ? Is spark right tool to do this conversion . below is the code I am using
import logging
import sys
from datetime import datetime
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import boto3
import time
if __name__ == "__main__":
sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
sqc = SQLContext(sc)
rdd = (sc. \
textFile("s3://my-bucket/data.txt")\
.flatMap(lambda line: line.split("END")) \
.map(lambda x: x.split("|")) \
.filter(lambda x: len(x) > 1))
df=sqc.createDataFrame(rdd)
#print(df1.head(10))
print(f'df.rdd.getNumPartitions() - {df.rdd.getNumPartitions()}')
df1.write.mode('overwrite').parquet('s3://my-bucket/processed')
job.commit()
Any suggestions for reducing the run time ?

set number of file write attempts in spark context

I'm running pyspark inside of aws glue jobs. As part of my pyspark script I write pyspark dataframes to a directory as parquet files. I would like to modify my spark context so that it will try to write each parquet file to the directory at least 20 times before failing the whole dataframe write attempt. The original version I have of starting my code is below. I've updated the "updated" version below as I think I'm supposed to in order to modify the spark context and use it with the glue context. Can someone please tell me if I've done this correctly or let me know how to fix it? Thanks
Original:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
updated:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
sc = SparkContext()
sc._jsc.hadoopConfiguration().set("fs.s3.maxretries", "20")
glueContext = GlueContext(sc.getOrCreate())
spark = glueContext.spark_session
Your updated code looks right
You can validate if the property is set by printing out the value from the below method
sc.getConf().getAll()

Does spark write dataframes asynchronously

I have two spark dataframes df1 and df2. I'm trying to write them out to two different file paths. Can someone tell me, do the writes occur synchronously or asynchronously? That is since they're two different dataframes writting to two different paths, will the writes occur at the same time, or do I have to wait until it finishes writing df1 out before it starts writing df2 out?
example code:
update added importing libraries:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
# updated 11/19/19 for error caused in error logging function
spark = glueContext.spark_session
from pyspark.sql import Window
from pyspark.sql.functions import col
from pyspark.sql.functions import first
from pyspark.sql.functions import date_format
from pyspark.sql.functions import lit,StringType
from pyspark.sql.types import *
from pyspark.sql.functions import substring, length, min,when,format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format,unix_timestamp
import time
import math
df1.write.mode("overwrite").parquet(filepath1)
df2.write.mode("overwrite").parquet(filepath2)
If its on single thread it will write one at a time. You can use threding and share the spark Context.

aws glue dropping mostly null fields

I have a dataframe df. It has a couple columns that are mostly null. I'm writing it to an s3 bucket using the code below. I then crawl the s3 bucket to get the table schema in the datacatalog. I'm finding when I crawl the data the fields that are mostly null get dropped. I've checked the json that is output and I'm finding that some records have the field, and others don't. Does anyone know what the issue might be? I would like to include the fields even if they are mostly null.
Code:
# importing libraries
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
from pyspark.sql.functions import col
from pyspark.sql.functions import first
from pyspark.sql.functions import date_format
from pyspark.sql.functions import lit,StringType
from pyspark.sql.types import *
from pyspark.sql.functions import to_date,format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format,unix_timestamp
from pyspark.sql.functions import *
# write to table
df.write.json('s3://path/table')
Why not use AWS Glue write method instead of spark DF?
glueContext.write_dynamic_frame.from_options

Resources