Shuffle Read and Write makes Spark job finish very slow - apache-spark

I am doing a join on two data frame having data 280 GB and 1 GB respectively.
My actual spark job which is computing join is fast but shuffle read and write takes very long time and that makes overall spark job very slow.
I am using m3.2xlarge 10 nodes cluster.
Each m3.2xlarge configuration
cpu:8
ram:30
hdd:160
Please suggest me what necessary changes I should do to make this shuffle read-write phase faster.
Adding my full code ..
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.{ SparkConf, SparkContext }
import java.sql.{Date, Timestamp}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import java.io.File
import org.apache.hadoop.fs._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.spark.sql.functions.input_file_name
import org.apache.spark.sql.functions.regexp_extract
val get_cus_val = spark.udf.register("get_cus_val", (filePath: String) => filePath.split("\\.")(3))
val df = sqlContext.read.format("csv").option("header", "true").option("delimiter", "|").option("inferSchema","true").load("s3://trffullfiles/FinancialLineItem/MAIN")
val df1With_ = df.toDF(df.columns.map(_.replace(".", "_")): _*)
val column_to_keep = df1With_.columns.filter(v => (!v.contains("^") && !v.contains("!") && !v.contains("_c"))).toSeq
val df1result = df1With_.select(column_to_keep.head, column_to_keep.tail: _*)
val df1resultFinal=df1result.withColumn("DataPartition", get_cus_val(input_file_name))
val df2 = sqlContext.read.format("csv").option("header", "true").option("delimiter", "|").option("inferSchema","true").load("s3://trffullfiles/FinancialLineItem/INCR")
val df2With_ = df2.toDF(df2.columns.map(_.replace(".", "_")): _*)
val df2column_to_keep = df2With_.columns.filter(v => (!v.contains("^") && !v.contains("!") && !v.contains("_c"))).toSeq
val df2result = df2With_.select(df2column_to_keep.head, df2column_to_keep.tail: _*)
import org.apache.spark.sql.expressions._
val windowSpec = Window.partitionBy("LineItem_organizationId", "LineItem_lineItemId").orderBy($"TimeStamp".cast(LongType).desc)
val latestForEachKey = df2result.withColumn("rank", rank().over(windowSpec)).filter($"rank" === 1).drop("rank", "TimeStamp")
val dfMainOutput = df1resultFinal.join(latestForEachKey, Seq("LineItem_organizationId", "LineItem_lineItemId"), "outer")
.select($"LineItem_organizationId", $"LineItem_lineItemId",
when($"DataPartition_1".isNotNull, $"DataPartition_1").otherwise($"DataPartition".cast(DataTypes.StringType)).as("DataPartition"),
when($"StatementTypeCode_1".isNotNull, $"StatementTypeCode_1").otherwise($"StatementTypeCode").as("StatementTypeCode"),
when($"LineItemName_1".isNotNull, $"LineItemName_1").otherwise($"LineItemName").as("LineItemName"),
when($"FinancialConceptCodeGlobalSecondaryId_1".isNotNull, $"FinancialConceptCodeGlobalSecondaryId_1").otherwise($"FinancialConceptCodeGlobalSecondaryId").as("FinancialConceptCodeGlobalSecondaryId"),
when($"FFAction_1".isNotNull, concat(col("FFAction_1"), lit("|!|"))).otherwise(concat(col("FFAction"), lit("|!|"))).as("FFAction"))
.filter(!$"FFAction".contains("D"))
val dfMainOutputFinal = dfMainOutput.select($"DataPartition", $"StatementTypeCode",concat_ws("|^|", dfMainOutput.schema.fieldNames.filter(_ != "DataPartition").map(c => col(c)): _*).as("concatenated"))
val headerColumn = df.columns.filter(v => (!v.contains("^") && !v.contains("_c"))).toSeq
val header = headerColumn.dropRight(1).mkString("", "|^|", "|!|")
val dfMainOutputFinalWithoutNull = dfMainOutputFinal.withColumn("concatenated", regexp_replace(col("concatenated"), "null", "")).withColumnRenamed("concatenated", header)
dfMainOutputFinalWithoutNull.write.partitionBy("DataPartition","StatementTypeCode")
.format("csv")
.option("nullValue", "")
.option("header", "true")
.option("codec", "gzip")
.save("s3://trffullfiles/FinancialLineItem/output")

Related

How to solve this about Spark TopK?

I got a problem about TopK to be solved using Spark.
The source file is about this:
baoshi,13
xinxi,80
baoshi,99
xinxi,32
baoshi,50
xinxi,43
baoshi,210
xinxi,100
Here is my code:
import org.apache.spark.{SparkConf, SparkContext}
object TopKTest {
def main(args: Array[String]): Unit = {
val file = "file:///home/hadoop/rdd-test/TopK3.txt"
val conf = new SparkConf().setAppName("TopKTest").setMaster("local")
val sc = new SparkContext(conf)
val txt = sc.textFile(file)
val rdd2 =txt.map(line=>(line.split(",")(0)
,line.split(",")(1).trim))
val rdd=rdd2.groupByKey()
val rdd1 = rdd.map(line=> {
val f = line._1
val s = line._2
val t = s.toList.sortWith(_ > _).take(2)
(f, t)
})
rdd1.foreach(println)
}
}
The expected result is :
(xinxi,List(100, 80))
(baoshi,List(210, 99))
That's because you compare Strings not numerics.
Change
val rdd2 =txt.map(line=>(line.split(",")(0)
,line.split(",")(1).trim))
to
val rdd2 =txt.map(line=>(line.split(",")(0)
,line.split(",")(1).trim.toLong))
Here is the way:
scala> import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
scala> val rdd = spark.sparkContext.textFile("D:\\test\\input.txt")
rdd: org.apache.spark.rdd.RDD[String] = D:\test\input.txt MapPartitionsRDD[1] at textFile at <console>:26
scala> rdd.foreach(println)
xinxi,43
baoshi,13
baoshi,210
xinxi,80
xinxi,100
baoshi,99
xinxi,32
baoshi,50
scala> val rdd1 = rdd.map(row => (row.split(",")(0), row.split(",")(1).toInt))
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[2] at map at <console>:28
scala> val rdd2 = rdd1.topByKey(2)
rdd2: org.apache.spark.rdd.RDD[(String, Array[Int])] = MapPartitionsRDD[4] at mapValues at MLPairRDDFunctions.scala:50
scala> val rdd3 = rdd2.map(m => (m._1, m._2.toList))
rdd3: org.apache.spark.rdd.RDD[(String, List[Int])] = MapPartitionsRDD[5] at map at <console>:32
scala> rdd3.foreach(println)
(xinxi,List(100, 80))
(baoshi,List(210, 99))

Spark DataFrame: How to specify schema when writing as Avro

I want to write a DataFrame in Avro format using a provided Avro schema rather than Spark's auto-generated schema. How can I tell Spark to use my custom schema on write?
After applying the patch in https://github.com/databricks/spark-avro/pull/222/, I was able to specify a schema on write as follows:
df.write.option("forceSchema", myCustomSchemaString).avro("/path/to/outputDir")
Hope below method helps.
import org.apache.spark.sql.types._
val schema = StructType( StructField("title", StringType, true) ::StructField("averageRating", DoubleType, false) ::StructField("numVotes", IntegerType, false) :: Nil)
titleMappedDF.write.option("avroSchema", schema.toString).avro("/home/cloudera/workspace/movies/avrowithschema")
Example:
Download data from below site. https://datasets.imdbws.com/
Download the movies data title.ratings.tsv.gz
Copy to below location. /home/cloudera/workspace/movies/title.ratings.tsv.gz
Start Spark-shell and type below command.
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
val title = sqlContext.read.text("file:///home/cloudera/Downloads/movies/title.ratings.tsv.gz")
scala> title.limit(5).show
+--------------------+
| value|
+--------------------+
|tconst averageRat...|
| tt0000001 5.8 1350|
| tt0000002 6.5 157|
| tt0000003 6.6 933|
| tt0000004 6.4 93|
+--------------------+
val titlerdd = title.rdd
case class Title(titleId:String, averageRating:Float, numVotes:Int)
val titlefirst = titlerdd.first
val titleMapped = titlerdd.filter(e=> e!=titlefirst).map(e=> {
val rowStr = e.getString(0)
val splitted = rowStr.split("\t")
val titleId = splitted(0).trim
val averageRating = scala.util.Try(splitted(1).trim.toFloat) getOrElse(0.0f)
val numVotes = scala.util.Try(splitted(2).trim.toInt) getOrElse(0)
Title(titleId, averageRating, numVotes)
})
val titleMappedDF = titleMapped.toDF
scala> titleMappedDF.limit(2).show
+---------+-------------+--------+
| titleId|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001| 5.8| 1350|
|tt0000002| 6.5| 157|
+---------+-------------+--------+
import org.apache.spark.sql.types._
val schema = StructType( StructField("title", StringType, true) ::StructField("averageRating", DoubleType, false) ::StructField("numVotes", IntegerType, false) :: Nil)
titleMappedDF.write.option("avroSchema", schema.toString).avro("/home/cloudera/workspace/movies/avrowithschema")

GraphFrames Connected Components Performance

When I attempt to generate the connected components using graphframes it is taking substantially longer than I expected. I am running on spark 2.1, graphframes 0.5 and AWS EMR with 3 r4.xlarge instances. When the generating the connected components for a graph of about 12 million edges it is taking around 3 hours.
The code is below. I am fairly new to spark so any suggestions would be awesome.
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("yarn-cluster")
.setAppName("Connected Component")
val sc = new SparkContext(sparkConf)
sc.setCheckpointDir("s3a://......")
AWSUtils.setS3Credentials(sc.hadoopConfiguration)
implicit val sqlContext = SQLContext.getOrCreate(sc)
import sqlContext.implicits._
val historical = sqlContext
.read
.option("mergeSchema", "false")
.parquet("s3a://.....")
.map(x => (x(0).toString, x(2).toString, x(1).toString, x(3).toString, x(4).toString.toLong, x(5).toString.toLong))
// Complete graph
val g = GraphFrame(
historical.flatMap(e => List((e._1, e._3, e._5), (e._2, e._4, e._5))).toDF("id", "type", "timestamp"),
historical.toDF("src", "dst", "srcType", "dstType", "timestamp", "companyId")
)
val connectedComponents: DataFrame = g.connectedComponents.run()
connectedComponents.toDF().show(100, false)
sc.stop()
}

Spark: Why does Python read input files twice but Scala doesn't when creating a DataFrame?

I'm using Spark v1.5.2. I wrote a program in Python and I don't understand why it reads the input files twice. The same program written in Scala only reads the input files once.
I use an accumulator to count the number of times that map() is called. From the accumulator value, I infer the number of times the input file is read.
The input file contains 3 lines of text.
Python:
from pyspark import SparkContext, SQLContext
from pyspark.sql.types import *
def createTuple(record): # used with map()
global map_acc
map_acc += 1
return (record[0], record[1].strip())
sc = SparkContext(appName='Spark test app') # appName is shown in the YARN UI
sqlContext = SQLContext(sc)
map_acc = sc.accumulator(0)
lines = sc.textFile("examples/src/main/resources/people.txt")
people_rdd = lines.map(lambda l: l.split(",")).map(createTuple) #.cache()
fieldNames = 'name age'
fields = [StructField(field_name, StringType(), True) for field_name in fieldNames.split()]
schema = StructType(fields)
df = sqlContext.createDataFrame(people_rdd, schema)
print 'record count DF:', df.count()
print 'map_acc:', map_acc.value
#people_rdd.unpersist()
$ spark-submit --master local[1] test.py 2> err
record count DF: 3
map_acc: 6 ##### why 6 instead of 3??
Scala:
import org.apache.spark._
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType,StructField,StringType};
object SimpleApp {
def main(args: Array[String]) {
def createTuple(record:Array[String], map_acc: Accumulator[Int]) = { // used with map()
map_acc += 1
Row(record(0), record(1).trim)
}
val conf = new SparkConf().setAppName("Scala Test App")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val map_acc = sc.accumulator(0)
val lines = sc.textFile("examples/src/main/resources/people.txt")
val people_rdd = lines.map(_.split(",")).map(createTuple(_, map_acc))
val fieldNames = "name age"
val schema = StructType(
fieldNames.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
val df = sqlContext.createDataFrame(people_rdd, schema)
println("record count DF: " + df.count)
println("map_acc: " + map_acc.value)
}
}
$ spark-submit ---class SimpleApp --master local[1] test.jar 2> err
record count DF: 3
map_acc: 3
If I remove the comments from the Python program and cache the RDD, then the input files are not read twice. However, I don't think I should have to cache the RDD, right? In the Scala version I don't need to cache the RDD.
people_rdd = lines.map(lambda l: l.split(",")).map(createTuple).cache()
...
people_rdd.unpersist()
$ spark-submit --master local[1] test.py 2> err
record count DF: 3
map_acc: 3
$ hdfs dfs -cat examples/src/main/resources/people.txt
Michael, 29
Andy, 30
Justin, 19
It happens because in 1.5 createDataFrame eagerly validates provided schema on a few elements:
elif isinstance(schema, StructType):
# take the first few rows to verify schema
rows = rdd.take(10)
for row in rows:
_verify_type(row, schema)
In contrast current versions validate schema for all elements but it is done lazily and you wouldn't see the same behavior. For example this would fail instantaneously in 1.5:
from pyspark.sql.types import *
rdd = sc.parallelize([("foo", )])
schema = StructType([StructField("foo", IntegerType(), False)])
sqlContext.createDataFrame(rdd, schema)
but 2.0 equivalent would fail when you try to evaluate DataFrame.
In general you shouldn't expect that Python and Scala code will behave the same way unless you strictly limit yourself to interactions with SQL API. PySpark:
Implements almost all RDD methods natively so the same chain of transformations can result in a different DAG.
Interactions with Java API may require an eager evaluation to provide type information for Java classes.

an rdd char is to be converted into a dataframe

The RDD data is to be converted into a data frame. But I am unable to do so. ToDf is not working,also I tried with array RDD to dataframe . Kindly advise me.This program is for parsing a sample excel using scala and spark
import java.io.{File, FileInputStream}
import org.apache.poi.xssf.usermodel.XSSFCell
import org.apache.poi.xssf.usermodel.{XSSFSheet, XSSFWorkbook}
import org.apache.poi.ss.usermodel.Cell._
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.{ StructType, StructField, StringType, IntegerType };
object excel
{
def main(args: Array[String]) =
{
val sc = new SparkContext(new SparkConf().setAppName("Excel Parsing").setMaster("local[*]"))
val file = new FileInputStream(new File("test.xlsx"))
val wb = new XSSFWorkbook(file)
val sheet = wb.getSheetAt(0)
val rowIterator = sheet.iterator()
val builder = StringBuilder.newBuilder
var column = ""
while (rowIterator.hasNext())
{
val row = rowIterator.next();
val cellIterator = row.cellIterator();
while (cellIterator.hasNext())
{
val cell = cellIterator.next();
cell.getCellType match {
case CELL_TYPE_NUMERIC ⇒builder.append(cell.getNumericCellValue + ",")
case CELL_TYPE_BOOLEAN ⇒ builder.append(cell.getBooleanCellValue + ",")
case CELL_TYPE_STRING ⇒ builder.append(cell.getStringCellValue + ",")
case CELL_TYPE_BLANK ⇒ builder.append(",")
}
}
column = builder.toString()
println(column)
builder.setLength(0)
}
val data= sc.parallelize(column)
println(data)
}
}
For converting Spark RDD to DataFrame . You have to make a sqlContext or sparkSession according to the spark version and then use
val sqlContext=new SQLContext(sc)
import sqlContext.implicits._
Incase you are using Spark 2.0 or above use SparkSession instead as SqlContext is deprecated in the new release !
val spark=SparkSession.builder.config(conf).getOrCreate.
import spark.implicits._
This will allow you to use toDF on RDD.
This might solve your problem !
Note: For using the sqlContext you have to inculde the spark_sql as dependency !

Resources