Unable to fetch the value of Println in apache spark - apache-spark

scala> import org.apache.spark.SparkContext
import org.apache.spark.SparkContext
scala> import org.apache.spark.SparkConf
import org.apache.spark.SparkConf
scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession
scala> object rddTest{
| def main(args: Array[String]) = {
| val spark = SparkSession.builder.appName("mapExample").master("local").getOrCreate()
| val rdd1 = spark.sparkContext.parallelize(Seq((1,"jan",2016),(3,"nov",2014),(16,"feb",2014)))
| val rdd2 = spark.sparkContext.parallelize(Seq((5,"dec",2014),(17,"sep",2015)))
| val rdd3 = spark.sparkContext.parallelize(Seq((6,"dec",2011),(16,"may",2015)))
| val rddUnion = rdd1.union(rdd2).union(rdd3)
| rddUnion.foreach(Println)
| }
| }
I am getting this error ,i dont know why this is coming
< console>:81: error: not found: value Println
rddUnion.foreach(Println)

You have an extrat upper case try this :
rddUnion.foreach(println)

Related

How convert rdd row to a dataframe with json struct in pyspark?

I am sending the following json to the path "/ home / host / test" so that the program can capture it using spark streaming and be able to make queries about it.
{"id": "1", description: "test"}
{"id": "1", description: "test"}
But when I perform the query it looks like the following structure
root
| --word: String (Nulleable = true)
and I get the following result:
+ ------------------- +
| word |
---------------------
| {"id": "1", "test"}
| {"id": "1", "test"}
I need the structure to look like this
root
| --id: String (Nulleable = true)
| --description string (Nulleable = true)
and I need to get a result like the following
----------------
| id | description
----------------
| "1" | "test" |
| "1" | "test" |
----------------
this is my pyspkark code
from __future__ import print_function
import os
import sys
from pyspark import SparkContext
from pyspark.sql.functions import col, explode
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.sql import SQLContext
if __name__ == "__main__":
sc = SparkContext(appName="PythonSqlNetworkWordCount")
ssc = StreamingContext(sc, 3)
sqlcontextoriginal = SQLContext(sc)
# Create a socket stream on target ip:port and count the
# words in input stream of \n delimited text (eg. generated by 'nc')
lines = ssc.textFileStream("/home/host/test")
# Convert RDDs of the words DStream to DataFrame and run SQL query
def process(time, rdd):
print("========= %s =========" % str(time))
try:
# Get the singleton instance of SQLContext
sqlContext = SQLContext(rdd.context)
# Convert RDD[String] to RDD[Row] to DataFrame
rowRdd = rdd.map(lambda w: Row(word=w))
wordsDataFrame = sqlContext.createDataFrame(rowRdd).toJSON()
json = sqlContext.read.json(wordsDataFrame)
# Register as table
json.createOrReplaceTempView("words")
json.printSchema()
wordCountsDataFrame = sqlContext.sql("select * from words ")
wordCountsDataFrame.show()
except:
pass
lines.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
Ok, i found the solution.
I had to use sql.read.json passing it as parameter the rdd directly.
json = sqlContext.read.json(rdd)

Apache spark - Window Function , FIRST_VALUE do not work

I have a problem with the WINDOW FUNCTION spark API :
my question is similar to this one : How to drop duplicates using conditions
I have a dataset :
+---+----------+---------+
| ID| VALUEE| OTHER|
+---+----------+---------+
| 1| null|something|
| 1|[1.0, 0.0]|something|
| 1|[1.0, 0.0]|something|
| 1|[0.0, 2.0]|something|
| 1|[3.0, 5.0]|something|
| 2|[3.0, 5.0]|something|
| 1|[3.0, 5.0]|something|
| 2| null|something|
| 3|[3.0, 5.0]|something|
| 4| null|something|
+---+----------+---------+
I want a keep only one ID of each ( no duplicate ) and I don't care of the VALUEE but I prefer a non NULL value
expected result
+---+----------+---------+
| ID| VALUEE| OTHER|
+---+----------+---------+
| 1|[0.0, 2.0]|something|
| 3|[3.0, 5.0]|something|
| 4| null|something|
| 2|[3.0, 5.0]|something|
+---+----------+---------+
windowsFunction with the Aggregate function first() do not work
whereas with row_number() it work
but i don't understand why first do not work
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.spark_project.guava.collect.ImmutableList;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
import static org.apache.spark.sql.types.DataTypes.StringType;
import static org.apache.spark.sql.types.DataTypes.createStructField;
public class TestSOF {
public static void main(String[] args) {
StructType schema = new StructType(
new StructField[]{
createStructField("ID", IntegerType, false),
createStructField("VALUEE", DataTypes.createArrayType(DataTypes.DoubleType), true),
createStructField("OTHER", StringType, true),
});
double [] a =new double[]{1.0,0.0};
double [] b =new double[]{3.0,5.0};
double [] c =new double[]{0.0,2.0};
List<Row> listOfdata = new ArrayList();
listOfdata.add(RowFactory.create(1,null,"something"));
listOfdata.add(RowFactory.create(1,a,"something"));
listOfdata.add(RowFactory.create(1,a,"something"));
listOfdata.add(RowFactory.create(1,c,"something"));
listOfdata.add(RowFactory.create(1,b,"something"));
listOfdata.add(RowFactory.create(2,b,"something"));
listOfdata.add(RowFactory.create(1,b,"something"));
listOfdata.add(RowFactory.create(2,null,"something"));
listOfdata.add(RowFactory.create(3,b,"something"));
listOfdata.add(RowFactory.create(4,null,"something"));
List<Row> rowList = ImmutableList.copyOf(listOfdata);
SparkSession sparkSession = new SparkSession.Builder().config("spark.master", "local[*]").getOrCreate();
sparkSession.sparkContext().setLogLevel("ERROR");
Dataset<Row> dataset = sparkSession.createDataFrame(rowList,schema);
dataset.show();
WindowSpec windowSpec = Window.partitionBy(dataset.col("ID")).orderBy(dataset.col("VALUEE").asc_nulls_last());
// wind solution
// lost information
Dataset<Row> dataset0 = dataset.groupBy("ID").agg(functions.first(dataset.col("VALUEE"), true));
Dataset<Row> dataset1 = dataset.withColumn("new",functions.row_number().over(windowSpec)).where("new = 1").drop("new");
//do not work
Dataset<Row> dataset2 = dataset.withColumn("new",functions.first("VALUEE",true).over(windowSpec)).drop("new");
JavaRDD<Row> rdd =
dataset.toJavaRDD()
.groupBy(row -> row.getAs("ID"))
.map(g -> {
Iterator<Row> iter =g._2.iterator();
Row rst = null;
Row tmp;
while(iter.hasNext()){
tmp = iter.next();
if (tmp.getAs("VALUEE") != null) {
rst=tmp;
break;
}
if(rst==null){
rst=tmp;
}
}
return rst;
});
Dataset<Row> dataset3 = sparkSession.createDataFrame(rdd, schema);
dataset0.show();
dataset1.show();
dataset2.show();
dataset3.show();
}
}
First is not a Window function in SPARK 2.3 it's only an Aggregate function
firstValue is not present in the dataframe API
You can use an equivalent solution as the one you posted. In your case, the null values will appear in the first order. So :
val df: DataFrame = ???
import df.sparkSession.implicits._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{col, last}
val id_cols = "ID"
val windowSpec = Window.partitionBy(id_cols).orderBy($"VALUEE".asc)
val list_cols = Seq("VALUE", "OTHER")
val df_dd = df.select(col(id_cols) +: list_cols.map(x => last(col(x)).over(windowSpec).alias(x)):_*).distinct
For the example data you've provided, the short version of the solution dataset1, that you provided:
dataset.groupBy("ID").agg(functions.first(dataset.col("VALUEE"), true)).show();
For understanding of Window Functions and optimization of performance of WindowFunction vs groupBy in Spark i strongly recommend presentations by Jacek Laskowski:
https://databricks.com/session/from-basic-to-advanced-aggregate-operators-in-apache-spark-sql-2-2-by-examples-and-their-catalyst-optimizations
https://databricks.com/session/from-basic-to-advanced-aggregate-operators-in-apache-spark-sql-2-2-by-examples-and-their-catalyst-optimizations-continues

Shuffle Read and Write makes Spark job finish very slow

I am doing a join on two data frame having data 280 GB and 1 GB respectively.
My actual spark job which is computing join is fast but shuffle read and write takes very long time and that makes overall spark job very slow.
I am using m3.2xlarge 10 nodes cluster.
Each m3.2xlarge configuration
cpu:8
ram:30
hdd:160
Please suggest me what necessary changes I should do to make this shuffle read-write phase faster.
Adding my full code ..
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.{ SparkConf, SparkContext }
import java.sql.{Date, Timestamp}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import java.io.File
import org.apache.hadoop.fs._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.spark.sql.functions.input_file_name
import org.apache.spark.sql.functions.regexp_extract
val get_cus_val = spark.udf.register("get_cus_val", (filePath: String) => filePath.split("\\.")(3))
val df = sqlContext.read.format("csv").option("header", "true").option("delimiter", "|").option("inferSchema","true").load("s3://trffullfiles/FinancialLineItem/MAIN")
val df1With_ = df.toDF(df.columns.map(_.replace(".", "_")): _*)
val column_to_keep = df1With_.columns.filter(v => (!v.contains("^") && !v.contains("!") && !v.contains("_c"))).toSeq
val df1result = df1With_.select(column_to_keep.head, column_to_keep.tail: _*)
val df1resultFinal=df1result.withColumn("DataPartition", get_cus_val(input_file_name))
val df2 = sqlContext.read.format("csv").option("header", "true").option("delimiter", "|").option("inferSchema","true").load("s3://trffullfiles/FinancialLineItem/INCR")
val df2With_ = df2.toDF(df2.columns.map(_.replace(".", "_")): _*)
val df2column_to_keep = df2With_.columns.filter(v => (!v.contains("^") && !v.contains("!") && !v.contains("_c"))).toSeq
val df2result = df2With_.select(df2column_to_keep.head, df2column_to_keep.tail: _*)
import org.apache.spark.sql.expressions._
val windowSpec = Window.partitionBy("LineItem_organizationId", "LineItem_lineItemId").orderBy($"TimeStamp".cast(LongType).desc)
val latestForEachKey = df2result.withColumn("rank", rank().over(windowSpec)).filter($"rank" === 1).drop("rank", "TimeStamp")
val dfMainOutput = df1resultFinal.join(latestForEachKey, Seq("LineItem_organizationId", "LineItem_lineItemId"), "outer")
.select($"LineItem_organizationId", $"LineItem_lineItemId",
when($"DataPartition_1".isNotNull, $"DataPartition_1").otherwise($"DataPartition".cast(DataTypes.StringType)).as("DataPartition"),
when($"StatementTypeCode_1".isNotNull, $"StatementTypeCode_1").otherwise($"StatementTypeCode").as("StatementTypeCode"),
when($"LineItemName_1".isNotNull, $"LineItemName_1").otherwise($"LineItemName").as("LineItemName"),
when($"FinancialConceptCodeGlobalSecondaryId_1".isNotNull, $"FinancialConceptCodeGlobalSecondaryId_1").otherwise($"FinancialConceptCodeGlobalSecondaryId").as("FinancialConceptCodeGlobalSecondaryId"),
when($"FFAction_1".isNotNull, concat(col("FFAction_1"), lit("|!|"))).otherwise(concat(col("FFAction"), lit("|!|"))).as("FFAction"))
.filter(!$"FFAction".contains("D"))
val dfMainOutputFinal = dfMainOutput.select($"DataPartition", $"StatementTypeCode",concat_ws("|^|", dfMainOutput.schema.fieldNames.filter(_ != "DataPartition").map(c => col(c)): _*).as("concatenated"))
val headerColumn = df.columns.filter(v => (!v.contains("^") && !v.contains("_c"))).toSeq
val header = headerColumn.dropRight(1).mkString("", "|^|", "|!|")
val dfMainOutputFinalWithoutNull = dfMainOutputFinal.withColumn("concatenated", regexp_replace(col("concatenated"), "null", "")).withColumnRenamed("concatenated", header)
dfMainOutputFinalWithoutNull.write.partitionBy("DataPartition","StatementTypeCode")
.format("csv")
.option("nullValue", "")
.option("header", "true")
.option("codec", "gzip")
.save("s3://trffullfiles/FinancialLineItem/output")

an rdd char is to be converted into a dataframe

The RDD data is to be converted into a data frame. But I am unable to do so. ToDf is not working,also I tried with array RDD to dataframe . Kindly advise me.This program is for parsing a sample excel using scala and spark
import java.io.{File, FileInputStream}
import org.apache.poi.xssf.usermodel.XSSFCell
import org.apache.poi.xssf.usermodel.{XSSFSheet, XSSFWorkbook}
import org.apache.poi.ss.usermodel.Cell._
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.{ StructType, StructField, StringType, IntegerType };
object excel
{
def main(args: Array[String]) =
{
val sc = new SparkContext(new SparkConf().setAppName("Excel Parsing").setMaster("local[*]"))
val file = new FileInputStream(new File("test.xlsx"))
val wb = new XSSFWorkbook(file)
val sheet = wb.getSheetAt(0)
val rowIterator = sheet.iterator()
val builder = StringBuilder.newBuilder
var column = ""
while (rowIterator.hasNext())
{
val row = rowIterator.next();
val cellIterator = row.cellIterator();
while (cellIterator.hasNext())
{
val cell = cellIterator.next();
cell.getCellType match {
case CELL_TYPE_NUMERIC ⇒builder.append(cell.getNumericCellValue + ",")
case CELL_TYPE_BOOLEAN ⇒ builder.append(cell.getBooleanCellValue + ",")
case CELL_TYPE_STRING ⇒ builder.append(cell.getStringCellValue + ",")
case CELL_TYPE_BLANK ⇒ builder.append(",")
}
}
column = builder.toString()
println(column)
builder.setLength(0)
}
val data= sc.parallelize(column)
println(data)
}
}
For converting Spark RDD to DataFrame . You have to make a sqlContext or sparkSession according to the spark version and then use
val sqlContext=new SQLContext(sc)
import sqlContext.implicits._
Incase you are using Spark 2.0 or above use SparkSession instead as SqlContext is deprecated in the new release !
val spark=SparkSession.builder.config(conf).getOrCreate.
import spark.implicits._
This will allow you to use toDF on RDD.
This might solve your problem !
Note: For using the sqlContext you have to inculde the spark_sql as dependency !

Spark SQL: Get month from week number and year

I have a dataframe with "Week" & "Year" column and needs to calculate month for same as below:
Input:
+----+----+
|Week|Year|
+----+----+
| 50|2012|
| 50|2012|
| 50|2012|
Expected output:
+----+----+-----+
|Week|Year|Month|
+----+----+-----+
| 50|2012|12 |
| 50|2012|12 |
| 50|2012|12 |
Any help would be appreciated. Thanks
Thanks to #zero323, who pointed me out to the sqlContext.sql query, I converted the query in the following :
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.apache.spark.sql.functions.*;
public class MonthFromWeekSparkSQL {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MonthFromWeekSparkSQL").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
List myList = Arrays.asList(RowFactory.create(50, 2012), RowFactory.create(50, 2012), RowFactory.create(50, 2012));
JavaRDD myRDD = sc.parallelize(myList);
List<StructField> structFields = new ArrayList<StructField>();
// Create StructFields
StructField structField1 = DataTypes.createStructField("week", DataTypes.IntegerType, true);
StructField structField2 = DataTypes.createStructField("year", DataTypes.IntegerType, true);
// Add StructFields into list
structFields.add(structField1);
structFields.add(structField2);
// Create StructType from StructFields. This will be used to create DataFrame
StructType schema = DataTypes.createStructType(structFields);
DataFrame df = sqlContext.createDataFrame(myRDD, schema);
DataFrame df2 = df.withColumn("yearAndWeek", concat(col("year"), lit(" "), col("week")))
.withColumn("month", month(unix_timestamp(col("yearAndWeek"), "yyyy w").cast(("timestamp")))).drop("yearAndWeek");
df2.show();
}
}
You actually create a new column with year and week formatted as "yyyy w" then convert it using unix_timestamp from which you can pull the month as you see.
PS: It seems that cast behavior was incorrect in spark 1.5 - https://issues.apache.org/jira/browse/SPARK-11724
So in that case, it's more general to do .cast("double").cast("timestamp")

Resources