spark streaming not executing spark sql query - apache-spark

I am facing one issue during executing spark sql on top of spark streaming.
i am not getting printed the value of x on the line var x = sqlContext.sql("select count(*) from prices")
please find my code here below
import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Encoders
import org.apache.spark.streaming._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import spark.implicits._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import spark.implicits._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
import java.util.regex.Pattern
import java.util.regex.Matcher
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._
val conf = new SparkConf().setAppName("streamHive").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true")
val ssc = new StreamingContext(conf, Seconds(5))
val sc=ssc.sparkContext
val lines = ssc.textFileStream("file:///home/sdf/testHive")
case class Prices(name: String, age: String,sex: String, location: String)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
def parse (rdd : org.apache.spark.rdd.RDD[String] ) = {
var l = rdd.map(_.split(","))
val prices = l.map(p => Prices(p(0),p(1),p(2),p(3)))
val pricesDf = sqlContext.createDataFrame(prices)
pricesDf.registerTempTable("prices")
println("showing printdfShow")
pricesDf.show()
var x = sqlContext.sql("select count(*) from prices")
println("hello")
println (x)
}
lines.foreachRDD { rdd => parse(rdd)}
ssc.start()
I am getting following result, its not printing spark sql result:
[count(1): bigint]
showing printdfShow
+----+---+---+--------+
|name|age|sex|location|
+----+---+---+--------+
+----+---+---+--------+
hello
[count(1): bigint]
showing printdfShow
+----+---+---+--------+
|name|age|sex|location|
+----+---+---+--------+
| rop| 22| M| uk|
| fop| 24| F| us|
| dop| 23| M| fok|
+----+---+---+--------+
hello
[count(1): bigint]
showing printdfShow
+----+---+---+--------+
|name|age|sex|location|
+----+---+---+--------+
+----+---+---+--------+
hello
[count(1): bigint]
please help me , how to use spark sql in spark streaming as i am new to spark.

Please try this in your code after pricesDf.show
println(pricesDf.count)
If you want this in the same code then try the below , not println(x)
x.show
x is a dataframe not a value thats why its not getting printed while you are running println(x). To get it in a variable you can try this
println(x.rdd.map(r => r.getString(0)).collect()(0))

Related

Issue in Pyspark code when running Glue Script

import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_replace, col
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
#sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
spark = glueContext.spark_session
#logger = glueContext.get_logger()
#logger.DEBUG('Hello Glue')
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# ####connect to database
client = boto3.client('glue', region_name='XXXXXX')
response = client.get_connection(Name='XXXXXX')
connection_properties = response['Connection']['ConnectionProperties']
URL = connection_properties['JDBC_CONNECTION_URL']
url_list = URL.split("/")
host = "{}".format(url_list[-2][:-5])
new_host=host.split('#',1)[1]
port = url_list[-2][-4:]
database = "{}".format(url_list[-1])
Oracle_Username = "{}".format(connection_properties['USERNAME'])
Oracle_Password = "{}".format(connection_properties['PASSWORD'])
#print("Oracle_Username:",Oracle_Username)
#print("Oracle_Password:",Oracle_Password)
print("Host:",host)
print("New Host:",new_host)
print("Port:",port)
print("Database:",database)
Oracle_jdbc_url="jdbc:oracle:thin:#//"+new_host+":"+port+"/"+database
print("Oracle_jdbc_url:",Oracle_jdbc_url)
source_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("dbtable", "xxgmdmadm.vendor_data").option("user", Oracle_Username).option("password", Oracle_Password).load()
#store the value in array
qrys = source_df.select("SRC_QUERY").collect()
var='select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from (select max(request_id) over(partition by org_code ) max_request_id,a.* from XXGMDMADM.MDM_DNB_RESULTS_DTL a where dnb_level =''LVL1'' and request_id in (131) ) where MAX_REQUEST_ID=request_id'
dnb_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("query", var).option("user", Oracle_Username).option("password", Oracle_Password).load()
error I am getting while populating dnb_df
An error occurred while calling o111.load. ORA-00911: invalid
character
Not getting what is wrong with this part
var='select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from
(select max(request_id) over(partition by org_code )
max_request_id,a.* from XXGMDMADM.MDM_DNB_RESULTS_DTL a where
dnb_level =''LVL1'' and request_id in (131) ) where
MAX_REQUEST_ID=request_id'
Even when I am running simple query like getting same error
var="select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from
XXGMDMADM.MDM_DNB_RESULTS_DTL"
Does query option doesn't work for Oracle ?
You can use alias while giving a query, works well:
.option("dbtable", "(SELECT * FROM schema.tablename) tbl")

Perform INSERT INTO ... SELECT in AWS GLUE

The following script populates a target table with the data fetched from a source table using pyspark.sql and runs without problems in AWS Glue:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
users.toDF().createOrReplaceTempView("users")
query_users = """
SELECT U.id
, signup_from
FROM users AS U
"""
users_df = spark.sql(query_users)
users_dynamicframe = DynamicFrame.fromDF(
users_df.repartition(1), glueContext, "users_dynamicframe"
)
users_output = glueContext.write_dynamic_frame.from_catalog(
frame=users_dynamicframe,
database="target",
table_name="target_users",
transformation_ctx="users_output",
)
job.commit()
Now, I would like to perform an INSERT INTO SELECT ... ON DUPLICATE KEY UPDATE ...
and I wrote the following script:
source_users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
target_users = glueContext.create_dynamic_frame.from_catalog(
database = "target", table_name = "target_users"
)
source_users.toDF().createOrReplaceTempView("source_users")
target_users.toDF().createOrReplaceTempView("target_users")
query = """
INSERT INTO target_users
SELECT U.id
, U.user_type
FROM source_users
on duplicate key update id=target_users.id
"""
target_output = spark.sql(query)
job.commit()
which returns the following
ParseException: "\nmismatched input 'on' expecting <EOF>
I am not sure how to achieve this, and the reason why I am trying this is to reflect in the target table the updates happening in the source table.
Any help in this direction would be massively appreciated,
Thanks!

Confused by SparkContext import statements

I am trying to learn Apache Spark and cannot wrap my head wround this:
import spark.SparkContext
import SparkContext._
Why do we need the second line that almost looks like the first? And what does the '._' man after SparkContext?
You do not need to execute the 2nd line import SparkContext._ . Given the old approach of, say, Spark 1.6.x for a self-contained Spark App, the following from https://github.com/mk6502/spark-1.6-scala-boilerplate/blob/master/src/main/scala/HelloSpark.scala clearly and briefly demonstrates this:
import org.apache.spark.{SparkContext, SparkConf}
object HelloSpark {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setAppName("hello spark").setMaster("local"))
val rdd = sc.parallelize(Array(1, 2, 3, 4, 5))
println("count: ")
println(rdd.count())
sc.stop()
}
}
In notebooks the settings and configs and entry points are automatic.
As stated in my comment, move on to Spark 2.x, 3.x and look at SparkSession via https://data-flair.training/forums/topic/sparksession-vs-sparkcontext-in-apache-spark/
In the 1.6 Spark Guide on Self-Contained Applications we see the 2nd line indeed, but no reference to underlying classes explicitly. E.g.
/* SimpleApp.scala */
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SimpleApp {
def main(args: Array[String]) {
val logFile = "YOUR_SPARK_HOME/README.md" // Should be some file on your system
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val logData = sc.textFile(logFile, 2).cache()
val numAs = logData.filter(line => line.contains("a")).count()
val numBs = logData.filter(line => line.contains("b")).count()
println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
}
}

how to run, once my object is created in spark

Please help me out i have installed spark and now i am trying to run the code object is defined ,but what next i am confused do help
scala> import org.apache.spark.SparkContext
import org.apache.spark.SparkContext
scala> import org.apache.spark.SparkConf
import org.apache.spark.SparkConf
scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession
scala> object mapTest{
| def main(args: Array[String]) = {
| val spark = SparkSession.builder.appName("mapExample").master("local").getOrCreate()
| val data = spark.read.textFile("file///home/parv/Desktop/1").rdd
| val mapFile = data.map(line => (line,line.length))
| mapFile.foreach(println)
| }
| }
defined object mapTest
Just say
mapTest.main
on scala shell

spark 1.3 playing with hbase error

i'm trying to create hbase table and insert using spark core (spark streaming after).
I succeeded to create the table and add data into it, even when i got this problem:
warning: Class org.apache.hadoop.hbase.classification.InterfaceAudience not found - continuing with a stub.
but when i try to count i got an error; may someone help me with the first warning and how i cant add streaming data into this table
my code is bellow:
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles
val tableName = "ziedspark"
val conf = HBaseConfiguration.create()
conf.addResource(new Path("file:///opt/cloudera/parcels/CDH-5.4.7-1.cdh5.4.7.p0.3/etc/hbase/conf.dist/hbase-site.xml"))
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(tableName)) {
print("Creating GHbase Table Creating GHbase Table Creating GHbase Table Creating GHbase Table ")
val tableDesc = new HTableDescriptor(tableName)
tableDesc.addFamily(new HColumnDescriptor("z1".getBytes()))
tableDesc.addFamily(new HColumnDescriptor("z2".getBytes()))
admin.createTable(tableDesc)
}else{
print("Table already exists!!")
}
val myTable = new HTable(conf, tableName)
for (i <- 414540 to 414545) {
var p = new Put(Bytes.toBytes(""+i))
p.add("z1".getBytes(), "name".getBytes(), Bytes.toBytes(""+(i*5)))
p.add("z1".getBytes(), "age".getBytes(), Bytes.toBytes("2016-07-01"))
p.add("z2".getBytes(), "job".getBytes(), Bytes.toBytes(""+i))
p.add("z2".getBytes(), "salary".getBytes(), Bytes.toBytes(""+i))
myTable.put(p)
}
myTable.flushCommits()
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//error here after creating the table count is not working
val count = hBaseRDD.count()
print("HBase RDD count:" + count)
System.exit(0)
Please find a similar question related to the Reading from Spark.
How to read from hbase using spark
Also in the mentioned libraries you'll get the stub to read and write in HBase.
Let me know for any more help on the same.

Resources