I have a function all_purch_spark() that sets a Spark Context as well as SQL Context for five different tables. The same function then successfully runs a sql query against an AWS Redshift DB. It works great. I am including the entire function below (stripped of sensitive data of course). Please forgive its length but I wanted to show it as is given the problem I am facing.
My problem is with the second function repurch_prep() and how it calls the first function all_purch_spark(). I can't figure out how to avoid errors such as this one: NameError: name 'sqlContext' is not defined
I will show the two functions and error below.
Here is the first function all_purch_spark(). Again I put the whole function here for reference. I know it is long but wasn't sure I could reduce it to a meaningful example.
def all_purch_spark():
config = {
'redshift_user': 'tester123',
'redshift_pass': '*****************',
'redshift_port': "5999",
'redshift_db': 'my_database',
'redshift_host': 'redshift.my_database.me',
}
from pyspark import SparkContext, SparkConf, SQLContext
jars = [
"/home/spark/SparkNotebooks/src/service/RedshiftJDBC42-no-awssdk-1.2.41.1065.jar"
]
conf = (
SparkConf()
.setAppName("S3 with Redshift")
.set("spark.driver.extraClassPath", ":".join(jars))
.set("spark.hadoop.fs.s3a.path.style.access", True)
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.set("com.amazonaws.services.s3.enableV4", True)
.set("spark.hadoop.fs.s3a.endpoint", f"s3-{config.get('region')}.amazonaws.com")
.set("spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
)
sc = SparkContext(conf=conf).getOrCreate()
sqlContext = SQLContext(sc)
##Set Schema and table to query
schema1 = 'production'
schema2 = 'X4production'
table1 = 'purchases'
table2 = 'customers'
table3 = 'memberships'
table4 = 'users' #set as users table in both schemas
purchases_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table1}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
customers_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table2}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
memberships_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table3}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
users_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table4}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
cusers_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema2}.{table4}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('fc_purchases').getOrCreate()
purchases_df.createOrReplaceTempView('purchases')
customers_df.createOrReplaceTempView('customers')
memberships_df.createOrReplaceTempView('memberships')
users_df.createOrReplaceTempView('users')
cusers_df.createOrReplaceTempView('cusers')
all_purch = spark.sql("SELECT \
p_paid.customer_id AS p_paid_user_id \
,p_trial.created_at AS trial_start_date \
,p_paid.created_at \
,cu.graduation_year \
,lower(cu.student_year) AS student_year \
,lower(p_paid.description) as product \
,u.email \
,u.id AS u_user_id \
,cu.id AS cu_user_id \
FROM \
purchases AS p_paid \
INNER JOIN purchases AS p_trial ON p_trial.customer_id = p_paid.customer_id \
INNER JOIN customers AS c on c.id = p_paid.customer_id \
INNER JOIN memberships AS m on m.id = c.membership_id \
INNER JOIN users AS u on u.id = m.user_id \
INNER JOIN cusers AS cu on cu.id = u.id \
WHERE \
p_trial.created_at >= '2018-03-01' \
AND p_paid.created_at >= '2018-03-01' \
AND u.institution_contract = false \
AND LOWER(u.email) not like '%hotmail.me%' \
AND LOWER(u.email) not like '%gmail.com%' \
AND p_trial.description like '% Day Free Trial' \
AND p_paid.status = 'paid' \
GROUP BY \
p_paid_user_id \
,trial_start_date \
,p_paid.created_at \
,u.email \
,cu.graduation_year \
,student_year \
,product \
,cu_user_id \
,u_user_id \
ORDER BY p_paid_user_id")
all_purch.registerTempTable("all_purch_table")
return all_purch
Here is the second function that calls the above function. It is supposed to select against the registered table views set in above function:
def repurch_prep():
all_purch_spark()
all_repurch = sqlContext.sql("SELECT * FROM all_purch_table WHERE p_paid_user_id IN \
(SELECT p_paid_user_id FROM all_purch_table GROUP BY p_paid_user_id HAVING COUNT(*) > 1) \
ORDER BY p_paid_user_id ASC")
return all_repurch
When I run repurch_prep() it throws the following exception even though the SQL Context is defined in above function. I have tried returning values above but can't figure out how to get this to work:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
in
----> 1 repurch_prep()
~/spark/SparkNotebooks/firecracker/utils_prod_db_spark.py in repurch_prep()
735 #sc = SparkContext().getOrCreate()
736 #sqlContext = SQLContext()
--> 737 all_repurch = sqlContext.sql("SELECT * FROM all_purch_table WHERE p_paid_user_id IN \
738 (SELECT p_paid_user_id FROM all_purch_table GROUP BY p_paid_user_id HAVING COUNT(*) > 1) \
739 ORDER BY p_paid_user_id ASC")
NameError: name 'sqlContext' is not defined
Any help greatly appreciated.
The solution per #Lamanus was to place variable outside of function making them global rather than storing them in a function (as I did) and call that function from another.
############### SPARK REDSHIFT GLOBAL CONFIG #####################
config = {
'redshift_user': 'tester123',
'redshift_pass': '*****************',
'redshift_port': "5999",
'redshift_db': 'my_database',
'redshift_host': 'redshift.my_database.me',
}
from pyspark import SparkContext, SparkConf, SQLContext
jars = [
"/home/spark/SparkNotebooks/src/service/RedshiftJDBC42-no-awssdk-1.2.41.1065.jar"
]
conf = (
SparkConf()
.setAppName("S3 with Redshift")
.set("spark.driver.extraClassPath", ":".join(jars))
.set("spark.hadoop.fs.s3a.path.style.access", True)
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.set("com.amazonaws.services.s3.enableV4", True)
.set("spark.hadoop.fs.s3a.endpoint", f"s3-{config.get('region')}.amazonaws.com")
.set("spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
)
sc = SparkContext(conf=conf).getOrCreate()
###############################################################
def all_purch_spark():
sqlContext = SQLContext(sc)
##Set Schema and table to query
schema1 = 'production'
schema2 = 'X4production'
table1 = 'purchases'
table2 = 'customers'
table3 = 'memberships'
table4 = 'users' #set as users table in both schemas
purchases_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table1}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
customers_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table2}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
memberships_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table3}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
users_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema1}.{table4}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
cusers_df = sqlContext.read \
.format("jdbc") \
.option("url", f"jdbc:postgresql://{config.get('redshift_host')}:{config.get('redshift_port')}/{config.get('redshift_db')}") \
.option("dbtable", f"{schema2}.{table4}") \
.option("user", config.get('redshift_user')) \
.option("password", config.get('redshift_pass')) \
.load()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('fc_purchases').getOrCreate()
purchases_df.createOrReplaceTempView('purchases')
customers_df.createOrReplaceTempView('customers')
memberships_df.createOrReplaceTempView('memberships')
users_df.createOrReplaceTempView('users')
cusers_df.createOrReplaceTempView('cusers')
all_purch = spark.sql("SELECT \
p_paid.customer_id AS p_paid_user_id \
,p_trial.created_at AS trial_start_date \
,p_paid.created_at \
,cu.graduation_year \
,lower(cu.student_year) AS student_year \
,lower(p_paid.description) as product \
,u.email \
,u.id AS u_user_id \
,cu.id AS cu_user_id \
FROM \
purchases AS p_paid \
INNER JOIN purchases AS p_trial ON p_trial.customer_id = p_paid.customer_id \
INNER JOIN customers AS c on c.id = p_paid.customer_id \
INNER JOIN memberships AS m on m.id = c.membership_id \
INNER JOIN users AS u on u.id = m.user_id \
INNER JOIN cusers AS cu on cu.id = u.id \
WHERE \
p_trial.created_at >= '2018-03-01' \
AND p_paid.created_at >= '2018-03-01' \
AND u.institution_contract = false \
AND LOWER(u.email) not like '%hotmail.me%' \
AND LOWER(u.email) not like '%gmail.com%' \
AND p_trial.description like '% Day Free Trial' \
AND p_paid.status = 'paid' \
GROUP BY \
p_paid_user_id \
,trial_start_date \
,p_paid.created_at \
,u.email \
,cu.graduation_year \
,student_year \
,product \
,cu_user_id \
,u_user_id \
ORDER BY p_paid_user_id")
all_purch.registerTempTable("all_purch_table")
return all_purch
Related
The below code is working as it should, i.e. data is written to the output table and is selectable from the table within 10 seconds. The problem is that foreachBatch is not executed.
When I have tested it with .format("console") and calling .start() then foreachBatch is run. So it feels like .toTable() is to blame here.
This code is using the Kafka connector but the same problems existed with Event hub connector.
If I try to add .start() after toTable() is get the error
'StreamingQuery' object has no attribute 'start'
Here is the code that is working except foreachBatch
TOPIC = "myeventhub"
BOOTSTRAP_SERVERS = "myeventhub.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://myeventhub.servicebus.windows.net/;SharedAccessKeyName=mykeyname;SharedAccessKey=mykey;EntityPath=myentitypath;\";"
df = spark.readStream \
.format("kafka") \
.option("subscribe", TOPIC) \
.option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
.option("kafka.sasl.mechanism", "PLAIN") \
.option("kafka.security.protocol", "SASL_SSL") \
.option("kafka.sasl.jaas.config", EH_SASL) \
.option("kafka.request.timeout.ms", "60000") \
.option("kafka.session.timeout.ms", "60000") \
.option("failOnDataLoss", "false") \
.option("startingOffsets", "earliest") \
.load()
n = 100
count = 0
def run_command(batchDF, epoch_id):
global count
count += 1
if count % n == 0:
spark.sql("OPTIMIZE firstcatalog.bronze.factorydatas3 ZORDER BY (readtimestamp)")
...Omitted code where I transform the data in the value column to strongly typed data...
myTypedDF.writeStream \
.foreachBatch(run_command) \
.format("delta") \
.outputMode("append") \
.option("checkpointLocation", "/tmp/delta/events/_checkpoints/") \
.partitionBy("somecolumn") \
.toTable("myunitycatalog.bronze.mytable")
you either do foreachBatch or toTable, but not both. You can move writing to table inside the foreachBatch function - just make sure that you do idempotent writes because batch could be restarted. Change your code to this:
def run_command(batchDF, epoch_id):
global count
batchDF.write.format("delta") \
.option("txnVersion", epoch_id) \
.option("txnAppId", "my_app") \
.partitionBy("somecolumn") \
.mode("append") \
.saveAsTable("myunitycatalog.bronze.mytable")
count += 1
if count % n == 0:
spark.sql("OPTIMIZE myunitycatalog.bronze.mytable ZORDER BY (readtimestamp)")
myTypedDF.writeStream \
.foreachBatch(run_command) \
.outputMode("append") \
.option("checkpointLocation", "/tmp/delta/events/_checkpoints/") \
.start()
I am trying to connect to MS SQL DB from PySpark using spark.read.jdbc.
import os
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext;
from pyspark.sql.session import SparkSession
sc = SparkContext('xx')
spark = SparkSession(sc)
spark.read.jdbc('DESKTOP-XXXX\SQLEXPRESS',
"""(select COL1, COL2 from tbl1 WHERE COL1 = 2) """,
properties={'user': sa, 'password': 12345, 'driver': xxxx})
I do not know sc = SparkContext('xx') and 'driver': xxxx which parameters should I pass?
Replace serveraddress with your address of database:
sc = SparkContext()
spark = SparkSession(sc)
spark.read \
.format('jdbc') \
.option('url', 'jdbc:sqlserver://serveraddress:1433') \
.option('user', 'sa') \
.option('password', '12345') \
.option('dbtable', '(select COL1, COL2 from tbl1 WHERE COL1 = 2)')
I am able to read data from Kafka topic and able to print the data on the console using spark streaming.
I wanted the data to be in a dataframe format.
Here is my code:
spark = SparkSession \
.builder \
.appName("StructuredSocketRead") \
.getOrCreate()
spark.sparkContext.setLogLevel('ERROR')
lines = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers","********") \
.option("subscribe","******") \
.option("startingOffsets", "earliest") \
.load()
readable = lines.selectExpr("CAST(value AS STRING)")
query = readable \
.writeStream \
.outputMode("append") \
.format("console") \
.option("truncate", "False") \
.start()
query.awaitTermination()
The output is in JSON file format. How to convert this into a dataframe?Please find the output below:
{"items": [{"SKU": "23565", "title": "EGG CUP MILKMAID HELGA ", "unit_price": 2.46, "quantity": 2}], "type": "ORDER", "country": "United Kingdom", "invoice_no": 154132541847735, "timestamp": "2020-11-02 20:56:01"}
IICU, please use explode() and getItems() in order to create a Dataframe out of the json..
Create the dataframe here
a_json = {"items": [{"SKU": "23565", "title": "EGG CUP MILKMAID HELGA ", "unit_price": 2.46, "quantity": 2}], "type": "ORDER", "country": "United Kingdom", "invoice_no": 154132541847735, "timestamp": "2020-11-02 20:56:01"}
df = spark.createDataFrame([(a_json)])
df.show(truncate=False)
+--------------+---------------+-------------------------------------------------------------------------------------+-------------------+-----+
|country |invoice_no |items |timestamp |type |
+--------------+---------------+-------------------------------------------------------------------------------------+-------------------+-----+
|United Kingdom|154132541847735|[[quantity -> 2, unit_price -> 2.46, title -> EGG CUP MILKMAID HELGA , SKU -> 23565]]|2020-11-02 20:56:01|ORDER|
+--------------+---------------+-------------------------------------------------------------------------------------+-------------------+-----+
Logic Here
df = df.withColumn("items_array", F.explode("items"))
df = df.withColumn("quantity", df.items_array.getItem("quantity")).withColumn("unit_price", df.items_array.getItem("unit_price")).withColumn("title", df.items_array.getItem("title")).withColumn("SKU", df.items_array.getItem("SKU"))
df.select("country", "invoice_no", "quantity","unit_price", "title", "SKU", "timestamp", "timestamp").show(truncate=False)
+--------------+---------------+--------+----------+-----------------------+-----+-------------------+-------------------+
|country |invoice_no |quantity|unit_price|title |SKU |timestamp |timestamp |
+--------------+---------------+--------+----------+-----------------------+-----+-------------------+-------------------+
|United Kingdom|154132541847735|2 |2.46 |EGG CUP MILKMAID HELGA |23565|2020-11-02 20:56:01|2020-11-02 20:56:01|
+--------------+---------------+--------+----------+-----------------------+-----+-------------------+-------------------+
I am trying to compare two dataframes to look for new records and updated records, which in turn will be used to create a third dataframe. I am using Pyspark 2.4.3
As I come from a SQL background (ASE), my initial thought would be to do a left join to find new records and a != on a hash of all the columns to find updates:
SELECT a.*
FROM Todays_Data a
Left Join Yesterdays_PK_And_Hash b on a.pk = b.pk
WHERE (b.pk IS NULL) --finds new records
OR (b.hashOfColumns != HASHBYTES('md5',<converted and concatenated columns>)) --updated records
I have been playing around with Pyspark and have come up with a script that achieves the results I am after:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import md5, concat_ws, col, lit
sc = SparkContext("local", "test App")
sqlContext = SQLContext(sc)
sp = SparkSession \
.builder \
.appName("test App") \
.getOrCreate()
df = sp.createDataFrame(
[("Fred", "Smith", "16ba5519cdb13f99e087473e4faf3825"), # hashkey here is created based on YOB of 1973. To test for an update
("Fred", "Davis", "253ab75676cdbd73b874c97a62d27608"),
("Barry", "Clarke", "cc3baaa05a1146f2f8cf0a743c9ab8c4")],
["First_name", "Last_name", "hashkey"]
)
df_a = sp.createDataFrame(
[("Fred", "Smith", "Adelaide", "Doctor", 1971),
("Fred", "Davis", "Melbourne", "Baker", 1970),
("Barry", "Clarke", "Sydney", "Scientist", 1975),
("Jane", "Hall", "Sydney", "Dentist", 1980)],
["First_name", "Last_name", "City", "Occupation", "YOB"]
)
df_a = df_a.withColumn("hashkey", md5(concat_ws("", *df_a.columns)))
df_ins = df_a.alias('a').join(df.alias('b'), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')), 'left_anti') \
.select(lit("Insert").alias("_action"), 'a.*') \
.dropDuplicates()
df_up = df_a.alias('a').join(df.alias('b'), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')) &
(col('a.hashkey') != col('b.hashkey')), 'inner') \
.select(lit("Update").alias("_action"), 'a.*') \
.dropDuplicates()
df_delta = df_ins.union(df_up).sort("YOB")
df_delta = df_delta.drop("hashkey")
df_delta.show(truncate=False)
What this produces is my final delta as such:
+-------+----------+---------+--------+----------+----+
|_action|First_name|Last_name|City |Occupation|YOB |
+-------+----------+---------+--------+----------+----+
|Update |Fred |Smith |Adelaide|Doctor |1971|
|Insert |Jane |Hall |Sydney |Dentist |1980|
+-------+----------+---------+--------+----------+----+
While I am getting the results I am after, I am unsure how efficient the above code is.
Ultimately in the end, I would like to run similar patterns against datasets into the 100's of million records.
Is there anyway to make this more efficient?
Thanks
Have you explored broadcast join? Your join statements could be problematic if you have 100M + records. If the dataset B is smaller, this would the tiny modification I would try:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import md5, concat_ws, col, lit, broadcast
sc = SparkContext("local", "test App")
sqlContext = SQLContext(sc)
sp = SparkSession \
.builder \
.appName("test App") \
.getOrCreate()
df = sp.createDataFrame(
[("Fred", "Smith", "16ba5519cdb13f99e087473e4faf3825"), # hashkey here is created based on YOB of 1973. To test for an update
("Fred", "Davis", "253ab75676cdbd73b874c97a62d27608"),
("Barry", "Clarke", "cc3baaa05a1146f2f8cf0a743c9ab8c4")],
["First_name", "Last_name", "hashkey"]
)
df_a = sp.createDataFrame(
[("Fred", "Smith", "Adelaide", "Doctor", 1971),
("Fred", "Davis", "Melbourne", "Baker", 1970),
("Barry", "Clarke", "Sydney", "Scientist", 1975),
("Jane", "Hall", "Sydney", "Dentist", 1980)],
["First_name", "Last_name", "City", "Occupation", "YOB"]
)
df_a = df_a.withColumn("hashkey", md5(concat_ws("", *df_a.columns)))
df_ins = df_a.alias('a').join(broadcast(df.alias('b')), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')), 'left_anti') \
.select(lit("Insert").alias("_action"), 'a.*') \
.dropDuplicates()
df_up = df_a.alias('a').join(broadcast(df.alias('b')), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')) &
(col('a.hashkey') != col('b.hashkey')), 'inner') \
.select(lit("Update").alias("_action"), 'a.*') \
.dropDuplicates()
df_delta = df_ins.union(df_up).sort("YOB")
Maybe rewriting the code cleanly would be easier to follow too.
#Ash, from a readability standpoint, you could do a couple of things:
Use variables
Use functions.
Use pep-8 guiding style as much as possible. (ex: no more than 80 chars in a line)
joinExpr = (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')
joinType = 'left_anti'
df_up = df_a.alias('a').join(broadcast(df.alias('b')), joinExpr) &
(col('a.hashkey') != col('b.hashkey')), joinType) \
.select(lit("Update").alias("_action"), 'a.*') \
.dropDuplicates()
This is still long, but you get the idea.
I know this question has already been asked before multiple times but none of the answers help in my case.
Below is my spark code
class ParseLogs extends java.io.Serializable {
def formLogLine(logLine: String): (String,String,String,Int,String,String,String,Int,Float,String,String,Flo at,Int,String,Int,Float,String)={
//some logic
//return value
(recordKey._2.toString().replace("\"", ""),recordKey._3,recordKey._4,recordKey._5,recordKey._6,recordKey._8,sbcId,recordKey._10,recordKey._11,recordKey._12,recordKey._13.trim(),LogTransferTime,contentAccessed,OTT,dataTypeId,recordKey._14,logCaptureTime1)
}
}
val inputDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", brokers)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.load()
val myDf = inputDf.selectExpr("CAST(value AS STRING)")
val df1 = myDf.map(line => new ParseLogs().formLogLine(line.get(0).toString()))
I get below error
User class threw exception: org.apache.spark.sql.streaming.StreamingQueryException: Text data source supports only a single column, and you have 17 columns.;
Use UDF to convert logLine to what you want.For example:
spark.sqlContext.udf.register("YOURLOGIC", (logLine: String) => {
//some logic
(recordKey._2.toString().replace("\"",""),recordKey._3,recordKey._4,recordKey._5,recordKey._6,recordKey._8,sbcId,recordKey._10,recordKey._11,recordKey._12,recordKey._13.trim(),LogTransferTime,contentAccessed,OTT,dataTypeId,recordKey._14,logCaptureTime1)
})
val inputDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", brokers)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.load()
val myDf = inputDf.selectExpr("CAST(value AS STRING)")
val df1 = myDf.selectExpr("YOURLOGIC(value) as result")
val result = df1.select(
df1("result").getItem(0),
df1("result").getItem(1),
df1("result").getItem(2)),
df1("result").getItem(3)),
...if you have 17 item,then add to 17
df1("result").getItem(17))