spark-submit - error to pass master param - apache-spark

can you help me?
taxi_task_select = KubernetesPodOperator(
namespace='spark',
image="taxi",
name='taxi_task_select',
is_delete_operator_pod=False,
in_cluster=True,
task_id="taxi_task_select",
get_logs=True,
cmds=['/opt/spark/bin/spark-submit'],
arguments=[
"--master=k8s://https://IP:443",
'--deploy-mode cluster',
'--name spark-pi',
'--class org.apache.spark.examples.SparkPi',
'--conf spark.executor.instances=5',
'--conf spark.kubernetes.container.image=taxi',
'local:///app/taxi-spark.py'])
or
arguments=[
"--master k8s://https://IP:443",
'--deploy-mode cluster',
'--name spark-pi',
'--class org.apache.spark.examples.SparkPi',
'--conf spark.executor.instances=5',
'--conf spark.kubernetes.container.image=taxi',
'local:///app/taxi-spark.py']
or
arguments=[
'--master' , 'k8s://https://IP:443',
'--deploy-mode cluster',
'--name spark-pi',
'--class org.apache.spark.examples.SparkPi',
'--conf spark.executor.instances=5',
'--conf spark.kubernetes.container.image=taxi',
'local:///app/taxi-spark.py']

Related

airflow 2.3.3 SparkKubernetesOperator

I do submit spark application to kubernetes from Airflow as SparkKubernetesOperator task.
When I mark task as failed I see the pod is not deleted in kubernetes.
How to fix it?
dag = DAG(
"dag-name",
default_args=default_args,
description='submit dag',
schedule_interval="30 1 * * *",
start_date = datetime(2022, 7, 26),
)
t1 = SparkKubernetesOperator(
task_id='sample-name',
namespace="batch",
application_file="k8s/sample.yaml",
do_xcom_push=True,
dag=dag,
params = {"processDate": process_date},
)
t1_sensor = SparkKubernetesSensor(
task_id='sample-monitor',
namespace="batch",
application_name="{{ task_instance.xcom_pull(task_ids='sample-name')['metadata']['name'] }}",
kubernetes_conn_id="kubernetes_default",
dag=dag,
)
t1 >> t1_sensor

when i insert data will got near " " syntax error

when i insert excel data to sqlite data it will show me some syntax erro,i dont where is this errors?
import sqlite3
from openpyxl import *
from sqlite3 import Error
def create_db(db_file):
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
print(e)
def execute_sql(conn, sql):
try:
c = conn.cursor()
c.execute(sql)
except Error as e:
print(e)
if __name__ == '__main__':
create_table_inspections = "CREATE TABLE if not exists inspections(" \
"activity_date DATE NOT NULL," \
"employee_id VARCHAR(15) NOT NULL," \
"facility_address VARCHAR(100) NOT NULL," \
"facility_city VARCHAR(80) NOT NULL," \
"facility_id VARCHAR(15) NOT NULL," \
"facility_name VARCHAR(100) NOT NULL," \
"facility_state VARCHAR(10) NOT NULL," \
"facility_zip VARCHAR(15) NOT NULL," \
"grade VARCHAR(1) NOT NULL," \
"owner_id VARCHAR(15) NOT NULL," \
"owner_name VARCHAR(80) NOT NULL," \
"pe_description VARCHAR(80) NOT NULL," \
"program_element_pe VARCHAR(4) NOT NULL," \
"program_name VARCHAR(80) NOT NULL," \
"program_status VARCHAR(10) NOT NULL," \
"record_id VARCHAR(15) NOT NULL," \
"score VARCHAR(10) NOT NULL," \
"serial_number VARCHAR(20) NOT NULL," \
"service_code VARCHAR(15) NOT NULL," \
"service_description VARCHAR(80) NOT NULL);"
conn = create_db("data2.db")
if conn is not None:
execute_sql(conn, create_table_inspections)
print("loading inspections")
data_inspections = load_workbook("inspections.xlsx")
data_inspections_ws = data_inspections['inspections']
print("done")
print("read inspections")
for i in data_inspections_ws:
sql = """INSERT INTO inspections(
activity_date,
employee_id,
facility_address,
facility_city,
facility_id,
facility_name,
facility_state,
facility_zip,
grade,
owner_id,
owner_name,
pe_description,
program_element_pe,
program_name,
program_status,
record_id,
score,
serial_number,
service_code,
service_description)
VALUES
("{vactivity_date}",
"{vemployee_id}",
"{vfacility_address}",
"{vfacility_city}",
"{vfacility_id}",
"{vfacility_name}",
"{vfacility_state}",
"{vfacility_zip}",
"{vgrade}",
"{vowner_id}",
"{vowner_name}",
"{vpe_description}",
"{vprogram_element_pe}",
"{vprogram_name}",
"{vprogram_status}",
"{vrecord_id}",
"{vscore}",
"{vserial_number}",
"{vservice_code}",
"{vservice_description}")"""
sql = sql.format(vactivity_date=i[0].value,
vemployee_id=i[1].value,
vfacility_address=i[2].value,
vfacility_city=i[3].value,
vfacility_id=i[4].value,
vfacility_name=i[5].value,
vfacility_state=i[6].value,
vfacility_zip=i[7].value,
vgrade=i[8].value,
vowner_id=i[9].value,
vowner_name=i[10].value,
vpe_description=i[11].value,
vprogram_element_pe=i[12].value,
vprogram_name=i[13].value,
vprogram_status=i[14].value,
vrecord_id=i[15].value,
vscore=i[16].value,
vserial_number=i[17].value,
vservice_code=i[18].value,
vservice_description=i[19].value)
execute_sql(conn, sql)
pass
print("Done")
conn.commit()
conn.close()

How to see the dataframe in the console (equivalent of .show() for structured streaming)?

I'm trying to see what's coming in as my DataFrame..
here is the spark code
from pyspark.sql import SparkSession
import pyspark.sql.functions as psf
import logging
import time
spark = SparkSession \
.builder \
.appName("Console Example") \
.getOrCreate()
logging.info("started to listen to the host..")
lines = spark \
.readStream \
.format("socket") \
.option("host", "127.0.0.1") \
.option("port", 9999) \
.load()
data = lines.selectExpr("CAST(value AS STRING)")
query1 = data.writeStream.format("console").start()
time.sleep(10)
query1.awaitTermination()
I am getting the progress reports but obviously the input rows are 0 for each trigger..
2019-08-19 23:45:45 INFO MicroBatchExecution:54 - Streaming query made progress: {
"id" : "a4b26eaf-1032-4083-9e42-a9f2f0426eb7",
"runId" : "35c2b82a-191d-4998-9c98-17b24f5e3e9d",
"name" : null,
"timestamp" : "2019-08-20T06:45:45.458Z",
"batchId" : 0,
"numInputRows" : 0,
"inputRowsPerSecond" : 0.0,
"durationMs" : {
"getOffset" : 0,
"triggerExecution" : 0
},
"stateOperators" : [ ],
"sources" : [ {
"description" : "TextSocketSource[host: 127.0.0.1, port: 9999]",
"startOffset" : null,
"endOffset" : null,
"numInputRows" : 0,
"inputRowsPerSecond" : 0.0
} ],
"sink" : {
"description" : "org.apache.spark.sql.execution.streaming.ConsoleSinkProvider#5f3e6f3"
}
}
My TCP server is spitting some stuff out and I can see it in the console too - but i just want to make sure if my spark job is receiving anything by printing out but difficult to do so.
This is my TCP server code.
import socket
import sys
import csv
import time
port = 9999
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind(('', port))
server_socket.listen(5)
connection_socket, addr = server_socket.accept()
file_path = "/Users/Downloads/youtube-new/USvideos.csv"
row_count = sum(1 for row in file_path)
with open(file_path, "r") as f:
reader = csv.reader(f, delimiter="\t")
while True:
for i, line in enumerate(reader):
try:
print(line)
data = line[0].encode('utf-8')
connection_socket.send(data)
time.sleep(2)
if (row_count == i-1):
break
except IndexError:
print("Index error")
server_socket.close()
server_socket.close()
I can see the line is getting printed out so I can at least say that this has accepted connection at localhost:9999 which is the host & port I'm using for spark job as well.
this is one of the data..
['8mhTWqWlQzU,17.15.11,"Wearing Online Dollar Store Makeup For A Week","Safiya Nygaard",22,2017-11-11T01:19:33.000Z,"wearing online dollar store makeup for a week"|"online dollar store makeup"|"dollar store makeup"|"daiso"|"shopmissa makeup"|"shopmissa haul"|"dollar store makeup haul"|"dollar store"|"shopmissa"|"foundation"|"concealer"|"eye primer"|"eyebrow pencil"|"eyeliner"|"bronzer"|"contour"|"face powder"|"lipstick"|"$1"|"$1 makeup"|"safiya makeup"|"safiya dollar store"|"safiya nygaard"|"safiya"|"safiya and tyler",2922523,119348,1161,6736,https://i.ytimg.com/vi/8mhTWqWlQzU/default.jpg,False,False,False,"I found this online dollar store called ShopMissA that sells all their makeup products for $1 and decided I had to try it out! So I replaced my entire everyday makeup routine with $1 makeup products, including foundation, concealer, eye primer, eyebrow pencil, eyeliner, bronzer, contour, face powder, and lipstick. What do you think? Would you try this?\\n\\nThis video is NOT sponsored!\\n\\nSafiya\'s Nextbeat: https://nextbeat.co/u/safiya\\nIG: https://www.instagram.com/safiyany/\\nTwitter: https://twitter.com/safiyajn\\nFacebook: https://www.facebook.com/safnygaard/\\n\\nAssistant Editor: Claire Wiley\\n\\nMUSIC\\nMind The Gap\\nvia Audio Network\\n\\nSFX\\nvia AudioBlocks"']
Everything in the bracket (notice I'm actually sending data[0])
from pyspark.sql import SparkSession
import pyspark.sql.functions as psf
import logging
import time
spark = SparkSession \
.builder \
.appName("Console Example") \
.getOrCreate()
logging.info("started to listen to the host..")
lines = spark \
.readStream \
.format("socket") \
.option("host", "127.0.0.1") \
.option("port", 9999) \
.load()
data = lines.selectExpr("CAST(value AS STRING)")
query1 = data.writeStream.queryName("counting").format("memory").outputMode("append").start()
for x in range(5):
spark.sql("select * from counting").show()
time.sleep(10)
Try this, it will show you data just as the method show() does in spark Sql. It will show you 5 sets of data, as we are looping five times.

Unable to read MS SQL table using pyspark in jupyter notebook?

import os
import sys
spark_path = 'C:/opt/spark/spark-2.4.1-bin-hadoop2.7'
os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.master('local[*]')\
.appName('Connection-Test')\
.config('spark.driver.extraClassPath', 'C:/Users/sqljdbc_4.2.8112.200_enu/sqljdbc_4.2/enu/jre8/sqljdbc42.jar')\
.config('spark.executor.extraClassPath', 'C:/Users/sqljdbc_4.2.8112.200_enu/sqljdbc_4.2/enu/jre8/sqljdbc42.jar')\
.getOrCreate()
sqlsUrl = 'jdbc:sqlserver://ip:port;database=dbname'
qryStr = """ (
SELECT *
FROM Table
) """
spark.read.format('jdbc')\
.option('url',sqlsUrl)\
.option('driver', 'com.microsoft.sqlserver.jdbc.SQLServerDriver')\
.option('dbtable', qryStr )\
.option("user", "user") \
.option("password", "password") \
.load().show()
An error occurred while calling o50.load. : com.microsoft.sqlserver.jdbc.SQLServerException: Incorrect syntax near the keyword 'WHERE'. at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:217)
Try adding "as Table_Name" to the end of your query
qryStr = """ (
SELECT *
FROM Table
) as Table """

Spark: running spark-submit with the correct number of executors

I've set up a basic EMR 3 node cluster, and run spark-submit with an --executor-memory setting of 1G and no other configs.
The script itself is a basic benchmarking task:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
import time
conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# sample data in lineitem table:
# 3|1284483|34508|3|27|39620.34|0.06|0.07|A|F|1994-01-16|1993-11-22|1994-01-23|DELIVER IN PERSON|SHIP|nal foxes wake. |
def mapper(lines):
x = lines.split("|")
return Row( rownum=int(x[0]),
l_orderkey=int(x[0]),
l_partkey=int(x[1]),
l_suppkey=int(x[2]),
l_linenumber=int(x[3]),
l_quantity=int(x[4]),
l_extendedprice=float(x[5]),
l_discount=float(x[6]),
l_tax=float(x[7]),
l_returnflag=x[8],
l_linestatus=x[9],
l_shipdate=x[10],
l_commitdate=x[11],
l_receiptdate=x[12],
l_shipinstruct=x[13],
l_shipment=x[14],
l_comment=x[15],
)
# ORDERKEY
# PARTKEY
# SUPPKEY
# LINENUMBER
# QUANTITY
# EXTENDEDPRICE
# DISCOUNT
# TAX
# RETURNFLAG
# LINESTATUS
# SHIPDATE
# COMMITDATE
# RECEIPTDATE
# SHIPINSTRUCT
# SHIPMODE
# COMMENT
rdd = sc.textFile("s3://sampletpchdata/10gb/lineitem.tbl.*")
# kick off an initial count
print rdd.count()
sample = rdd.map(mapper)
schemaSample = sqlContext.createDataFrame( sample )
schemaSample.registerTempTable("lineitem")
# run TPCH query 1
results = sqlContext.sql("""
SELECT
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= date_sub(cast('1998-12-01' as date), '60')
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus
""")
# kick off a final count of the results
print results.count()
And while that's going on, I looked at the result of the spark API's executors endpoint, and got this result:
[ {
"id" : "driver",
"hostPort" : "10.232.13.130:47656",
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 0,
"totalTasks" : 0,
"totalDuration" : 0,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"maxMemory" : 7975010304,
"executorLogs" : { }
}, {
"id" : "1",
"hostPort" : "ip-10-232-13-123.us-west-1.compute.internal:58544",
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 641,
"totalTasks" : 641,
"totalDuration" : 4998902,
"totalInputBytes" : 3490792,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 395870,
"maxMemory" : 7790985216,
"executorLogs" : {
"stdout" : "http://somenode:8042/node/containerlogs/container_1456781958356_0004_01_000009/hadoop/stdout?start=-4096",
"stderr" : "http://somenode:8042/node/containerlogs/container_1456781958356_0004_01_000009/hadoop/stderr?start=-4096"
}
} ]
And unless i'm misunderstanding this result, it appears that in my 3 node cluster, there is only 1 driver and 1 executor. Is this what is happening? If so, shouldn't there be more executors than this? and how do I make that happen?
You'll have to also use --num-executors to choose the number of executors you want to run your code.

Resources