when i insert data will got near " " syntax error - excel

when i insert excel data to sqlite data it will show me some syntax erro,i dont where is this errors?
import sqlite3
from openpyxl import *
from sqlite3 import Error
def create_db(db_file):
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
print(e)
def execute_sql(conn, sql):
try:
c = conn.cursor()
c.execute(sql)
except Error as e:
print(e)
if __name__ == '__main__':
create_table_inspections = "CREATE TABLE if not exists inspections(" \
"activity_date DATE NOT NULL," \
"employee_id VARCHAR(15) NOT NULL," \
"facility_address VARCHAR(100) NOT NULL," \
"facility_city VARCHAR(80) NOT NULL," \
"facility_id VARCHAR(15) NOT NULL," \
"facility_name VARCHAR(100) NOT NULL," \
"facility_state VARCHAR(10) NOT NULL," \
"facility_zip VARCHAR(15) NOT NULL," \
"grade VARCHAR(1) NOT NULL," \
"owner_id VARCHAR(15) NOT NULL," \
"owner_name VARCHAR(80) NOT NULL," \
"pe_description VARCHAR(80) NOT NULL," \
"program_element_pe VARCHAR(4) NOT NULL," \
"program_name VARCHAR(80) NOT NULL," \
"program_status VARCHAR(10) NOT NULL," \
"record_id VARCHAR(15) NOT NULL," \
"score VARCHAR(10) NOT NULL," \
"serial_number VARCHAR(20) NOT NULL," \
"service_code VARCHAR(15) NOT NULL," \
"service_description VARCHAR(80) NOT NULL);"
conn = create_db("data2.db")
if conn is not None:
execute_sql(conn, create_table_inspections)
print("loading inspections")
data_inspections = load_workbook("inspections.xlsx")
data_inspections_ws = data_inspections['inspections']
print("done")
print("read inspections")
for i in data_inspections_ws:
sql = """INSERT INTO inspections(
activity_date,
employee_id,
facility_address,
facility_city,
facility_id,
facility_name,
facility_state,
facility_zip,
grade,
owner_id,
owner_name,
pe_description,
program_element_pe,
program_name,
program_status,
record_id,
score,
serial_number,
service_code,
service_description)
VALUES
("{vactivity_date}",
"{vemployee_id}",
"{vfacility_address}",
"{vfacility_city}",
"{vfacility_id}",
"{vfacility_name}",
"{vfacility_state}",
"{vfacility_zip}",
"{vgrade}",
"{vowner_id}",
"{vowner_name}",
"{vpe_description}",
"{vprogram_element_pe}",
"{vprogram_name}",
"{vprogram_status}",
"{vrecord_id}",
"{vscore}",
"{vserial_number}",
"{vservice_code}",
"{vservice_description}")"""
sql = sql.format(vactivity_date=i[0].value,
vemployee_id=i[1].value,
vfacility_address=i[2].value,
vfacility_city=i[3].value,
vfacility_id=i[4].value,
vfacility_name=i[5].value,
vfacility_state=i[6].value,
vfacility_zip=i[7].value,
vgrade=i[8].value,
vowner_id=i[9].value,
vowner_name=i[10].value,
vpe_description=i[11].value,
vprogram_element_pe=i[12].value,
vprogram_name=i[13].value,
vprogram_status=i[14].value,
vrecord_id=i[15].value,
vscore=i[16].value,
vserial_number=i[17].value,
vservice_code=i[18].value,
vservice_description=i[19].value)
execute_sql(conn, sql)
pass
print("Done")
conn.commit()
conn.close()

Related

How to pass SparkSession object to Kafka-Spark streaming's foreachBatch method?

I have a python script loader.py which consists of main class that creates a sparkSession object as given below and calls various methods to perform different actions.
from utils import extract_kafka_data, do_some_transformation
def main():
try:
spark = SparkSession.builder.appName(config['kafka_transformations']).enableHiveSupport().getOrCreate()
kafka_df = extract_kafka_data(spark=spark, config=config, topic_name=topic_name)
do_some_transformation(kafka_df, spark)
except Exception as exc:
print(f'Failed with Exception:{exc}')
traceback.print_exc()
print('Stopping the application')
sys.exit(1)
if __name__ == '__main__':
main()
The methods extract_kafka_data, do_some_transformation are present in a different python script: utils.py
There are so many other methods inside my utils.py file that perform various transformations. Below are the couple of methods of this scenario that needs some addressing.
def extract_kafka_data(spark: SparkSession, config: dict, topic_name: str):
jass_config = config['jaas_config'] + " oauth.token.endpoint.uri=" + '"' + config['endpoint_uri'] + '"' + " oauth.client.id=" + '"' + config['client_id'] + '"' + " oauth.client.secret=" + '"' + config['client_secret'] + '" ;'
stream_df = spark.readStream \
.format('kafka') \
.option('kafka.bootstrap.servers', config['kafka_broker']) \
.option('subscribe', topic_name) \
.option('kafka.security.protocol', config['kafka_security_protocol']) \
.option('kafka.sasl.mechanism', config['kafka_sasl_mechanism']) \
.option('kafka.sasl.jaas.config', jass_config) \
.option('kafka.sasl.login.callback.handler.class', config['kafka_sasl_login_callback_handler_class']) \
.option('startingOffsets', 'earliest') \
.option('fetchOffset.retryIntervalMs', config['kafka_fetch_offset_retry_intervalms']) \
.option('fetchOffset.numRetries', config['retries']) \
.option('failOnDataLoss', 'False') \
.option('checkpointLocation', checkpoint_location) \
.load() \
.select(from_json(col('value').cast('string'), schema).alias("json_dta")).selectExpr('json_dta.*')
return stream_df
def do_some_transformation(spark: SparkSession, kafka_df: Dataframe):
kafka_df.writeStream \
.format('kafka') \
.foreachBatch(my_transformation_method) \
.option('checkpointLocation', checkpoint_location) \
.trigger(processingTime='10 minutes') \
.start()
.awaitTermination()
def my_transformation_method(kafka_df: Dataframe, batch_id: int):
base_delta = DeltaTable.forPath(spark, config['delta_path'])
base_delta.alias("base") \
.merge(source=kafka_df.alias("inc"), condition=build_update_condition(config['merge_keys'], config['inc_keys'])) \
.whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()
The problem I am facing here is with the method: my_transformation_method.
Inside method: my_transformation_method I am performing a merge of my kafka dataframe with my delta table.
In order to read the base table data, I need to run this statement:
base_delta = DeltaTable.forPath(spark, config['delta_path'])
But the problem here is that the method: my_transformation_method which is being called by foreachBatch in do_some_transformation method can only receive two method arguments: 1. Dataframe 2. batch_id as per the syntax of spark streaming.
I can make the spark session object global but I don't want to do it as it doesn't appear to be the standard way.
Is there any way I can make the sparkSession object spark available to method my_transformation_method when I call it from do_some_transformation ?
Any help is much appreciated.
DataFrame API ptovides sparkSession method that can be used:
spark = kafka_df.sparkSession()

How to write a streaming dataframe into another Kafka Topic after doing some transformations?

I am trying to read data from a Kafka topic, join it with another dataframe from Hive table and save the result to another Kafka topic.
Below is the code I have written.
# Returns a dataframe after reading Kafka topic.
kafka_df = kafka_data(spark=spark, kafkaconfig=kafkaconfig, tableconfig=table_config, source_type='kafka', where_clause='', objectname='object_name')
#Write the dataframe returned from above step into another Kafka topic.
write_batches(kafka_df)
def write_batches(kafka_df):
table_config = po_header_config
kafka_config = kafkaconfig
jaas_config = kafka_config['jaas_config']
oauth_client = f" oauth.client.id='{kafka_config['client_id']}'"
oauth_secret = f" oauth.client.secret='{kafka_config['client_secret']}'"
oauth_token_endpoint_uri = f" oauth.token.endpoint.uri='{kafka_config['endpoint_uri']}'"
oauth_config = jaas_config + oauth_client + oauth_secret + oauth_token_endpoint_uri + " oauth.max.token.expiry.seconds='30000' ;"
kafka_df.writeStream \
.option('checkpointLocation', table_config['checkpoint_location']) \
.option('kafka.bootstrap.servers', kafka_config['kafka_broker']) \
.option('topic', kafka_config['topic_name']) \
.format('kafka') \
.foreachBatch(join_kafka_streams_final_table_test) \
.outputMode("append") \
.trigger(processingTime="300 seconds") \
.start().awaitTermination()
def join_kafka_streams_final_table_test(kafka_df, batch_id):
try:
table_config = config
filters = data_filter(kafka_df=kafka_df)
query = f'select * from DB.TABLE where {filters}'
main_df = spark.sql(query)
print(f'Joining kafka dataframe with final_table table')
joined_df = join_remove_duplicate_col(kafka_df=kafka_df, final_table=main_df, table_config=table_config)
except Exception as error:
print(f'Join failed with the exception: {error}')
traceback.print_exc()
print('Stopping the application')
sys.exit(1)
def join_remove_duplicate_col(kafka_df, final_table: DataFrame, table_config: dict):
try:
df = kafka_df.join(final_table, on=table_config['join_keys'], how='left_outer')
print('Join Successful.')
repeated_columns = [c for c in kafka_df.columns if c in final_table.columns]
for column in repeated_columns:
df = df.drop(final_table[column])
return df
except Exception as error:
print(f'Unable join kafka_df & final_table table with the exception: {error}')
traceback.print_exc()
sys.exit(1)
def data_filter(kafka_df):
try:
print('Preparing filters for final_table table')
lst = []
distinct_partitions = kafka_df.select('main_part', 'create_dt').withColumn('month_part', substring('create_dt', 1, 7)).drop('create_dt').distinct()
filters = distinct_partitions.groupby('main_part').agg(F.concat_ws("', '", F.collect_list(distinct_partitions.month_part))).rdd.map(lambda row: (row[0], row[1])).collectAsMap()
for key, value in filters.items():
s = "'" + value + "'"
lst.append(f"(super_main_part = '{key}' and month_part in ({s}))")
datafilter = ' or '.join(lst)
return datafilter
except Exception as error:
print(f'Unable to form filter for final_table table with the exception: {error}')
traceback.print_exc()
print('Stopping the application')
sys.exit(1)
The problem here is when I invoke the method write_batches with my Kafka dataframe, I don't see any print statements from the methods present inside join_kafka_streams_final_table_test which is executed foreachBatch
The stream just loads and does nothing.
Could anyone let me know if my syntax is in the right format ? If not, what is the mistake I did here and how can I correct it.
Two things that came to my mind:
In the current code, you are using format("kafka") and foreachBatch(join_kafka_streams_final_table_test) at the same time. Typically, you would only use one of them.
The method join_kafka_streams_final_table_test does not contain any actions such as writing, hence, it will never be executed.
Looking at the overall code, I really recommend to get familiar with the Structured Streaming Programming Guide. As I am not completely familiar with Python I can only guess what you are trying to achieve, but the programming model of a structured streaming application already allows you to handle Dataframes within a batch. So instead of calling explictly foreachBatch you could do something as below:
table_config = po_header_config
kafka_config = kafkaconfig
# Returns a dataframe after reading Kafka topic.
kafka_df = kafka_data(spark=spark, kafkaconfig=kafkaconfig, tableconfig=table_config, source_type='kafka', where_clause='', objectname='object_name')
table_config = config
filters = data_filter(kafka_df=kafka_df)
query = f'select * from DB.TABLE where {filters}'
main_df = spark.sql(query)
df = kafka_df.join(main_df, on=table_config['join_keys'], how='left_outer')
df.writeStream \
.option('checkpointLocation', table_config['checkpoint_location']) \
.option('kafka.bootstrap.servers', kafka_config['kafka_broker']) \
.option('topic', kafka_config['topic_name']) \
.format('kafka') \
.outputMode("append") \
.trigger(processingTime="300 seconds") \
.start().awaitTermination()
Again, not completely familiar with the Python syntax, but I hope you get the idea.

How to see the dataframe in the console (equivalent of .show() for structured streaming)?

I'm trying to see what's coming in as my DataFrame..
here is the spark code
from pyspark.sql import SparkSession
import pyspark.sql.functions as psf
import logging
import time
spark = SparkSession \
.builder \
.appName("Console Example") \
.getOrCreate()
logging.info("started to listen to the host..")
lines = spark \
.readStream \
.format("socket") \
.option("host", "127.0.0.1") \
.option("port", 9999) \
.load()
data = lines.selectExpr("CAST(value AS STRING)")
query1 = data.writeStream.format("console").start()
time.sleep(10)
query1.awaitTermination()
I am getting the progress reports but obviously the input rows are 0 for each trigger..
2019-08-19 23:45:45 INFO MicroBatchExecution:54 - Streaming query made progress: {
"id" : "a4b26eaf-1032-4083-9e42-a9f2f0426eb7",
"runId" : "35c2b82a-191d-4998-9c98-17b24f5e3e9d",
"name" : null,
"timestamp" : "2019-08-20T06:45:45.458Z",
"batchId" : 0,
"numInputRows" : 0,
"inputRowsPerSecond" : 0.0,
"durationMs" : {
"getOffset" : 0,
"triggerExecution" : 0
},
"stateOperators" : [ ],
"sources" : [ {
"description" : "TextSocketSource[host: 127.0.0.1, port: 9999]",
"startOffset" : null,
"endOffset" : null,
"numInputRows" : 0,
"inputRowsPerSecond" : 0.0
} ],
"sink" : {
"description" : "org.apache.spark.sql.execution.streaming.ConsoleSinkProvider#5f3e6f3"
}
}
My TCP server is spitting some stuff out and I can see it in the console too - but i just want to make sure if my spark job is receiving anything by printing out but difficult to do so.
This is my TCP server code.
import socket
import sys
import csv
import time
port = 9999
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind(('', port))
server_socket.listen(5)
connection_socket, addr = server_socket.accept()
file_path = "/Users/Downloads/youtube-new/USvideos.csv"
row_count = sum(1 for row in file_path)
with open(file_path, "r") as f:
reader = csv.reader(f, delimiter="\t")
while True:
for i, line in enumerate(reader):
try:
print(line)
data = line[0].encode('utf-8')
connection_socket.send(data)
time.sleep(2)
if (row_count == i-1):
break
except IndexError:
print("Index error")
server_socket.close()
server_socket.close()
I can see the line is getting printed out so I can at least say that this has accepted connection at localhost:9999 which is the host & port I'm using for spark job as well.
this is one of the data..
['8mhTWqWlQzU,17.15.11,"Wearing Online Dollar Store Makeup For A Week","Safiya Nygaard",22,2017-11-11T01:19:33.000Z,"wearing online dollar store makeup for a week"|"online dollar store makeup"|"dollar store makeup"|"daiso"|"shopmissa makeup"|"shopmissa haul"|"dollar store makeup haul"|"dollar store"|"shopmissa"|"foundation"|"concealer"|"eye primer"|"eyebrow pencil"|"eyeliner"|"bronzer"|"contour"|"face powder"|"lipstick"|"$1"|"$1 makeup"|"safiya makeup"|"safiya dollar store"|"safiya nygaard"|"safiya"|"safiya and tyler",2922523,119348,1161,6736,https://i.ytimg.com/vi/8mhTWqWlQzU/default.jpg,False,False,False,"I found this online dollar store called ShopMissA that sells all their makeup products for $1 and decided I had to try it out! So I replaced my entire everyday makeup routine with $1 makeup products, including foundation, concealer, eye primer, eyebrow pencil, eyeliner, bronzer, contour, face powder, and lipstick. What do you think? Would you try this?\\n\\nThis video is NOT sponsored!\\n\\nSafiya\'s Nextbeat: https://nextbeat.co/u/safiya\\nIG: https://www.instagram.com/safiyany/\\nTwitter: https://twitter.com/safiyajn\\nFacebook: https://www.facebook.com/safnygaard/\\n\\nAssistant Editor: Claire Wiley\\n\\nMUSIC\\nMind The Gap\\nvia Audio Network\\n\\nSFX\\nvia AudioBlocks"']
Everything in the bracket (notice I'm actually sending data[0])
from pyspark.sql import SparkSession
import pyspark.sql.functions as psf
import logging
import time
spark = SparkSession \
.builder \
.appName("Console Example") \
.getOrCreate()
logging.info("started to listen to the host..")
lines = spark \
.readStream \
.format("socket") \
.option("host", "127.0.0.1") \
.option("port", 9999) \
.load()
data = lines.selectExpr("CAST(value AS STRING)")
query1 = data.writeStream.queryName("counting").format("memory").outputMode("append").start()
for x in range(5):
spark.sql("select * from counting").show()
time.sleep(10)
Try this, it will show you data just as the method show() does in spark Sql. It will show you 5 sets of data, as we are looping five times.

Unable to read MS SQL table using pyspark in jupyter notebook?

import os
import sys
spark_path = 'C:/opt/spark/spark-2.4.1-bin-hadoop2.7'
os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.master('local[*]')\
.appName('Connection-Test')\
.config('spark.driver.extraClassPath', 'C:/Users/sqljdbc_4.2.8112.200_enu/sqljdbc_4.2/enu/jre8/sqljdbc42.jar')\
.config('spark.executor.extraClassPath', 'C:/Users/sqljdbc_4.2.8112.200_enu/sqljdbc_4.2/enu/jre8/sqljdbc42.jar')\
.getOrCreate()
sqlsUrl = 'jdbc:sqlserver://ip:port;database=dbname'
qryStr = """ (
SELECT *
FROM Table
) """
spark.read.format('jdbc')\
.option('url',sqlsUrl)\
.option('driver', 'com.microsoft.sqlserver.jdbc.SQLServerDriver')\
.option('dbtable', qryStr )\
.option("user", "user") \
.option("password", "password") \
.load().show()
An error occurred while calling o50.load. : com.microsoft.sqlserver.jdbc.SQLServerException: Incorrect syntax near the keyword 'WHERE'. at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:217)
Try adding "as Table_Name" to the end of your query
qryStr = """ (
SELECT *
FROM Table
) as Table """

TypeError: 'Column' object is not callable when use 'case-when'

when I use func.when() in pyspark I am as`
TypeError: 'Column' object is not callable
Below is the code which I have written
import sys
from pyspark.sql.window import Window
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.functions import coalesce, current_date, current_timestamp, lit, unix_timestamp, from_unixtime, \
row_number, mean
a_df = sqlContext.table('opssup_dev_wrk_ct.wrk_ct_ods_batch_derived_extnd_stg2')
b_df = sqlContext.table('opssup_dev_wrk_ct.wrk_ct_sap_batch_specific_dates')
fdsi_df = sqlContext.table('opssup_dev_wrk_ct.wrk_ct_sap_batch_specific_dates')
ldsi_df = sqlContext.table('opssup_dev_wrk_ct.wrk_ct_sap_batch_specific_dates')
fds_df = sqlContext.table('opssup_dev_wrk_ct.wrk_ct_sap_batch_specific_dates')
temp22_df = a_df \
.join(b_df,(a_df.batch_number==b_df.Batch_Number)) \
.join(fdsi_df,(a_df.First_DSI_BATCH_NUMBER==fdsi_df.Batch_Number),"left_outer") \
.join(ldsi_df,(a_df.Last_DSI_BATCH_NUMBER==ldsi_df.Batch_Number),"left_outer") \
.join(fds_df,(a_df.First_DS_BATCH_NUMBER==fds_df.Batch_Number),"left_outer") \
.select( \
a_df.driving_batch_number, \
a_df.batch_number, \
b_df.Material_Group, \
a_df.mfg_stage_code, \
func.when(a_df.batch_number==a_df.First_DSI_BATCH_NUMBER,lit('0')) \
.otherwisw(lit('')) \
.alias('mfg_start_date') \
)
I am not getting why that error is coming during execution.
I got the answer after straggling more than 1 hr. Simply I have missed out the spelling.
func.when(a_df.batch_number==a_df.First_DSI_BATCH_NUMBER,lit('0')) \
**.otherwisw**(lit('')) \
.alias('mfg_start_date') \
)

Resources