Errors when creating a Value and Timestamp dataframe with Azure Databricks - apache-spark

I'm not too familiar with Spark but I'm forced to use it to consume some data. I've tried basically every syntax I could find to make a dataframe with a value and a timestamp that I can put into a database to track when I get updates from the datasource. The errors are endless and I'm out of ideas and short on reasons for why I can't make something this simple. Below is the sample of code I'm trying to get working
sc = spark.sparkContext
df = sc.parallelize([[1,pyspark.sql.functions.current_timestamp()]]).toDF(("Value","CreatedAt"))
and this error doesn't really help
py4j.Py4JException: Method __getstate__([]) does not exist
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<command-1699228214903488> in <module>
29
30 sc = spark.sparkContext
---> 31 df = sc.parallelize([[1,pyspark.sql.functions.current_timestamp()]]).toDF(("Value","CreatedAt"))
/databricks/spark/python/pyspark/context.py in parallelize(self, c, numSlices)
557 return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices)
558
--> 559 jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer)
560
561 return RDD(jrdd, self, serializer)
/databricks/spark/python/pyspark/context.py in _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer)
590 try:
591 try:
--> 592 serializer.dump_stream(data, tempFile)
593 finally:
594 tempFile.close()
I've also tried this
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc) # sc is the spark context
df = sqlContext.createDataFrame(
[( current_timestamp(), '12a345')],
['CreatedAt','Value'] # the row header/column labels should be entered here
)
With the error
AssertionError: dataType <py4j.java_gateway.JavaMember object at 0x7f43d97c6ba8> should be an instance of <class 'pyspark.sql.types.DataType'>
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<command-2294571935273349> in <module>
33 df = sqlContext.createDataFrame(
34 [( current_timestamp(), '12a345')],
---> 35 ['CreatedAt','Value'] # the row header/column labels should be entered here
36 )
37
/databricks/spark/python/pyspark/sql/context.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
305 Py4JJavaError: ...
306 """
--> 307 return self.sparkSession.createDataFrame(data, schema, samplingRatio, verifySchema)
308
309 #since(1.3)
/databricks/spark/python/pyspark/sql/session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
815 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
816 else:
--> 817 rdd, schema = self._createFromLocal(map(prepare, data), schema)
818 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())

Well I code something to work eventually. I couldn't get it to work with TimestampType() though, spark would flip out when inserting the data. I think that may be a runtime error and not a coding issue though.
import adal
import datetime;
from pyspark.sql.types import *
# Set Access Token
access_token = token["accessToken"]
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc) # sc is the spark context
schema = StructType([
StructField("CreatedAt", StringType(), True),
StructField("value", StringType(), True)
])
da = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
df = sqlContext.createDataFrame(
[(da,'12a345')],schema
)
df.write \
.format("com.microsoft.sqlserver.jdbc.spark") \
.option("url", url)\
.option("dbtable", "dbo.RunStart")\
.option("accessToken", access_token)\
.option("databaseName", database_name) \
.option("encrypt", "true")\
.option("hostNameInCertificate", "*.database.windows.net")\
.option("applicationintent", "ReadWrite") \
.mode("append") \
.save()

Related

returning an error while connecting snowflake to PySpark

I'm trying to connect snowflake database to PySpark. But it's returning an error. Below is the code
import snowflake.connector
import os
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.primitives.asymmetric import dsa
from cryptography.hazmat.primitives import serialization
import pandas as pd
import numpy as np
def get_last_result_spark():
# get result of the last sql query
return spark.createDataFrame(cs.fetchall(), [x[0] for x in cs.description])
with open("key.p8", "rb") as key: ##Read Note: Point num 2
p_key= serialization.load_pem_private_key(
key.read(),
password=None,
backend=default_backend())
pkb = p_key.private_bytes(
encoding=serialization.Encoding.DER,
format=serialization.PrivateFormat.PKCS8,
encryption_algorithm=serialization.NoEncryption())
ctx = snowflake.connector.connect(
user=''
,account=''
,authenticator=''
,private_key=pkb
,role=''
,warehouse=''
,database=''
,schema=''
)
cs = ctx.cursor()
submit_query_spark('select * from table limit 3;')
df = display(get_last_result_spark())
# ctx.close()
df
It's returning an error as below
ValueError Traceback (most recent call last)
<timed exec> in <module>
<timed exec> in get_last_result_spark()
/projects/fmcganalytics/jupyter/conda/envs/default/lib/python3.7/site-packages/pyspark/sql/session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
673 return super(SparkSession, self).createDataFrame(
674 data, schema, samplingRatio, verifySchema)
--> 675 return self._create_dataframe(data, schema, samplingRatio, verifySchema)
676
677 def _create_dataframe(self, data, schema, samplingRatio, verifySchema):
/projects/fmcganalytics/jupyter/conda/envs/default/lib/python3.7/site-packages/pyspark/sql/session.py in _create_dataframe(self, data, schema, samplingRatio, verifySchema)
698 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
699 else:
--> 700 rdd, schema = self._createFromLocal(map(prepare, data), schema)
701 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
702 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
/projects/fmcganalytics/jupyter/conda/envs/default/lib/python3.7/site-packages/pyspark/sql/session.py in _createFromLocal(self, data, schema)
510
511 if schema is None or isinstance(schema, (list, tuple)):
--> 512 struct = self._inferSchemaFromList(data, names=schema)
513 converter = _create_converter(struct)
514 data = map(converter, data)
/projects/fmcganalytics/jupyter/conda/envs/default/lib/python3.7/site-packages/pyspark/sql/session.py in _inferSchemaFromList(self, data, names)
439 schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
440 if _has_nulltype(schema):
--> 441 raise ValueError("Some of types cannot be determined after inferring")
442 return schema
443
ValueError: Some of types cannot be determined after inferring
Can anyone let me know why it's returning an issue. It's working well i wrote code for pandas. But for PySpark it's returning an issue. Can anyone please look into this issue?

IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment

I'm trying to connect BigQuery Dataset to Databrick and run Script using Pyspark.
Procedures I've done:
I patched the BigQuery Json API to databrick in dbfs for connection access.
Then I added spark-bigquery-latest.jar in the cluster library and I ran my Script.
When I run this script, I didn't face any error.
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("table","dataset.schema.tablename") \
.load()
df.show()
But Instead of calling a single table in that schema I tried to call all the tables under it using query like:
from pyspark.sql import SparkSession
from google.cloud import bigquery
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
client = bigquery.Client()
table_list = 'dataset.schema'
tables = client.list_tables(table_list)
for table in tables:
tlist = tlist.append(table)
for i in tlist:
sql_query = """select * from `dataset.schema.' + i +'`"""
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("query", sql_query).load()
df.show()
OR
This Script:
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
sql_query = """select * from `dataset.schema.tablename`"""
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("query", sql_query).load()
df.show()
I get this unusual Error:
IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment. Please set a project ID using the builder.
---------------------------------------------------------------------------
IllegalArgumentException Traceback (most recent call last)
<command-131090852> in <module>
35 .option("parentProject", "google-project-ID") \
36 .option("project", "Dataset-Name") \
---> 37 .option("query", sql_query).load()
38 #df.show()
39
/databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
182 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
183 else:
--> 184 return self._df(self._jreader.load())
185
186 #since(1.4)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
131 # Hide where the exception came from that shows a non-Pythonic
132 # JVM exception message.
--> 133 raise_from(converted)
134 else:
135 raise
/databricks/spark/python/pyspark/sql/utils.py in raise_from(e)
IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment. Please set a project ID using the builder.
It do recognize my project ID when I call it as table, but when I run it as query I get this error.
I tried to figure it out and went through many sites for an answer but couldn't get a clear answer for it.
Help is much appreciated... Thanks in Advance...
Can you avoid using queries and just use the table option?
from pyspark.sql import SparkSession
from google.cloud import bigquery
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
client = bigquery.Client()
table_list = 'dataset.schema'
tables = client.list_tables(table_list)
for table in tables:
tlist = tlist.append(table)
for i in tlist:
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("table","dataset.schema." + str(i)) \
.load()
df.show()
In my case I had the same exception but because I wasn't specifying the config value parentProject which is the BigQuery project ID I'm connecting to

Insert/Upsert/Delete (CDC) PySpark Structured Streaming

Lets supose that we have a initial file like this:
Id
Number
ChangeMode
1
10
insert
2
20
insert
3
30
insert
4
40
insert
5
50
insert
My table in mariaDB should be something like this:
Id
Number
1
10
2
20
3
30
4
40
5
50
Then other file like this come to folder:
Id
Number
ChangeMode
1
123
upsert
2
456
upsert
3
30
remove
And the table should be like this :
Id
Number
1
123
2
456
4
40
5
50
How can i use the column "ChangeMode" as a reference to say to spark when it will insert/update/delete?
I already wrote this part of code, but i dont know how to proceed from here, and also dont know how to implement delete.
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = (SparkSession
.builder
.appName("Spark Structured Streaming CDC")
.config("spark.driver.extraClassPath", "E:\\pyspark_projects\\mariadb-java-client-2.7.1.jar")
.getOrCreate())
streamingSchema = StructType([
StructField("Id", IntegerType(),True),
StructField("Number", IntegerType(),True),
StructField("ChangeMode", StringType(),True),
])
streamingDF = (spark.readStream
.format("csv")
.option("sep", "|")
.schema(streamingSchema)
.csv("E:\\pyspark_projects\\stream_cdc\\files\\input\\"))
db_target_properties = {"user":"root", "password":"root", "driver":"org.mariadb.jdbc.Driver"}
db_target_url = "jdbc:mariadb://127.0.0.1:3306/projects"
streamingInsert = streamingDF.where("ChangeMode == 'insert'")
streamingUpsert = streamingDF.where("ChangeMode == 'upsert'")
def insert(df, epoch_id):
streamingInsert.write.jdbc(url=db_target_url, table="cdc", mode="append", properties=db_target_properties)
pass
def upsert(df, epoch_id):
streamingUpsert.write.jdbc(url=db_target_url, table="cdc", mode="update", properties=db_target_properties)
pass
queryInsert = streamingInsert.writeStream.foreachBatch(insert).start()
queryUpdate = streamingUpsert.writeStream.foreachBatch(upsert).start()
spark.streams.awaitAnyTermination()
I'm having the following error:
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
File "C:\Spark\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 2442, in _call_proxy
return_value = getattr(self.pool[obj_id], method)(*params)
File "C:\Spark\python\pyspark\sql\utils.py", line 207, in call
raise e
File "C:\Spark\python\pyspark\sql\utils.py", line 204, in call
self.func(DataFrame(jdf, self.sql_ctx), batch_id)
File "main.py", line 32, in insert
streamingInsert.write.jdbc(url=db_target_url, table="cdc", mode="append", properties=db_target_properties)
File "C:\Spark\python\pyspark\sql\dataframe.py", line 231, in write
return DataFrameWriter(self)
File "C:\Spark\python\pyspark\sql\readwriter.py", line 645, in __init__
self._jwrite = df._jdf.write()
File "C:\Spark\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Spark\python\pyspark\sql\utils.py", line 134, in deco
raise_from(converted)
File "<string>", line 3, in raise_from
pyspark.sql.utils.AnalysisException: 'write' can not be called on streaming Dataset/DataFrame;
at py4j.Protocol.getReturnValue(Protocol.java:476)
at py4j.reflection.PythonProxyHandler.invoke(PythonProxyHandler.java:108)
at com.sun.proxy.$Proxy17.call(Unknown Source)
at org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchHelper$.$anonfun$callForeachBatch$1(ForeachBatchSink.scala:56)
at org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchHelper$.$anonfun$callForeachBatch$1$adapted(ForeachBatchSink.scala:56)
at org.apache.spark.sql.execution.streaming.sources.ForeachBatchSink.addBatch(ForeachBatchSink.scala:36)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:572)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:570)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:352)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:350)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:570)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:223)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:352)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:350)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:191)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:185)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:334)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:245)
If anyone knows another method of doing the same, please let me know.
I found a way to do that, usign another module to write in mariaDB, to insert/update i only use one command, and to delete i use a separate command:
Hope it helps someone in future!
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import mariadb
spark = (SparkSession
.builder
.appName("Spark Structured Streaming CDC")
.getOrCreate())
streamingSchema = StructType([
StructField("Id", IntegerType(),True),
StructField("Number", IntegerType(),True),
StructField("ChangeMode", StringType(),True)
])
streamingDF = (spark.readStream
.format("csv")
.option("sep", "|")
.schema(streamingSchema)
.csv("E:\\pyspark_projects\\stream_cdc\\files\\input\\"))
class RowWriter:
def open(self, partition_id, epoch_id):
print("Opened %d, %d" % (partition_id, epoch_id))
return True
def process(self, row):
conn = mariadb.connect(
user="root",
password="root",
host="127.0.0.1",
port=3306,
database="projects"
)
cur = conn.cursor()
if(row[2] == 'insert' or 'update'):
cur.execute("INSERT INTO cdc (Id,Number) VALUES ("+str(row[0])+", "+str(row[1])+") ON DUPLICATE KEY UPDATE Number = "+str(row[1])+"")
if(row[2] == 'delete'):
cur.execute("DELETE FROM cdc WHERE Id = "+str(row[0])+"")
conn.commit()
conn.close()
def close(self, error):
print("Closed with error: %s" % str(error))
query = (streamingDF.writeStream
.foreach(RowWriter())
.option("checkpointLocation", "E:\\pyspark_projects\\stream_cdc\\files\\checkpoint")
.start())
query.awaitTermination()

Read csv using pyspark

I am new to spark. And I am trying to read csv file using pyspark. And I referred to PySpark How to read CSV into Dataframe, and manipulate it, Get CSV to Spark dataframe and many more. I tried to read it two ways:
1
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.conf import SparkConf
sc = SparkContext.getOrCreate()
df = spark.read.csv('D:/Users/path/csv/test.csv')
df.show()
2
import pyspark
sc = pyspark.SparkContext()
sql = SQLContext(sc)
df = (sql.read
.format("com.databricks.spark.csv")
.option("header", "true")
.load("D:/Users/path/csv/test.csv"))
df.show()
Neither of the codes are working. I am getting the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-28-c6263cc7dab9> in <module>()
4
5 sc = SparkContext.getOrCreate()
----> 6 df = spark.read.csv('D:/Users/path/csv/test.csv')
7 df.show()
8
~\opt\spark\spark-2.1.0-bin-hadoop2.7\python\pyspark\sql\readwriter.py in csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode)
378 if isinstance(path, basestring):
379 path = [path]
--> 380 return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
381
382 #since(1.5)
~\opt\spark\spark-2.1.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
~\opt\spark\spark-2.1.0-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~\opt\spark\spark-2.1.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o663.csv.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.execution.HiveFileFormat not found
at java.util.ServiceLoader.fail(ServiceLoader.java:239)
at java.util.ServiceLoader.access$300(ServiceLoader.java:185)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:372)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
I don't why it is throwing some hive exception Py4JJavaError: An error occurred while calling o663.csv.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.execution.HiveFileFormat not found. How to resolve this error HiveFileFormat not found.
Can anyone guide me to resolve this error?
Have you tried to use sqlContext.read.csv? This is how I read csvs in Spark 2.1
from pyspark import sql, SparkConf, SparkContext
conf = SparkConf().setAppName("Read_CSV")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)
df = sqlContext.read.csv("path/to/data")
df.show()
First of all, the system needs to recognize Spark Session as the following commands:
from pyspark import SparkConf, SparkContext
sc = SparkContext()
after that, SQL library has to be introduced to the system like this:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
and finally you can read your CSV by the following command:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('path/to/your/file.csv')
Since in PySpark 3.0.1 SQLContext is Deprectaed - to import a CSV file into PySpark.
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python") \
.getOrCreate()
df = spark.read.csv("/path/to/file/csv")
df.show()
Try to specify to use local master by making a configuration object. This would remove some doubts of spark trying to access on hadoop or anywhere as mentioned by somebody in comment.
sc.stop()
conf = (conf.setMaster('local[*]'))
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
if this is not working, then dont use sqlcontext for reading the file. Try spark.read.csv("path/filename.csv") by creating sparksession.
Also, it is best if you use Spark/Hadoop with a Linux operating system as it is a lot simpler in those systems.
Error most likely occurs because you are trying to access a local file.
See below how you should access it:
#Local File
spark.read.option("header","true").option("inferSchema","true").csv("file:///path")
#HDFS file
spark.read.option("header","true").option("inferSchema","true").csv("/path")
.csv(<path>) comes last.

Querying a spark streaming application from spark-shell (pyspark)

I am following this example in the pyspark console and everything works perfectly.
After that I wrote it as an PySpark application as follows:
# -*- coding: utf-8 -*-
import sys
import click
import logging
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#click.command()
#click.option('--master')
def most_idiotic_bi_query(master):
spark = SparkSession \
.builder \
.master(master)\
.appName("stream-test")\
.getOrCreate()
spark.sparkContext.setLogLevel('ERROR')
some_schema = .... # Schema removed
some_stream = spark\
.readStream\
.option("sep", ",")\
.schema(some_schema)\
.option("maxFilesPerTrigger", 1)\
.csv("/data/some_stream", header=True)
streaming_counts = (
linkage_stream.groupBy(some_stream.field_1).count()
)
query = streaming_counts.writeStream\
.format("memory")\
.queryName("counts")\
.outputMode("complete")\
.start()
query.awaitTermination()
if __name__ == "__main__":
logging.getLogger("py4j").setLevel(logging.ERROR)
most_idiotic_bi_query()
The app is executed as:
spark-submit test_stream.py --master spark://master:7077
Now, If I open a new spark driver in another terminal:
pyspark --master spark://master:7077
And try to run:
spark.sql("select * from counts")
It fails with:
During handling of the above exception, another exception occurred:
AnalysisExceptionTraceback (most recent call last)
<ipython-input-3-732b22f02ef6> in <module>()
----> 1 spark.sql("select * from id_counts").show()
/usr/spark-2.0.2/python/pyspark/sql/session.py in sql(self, sqlQuery)
541 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
542 """
--> 543 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
544
545 #since(2.0)
/usr/local/lib/python3.4/dist-packages/py4j-0.10.4-py3.4.egg/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/spark-2.0.2/python/pyspark/sql/utils.py in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: 'Table or view not found: counts; line 1 pos 14'
I don't understand what is happening.
This is an expected behavior. If you check the documentation for memory sink:
The output is stored in memory as an in-memory table. Both, Append and Complete output modes, are supported. This should be used for debugging purposes on low data volumes as the entire output is collected and stored in the driver’s memory. Hence, use it with caution.
As you can see memory sink doesn't create a persistent table or global temporary view but a local structure limited to a driver. Hence it cannot be queried from another Spark application.
So the memory output has to be queried from the driver, in which it is written. For example you could mimic console mode as shown below.
A dummy writer:
import pandas as pd
import numpy as np
import tempfile
import shutil
def producer(path):
temp_path = tempfile.mkdtemp()
def producer(i):
df = pd.DataFrame({
"group": np.random.randint(10, size=1000)
})
df["val"] = (
np.random.randn(1000) +
np.random.random(1000) * df["group"] +
np.random.random(1000) * i % 7
)
f = tempfile.mktemp(dir=temp_path)
df.to_csv(f, index=False)
shutil.move(f, path)
return producer
Spark application:
from pyspark.sql.types import IntegerType, DoubleType, StructType, StructField
schema = StructType([
StructField("group", IntegerType()),
StructField("val", DoubleType())
])
path = tempfile.mkdtemp()
query_name = "foo"
stream = (spark.readStream
.schema(schema)
.format("csv")
.option("header", "true")
.load(path))
query = (stream
.groupBy("group")
.avg("val")
.writeStream
.format("memory")
.queryName(query_name)
.outputMode("complete")
.start())
And some events:
from rx import Observable
timer = Observable.timer(5000, 5000)
timer.subscribe(producer(path))
timer.skip(1).subscribe(lambda *_: spark.table(query_name).show())
query.awaitTermination()

Resources