pyspark to read data from sql server - apache-spark

I'm trying to read data from sql server using pyspark. Below mentioned code works fine when executed using following command (where i'm passing sqljdbc driver path) but it fails when i try to run it using PyCharm IDE(on windows).
spark-submit --driver-class-path C:\drivers\sqljdbc_6.0.8112.100_enu\sqljdbc_6.0\enu\jre8\sqljdbc42.jar ReadSQLServerData.py
How to include or set the driver path while running same code through PyCharm IDE?
Code:
from pyspark.sql import SQLContext, Row
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("ReadSQLServerData")
sc = SparkContext(conf=conf)
query = "(SELECT top 10 * from users) as users"
sqlctx = SQLContext(sc)
df = sqlctx.read.format("jdbc").options(url="jdbc:sqlserver://mssqlserver:1433;database=user_management;user=pyspark;password=pyspark", dbtable=query).load()
Exception:
Traceback (most recent call last):
File "H:/Mine/OneDrive/Python/PySpark01/ReadSQLServerData.py", line 9, in <module>
df = sqlctx.read.format("jdbc").options(url="jdbc:sqlserver://mssqlserver:1433;database=user_management;user=pyspark;password=pyspark", dbtable=query).load()
File "C:\spark\python\pyspark\sql\readwriter.py", line 155, in load
return self._df(self._jreader.load())
File "C:\spark\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py", line 1133, in __call__
File "C:\spark\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "C:\spark\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o27.load.
: java.sql.SQLException: No suitable driver
at java.sql.DriverManager.getDriver(DriverManager.java:315)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:83)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:34)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:330)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)

Not sure if you figured this out but figured I could help others.
You have to set the driver-class-path and you can pass it in as a config option like below
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.driver.extraClassPath","/Users/Desktop/drivers/sqljdbc42.jar") \
.getOrCreate()

Related

read a text file from S3 into a Spark df : UsupportedOperationException

I am trying to read a text file from on-prem s3 compatible object storage using Spark and I am getting an error stating: UsupportedOperationException. I am unsure what this is pointing to and have tried to adjust code thinking maybe it was the spark.read command. I have tried read.text and read.csv both of which should work, but result in the same error. Full stack trace is below along with code:
Code being used:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("s3reader") \
.getOrCreate()\
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key","xxxxxxxxxxxx")
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "xxxxxxxxxxxxxx")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
df = spark.read.text("https://s3a.us-east-1.xxxx.xxxx.xxxx.com/bronze/xxxxxxx/test.txt")
print(df)
Stack trace:
Traceback (most recent call last):
File "/home/cloud/sparks3test.py", line 19, in <module>
df = spark.read.text("https://s3a.us-east-1.tpavcps3ednrg1.vici.verizon.com/bronze/CoreMetrics/test.txt")
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 516, in text
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o31.text.
: java.lang.UnsupportedOperationException
at org.apache.hadoop.fs.http.AbstractHttpFileSystem.listStatus(AbstractHttpFileSystem.java:91)
at org.apache.hadoop.fs.http.HttpsFileSystem.listStatus(HttpsFileSystem.java:23)
at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225)
at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:238)
at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:158)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:131)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:94)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:66)
at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:581)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:417)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:325)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:307)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:307)
at org.apache.spark.sql.DataFrameReader.text(DataFrameReader.scala:944)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:829)```
Try reading file from S3 like below.
s3a://bucket/bronze/xxxxxxx/test.txt

Logon denied error when loading data from Oracle to pyspark

Please note that I am new to pySpark, and feel free to let me know if I am missing any detail.
Running on Windows 10, with python3.7 installed
Command being used to run pyspark: pyspark --jars "C:\spark\spark-2.4.5-bin-hadoop2.7\jars\ojdbc6.jar"
Code that I am trying to execute in pyspark shell:
from pyspark import SparkConf, SparkContext
sqlctx = SQLContext(sc)
with open("new1", "r") as f:
query = f.read()
df = sqlctx.read.format("jdbc").options(url="jdbc:oracle:thin:#host:port:sid",
driver="oracle.jdbc.driver.OracleDriver", dbtable=query).load()
I am pretty sure, url is correct, window login that i'm using has access to database as it works fine with cx_Oracle and I can access DB using PL/SQL client.
Error:
File "<stdin>", line 1, in <module>
File "C:\spark\spark-2.4.5-bin-hadoop2.7\python\pyspark\sql\readwriter.py", line 172, in load
return self._df(self._jreader.load())
File "C:\spark\spark-2.4.5-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
File "C:\spark\spark-2.4.5-bin-hadoop2.7\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "C:\spark\spark-2.4.5-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o39.load.
: java.sql.SQLException: ORA-01017: invalid username/password; logon denied
at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:447)
at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:389)
at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:382)
at oracle.jdbc.driver.T4CTTIfun.processError(T4CTTIfun.java:675)
at oracle.jdbc.driver.T4CTTIoauthenticate.processError(T4CTTIoauthenticate.java:448)
at oracle.jdbc.driver.T4CTTIfun.receive(T4CTTIfun.java:513)
at oracle.jdbc.driver.T4CTTIfun.doRPC(T4CTTIfun.java:227)
at oracle.jdbc.driver.T4CTTIoauthenticate.doOAUTH(T4CTTIoauthenticate.java:383)
at oracle.jdbc.driver.T4CTTIoauthenticate.doOAUTH(T4CTTIoauthenticate.java:776)
at oracle.jdbc.driver.T4CConnection.logon(T4CConnection.java:432)
at oracle.jdbc.driver.PhysicalConnection.<init>(PhysicalConnection.java:553)
at oracle.jdbc.driver.T4CConnection.<init>(T4CConnection.java:254)
at oracle.jdbc.driver.T4CDriverExtension.getConnection(T4CDriverExtension.java:32)
at oracle.jdbc.driver.OracleDriver.connect(OracleDriver.java:528)
at org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.connect(DriverWrapper.scala:45)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:63)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:54)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:56)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:210)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:35)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
this is the example way to access oracle from spark, where you are using user and pwd seperately.
see read-data-from-oracle-database-with-apache-spark
myDF = spark.read \
.format("jdbc") \
.option("url", "jdbc:oracle:thin:username/password#//hostname:portnumber/SID") \
.option("dbtable", "hr.emp") \
.option("user", "db_user_name") \
.option("password", "password") \
.option("driver", "oracle.jdbc.driver.OracleDriver") \
.load()

pyspark MQTT structured streaming with apache bahir

I'm using spark 2.4 and I've run pyspark like this:
./bin/pyspark --packages org.apache.bahir:spark-sql-streaming-mqtt_2.11:2.3.2
pyspark runs successfully.
(But when I run spark-sql-streaming-mqtt_2.11:2.4.0-SNAPSHOT, got an error)
I'm trying to get data from a MQTT broker using structured streaming.
so, I've run this
>>> from pyspark.sql import SparkSession
>>> from pyspark.sql.functions import explode
>>> from pyspark.sql.functions import split
>>> spark = SparkSession \
... .builder \
... .appName("Test") \
... .getOrCreate()
>>> lines = spark.readStream\
... .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")\
... .option("topic", "/sensor")\
... .option("brokerUrl", "tcp://localhost:1883")\
... .load()
the error shown:
2019-03-22 01:24:43 WARN MQTTUtils:51 - If `clientId` is not set, a random value is picked up.
Recovering from failure is not supported in such a case.
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "/opt/spark/python/pyspark/sql/streaming.py", line 400, in load
return self._df(self._jreader.load())
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/spark/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o43.load.
: MqttException (0)
at org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence.checkIsOpen(MqttDefaultFilePersistence.java:130)
at org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence.getFiles(MqttDefaultFilePersistence.java:247)
at org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence.close(MqttDefaultFilePersistence.java:142)
at org.apache.bahir.sql.streaming.mqtt.MQTTStreamSource.stop(MQTTStreamSource.scala:228)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:190)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
I tried to stream MQTT data for a week. But I don't think there is a way to solve it and it is really desperate. Is there no way I can solve it?
Thank you.
Try to set the persistence option.
Example :
val lines = spark.readStream.format("datasource.mqtt.MQTTStreamSourceProvider")
.option("topic", topic)
.option("persistence","memory")
.option("brokerUrl",broker)
.option("cleanSession", "true")
.load()

Could not read data from kafka using pyspark

I've a streaming data in my kafka topic. I need to read this data from topic using pyspark inthe form of pyspark dataframe. But I'm continuously receiving error when I'm calling readStream function. The error is "py4j.protocol.Py4JJavaError: An error occurred while calling o35.load". My code is as follows:-
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'
if __name__ == '__main__':
sc = SparkSession.builder.appName('PythonStreamingDirectKafkaWordCount').getOrCreate()
ssc = StreamingContext(sc, 60)
df = sc \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "near_line") \
.load() \
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","CAST(value AS STRING)")
ssc.start()
ssc.awaitTermination()
I'm getting an error as follows:-
Traceback (most recent call last):
File "/home/nayanam/PycharmProjects/recommendation_engine/derivation/kafka_cons**umer_test.py", line 21, in <module>
.option("subscribe", "near_line") \**
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/pyspark/sql/streaming.py", line 397, in load
return self._df(self._jreader.load())
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o35.load.
: java.lang.ClassNotFoundException: Failed to find data source: kafka. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:549)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:195)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:87)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:87)
at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:30)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:150)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: kafka.DefaultSource
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21$$anonfun$apply$12.apply(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21$$anonfun$apply$12.apply(DataSource.scala:533)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21.apply(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21.apply(DataSource.scala:533)
at scala.util.Try.orElse(Try.scala:84)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:533)
... 18 more
I got the same issue. Well, in spark 2.3 pyspark accepts the --jars options and it's working. So, in this version, all you need are 2 jars:
spark-sql-kafka-0-10_2.11-2.3.2.jar
spark-streaming-kafka-0-10-assembly_2.11-2.3.2.jar
$ pyspark --jars spark-sql-kafka-0-10_2.11-2.3.2.jar,spark-streaming-kafka-0-10-assembly_2.11-2.3.2.jar
I'm using Spark 2.3.0, Scala 2.11.8 and Kafka 0.10 which are downloadable from apache.org
pass this package if you don't want to use jar
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.2,org.apache.spark:spark-streaming-kafka-0-10-assembly_2.11:2.3.2

Reading from hdfs and writting to oracle 12

Hi.
I am trying to read from hdfs and write in oracle using pyspark, but I
have an error. I attach the code that I am using and the error that I
get:
pyspark --driver-class-path "/opt/oracle/app/oracle/product/12.1.0.2/dbhome_1/jdbc/lib/ojdbc7.jar"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf().setAppName("myFirstApp").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
lines = sc.textFile("hdfs://bigdatalite.localdomain:8020/user/oracle/ACTIVITY/part-m-00000")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=p[1]))
schemaPeople = sqlContext.createDataFrame(people)
url = "jdbc:oracle:thin#localhost:1521/orcl"
properties = {
"user": "MOVIEDEMO",
"password": "welcome1",
"driver": "oracle.jdbc.driver.OracleDriver"
}
schemaPeople.write.jdbc(url=url, table="ACTIVITY", mode="append", properties=properties)
..and the error that show is:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/spark/python/pyspark/sql/readwriter.py", line 530, in jdbc
self._jwrite.mode(mode).jdbc(url, table, jprop)
File "/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 45, in deco
return f(*a, **kw)
File "/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o66.jdbc.
: java.sql.SQLException: Invalid Oracle URL specified
at oracle.jdbc.driver.OracleDriver.connect(OracleDriver.java:453)
at org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.connect(DriverWrapper.scala:45)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$2.apply(JdbcUtils.scala:61)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$2.apply(JdbcUtils.scala:52)
at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:278)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:748)
PD: I using spark 1.6.0
Url should be specified in the "service" format, ie.
jdbc:oracle:thin:#//myhost:1521/orcl

Resources