from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 1)
directKafkaStream = KafkaUtils.createDirectStream(ssc, ["topic"], {"metadata.broker.list":"prd-kafka:9092,prd-kafka1:9092,prd-kafka:9092,"})
I trying to no connect spark streaming with, to read some topic and write in hdfs.
But there is a problem, follow bellow
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/cloudera/parcels/CDH-5.9.3-1.cdh5.9.3.p0.4/lib/spark/python/pyspark/streaming/kafka.py", line 152, in createDirectStream
raise e
py4j.protocol.Py4JJavaError: An error occurred while calling o73.createDirectStreamWithoutMessageHandler.
: org.apache.spark.SparkException: java.io.EOFException
java.nio.channels.ClosedChannelException
java.io.EOFException
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at scala.util.Either.fold(Either.scala:97)
at org.apache.spark.streaming.kafka.KafkaCluster$.checkErrors(KafkaCluster.scala:365)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:222)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStream(KafkaUtils.scala:720)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStreamWithoutMessageHandler(KafkaUtils.scala:688)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:748)
I think my connection must be similar that
format("kafka") \
.option("kafka.sasl.mechanism", "SCRAM-SHA-256") \
.option("kafka.security.protocol", "SASL_PLAINTEXT") \
.option("kafka.sasl.jaas.config", EH_SASL) \
.option("kafka.batch.size", 5000) \
.option("kafka.bootstrap.servers", "metadata.broker.list":"prd-kafka:9092,prd-kafka1:9092,prdkafka:9092,") \
.option("subscribe", "topic")
Somebody knows how to connect spark streaming with kafka using "SCHA-SHA-256" mechanism.
Related
I have spark job that save data to hdfs then it save the same data to Hive table. When I run it on Jupyter, it run succesfully. But when I run it through oozie It raises the folloing exception when it reaches the step of writing data to hive.
Here is my code followed by the exception and followed by workflow .xml:
# coding: utf-8
# In[10]:
import os
JARS_HOME = "hdfs:///dataengineering/jars"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars '+JARS_HOME+'/ojdbc6.jar,'+JARS_HOME+'/anonymize_udfs.jar pyspark-shell'
os.environ["HADOOP_CONF_DIR"] = '/etc/hadoop/conf'
# In[11]:
try:
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
except:
import findspark
findspark.init('/opt/cloudera/parcels/CDH-6.1.1-1.cdh6.1.1.p0.875250/lib/spark')
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
import sys
import pyspark.sql.functions as functions
from datetime import date
from dateutil.relativedelta import relativedelta
from datetime import datetime
from datetime import timedelta
from pyspark.sql.types import StringType
from pyspark.sql.functions import *
from pyspark.sql import functions as sf
from pyspark.sql.types import StringType
spark = SparkSession.builder .master("yarn") .appName("oozie_sample_spark") .config('spark.executor.cores','3') .config('spark.executor.memory','15g') .config('spark.driver.memory','5g') .config('spark.driver.maxResultSize','12g') .config("spark.dynamicAllocation.enabled", "true") .config("spark.shuffle.service.enabled", "true") .config("spark.executor.instances", "4") .config("spark.yarn.queue", "root.STREAMING") .config("spark.dynamicAllocation.cachedExecutorIdleTimeout", "300s") .config("hive.metastore.uris", "thrift://dchqmaster01.internal.eg.vodafone.com:9083") .getOrCreate()
# In[13]:
spark.sql("select current_timestamp() column_a").write.csv("/user/akhamis11/oozie-samples/spark-sample/current_column.csv", mode='append')
spark.sql("select current_timestamp() column_a").write.saveAsTable("bde.oozie_test", mode='append')
spark.stop()
2020-04-13 07:27:21,077 [dispatcher-event-loop-1] INFO org.apache.spark.deploy.yarn.YarnAllocator - Driver requested a total number of 0 executor(s).
2020-04-13 07:27:21,081 [Thread-10] INFO org.apache.spark.sql.execution.datasources.FileFormatWriter - Write Job 316245be-4c54-42d9-bd43-6246d77672b0 committed.
2020-04-13 07:27:21,108 [Thread-10] INFO org.apache.spark.sql.execution.datasources.FileFormatWriter - Finished processing stats for write job 316245be-4c54-42d9-bd43-6246d77672b0.
2020-04-13 07:27:21,191 [Thread-10] INFO com.cloudera.spark.lineage.NavigatorQueryListener - Failed to generate lineage for successful query execution.
java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:192)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:103)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:102)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.org$apache$spark$sql$hive$HiveSessionStateBuilder$$externalCatalog(HiveSessionStateBuilder.scala:39)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:90)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:90)
at org.apache.spark.sql.query.analysis.QueryAnalysis$.hiveCatalog(QueryAnalysis.scala:63)
at org.apache.spark.sql.query.analysis.QueryAnalysis$.getLineageInfo(QueryAnalysis.scala:88)
at com.cloudera.spark.lineage.NavigatorQueryListener.onSuccess(ClouderaNavigatorListener.scala:60)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1$$anonfun$apply$mcV$sp$1.apply(QueryExecutionListener.scala:124)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1$$anonfun$apply$mcV$sp$1.apply(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling$1.apply(QueryExecutionListener.scala:145)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling$1.apply(QueryExecutionListener.scala:143)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.collection.mutable.ListBuffer.foreach(ListBuffer.scala:45)
at org.apache.spark.sql.util.ExecutionListenerManager.org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling(QueryExecutionListener.scala:143)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply$mcV$sp(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager.readLock(QueryExecutionListener.scala:156)
at org.apache.spark.sql.util.ExecutionListenerManager.onSuccess(QueryExecutionListener.scala:122)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:670)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:656)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:189)
... 39 more
Caused by: java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveException
at org.apache.spark.sql.hive.HiveExternalCatalog.<init>(HiveExternalCatalog.scala:73)
... 44 more
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveException
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 45 more
Traceback (most recent call last):
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o148.saveAsTable.
: java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:192)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:103)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:102)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.org$apache$spark$sql$hive$HiveSessionStateBuilder$$externalCatalog(HiveSessionStateBuilder.scala:39)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:90)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:90)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.tableExists(SessionCatalog.scala:415)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:405)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:400)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:189)
... 21 more
Caused by: java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveException
at org.apache.spark.sql.hive.HiveExternalCatalog.<init>(HiveExternalCatalog.scala:73)
... 26 more
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "oozie_sample_spark.py", line 53, in <module>
spark.sql("select current_timestamp() column_a").write.saveAsTable("bde.oozie_test", mode='append')
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/pyspark.zip/pyspark/sql/readwriter.py", line 775, in saveAsTable
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/pyspark.zip/pyspark/sql/utils.py", line 79, in deco
pyspark.sql.utils.IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':"
<action name='spark-node'>
<spark xmlns="uri:oozie:spark-action:1.0">
<resource-manager>${resourceManager}</resource-manager>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.service.HCatAccessorService.hcat.configuration</name>
<value>/opt/cloudera/parcels/CDH/etc/hive/conf.dist/hive-site.xml</value>
</property>
</configuration>
<master>${master}</master>
<name>oozies_sample</name>
<jar>${nameNode}/user/${wf:user()}/oozie-samples/spark-sample/lib/oozie_sample_spark.py</jar>
</spark>
<ok to="end" />
<error to="fail" />
</action>
<kill name="fail">
<message>Workflow failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]
</message>
</kill>
<end name='end' />
[~]$ hdfs dfs -ls /user/oozie/share/lib/lib_<ts>/spark2
/user/oozie/share/lib/lib_<ts>/spark2/HikariCP-java7-2.4.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/RoaringBitmap-0.5.11.jar
/user/oozie/share/lib/lib_<ts>/spark2/accessors-smart-1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/activation-1.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/annotations-2.0.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/antlr4-runtime-4.7.jar
/user/oozie/share/lib/lib_<ts>/spark2/aopalliance-1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/aopalliance-repackaged-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/arpack_combined_all-0.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/arrow-format-0.10.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/arrow-memory-0.10.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/arrow-vector-0.10.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/audience-annotations-0.5.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/avro-ipc.jar
/user/oozie/share/lib/lib_<ts>/spark2/avro-mapred-hadoop2.jar
/user/oozie/share/lib/lib_<ts>/spark2/avro.jar
/user/oozie/share/lib/lib_<ts>/spark2/aws-java-sdk-bundle-1.11.271.jar
/user/oozie/share/lib/lib_<ts>/spark2/azure-keyvault-core-0.8.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/azure-storage-5.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/breeze-macros_2.11-0.13.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/breeze_2.11-0.13.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/chill-java-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/chill_2.11-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-beanutils-1.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-cli-1.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-codec-1.10.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-collections-3.2.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-compiler-3.0.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-compress-1.4.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-configuration2-2.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-crypto-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-io-2.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-lang-2.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-lang3-3.7.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-logging-1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-math3-3.4.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-net-3.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-pool-1.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/compress-lzf-1.0.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/core-1.1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/curator-client-2.7.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/curator-framework-2.7.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/curator-recipes-2.7.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/datanucleus-core-4.1.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/derby-10.14.1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/ehcache-3.3.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flatbuffers-1.2.0-3f79e055.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-config-filter-api-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-configuration-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-core-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-sdk-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/geronimo-jcache_1.0_spec-1.0-alpha-1.jar
/user/oozie/share/lib/lib_<ts>/spark2/gson-2.2.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/guava-11.0.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/guice-4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/guice-servlet-4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-annotations.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-auth.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-aws.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-azure-3.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-client-3.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-hdfs-client.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-mapreduce-client-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-mapreduce-client-core.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-mapreduce-client-jobclient.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-openstack-3.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-api.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-client.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-server-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-server-web-proxy.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-exec.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-hcatalog-core.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-hcatalog-pig-adapter.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-metastore.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-serde.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-site.xml
/user/oozie/share/lib/lib_<ts>/spark2/hive-webhcat-java-client.jar
/user/oozie/share/lib/lib_<ts>/spark2/hk2-api-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/hk2-locator-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/hk2-utils-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/hppc-0.7.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/htrace-core4-4.1.0-incubating.jar
/user/oozie/share/lib/lib_<ts>/spark2/httpclient-4.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/httpcore-4.4.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/ivy-2.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-annotations-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-core-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-core-asl-1.9.13.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-databind-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-dataformat-cbor-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-jaxrs-base-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-jaxrs-json-provider-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-mapper-asl-1.9.13-cloudera.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-module-jaxb-annotations-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-module-paranamer-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-module-scala_2.11-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/janino-3.0.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/javassist-3.18.1-GA.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.annotation-api-1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.inject-1.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.inject-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.servlet-api-3.1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.ws.rs-api-2.0.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jaxb-api-2.2.11.jar
/user/oozie/share/lib/lib_<ts>/spark2/jcip-annotations-1.0-1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jcl-over-slf4j-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-client-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-common-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-container-servlet-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-container-servlet-core-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-guava-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-media-jaxb-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-server-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-jmx-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-util-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-util-ajax-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-webapp-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-xml-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/joda-time-2.9.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/jodd-core-3.5.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jsch-0.1.54.jar
/user/oozie/share/lib/lib_<ts>/spark2/json-smart-2.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-ast_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-core_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-jackson_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-scalap_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/jsp-api-2.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jsr305-1.3.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/jtransforms-2.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/jul-to-slf4j-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/kafka-clients-2.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-admin-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-client-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-common-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-core-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-crypto-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-identity-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-server-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-simplekdc-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-util-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-asn1-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-config-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-pkix-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-util-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-xdr-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kryo-shaded-4.0.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/leveldbjni-all-1.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/libfb303-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/libthrift-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/log4j-1.2.17.jar
/user/oozie/share/lib/lib_<ts>/spark2/lz4-java-1.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/machinist_2.11-0.6.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/macro-compat_2.11-1.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-core-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-graphite-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-json-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-jvm-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/mina-core-2.0.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/minlog-1.3.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/mssql-jdbc-6.2.1.jre7.jar
/user/oozie/share/lib/lib_<ts>/spark2/netty-3.10.6.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/netty-all-4.1.17.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/nimbus-jose-jwt-4.41.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/objenesis-2.5.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/okhttp-2.7.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/okio-1.6.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/oozie-sharelib-spark-5.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/oozie-sharelib-spark.jar
/user/oozie/share/lib/lib_<ts>/spark2/opencsv-2.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/oro-2.0.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/osgi-resource-locator-1.0.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/paranamer-2.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-column.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-encoding.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-format.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-hadoop.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-jackson.jar
/user/oozie/share/lib/lib_<ts>/spark2/protobuf-java-2.5.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/py4j-0.10.7-src.zip
/user/oozie/share/lib/lib_<ts>/spark2/py4j-0.10.7.jar
/user/oozie/share/lib/lib_<ts>/spark2/pyrolite-4.13.jar
/user/oozie/share/lib/lib_<ts>/spark2/pyspark.zip
/user/oozie/share/lib/lib_<ts>/spark2/re2j-1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-compiler-2.11.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-library-2.11.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-parser-combinators_2.11-1.1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-reflect-2.11.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-xml_2.11-1.0.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/shapeless_2.11-2.3.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/slf4j-api-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/slf4j-log4j12-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/snappy-java-1.1.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-avro_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-catalyst_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-core_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-graphx_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-hadoop-cloud_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-hive_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-kvstore_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-launcher_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-lineage_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-mllib-local_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-mllib_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-network-common_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-network-shuffle_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-repl_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-sketch_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-sql-kafka-0-10_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-sql_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming-flume-sink_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming-flume_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming-kafka-0-10_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-tags_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-unsafe_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-yarn_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spire-macros_2.11-0.13.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/spire_2.11-0.13.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/stax2-api-3.1.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/stream-2.7.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/univocity-parsers-2.7.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/validation-api-1.1.0.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/wildfly-openssl-1.0.4.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/woodstox-core-5.0.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/xbean-asm6-shaded-4.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/xz-1.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/zookeeper.jar
/user/oozie/share/lib/lib_<ts>/spark2/zstd-jni-1.3.2-2.jar
Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog' This means the Catalog jar, its trying find is not in ooziesharelib spark directory.
Please add the following property in your job.properties file.
oozie.action.sharelib.for.spark=hive,spark,hcatalog <spark2 instead of spark if you are using cloudera>
This will allow hive jars to use in spark action. For external hive catalog spark oozieShareLib dirctory does not contain every jars.
org/apache/commons/dbcp/ConnectionFactory releated jar lives under hive lib folder.
So please check weather the jar exists in your local file system as well as in hdfs
find <location> -name "*.jar" | xargs grep ConnectionFactory
Also please add this in your SparkSession to enable hive support for spark sql .enableHiveSupport()
I'm using spark 2.4 and I've run pyspark like this:
./bin/pyspark --packages org.apache.bahir:spark-sql-streaming-mqtt_2.11:2.3.2
pyspark runs successfully.
(But when I run spark-sql-streaming-mqtt_2.11:2.4.0-SNAPSHOT, got an error)
I'm trying to get data from a MQTT broker using structured streaming.
so, I've run this
>>> from pyspark.sql import SparkSession
>>> from pyspark.sql.functions import explode
>>> from pyspark.sql.functions import split
>>> spark = SparkSession \
... .builder \
... .appName("Test") \
... .getOrCreate()
>>> lines = spark.readStream\
... .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")\
... .option("topic", "/sensor")\
... .option("brokerUrl", "tcp://localhost:1883")\
... .load()
the error shown:
2019-03-22 01:24:43 WARN MQTTUtils:51 - If `clientId` is not set, a random value is picked up.
Recovering from failure is not supported in such a case.
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "/opt/spark/python/pyspark/sql/streaming.py", line 400, in load
return self._df(self._jreader.load())
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/spark/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o43.load.
: MqttException (0)
at org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence.checkIsOpen(MqttDefaultFilePersistence.java:130)
at org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence.getFiles(MqttDefaultFilePersistence.java:247)
at org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence.close(MqttDefaultFilePersistence.java:142)
at org.apache.bahir.sql.streaming.mqtt.MQTTStreamSource.stop(MQTTStreamSource.scala:228)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:190)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
I tried to stream MQTT data for a week. But I don't think there is a way to solve it and it is really desperate. Is there no way I can solve it?
Thank you.
Try to set the persistence option.
Example :
val lines = spark.readStream.format("datasource.mqtt.MQTTStreamSourceProvider")
.option("topic", topic)
.option("persistence","memory")
.option("brokerUrl",broker)
.option("cleanSession", "true")
.load()
I've a streaming data in my kafka topic. I need to read this data from topic using pyspark inthe form of pyspark dataframe. But I'm continuously receiving error when I'm calling readStream function. The error is "py4j.protocol.Py4JJavaError: An error occurred while calling o35.load". My code is as follows:-
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'
if __name__ == '__main__':
sc = SparkSession.builder.appName('PythonStreamingDirectKafkaWordCount').getOrCreate()
ssc = StreamingContext(sc, 60)
df = sc \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "near_line") \
.load() \
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","CAST(value AS STRING)")
ssc.start()
ssc.awaitTermination()
I'm getting an error as follows:-
Traceback (most recent call last):
File "/home/nayanam/PycharmProjects/recommendation_engine/derivation/kafka_cons**umer_test.py", line 21, in <module>
.option("subscribe", "near_line") \**
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/pyspark/sql/streaming.py", line 397, in load
return self._df(self._jreader.load())
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o35.load.
: java.lang.ClassNotFoundException: Failed to find data source: kafka. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:549)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:195)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:87)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:87)
at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:30)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:150)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: kafka.DefaultSource
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21$$anonfun$apply$12.apply(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21$$anonfun$apply$12.apply(DataSource.scala:533)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21.apply(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21.apply(DataSource.scala:533)
at scala.util.Try.orElse(Try.scala:84)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:533)
... 18 more
I got the same issue. Well, in spark 2.3 pyspark accepts the --jars options and it's working. So, in this version, all you need are 2 jars:
spark-sql-kafka-0-10_2.11-2.3.2.jar
spark-streaming-kafka-0-10-assembly_2.11-2.3.2.jar
$ pyspark --jars spark-sql-kafka-0-10_2.11-2.3.2.jar,spark-streaming-kafka-0-10-assembly_2.11-2.3.2.jar
I'm using Spark 2.3.0, Scala 2.11.8 and Kafka 0.10 which are downloadable from apache.org
pass this package if you don't want to use jar
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.2,org.apache.spark:spark-streaming-kafka-0-10-assembly_2.11:2.3.2
Trying to create input stream in Spark using kafka topic using below commands but getting error.
This is the first time I am trying spark Streaming with kafka
Version details,
Spark Version:spark-2.2.0-bin-hadoop2.7
Kafka Version: kafka_2.11-0.11.0.0
Zookeepr Version:zookeeper-3.4.10
Spark Streaming jar file:spark-streaming-kafka-0-8-assembly_2.10-2.2.0.jar
(Zookeeper & kafka is up and running. I am able to create producer & consumer in kafka console.)
PySpark Notebook Commands:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.10:2.2.0 pyspark-shell'
from pyspark.streaming.kafka import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 5)
topics=["kafka-test"]
kafkaParams ={"bootstrap.servers" : "localhost:9092"}
kafkaStream = KafkaUtils.createDirectStream(ssc,topics, kafkaParams )
********************Error Message:
Py4JJavaError: An error occurred while calling o26.createDirectStreamWithoutMessageHandler.
: java.lang.NoClassDefFoundError: scala/collection/GenTraversableOnce$class
at kafka.utils.Pool.<init>(Pool.scala:28)
at kafka.consumer.FetchRequestAndResponseStatsRegistry$.<init>(FetchRequestAndResponseStats.scala:60)
at kafka.consumer.FetchRequestAndResponseStatsRegistry$.<clinit>(FetchRequestAndResponseStats.scala)
at kafka.consumer.SimpleConsumer.<init>(SimpleConsumer.scala:39)
at org.apache.spark.streaming.kafka.KafkaCluster.connect(KafkaCluster.scala:59)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:364)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:361)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at org.apache.spark.streaming.kafka.KafkaCluster.org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers(KafkaCluster.scala:361)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitionMetadata(KafkaCluster.scala:132)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitions(KafkaCluster.scala:119)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:211)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStream(KafkaUtils.scala:720)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStreamWithoutMessageHandler(KafkaUtils.scala:688)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: scala.collection.GenTraversableOnce$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 26 more
Does PySpark work (compatible) for MapR Streams?
Any example code?
I've tried that but keep getting exception
strLoc = '/Path1:Stream1'
protocol = 'file://' if ( strLoc.startswith('/') or strLoc.startswith('\\') ) else ''
from pyspark.streaming.kafka import *;
from pyspark import StorageLevel;
APA = KafkaUtils.createDirectStream(ssc, [strLoc], kafkaParams={ \
"oracle.odi.prefer.dataserver.packages" : "" \
,"key.deserializer" : "org.apache.kafka.common.serialization.StringDeserializer" \
,"value.deserializer" : "org.apache.kafka.common.serialization.ByteArrayDeserializer" \
,"zookeeper.connect" : "maprdemo:5181" \
,"metadata.broker.list" : "this.will.be.ignored:9092"
,"group.id" : "New_Mapping_2_Physical"}, fromOffsets=None, messageHandler=None)
Traceback (most recent call last):
File "/tmp/New_Mapping_2_Physical.py", line 77, in <module>
,"group.id" : "New_Mapping_2_Physical"}, fromOffsets=None, messageHandler=None)
File "/opt/mapr/spark/spark-1.6.1/python/lib/pyspark.zip/pyspark/streaming/kafka.py", line 152, in createDirectStream
py4j.protocol.Py4JJavaError: An error occurred while calling o58.createDirectStreamWithoutMessageHandler.
: org.apache.spark.SparkException: java.nio.channels.ClosedChannelException
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at scala.util.Either.fold(Either.scala:97)
at org.apache.spark.streaming.kafka.KafkaCluster$.checkErrors(KafkaCluster.scala:365)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:222)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStream(KafkaUtils.scala:720)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStreamWithoutMessageHandler(KafkaUtils.scala:688)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
On Scala, it seems to work fine, but on PySpark, not.
I downloaded the latest build http://package.mapr.com/releases/ecosystem-5.x/redhat/mapr-spark-1.6.1.201612010646-1.noarch.rpm and it resolved the issue.
I've checked the the pyspark kafka.py, and found it updated. I was using label 1605, now 1611.