package com.mypackage
import org.apache.spark.graphx._
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by sidazhang on 11/8/16.
*/
case class Person(age: Int)
case class EdgeImpl()
object GraphApp {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkMain").setMaster("local[1]")
val sc = new SparkContext(conf)
val vertices =
sc.parallelize(Array((1L, Person(10)), (2L, Person(15)),
(3L, Person(20)), (4L, Person(30))))
// Create an RDD for edges
val relationships =
sc.parallelize(Array(Edge(2L, 1L, EdgeImpl()),
Edge(3L, 1L, EdgeImpl()), Edge(4L, 1L, EdgeImpl())))
val graph = Graph(vertices, relationships)
// Compute the number of older followers and their total age
val olderFollowers: VertexRDD[Array[Person]] = graph.aggregateMessages[Array[Person]](
ctx => ctx.sendToDst(Array(ctx.srcAttr)),
// Merge the array of followers
(a, b) => a ++ b
)
// Here I only have the id of the person and a list of his followers.
// How do I get the vertex of the person
olderFollowers.collect.foreach { case (id, followers) => followers.foreach(println(id, _)) }
}
}
The question is that through the aggregateMessage API, I end up with the vertexId. How do I get the actual vertex.
(The question is inline)
You have to join it back with the original data:
graph.joinVertices(olderFollowers)(someMergingFunction).vertices
Related
Snapshot of my firebase realtime database
I want to extract the entire data under the "Orders" node, please tell me how should I model my data class for android in Kotlin?
I tried with this type of modeling,
After getting the reference of (Orders/uid/)
Order.kt
data class Order(
val items:ArrayList<Myitems>=ArrayList(),
val timeStamp:Long=0,
val totalCost:Int=0
)
MyItems.kt
data class MyItems(
val Item:ArrayList<Menu>=ArrayList()
)
Menu.kt
data class Menu(
val menCategory:String="",
val menName:String="",
val menImage:String="",
val menId:String="",
val menQuantity:Int=0,
val menCost:Int=0
)
After a lot of thinking and research online. I was finally able to model my classes and call add value event listener to it. Here it goes:
Order.kt
data class Order(
val items: ArrayList<HashMap<String, Any>> = ArrayList(),
val timeStamp: Long = 0,
val totalCost: Int = 0
)
OItem.kt
data class OItem(
val menCategory: String = "",
val menId: String = "",
val menImage: String = "",
val menName: String = "",
val menPrice: Int = 0,
var menQuantity: Int = 0
)
MainActivity.kt
val uid = FirebaseAuth.getInstance().uid
val ref = FirebaseDatabase.getInstance().getReference("Orders/$uid")
ref.addListenerForSingleValueEvent(object : ValueEventListener {
override fun onCancelled(error: DatabaseError) {
//
}
override fun onDataChange(p0: DataSnapshot) {
p0.children.forEach {
val order = it.getValue(Order::class.java)
ordList.add(order!!)
}
Log.d("hf", ordList.toString())
}
})
I have a dataframe let's say:
val someDF = Seq(
(8, "bat"),
(64, "mouse"),
(-27, "horse")
).toDF("number", "word")
I want to send that dataframe to a kafka topic using avro serialization and using schema registry. I believe I'm almost there, but I can't seem to get past the Task not serializable error. I understand there is a sink for kafka, but it doesn't communicate with the schema registry which is a requirement.
object Holder extends Serializable{
def prop(): java.util.Properties = {
val props = new Properties()
props.put("schema.registry.url", schemaRegistryURL)
props.put("key.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("value.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("schema.registry.url", schemaRegistryURL)
props.put("bootstrap.servers", brokers)
props
}
def vProps(props: java.util.Properties): kafka.utils.VerifiableProperties = {
val vProps = new kafka.utils.VerifiableProperties(props)
vProps
}
def messageSchema(vProps: kafka.utils.VerifiableProperties): org.apache.avro.Schema = {
val ser = new KafkaAvroEncoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(subjectValueName)
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
messageSchema
}
def avroRecord(messageSchema: org.apache.avro.Schema): org.apache.avro.generic.GenericData.Record = {
val avroRecord = new GenericData.Record(messageSchema)
avroRecord
}
def ProducerRecord(avroRecord:org.apache.avro.generic.GenericData.Record): org.apache.kafka.clients.producer.ProducerRecord[org.apache.avro.generic.GenericRecord,org.apache.avro.generic.GenericRecord] = {
val record = new ProducerRecord[GenericRecord, GenericRecord](topicWrite, avroRecord)
record
}
def producer(props: java.util.Properties): KafkaProducer[GenericRecord, GenericRecord] = {
val producer = new KafkaProducer[GenericRecord, GenericRecord](props)
producer
}
}
val prod: (String, String) => String = (
number: String,
word: String,
) => {
val prop = Holder.prop()
val vProps = Holder.vProps(prop)
val mSchema = Holder.messageSchema(vProps)
val aRecord = Holder.avroRecord(mSchema)
aRecord.put("number", number)
aRecord.put("word", word)
val record = Holder.ProducerRecord(aRecord)
val producer = Holder.producer(prop)
producer.send(record)
"sent"
}
val prodUDF: org.apache.spark.sql.expressions.UserDefinedFunction =
udf((
Number: String,
word: String,
) => prod(number,word))
val testDF = firstDF.withColumn("sent", prodUDF(col("number"), col("word")))
KafkaProducer is not serializable.
Create the KafkaProducer inside prod() instead of creating it outside.
I have two kafka streams that contain results for two parallel operations, I need a way to combine both streams so I can process the results in a single spark transform. Is this possible? (illustration below)
Stream 1 {id:1,result1:True}
Stream 2 {id:1,result2:False}
JOIN(Stream 1, Stream 2, On "id") -> Output Stream {id:1,result1:True,result2:False}
Current code that isn't working:
kvs1 = KafkaUtils.createStream(sparkstreamingcontext, ZOOKEEPER, NAME+"_stream", {"test_join_1": 1})
kvs2 = KafkaUtils.createStream(sparkstreamingcontext, ZOOKEEPER, NAME+"_stream", {"test_join_2": 1})
messages_RDDstream1 = kvs1.map(lambda x: x[1])
messages_RDDstream2 = kvs2.map(lambda x: x[1])
messages_RDDstream_Final = messages_RDDstream1.join(messages_RDDstream2)
When I pass two sample jsons to each Kafka queue with the same ID field, nothing is returned in my final RDD stream. I imaging I am missing the stage of converting my Kafka JSON string message into a Tuple?
I have also tried the following:
kvs1.map(lambda (key, value): json.loads(value))
and
kvs1.map(lambda x: json.loads(x))
To no avail
Cheers
Adam
A simple lookup on Spark's documentation would have given you the answer..
You can use the join operation.
join(otherStream, [numTasks]) :
When called on two DStreams of (K, V) and (K, W) pairs, return a new DStream of (K, (V, W)) pairs with all pairs of elements for each key.
For example : val streamJoined = stream1.join(stream2)
What you need can be done using the join() method of key-value pair DStreams:
// Test data
val input1 = List((1, true), (2, false), (3, false), (4, true), (5, false))
val input2 = List((1, false), (2, false), (3, true), (4, true), (5, true))
val input1RDD = sc.parallelize(input1)
val input2RDD = sc.parallelize(input2)
import org.apache.spark.streaming.{Seconds, StreamingContext}
val streamingContext = new StreamingContext(sc, Seconds(3))
// Creates a DStream from the test data
import scala.collection.mutable
val input1DStream = streamingContext.queueStream[(Int, Boolean)](mutable.Queue(input1RDD))
val input2DStream = streamingContext.queueStream[(Int, Boolean)](mutable.Queue(input2RDD))
// Join the two streams together by merging them into a single dstream
val joinedDStream = input1DStream.join(input2DStream)
// Print the result
joinedDStream.print()
// Start the context, time out after one batch, and then stop it
streamingContext.start()
streamingContext.awaitTerminationOrTimeout(5000)
streamingContext.stop()
Results in:
-------------------------------------------
Time: 1468313607000 ms
-------------------------------------------
(4,(true,true))
(2,(false,false))
(1,(true,false))
(3,(false,true))
(5,(false,true))
I have joined two queueStream using Spark java. Please have a look at below code.
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Queue;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import com.google.common.collect.Queues;
import scala.Tuple2;
public class SparkQueueStreamJoin {
public static void main(String[] args) throws InterruptedException {
// Test data
List<Pair<Integer, Boolean>> input1 = Arrays.asList(Pair.of(1,true), Pair.of(2,false), Pair.of(3,false), Pair.of(4,true), Pair.of(5,false));
List<Pair<Integer, Boolean>> input2 = Arrays.asList(Pair.of(1,false), Pair.of(2,false), Pair.of(3,true), Pair.of(4,true), Pair.of(5,true));
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkQueueStreamJoin ")
.set("spark.testing.memory", "2147480000");
//System.setProperty("hadoop.home.dir", "C:/H`enter code here`adoop/hadoop-2.7.1");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Pair<Integer, Boolean>> input1RDD = sc.parallelize(input1);
JavaRDD<Pair<Integer, Boolean>> input2RDD = sc.parallelize(input2);
JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(3));
Queue<JavaRDD<Pair<Integer, Boolean>>> queue1RDD = Queues.newLinkedBlockingQueue();
queue1RDD.add(input1RDD);
Queue<JavaRDD<Pair<Integer, Boolean>>> queue2RDD = Queues.newLinkedBlockingQueue();
queue2RDD.add(input2RDD);
// Creates a DStream from the test data
JavaInputDStream<Pair<Integer, Boolean>> input1DStream = streamingContext.queueStream(queue1RDD, false);
JavaInputDStream<Pair<Integer, Boolean>> input2DStream = streamingContext.queueStream(queue2RDD, false);
JavaPairDStream<Integer, Boolean> pair1DStream = input1DStream.mapToPair(new PairFunction<Pair<Integer, Boolean>, Integer, Boolean>() {
#Override
public Tuple2<Integer, Boolean> call(Pair<Integer, Boolean> rawEvent) throws Exception {
return new Tuple2<>(rawEvent.getKey(), rawEvent.getValue());
}
});
JavaPairDStream<Integer, Boolean> pair2DStream = input2DStream.mapToPair(new PairFunction<Pair<Integer, Boolean>, Integer, Boolean>() {
#Override
public Tuple2<Integer, Boolean> call(Pair<Integer, Boolean> rawEvent) throws Exception {
return new Tuple2<>(rawEvent.getKey(), rawEvent.getValue());
}
});
// Union two streams together by merging them into a single dstream
//JavaDStream<Pair<Integer, Boolean>> joinedDStream = input1DStream.union(input2DStream);
// Join the two streams together by merging them into a single dstream
JavaPairDStream<Integer, Tuple2<Boolean, Boolean>> joinedDStream = pair1DStream.join(pair2DStream);
// Print the result
joinedDStream.print();
// Start the context, time out after one batch, and then stop it
streamingContext.start();
streamingContext.awaitTerminationOrTimeout(5000);
streamingContext.stop();
}
}
Output:
-------------------------------------------
Time: 1511444352000 ms
-------------------------------------------
(1,(true,false))
(2,(false,false))
(3,(false,true))
(4,(true,true))
(5,(false,true))
I have created a GraphFrame in Spark and the graph currently looks as following:
Basically, there will be lot of such subgraphs where each of these subgraphs will be disconnected to each other. Given a particular node ID I want to find all the other nodes within the subgraph. For instance, if the node ID 1 is given then the graph will traverse and return 2,10,20,3,30.
I have created a motif but it doesn't give the right result.
testgraph.find("(a)-[]->(b); (c)-[]->(b)").filter("(a.id = '1')").show()
Unfortunately the connected component function consider the whole graph. Is it possible to get all the nodes within a disconnected subgraph given a particular node ID using GraphFrame/GraphX?
Getting the connected component related to a specific vertex can be done using a BFS traversal that starts from this vertex and collects all its neighbors on several hops.
This can be simply done through the Pregel API offered by GraphX, where we should implement a vertexProgram, sendMessage and mergeMessages functions. The algorithm is triggered on the reception of an initial message. The center sends a message to its neighbors that will propagate it to their neighbors and so on till covering the connected component. Every vertex that receives a msg is checked so that it won't be activated in the following iterations.
Here is the implementation of this approach:
import org.apache.spark.graphx._
import org.apache.spark.{SparkConf, SparkContext}
object ConnectedComponent extends Serializable {
def main(args = Array[String]) = {
val conf = new SparkConf().setAppName("ConnectedComponent").setMaster("local")
val sc = new SparkContext(conf)
val vRDD = sc.objectFile[(VertexId,Int)]("/path/to/vertex/rdd/file/")
val eRDD = sc.objectFile[Edge[Int]]("/path/to/edge/rdd/file/")
val graph = Graph(vRDD, eRDD)
val centerOfCC = graph.pickRandomVertex()
var cc = extractCC(graph, center)
cc.vertices.collect.foreach(println)
sc.stop()
}
def extractCC(g: Graph[Int, Int], center: VertexId): Graph[Int, Int] = {
/* Return a subgraph of the input graph containing 'center' with the connected component
*/
val initialGraph = g.mapVertices((id, attr) => VertexData(attr, false, false, center))
val connectedComponent = initialGraph.pregel(initialMsg = 0)(vprog, sendMsg, mergeMsgs)
.subgraph(vpred = (id, attr) => attr.checked == true)
.mapVertices((id, vdata) => vdata.attr)
connectedComponent
}
case class VertexData( var attr : Int, // label of the vertex
var checked : Boolean, // check visited vertices
var propagate : Boolean, // allow forwarding msgs or not
var center: VertexId) // ID of the connectedComponent center
def vprog(id:VertexId, vdata: VertexData, msg: Int): VertexData = {
val attr : Int = vdata.attr
var checked : Boolean = vdata.checked
var propagate : Boolean = vdata.propagate
val center : VertexId = vdata.center
if (checked==false && msg == 0 && id==center) {
propagate = true
checked = true
}
else if(checked==false && msg == 1) {
propagate = true
checked = true
}
else if(checked == true && msg == 1){
propagate = false
}
new VertexData(attr, checked, propagate, center)
}
def sendMsg(triplet: EdgeTriplet[VertexData, Int]):Iterator[(VertexId, Int)] = {
var it : Iterator[(VertexId, Int)] = Iterator()
if(triplet.dstAttr.propagate==true)
it = it ++ Iterator((triplet.srcId, 1))
if(triplet.srcAttr.propagate==true)
it = it ++ Iterator((triplet.dstId, 1))
it
}
def mergeMsgs(a: Int, b: Int): Int = math.max(a, b)
}
I am new to Spark SQL. Concat function not available in Spark Sql Query for this we have registered one sql function, with in this function i need access another table. for that we have written spark sql query on SQLContext object.
when i invoke this query i am getting NullpointerException.please can you help on this.
Thanks in advance
//This I My code
class SalesHistory_2(sqlContext:SQLContext,sparkContext:SparkContext) extends Serializable {
import sqlContext._
import sqlContext.createSchemaRDD
try{
sqlContext.registerFunction("MaterialTransformation", Material_Transformation _)
def Material_Transformation(Material_ID: String): String =
{
var material:String =null;
var dd = sqlContext.sql("select * from product_master")
material
}
/* Product master*/
val productRDD = this.sparkContext.textFile("D:\\Realease 8.0\\files\\BHI\\BHI_SOP_PRODUCT_MASTER.txt")
val product_schemaString = productRDD.first
val product_withoutHeaders = dropHeader(productRDD)
val product_schema = StructType(product_schemaString.split("\\|").map(fieldName => StructField(fieldName, StringType, true)))
val productdata = product_withoutHeaders.map{_.replace("|", "| ")}.map(x=> x.split("\\|"))
var product_rowRDD = productdata.map(line=>{
Row.fromSeq(line.map {_.trim() })
})
val product_srctableRDD = sqlContext.applySchema(product_rowRDD, product_schema)
product_srctableRDD.registerTempTable("product_master")
cacheTable("product_master")
/* Customer master*/
/* Sales History*/
val srcRDD = this.sparkContext.textFile("D:\\Realease 8.0\\files\\BHI\\BHI_SOP_TRADE_SALES_HISTORY_DS_4_20150119.txt")
val schemaString= srcRDD.first
val withoutHeaders = dropHeader(srcRDD)
val schema = StructType(schemaString.split("\\|").map(fieldName => StructField(fieldName, StringType, true)))
val lines = withoutHeaders.map {_.replace("|", "| ")}.map(x=> x.split("\\|"))
var rowRDD = lines.map(line=>{
Row.fromSeq(line.map {_.trim() })
})
val srctableRDD = sqlContext.applySchema(rowRDD, schema)
srctableRDD.registerTempTable("SALES_HISTORY")
val srcResults = sqlContext.sql("SELECT Delivery_Number,Delivery_Line_Item,MaterialTransformation(Material_ID),Customer_Group_Node,Ops_ID,DC_ID,Mfg_ID,PGI_Date,Delivery_Qty,Customer_Group_Node,Line_Total_COGS,Line_Net_Rev,Material_Description,Sold_To_Partner_Name,Plant_Description,Originating_Doc,Orig_Doc_Line_item,Revenue_Type,Material_Doc_Ref,Mater_Doc_Ref_Item,Req_Delivery_Date FROM SALES_HISTORY")
val path: Path = Path ("D:/Realease 8.0/files/output/")
try {
path.deleteRecursively(continueOnFailure = false)
} catch {
case e: IOException => // some file could not be deleted
}
val successRDDToFile = srcResults.map { x => x.mkString("|")}
successRDDToFile.coalesce(1).saveAsTextFile("D:/Realease 8.0/files/output/")
}
catch {
case ex: Exception => println(ex) // TODO: handle error
}
this.sparkContext.stop()
def dropHeader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
The answer here is rather short and probably disappointing - you simply cannot do something like this.
General rule in Spark is you cannot trigger action or transformation from another action and transformation or, to be a little bit more precise, outside the driver Spark Context is no longer accessible / defined.
Calling Spark SQL for each row in the Sales History RDD looks like a very bad idea:
val srcResults = sqlContext.sql("SELECT Delivery_Number,Delivery_Line_Item,MaterialTransformation(Material_ID),Customer_Group_Node,Ops_ID,DC_ID,Mfg_ID,PGI_Date,Delivery_Qty,Customer_Group_Node,Line_Total_COGS,Line_Net_Rev,Material_Description,Sold_To_Partner_Name,Plant_Description,Originating_Doc,Orig_Doc_Line_item,Revenue_Type,Material_Doc_Ref,Mater_Doc_Ref_Item,Req_Delivery_Date FROM SALES_HISTORY")
You'd better user a join between your RDDs and forget you custom function:
val srcResults = sqlContext.sql("SELECT s.*, p.* FROM SALES_HISTORY s join product_master p on s.Material_ID=p.ID")