spark write data to hbase - apache-spark

I write a demo to write data to hbase, but no response, no error, no log.
My hbase is 0.98, hadoop 2.3, spark 1.4.
And I run in yarn-client mode. any idea? thanks.
object SparkConnectHbase2 extends Serializable {
def main(args: Array[String]) {
new SparkConnectHbase2().toHbase();
}
}
class SparkConnectHbase2 extends Serializable {
def toHbase() {
val conf = new SparkConf().setAppName("ljh_ml3");
val sc = new SparkContext(conf)
val tmp = sc.parallelize(Array(601, 701, 801, 901)).foreachPartition({ a =>
val configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.property.clientPort", "2181");
configuration.set("hbase.zookeeper.quorum", “192.168.1.66");
configuration.set("hbase.master", “192.168.1.66:60000");
val table = new HTable(configuration, "ljh_test4");
var put = new Put(Bytes.toBytes(a+""));
put.add(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes(a + "value"));
table.put(put);
table.flushCommits();
})
}
}
thanks.

Write to hbase table
import org.apache.hadoop.hbase.client.{HBaseAdmin, HTable, Put}
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, HColumnDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark._
val hconf = HBaseConfiguration.create()
hconf.set(TableInputFormat.INPUT_TABLE, tablename)
val admin = new HBaseAdmin(hconf)
if(!admin.isTableAvailable(tablename)) {
val tabledesc= new HTableDescriptor(tablename)
tabledesc.addFamily(new HColumnDescriptor("cf1".getBytes()));
admin.createTable(tabledesc)
}
val newtable= new HTable(hconf, tablename);
val put = new Put(new String("row").getBytes());
put .add("cf1".getBytes(), "col1".getBytes(), new String("data").getBytes());
newtable.put(put);
newtable.flushCommits();
val hbaserdd = sc.newAPIHadoopRDD(hconf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])

Related

Is it possible to write a dataframe into 2 files of different type?

We can use following api to write dataframe into local files.
df.write.parquet(path)
df.write.json(path)
However, Can I write into a parquet and a json in one time without compute the dataframe twice ?
By the way , I dont want to cache the data in memory, because it's too big.
If you don't cache/persist the dataframe, then it'll will need re-computed for each output format.
We can implement an org.apache.spark.sql.execution.datasources.FileFormat to do such thing.
DuplicateOutFormat demo
/**
* Very Dangerous Toy Code. DO NOT USE IN PRODUCTION.
*/
class DuplicateOutFormat
extends FileFormat
with DataSourceRegister
with Serializable {
override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = {
throw new UnsupportedOperationException()
}
override def prepareWrite(sparkSession: SparkSession,
job: Job,
options: Map[String, String],
dataSchema: StructType): OutputWriterFactory = {
val format1 = options("format1")
val format2 = options("format2")
val format1Instance = DataSource.lookupDataSource(format1, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val format2Instance = DataSource.lookupDataSource(format2, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val writerFactory1 = format1Instance.prepareWrite(sparkSession, job, options, dataSchema)
val writerFactory2 = format2Instance.prepareWrite(sparkSession, job, options, dataSchema)
new OutputWriterFactory {
override def getFileExtension(context: TaskAttemptContext): String = ".dup"
override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
val path1 = path.replace(".dup", writerFactory1.getFileExtension(context))
val path2 = path.replace(".dup", writerFactory2.getFileExtension(context))
val writer1 = writerFactory1.newInstance(path1, dataSchema, context)
val writer2 = writerFactory2.newInstance(path2, dataSchema, context)
new OutputWriter {
override def write(row: InternalRow): Unit = {
writer1.write(row)
writer2.write(row)
}
override def close(): Unit = {
writer1.close()
writer2.close()
}
}
}
}
}
override def shortName(): String = "dup"
}
SPI
we should make a SPI file /META-INF/services/org.apache.spark.sql.sources.DataSourceRegister, content:
com.github.sparkdemo.DuplicateOutFormat.
demo usage
class DuplicateOutFormatTest extends FunSuite {
val spark = SparkSession.builder()
.master("local")
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
test("testDuplicateWrite") {
val data = Array(
("k1", "fa", "20210901", 16),
("k2", null, "20210902", 15),
("k3", "df", "20210903", 14),
("k4", null, "20210904", 13)
)
val tempDir = System.getProperty("java.io.tmpdir") + "spark-dup-test" + System.nanoTime()
val df = sc.parallelize(data).toDF("k", "col2", "day", "col4")
df.write
.option("format1", "csv")
.option("format2", "orc")
.format("dup").save(tempDir)
df.show(1000, false)
}
}
WARNING
Spark SQL couple some sth in DataFrameWriter#saveToV1Source and other source code, that we can't change. This custom DuplicateOutFormat is just for demo, lacking of test. Full demo in github.

Save RDD as csv file using coalesce function

I am trying to stream twitter data using Apache Spark in Intellij however when i use the function coalesce , it says that it cannot resolve symbol coalesce. Here is my main code:
val spark = SparkSession.builder().appName("twitterStream").master("local[*]").getOrCreate()
import spark.implicits._
val sc: SparkContext = spark.sparkContext
val streamContext = new StreamingContext(sc, Seconds(5))
val filters = Array("Singapore")
val filtered = TwitterUtils.createStream(streamContext, None, filters)
val englishTweets = filtered.filter(_.getLang() == "en")
//englishTweets.print()
englishTweets.foreachRDD{rdd =>
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val tweets = rdd.map( field =>
(
field.getId,
field.getUser.getScreenName,
field.getCreatedAt.toInstant.toString,
field.getText.toLowerCase.split(" ").filter(_.matches("^[a-zA-Z0-9 ]+$")).fold("")((a, b) => a + " " + b).trim,
sentiment(field.getText)
)
)
val tweetsdf = tweets.toDF("userID", "user", "createdAt", "text", "sentimentType")
tweetsdf.printSchema()
tweetsdf.show(false)
}.coalesce(1).write.csv("hdfs://localhost:9000/usr/sparkApp/test/testing.csv")
I have tried with my own dataset, and I have read a dataset and while writing I have applied coalesce function and it is giving results, please refer to this it may help you.
import org.apache.spark.sql.SparkSession
import com.spark.Rdd.DriverProgram
import org.apache.log4j.{ Logger, Level }
import org.apache.spark.sql.SaveMode
import java.sql.Date
object JsonDataDF {
System.setProperty("hadoop.home.dir", "C:\\hadoop");
System.setProperty("hadoop.home.dir", "C:\\hadoop"); // This is the system property which is useful to find the winutils.exe
Logger.getLogger("org").setLevel(Level.WARN) // This will remove Logs
case class AOK(appDate:Date, arr:String, base:String, Comments:String)
val dp = new DriverProgram
val spark = dp.getSparkSession()
def main(args : Array[String]): Unit = {
import spark.implicits._
val jsonDf = spark.read.option("multiline", "true").json("C:\\Users\\34979\\Desktop\\Work\\Datasets\\JSONdata.txt").as[AOK]
jsonDf.coalesce(1) // Refer Here
.write
.mode(SaveMode.Overwrite)
.option("header", "true")
.format("csv")
.save("C:\\Users\\34979\\Desktop\\Work\\Datasets\\JsonToCsv")
}
}

How pass Basic Authentication to Confluent Schema Registry?

I want to read data from a confluent cloud topic and then write in another topic.
At localhost, I haven't had any major problems. But the schema registry of confluent cloud requires to pass some authentication data that I don't know how to enter them:
basic.auth.credentials.source=USER_INFO
schema.registry.basic.auth.user.info=:
schema.registry.url=https://xxxxxxxxxx.confluent.cloudBlockquote
Below is the current code:
import com.databricks.spark.avro.SchemaConverters
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.sql.SparkSession
object AvroConsumer {
private val topic = "transactions"
private val kafkaUrl = "http://localhost:9092"
private val schemaRegistryUrl = "http://localhost:8081"
private val schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
private val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
private val avroSchema = schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
private var sparkSchema = SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("ConfluentConsumer")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark.udf.register("deserialize", (bytes: Array[Byte]) =>
DeserializerWrapper.deserializer.deserialize(bytes)
)
val kafkaDataFrame = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", kafkaUrl)
.option("subscribe", topic)
.load()
val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")
import org.apache.spark.sql.functions._
val formattedDataFrame = valueDataFrame.select(
from_json(col("message"), sparkSchema.dataType).alias("parsed_value"))
.select("parsed_value.*")
formattedDataFrame
.writeStream
.format("console")
.option("truncate", false)
.start()
.awaitTermination()
}
object DeserializerWrapper {
val deserializer = kafkaAvroDeserializer
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val genericRecord = super.deserialize(bytes).asInstanceOf[GenericRecord]
genericRecord.toString
}
}
}
I think I have to pass this authentication data to CachedSchemaRegistryClient but I'm not sure if so and how.
I've finally been able to pass the properties.
I leave the lines that gave the solution.
val restService = new RestService(schemaRegistryURL)
val props = Map(
"basic.auth.credentials.source" -> "USER_INFO",
"schema.registry.basic.auth.user.info" -> "secret:secret"
).asJava
var schemaRegistryClient = new CachedSchemaRegistryClient(restService, 100, props)

How to batch insert into hbase using saveAsNewAPIHadoopDataset

just learn spark for a while, i found the api: saveAsNewAPIHadoopDataset when i use hbase, code like below, as far as know,this code can insert one row at a time , how to change it to batch put? i am a rookie ..please help...tks
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkContext, SparkConf}
/**
*
*/
object HbaseTest2 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "account"
sc.hadoopConfiguration.set("hbase.zookeeper.quorum","slave1,slave2,slave3")
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = Job.getInstance(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val indataRDD = sc.makeRDD(Array("1,jack,15","2,Lily,16","3,mike,16"))
val rdd = indataRDD.map(_.split(',')).map{arr=>{
val put = new Put(Bytes.toBytes(arr(0)))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("age"),Bytes.toBytes(arr(2).toInt))
(new ImmutableBytesWritable, put)
}}
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
sc.stop()
}
}
Actually you don't need to worry about this - under the hood, put(Put) and put(List<Put>) are identical. They both buffer messages and flush them in batches. There should be no noticeable performance difference.
I'm afraid the other answer is misguided.
saveAsNewAPIHadoopDataset performs single put.
To perform bulk put to hbase table, you can use hbase-spark connector.
The connector executes bulkPutFunc2 within mapPartition() so is efficient.
Your source code will change like below -
object HBaseTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "account"
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "slave1,slave2,slave3")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
hbaseConf.set("zookeeper.znode.parent", "/hbase")
val hbaseContext = new HBaseContext(sc, hbaseConf)
val indataRDD = sc.makeRDD(Array("1,jack,15", "2,Lily,16", "3,mike,16"))
hbaseContext.bulkPut(indataRDD, TableName.valueOf(tablename), bulkPutFunc2)
sc.stop()
}
def bulkPutFunc2(arrayRec : String): Put = {
val rec = arrayRec.split(",")
val put = new Put(Bytes.toBytes(rec(0).toInt))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("name"), Bytes.toBytes(rec(1)))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("age"), Bytes.toBytes(rec(2).toInt))
put
}
}
pom.xml would have following entry -
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.2.0-cdh5.12.1</version>
<dependency>

Reading files dynamically from HDFS from within spark transformation functions

How can a file from HDFS be read in a spark function not using sparkContext within the function.
Example:
val filedata_rdd = rdd.map { x => ReadFromHDFS(x.getFilePath) }
Question is how ReadFromHDFS can be implemented?Usually to read from HDFS we could do a sc.textFile but in this case sc cannot be used in the function.
You don't necessarily need service context to interact with HDFS. You can simply broadcast the hadoop configuration from master and use the broadcasted configuration value on executors to construct a hadoop.fs.FileSystem. Then the world is your. :)
Following is the code:
import java.io.StringWriter
import com.sachin.util.SparkIndexJobHelper._
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SerializableWritable, SparkConf}
class Test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[15]")
.setAppName("TestJob")
val sc = createSparkContext(conf)
val confBroadcast = sc.broadcast(new SerializableWritable(sc.hadoopConfiguration))
val rdd: RDD[String] = ??? // your existing rdd
val filedata_rdd = rdd.map { x => readFromHDFS(confBroadcast.value.value, x) }
}
def readFromHDFS(configuration: Configuration, path: String): String = {
val fs: FileSystem = FileSystem.get(configuration)
val inputStream = fs.open(new Path(path));
val writer = new StringWriter();
IOUtils.copy(inputStream, writer, "UTF-8");
writer.toString();
}
}

Resources