Can't load from multiple databases using Spark-Mongo driver - apache-spark

I am working on spark module, where I need to load the collections from multiple sources (databases) but I can't get the collection from second db.
Databases
DB1
L_coll1
DB2
L_coll2
Logic code
String mst ="local[*]";
String host= "localhost";
String port = "27017";
String DB1 = "DB1";
String DB2 = "DB2";
SparkConf conf = new SparkConf().setAppName("cust data").setMaster(mst);
SparkSession spark = SparkSession
.builder()
.config(conf)
.config("spark.mongodb.input.uri", "mongodb://"+host+":"+port+"/")
.config("spark.mongodb.input.database",DB1)
.config("spark.mongodb.input.collection","coll1")
.getOrCreate();
SparkSession spark1 = SparkSession
.builder()
.config(conf)
.config("spark.mongodb.input.uri", "mongodb://"+host+":"+port+"/")
.config("spark.mongodb.input.database",DB2)
.config("spark.mongodb.input.collection","coll2")
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaSparkContext jsc1 = new JavaSparkContext(spark1.sparkContext());
Reading configurations
ReadConfig readConfig = ReadConfig.create(spark);
Dataset<Row> MongoDatset = MongoSpark.load(jsc,readConfig).toDF();
MongoDatset.show();
ReadConfig readConfig1 = ReadConfig.create(spark1);
Dataset<Row> MongoDatset1 = MongoSpark.load(jsc1,readConfig1).toDF();
MongoDatset1.show();
After running the about code, I am getting the first dataset multiple time. If I comment the first SparkSession spark instance than only getting the collection from second db DB2.

Instead of using the multiple spark sessions you can use ReadConfig's override option to get multiple database and collections.
Creating spark session
String DB = "DB1";
String DB1 = "DB2";
String Coll1 ="Coll1";
String Coll2 ="Coll2";
SparkSession spark = SparkSession.builder()
.master("local")
.appName("MongoSparkConnectorIntro")
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.myCollection")
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.myCollection")
.getOrCreate();
// Create a JavaSparkContext using the SparkSession's SparkContext object
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
Get database function
private static Dataset<Row> getDB(JavaSparkContext jsc_, String DB, String Coll1) {
// Create a custom ReadConfig
Map<String, String> readOverrides = new HashMap<String, String>();
readOverrides.put("database",DB );
readOverrides.put("collection", Coll1);
readOverrides.put("readPreference.name", "secondaryPreferred");
System.out.println(readOverrides);
ReadConfig readConfig = ReadConfig.create(jsc_).withOptions(readOverrides);
return MongoSpark.load(jsc_,readConfig).toDF();
}
Using getDB to create multiple databases
Dataset<Row> MongoDatset1 = getDB(jsc, DB, Coll1);
Dataset<Row> MongoDatset2 = getDB(jsc, DB1, Coll2);
MongoDatset1.show(1);
MongoDatset2.show(1);

Related

Write the final dataset output for spark Java into s3

I am not able to find the correct way to write data to s3 from dataset spark. What should be some more configurations that I should add. Do I have to mention the AWS configurations in my code or it will pick it up from local .aws/ profile?
Please guide
import java.util.Properties;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class sparkSqlMysql {
private static final org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(sparkSqlMysql.class);
private static final SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Spark2JdbcDs")
.getOrCreate();
public static void main(String[] args) {
// JDBC connection properties
final Properties connectionProperties = new Properties();
connectionProperties.put("user", "root");
connectionProperties.put("password", "password");
connectionProperties.put("driver", "com.mysql.jdbc.Driver");
final String dbTable = "(select * from Fielding) t";
final String dbTable1 = "(select * from Salaries) m";
final String dbTable2 = "(select * from Pitching) n";
// Load MySQL query result as Dataset
Dataset<Row> jdbcDF2 = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/lahman2016", dbTable,
connectionProperties);
Dataset<Row> jdbcDF3 = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/lahman2016", dbTable1,
connectionProperties);
Dataset<Row> jdbcDF4 = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/lahman2016", dbTable2,
connectionProperties);
jdbcDF2.createOrReplaceTempView("Fielding");
jdbcDF3.createOrReplaceTempView("Salaries");
jdbcDF4.createOrReplaceTempView("Pitching");
Dataset<Row> sqlDF = sparkSession.sql(
"select Salaries.yearID, avg(Salaries.salary) as Fielding from Salaries inner join Fielding ON Salaries.yearID = Fielding.yearID AND Salaries.playerID = Fielding.playerID group by Salaries.yearID limit 5");
Dataset<Row> sqlDF1 = sparkSession.sql(
"select Salaries.yearID, avg(Salaries.salary) as Pitching from Salaries inner join Pitching ON Salaries.yearID = Pitching.yearID AND Salaries.playerID = Pitching.playerID group by Salaries.yearID limit 5");
// sqlDF.show();
// sqlDF1.show();
sqlDF.createOrReplaceTempView("avg_fielding");
sqlDF1.createOrReplaceTempView("avg_pitching");
Dataset<Row> final_query_1_output = sparkSession.sql(
"select avg_fielding.yearID, avg_fielding.Fielding, avg_pitching.Pitching from avg_fielding inner join avg_pitching ON avg_pitching.yearID = avg_fielding.yearID");
final_query_1_output.show();
The output of the query is :
final_query_1_output.show();
+------+------------------+------------------+
|yearID| Fielding| Pitching|
+------+------------------+------------------+
| 1990| 507978.625320787| 485947.2487437186|
| 2003|2216200.9609838845|2133800.1867612293|
| 2007|2633213.0126475547|2617533.3393665156|
| 2015|3996199.5729421354| 3955581.121535181|
| 2006| 2565803.492487479| 2534756.866972477|
+------+------------------+------------------+
I want to write this dataset to s3 : how can I do that?
final_query_1_output.write().mode("overwrite").save("s3n://druids3migration/data.csv");

How to batch insert into hbase using saveAsNewAPIHadoopDataset

just learn spark for a while, i found the api: saveAsNewAPIHadoopDataset when i use hbase, code like below, as far as know,this code can insert one row at a time , how to change it to batch put? i am a rookie ..please help...tks
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkContext, SparkConf}
/**
*
*/
object HbaseTest2 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "account"
sc.hadoopConfiguration.set("hbase.zookeeper.quorum","slave1,slave2,slave3")
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = Job.getInstance(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val indataRDD = sc.makeRDD(Array("1,jack,15","2,Lily,16","3,mike,16"))
val rdd = indataRDD.map(_.split(',')).map{arr=>{
val put = new Put(Bytes.toBytes(arr(0)))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("age"),Bytes.toBytes(arr(2).toInt))
(new ImmutableBytesWritable, put)
}}
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
sc.stop()
}
}
Actually you don't need to worry about this - under the hood, put(Put) and put(List<Put>) are identical. They both buffer messages and flush them in batches. There should be no noticeable performance difference.
I'm afraid the other answer is misguided.
saveAsNewAPIHadoopDataset performs single put.
To perform bulk put to hbase table, you can use hbase-spark connector.
The connector executes bulkPutFunc2 within mapPartition() so is efficient.
Your source code will change like below -
object HBaseTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "account"
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "slave1,slave2,slave3")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
hbaseConf.set("zookeeper.znode.parent", "/hbase")
val hbaseContext = new HBaseContext(sc, hbaseConf)
val indataRDD = sc.makeRDD(Array("1,jack,15", "2,Lily,16", "3,mike,16"))
hbaseContext.bulkPut(indataRDD, TableName.valueOf(tablename), bulkPutFunc2)
sc.stop()
}
def bulkPutFunc2(arrayRec : String): Put = {
val rec = arrayRec.split(",")
val put = new Put(Bytes.toBytes(rec(0).toInt))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("name"), Bytes.toBytes(rec(1)))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("age"), Bytes.toBytes(rec(2).toInt))
put
}
}
pom.xml would have following entry -
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.2.0-cdh5.12.1</version>
<dependency>

How to make Task Serializable in HBase using Spark

I was trying to write data in HBase using Spark but getting the exception Exception in thread "main" org.apache.spark.SparkException: Task not serializable. I was trying to open connection on each worker node using the following code snippet:
val conf = HBaseConfiguration.create()
val tableName = args(1)
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
val tableDesc = new HTableDescriptor(tableName)
val columnDesc = new HColumnDescriptor("cf".getBytes()).setBloomFilterType(BloomType.ROWCOL).setMaxVersions(5)
tableDesc.addFamily(columnDesc)
admin.createTable(tableDesc)
rddData.foreachPartition( part => {
val table = new HTable(conf, tableName)
part.foreach( elem => {
var put = new Put(Bytes.toBytes(elem._1))
put.add(Bytes.toBytes("cf"), Bytes.toBytes("col"), Bytes.toBytes(elem._2))
table.put(put)
})
table.flushCommits()
})
How can I make task serializable while writing on HBase using spark?
If I am not mistaken conf (instance of hadoop Configuration) is not serializable.
Write your code in such a way that all the non-serializable parts are in the foreachPartition block (so that it is executed on the nodes). Here is an example where I create a second conf etc..:
`
rddData.foreachPartition( part => {
val conf2 = HBaseConfiguration.create()
val tableName2 = args(1)
conf2.set(TableInputFormat.INPUT_TABLE, tableName2)
val table2 = new HTable(conf2, tableName2)
part.foreach( elem => {
var put = new Put(Bytes.toBytes(elem._1))
put.add(Bytes.toBytes("cf"), Bytes.toBytes("col"), Bytes.toBytes(elem._2))
table2.put(put)
})
table2.flushCommits()
})
`

Create a StructType from String in Spark Streaming

In Spark structured Streaming I want to create a StructType from STRING.
In the below example, spark read method accepts only "Struct Type" for schema, how can I create a StructType from String. I want to convert employeeSchema String to StructType.
public static void main(String[] args) throws AnalysisException {
String master = "local[*]";
SparkSession sparkSession = SparkSession
.builder().appName(EmployeeSchemaLoader.class.getName())
.master(master).getOrCreate();
String employeeSchema = "StructType(\n" +
"StructField(firstName,StringType,true),\n" +
"StructField(lastName,StringType,true),\n" +
"StructField(addresses,\n" +
"ArrayType(\n" +
"StructType(\n" +
"StructField(city,StringType,true), \n" +
"StructField(state,StringType,true)\n" +
"),\n" +
"true),\n" +
"true) \n" +
")";
SparkContext context = sparkSession.sparkContext();
context.setLogLevel("ERROR");
SQLContext sqlCtx = sparkSession.sqlContext();
Dataset<Row> employeeDataset = sparkSession.read()
//.schema(employeeSchema) // Accepts only Struct Type
.json("simple_employees.json");
employeeDataset.printSchema();
employeeDataset.createOrReplaceTempView("employeeView");
sparkSession.catalog().listTables().show();
sqlCtx.sql("select * from employeeView").show();
I'm not sure why you would want to do this. Instead of making employeeSchema a String, why not make it a StructType? Like this:
StructType employeeSchema = StructType(
StructField(firstName,StringType,true),
StructField(lastName,StringType,true),
StructField(addresses, ArrayType(StructType(
StructField(city,StringType,true),
StructField(state,StringType,true)
), true), true)
from pyspark.sql.types import StructType
schema = inputdf.schema
print(type(inputdf.schema))
# just to display all methods available on schema
print(dir(schema))
new_schema = StructType.fromJson(schema.jsonValue())
print(type(new_schema))

Reading Hive Table in Spark

Which of the following approach is better if I have billions of records in the hive table:
Direct:
SparkConf conf = new SparkConf(true).setMaster("yarn-cluster").setAppName("DCA_HIVE_HDFS");
SparkContext sc = new SparkContext(conf);
HiveContext hc = new HiveContext(sc);
DataFrame df = hc.table(tableName);
df.write().orc(outputHdfsFile);
Using JDBC:
SparkConf conf = new SparkConf(true).setMaster("yarn-cluster").setAppName("DCA_HIVE_HDFS");
SparkContext sc = new SparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
try {
Class.forName(driverName);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
Properties props = new Properties();
props.setProperty("user", userName);
props.setProperty("password", password);
props.setProperty("driver", driverName);
DataFrame df = sqlContext.read().jdbc(connectionUri, tableName, props);
df.write().orc(outputHdfsFile);

Resources