Below is a full example using the spark hbase connector from Hortonworks available in Maven.
This example shows
- how to check if HBase table is existing
- create HBase table if not existing
- Insert DataFrame into HBase table
import org.apache.hadoop.hbase.client.{ColumnFamilyDescriptorBuilder, ConnectionFactory, TableDescriptorBuilder}
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
object Main extends App {
case class Employee(key: String, fName: String, lName: String, mName: String,
addressLine: String, city: String, state: String, zipCode: String)
// as pre-requisites the table 'employee' with column families 'person' and 'address' should exist
val tableNameString = "default:employee"
val colFamilyPString = "person"
val colFamilyAString = "address"
val tableName = TableName.valueOf(tableNameString)
val colFamilyP = colFamilyPString.getBytes
val colFamilyA = colFamilyAString.getBytes
val hBaseConf = HBaseConfiguration.create()
val connection = ConnectionFactory.createConnection(hBaseConf);
val admin = connection.getAdmin();
println("Check if table 'employee' exists:")
val tableExistsCheck: Boolean = admin.tableExists(tableName)
println(s"Table " + tableName.toString + " exists? " + tableExistsCheck)
if(tableExistsCheck == false) {
println("Create Table employee with column families 'person' and 'address'")
val colFamilyBuild1 = ColumnFamilyDescriptorBuilder.newBuilder(colFamilyP).build()
val colFamilyBuild2 = ColumnFamilyDescriptorBuilder.newBuilder(colFamilyA).build()
val tableDescriptorBuild = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(colFamilyBuild1)
.setColumnFamily(colFamilyBuild2)
.build()
admin.createTable(tableDescriptorBuild)
}
// define schema for the dataframe that should be loaded into HBase
def catalog =
s"""{
|"table":{"namespace":"default","name":"employee"},
|"rowkey":"key",
|"columns":{
|"key":{"cf":"rowkey","col":"key","type":"string"},
|"fName":{"cf":"person","col":"firstName","type":"string"},
|"lName":{"cf":"person","col":"lastName","type":"string"},
|"mName":{"cf":"person","col":"middleName","type":"string"},
|"addressLine":{"cf":"address","col":"addressLine","type":"string"},
|"city":{"cf":"address","col":"city","type":"string"},
|"state":{"cf":"address","col":"state","type":"string"},
|"zipCode":{"cf":"address","col":"zipCode","type":"string"}
|}
|}""".stripMargin
// define some test data
val data = Seq(
Employee("1","Horst","Hans","A","12main","NYC","NY","123"),
Employee("2","Joe","Bill","B","1337ave","LA","CA","456"),
Employee("3","Mohammed","Mohammed","C","1Apple","SanFran","CA","678")
)
// create SparkSession
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.appName("HBaseConnector")
.getOrCreate()
// serialize data
import spark.implicits._
val df = spark.sparkContext.parallelize(data).toDF
// write dataframe into HBase
df.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "3")) // create 3 regions
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
}
This worked for me while I had the relevant site-xmls ("core-site.xml", "hbase-site.xml", "hdfs-site.xml") available in my resources.
val data = (0 to 255).map { i => HBaseRecord(i, "extra")}
how to insert foreach record of my dataframe not from 0 to 255 – Zied Hermi