I'm reading data coming from a Kafka (100.000 line per second) using Structured Spark Streaming, and i'm trying to insert all the data in HBase.
I'm in Cloudera Hadoop 2.6 and I'm using Spark 2.3
I tried something like I've seen here.
eventhubs.writeStream
.foreach(new MyHBaseWriter[Row])
.option("checkpointLocation", checkpointDir)
.start()
.awaitTermination()
MyHBaseWriter looks like this :
class AtomeHBaseWriter[RECORD] extends HBaseForeachWriter[Row] {
override def toPut(record: Row): Put = {
override val tableName: String = "hbase-table-name"
override def toPut(record: Row): Put = {
// Get Json
val data = JSON.parseFull(record.getString(0)).asInstanceOf[Some[Map[String, Object]]]
val key = data.getOrElse(Map())("key")+ ""
val val = data.getOrElse(Map())("val")+ ""
val p = new Put(Bytes.toBytes(key))
//Add columns ...
p.addColumn(Bytes.toBytes(columnFamaliyName),Bytes.toBytes(columnName), Bytes.toBytes(val))
p
}
}
And the HBaseForeachWriter class looks like this :
trait HBaseForeachWriter[RECORD] extends ForeachWriter[RECORD] {
val tableName: String
def pool: Option[ExecutorService] = None
def user: Option[User] = None
private var hTable: Table = _
private var connection: Connection = _
override def open(partitionId: Long, version: Long): Boolean = {
connection = createConnection()
hTable = getHTable(connection)
true
}
def createConnection(): Connection = {
// I create HBase Connection Here
}
def getHTable(connection: Connection): Table = {
connection.getTable(TableName.valueOf(Variables.getTableName()))
}
override def process(record: RECORD): Unit = {
val put = toPut(record)
hTable.put(put)
}
override def close(errorOrNull: Throwable): Unit = {
hTable.close()
connection.close()
}
def toPut(record: RECORD): Put
}
So here I'm doing a put line by line, even if I allow 20 executors and 4 cores for each, I don't have the data inserted immediatly in HBase. So what I need to do is a bulk load ut I'm struggled because all what I find in the internet is to realize it with RDDs and Map/Reduce.