I am inserting to HBase using Spark but it's slow. For 60,000 records it takes 2-3mins. I have about 10 million records to save.
object WriteToHbase extends Serializable {
def main(args: Array[String]) {
val csvRows: RDD[Array[String] = ...
val dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val usersRDD = csvRows.map(row => {
new UserTable(row(0), row(1), row(2), row(9), row(10), row(11))
})
processUsers(sc: SparkContext, usersRDD, dateFormatter)
})
}
def processUsers(sc: SparkContext, usersRDD: RDD[UserTable], dateFormatter: DateTimeFormatter): Unit = {
usersRDD.foreachPartition(part => {
val conf = HBaseConfiguration.create()
val table = new HTable(conf, tablename)
part.foreach(userRow => {
val id = userRow.id
val name = userRow.name
val date1 = dateFormatter.parseDateTime(userRow.date1)
val hRow = new Put(Bytes.toBytes(id))
hRow.add(cf, q, Bytes.toBytes(date1))
hRow.add(cf, q, Bytes.toBytes(name))
...
table.put(hRow)
})
table.flushCommits()
table.close()
})
}
I am using this in spark-submit:
--num-executors 2 --driver-memory 2G --executor-memory 2G --executor-cores 2