I want to load the whole hive table into spark memory by hive jdbc connection. And already add hive-site.xml, hdfs-site.xml in my project. The spark already connected hive, because of getting the column name(eg.role_id) successfully. But spark seems to load the column name as data, and throws an exception. Here is my code:
val df = spark.read.format("jdbc")
.option("driver", CommonUtils.HIVE_DIRVER)
.option("url", CommonUtils.HIVE_URL)
.option("dbtable", "datasource_test.t_leave_map_base")
.option("header", "true")
.option("user", CommonUtils.HIVE_PASSWORD)
.option("password", CommonUtils.HIVE_PASSWORD)
.option("fetchsize", "20")
.load()
df.registerTempTable("t_leave_map_base")
df.persist(StorageLevel.MEMORY_ONLY)
df.show()
df
And get error:
java.lang.NumberFormatException: For input string: "t_leave_map_base.role_id" at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) ~[na:1.8.0_25] at java.lang.Long.parseLong(Long.java:589) ~[na:1.8.0_25] at java.lang.Long.valueOf(Long.java:803) ~[na:1.8.0_25] at org.apache.hive.jdbc.HiveBaseResultSet.getLong(HiveBaseResultSet.java:366) ~[hive-jdbc-1.1.0-cdh5.12.0.jar:1.1.0-cdh5.12.0] at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$makeGetter$8.apply(JdbcUtils.scala:409) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$makeGetter$8.apply(JdbcUtils.scala:408) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anon$1.getNext(JdbcUtils.scala:330) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anon$1.getNext(JdbcUtils.scala:312) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) ~[na:na] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.hasNext(InMemoryRelation.scala:133) ~[spark-sql_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.scheduler.Task.run(Task.scala:108) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) ~[spark-core_2.11-2.2.0.cloudera2.jar:2.2.0.cloudera2] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_25] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[na:1.8.0_25] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_25]
Debug the project and all the fetchedRows are columns' name:
I wonder whether spark sql does support load hive table in this way?