Spark submit failing with Hive -
i trying spark 1.1.0 program written in scala work, i'm having hard time it. have hive query simple:
select json, score data
when run following command spark-shell works (i need mysql_conn in driver class path i'm using hive mysql metadata store)
bin/spark-shell --master $spark_url --driver-class-path $mysql_conn import org.apache.spark.sql.hive.hivecontext val sqlcontext = new hivecontext(sc) sqlcontext.sql("select json data").map(t => t.getstring(0)).take(10).foreach(println)
i ten lines of json want. however, when run spark-submit follows problem
bin/spark-submit --master $spark_url --class spark.main --driver-class-path $mysql_conn target/spark-testing-1.0-snapshot.jar
here whole spark program
package spark import org.apache.spark.sql.hive.hivecontext import org.apache.spark.{sparkcontext, sparkconf} object main { def main(args: array[string]) { val sc = new sparkcontext(new sparkconf().setappname("gathering data")) val sqlcontext = new hivecontext(sc) sqlcontext.sql("select json data").map(t => t.getstring(0)).take(10).foreach(println) } }
and here resultant stack
14/12/01 21:30:04 warn tasksetmanager: lost task 0.0 in stage 0.0 (tid 0, match1hd17.dc1): java.lang.classnotfoundexception: spark.main$$anonfun$main$1 java.net.urlclassloader$1.run(urlclassloader.java:200) java.security.accesscontroller.doprivileged(native method) java.net.urlclassloader.findclass(urlclassloader.java:188) java.lang.classloader.loadclass(classloader.java:307) java.lang.classloader.loadclass(classloader.java:252) java.lang.classloader.loadclassinternal(classloader.java:320) java.lang.class.forname0(native method) java.lang.class.forname(class.java:247) org.apache.spark.serializer.javadeserializationstream$$anon$1.resolveclass(javaserializer.scala:59) java.io.objectinputstream.readnonproxydesc(objectinputstream.java:1575) java.io.objectinputstream.readclassdesc(objectinputstream.java:1496) java.io.objectinputstream.readordinaryobject(objectinputstream.java:1732) java.io.objectinputstream.readobject0(objectinputstream.java:1329) java.io.objectinputstream.defaultreadfields(objectinputstream.java:1947) java.io.objectinputstream.readserialdata(objectinputstream.java:1871) java.io.objectinputstream.readordinaryobject(objectinputstream.java:1753) java.io.objectinputstream.readobject0(objectinputstream.java:1329) java.io.objectinputstream.defaultreadfields(objectinputstream.java:1947) java.io.objectinputstream.readserialdata(objectinputstream.java:1871) java.io.objectinputstream.readordinaryobject(objectinputstream.java:1753) java.io.objectinputstream.readobject0(objectinputstream.java:1329) java.io.objectinputstream.readobject(objectinputstream.java:351) org.apache.spark.serializer.javadeserializationstream.readobject(javaserializer.scala:62) org.apache.spark.serializer.javaserializerinstance.deserialize(javaserializer.scala:87) org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:57) org.apache.spark.scheduler.task.run(task.scala:54) org.apache.spark.executor.executor$taskrunner.run(executor.scala:177) java.util.concurrent.threadpoolexecutor$worker.runtask(threadpoolexecutor.java:886) java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:908) java.lang.thread.run(thread.java:619) 14/12/01 21:30:10 error tasksetmanager: task 0 in stage 0.0 failed 4 times; aborting job exception in thread "main" org.apache.spark.sparkexception: job aborted due stage failure: task 0 in stage 0.0 failed 4 times, recent failure: lost task 0.3 in stage 0.0 (tid 3, match1hd12.dc1m): java.lang.classnotfoundexception: spark.main$$anonfun$main$1 java.net.urlclassloader$1.run(urlclassloader.java:200) java.security.accesscontroller.doprivileged(native method) java.net.urlclassloader.findclass(urlclassloader.java:188) java.lang.classloader.loadclass(classloader.java:307) java.lang.classloader.loadclass(classloader.java:252) java.lang.classloader.loadclassinternal(classloader.java:320) java.lang.class.forname0(native method) java.lang.class.forname(class.java:247) org.apache.spark.serializer.javadeserializationstream$$anon$1.resolveclass(javaserializer.scala:59) java.io.objectinputstream.readnonproxydesc(objectinputstream.java:1575) java.io.objectinputstream.readclassdesc(objectinputstream.java:1496) java.io.objectinputstream.readordinaryobject(objectinputstream.java:1732) java.io.objectinputstream.readobject0(objectinputstream.java:1329) java.io.objectinputstream.defaultreadfields(objectinputstream.java:1947) java.io.objectinputstream.readserialdata(objectinputstream.java:1871) java.io.objectinputstream.readordinaryobject(objectinputstream.java:1753) java.io.objectinputstream.readobject0(objectinputstream.java:1329) java.io.objectinputstream.defaultreadfields(objectinputstream.java:1947) java.io.objectinputstream.readserialdata(objectinputstream.java:1871) java.io.objectinputstream.readordinaryobject(objectinputstream.java:1753) java.io.objectinputstream.readobject0(objectinputstream.java:1329) java.io.objectinputstream.readobject(objectinputstream.java:351) org.apache.spark.serializer.javadeserializationstream.readobject(javaserializer.scala:62) org.apache.spark.serializer.javaserializerinstance.deserialize(javaserializer.scala:87) org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:57) org.apache.spark.scheduler.task.run(task.scala:54) org.apache.spark.executor.executor$taskrunner.run(executor.scala:177) java.util.concurrent.threadpoolexecutor$worker.runtask(threadpoolexecutor.java:886) java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:908) java.lang.thread.run(thread.java:619) driver stacktrace: @ org.apache.spark.scheduler.dagscheduler.org$apache$spark$scheduler$dagscheduler$$failjobandindependentstages(dagscheduler.scala:1185) @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1174) @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1173) @ scala.collection.mutable.resizablearray$class.foreach(resizablearray.scala:59) @ scala.collection.mutable.arraybuffer.foreach(arraybuffer.scala:47) @ org.apache.spark.scheduler.dagscheduler.abortstage(dagscheduler.scala:1173) @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:688) @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:688) @ scala.option.foreach(option.scala:236) @ org.apache.spark.scheduler.dagscheduler.handletasksetfailed(dagscheduler.scala:688) @ org.apache.spark.scheduler.dagschedulereventprocessactor$$anonfun$receive$2.applyorelse(dagscheduler.scala:1391) @ akka.actor.actorcell.receivemessage(actorcell.scala:498) @ akka.actor.actorcell.invoke(actorcell.scala:456) @ akka.dispatch.mailbox.processmailbox(mailbox.scala:237) @ akka.dispatch.mailbox.run(mailbox.scala:219) @ akka.dispatch.forkjoinexecutorconfigurator$akkaforkjointask.exec(abstractdispatcher.scala:386) @ scala.concurrent.forkjoin.forkjointask.doexec(forkjointask.java:260) @ scala.concurrent.forkjoin.forkjoinpool$workqueue.runtask(forkjoinpool.java:1339) @ scala.concurrent.forkjoin.forkjoinpool.runworker(forkjoinpool.java:1979) @ scala.concurrent.forkjoin.forkjoinworkerthread.run(forkjoinworkerthread.java:107)
i have spent hours on already, , have no idea why works spark-shell. looked @ stderr output on individual nodes , have same cryptic error message. if can shed light why works spark-shell , not spark-submit awesome.
thanks
update:
i've been playing around , following program works fine.
package spark import org.apache.spark.sql.hive.hivecontext import org.apache.spark.{sparkcontext, sparkconf} object main { def main(args: array[string]) { val sc = new sparkcontext(new sparkconf().setappname("gathering data")) val sqlcontext = new hivecontext(sc) sqlcontext.sql("select json data").take(10).map(t => t.getstring(0)).foreach(println) } }
obviously won't work large amount of data, shows problem appears in scehmardd.map() function.
it seems there problem spark context initialization.
please try below code:
val sparkconf = new sparkconf().setappname("gathering data"); val sc = new sparkcontext(sparkconf);
Comments
Post a Comment