Talend Spark Parquet error

Symptoms :

 

[WARN ]: org.apache.spark.scheduler.TaskSetManager – Lost task 0.0 in stage 6.0 (TID 14, clouderadXXXXX): java.lang.NullPointerException
at parquet.column.values.fallback.FallbackValuesWriter.writeBytes(FallbackValuesWriter.java:161)
at parquet.column.impl.ColumnWriterV1.write(ColumnWriterV1.java:204)
at parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.addBinary(MessageColumnIO.java:376)
at parquet.example.data.simple.BinaryValue.writeValue(BinaryValue.java:45)
at parquet.example.data.simple.SimpleGroup.writeValue(SimpleGroup.java:229)
at parquet.example.data.GroupWriter.writeGroup(GroupWriter.java:51)
at parquet.example.data.GroupWriter.write(GroupWriter.java:37)
at parquet.hadoop.example.GroupWriteSupport.write(GroupWriteSupport.java:74)
at parquet.hadoop.example.GroupWriteSupport.write(GroupWriteSupport.java:36)
at parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:116)
at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:123)
at parquet.hadoop.mapred.DeprecatedParquetOutputFormat$RecordWriterWrapper.write(DeprecatedParquetOutputFormat.java:107)
at parquet.hadoop.mapred.DeprecatedParquetOutputFormat$RecordWriterWrapper.write(DeprecatedParquetOutputFormat.java:75)
at org.apache.spark.SparkHadoopWriter.write(SparkHadoopWriter.scala:96)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$13.apply(PairRDDFunctions.scala:1073)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$13.apply(PairRDDFunctions.scala:1059)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
[WARN ]: org.apache.spark.scheduler.TaskSetManager – Lost task 0.1 in stage 6.0 (TID 16, clouderadaXX.XX.com): org.apache.hadoop.fs.FileAlreadyExistsException: /XXX/FILE/workoutput/part-00000-m-00000.snappy.parquet for client 10.XX1.X.XX already exists

 

Solution

 

Leave a Reply