在Spark 2.2中,使用Java实现异常处理和错误传递:
1. 抛出异常并在Driver端捕获
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class SparkExceptionHandlingJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SparkExceptionHandling")
.setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
try {
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> rdd = jsc.parallelize(data);
JavaRDD<Integer> result = rdd.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer x) throws Exception {
if (x == 5) {
// 抛出异常,任务会失败
throw new RuntimeException("处理值5时发生严重错误");
}
return x * 2;
}
});
// 触发计算并处理异常
List<Integer> collectedResults = result.collect();
for (Integer num : collectedResults) {
System.out.println("结果: " + num);
}
} catch (Exception e) {
System.err.println("Driver端捕获到异常: " + e.getMessage());
// 打印堆栈信息
e.printStackTrace();
// 停止SparkContext并退出
jsc.stop();
System.exit(1);
} finally {
jsc.close();
}
}
}
2. 使用返回特殊对象的方式(更安全的做法)
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Option;
import scala.util.Either;
import scala.util.Left;
import scala.util.Right;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class SparkSafeExceptionHandlingJava {
// 定义结果包装类
static class ResultWrapper implements Serializable {
private boolean success;
private int value;
private String errorMessage;
public ResultWrapper(boolean success, int value, String errorMessage) {
this.success = success;
this.value = value;
this.errorMessage = errorMessage;
}
public boolean isSuccess() { return success; }
public int getValue() { return value; }
public String getErrorMessage() { return errorMessage; }
}
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SparkSafeExceptionHandling")
.setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> rdd = jsc.parallelize(data);
// 使用ResultWrapper包装结果
JavaRDD<ResultWrapper> resultRdd = rdd.map(new Function<Integer, ResultWrapper>() {
@Override
public ResultWrapper call(Integer x) {
try {
if (x == 5) {
throw new RuntimeException("处理值5时发生严重错误");
}
return new ResultWrapper(true, x * 2, null);
} catch (Exception e) {
return new ResultWrapper(false, 0, "处理值" + x + "时出错: " + e.getMessage());
}
}
});
// 收集结果
List<ResultWrapper> results = resultRdd.collect();
// 分离成功和失败的结果
List<Integer> successfulResults = new ArrayList<>();
List<String> errorMessages = new ArrayList<>();
for (ResultWrapper wrapper : results) {
if (wrapper.isSuccess()) {
successfulResults.add(wrapper.getValue());
} else {
errorMessages.add(wrapper.getErrorMessage());
}
}
// 检查是否有错误
if (!errorMessages.isEmpty()) {
System.err.println("发现以下错误:");
for (String error : errorMessages) {
System.err.println(" - " + error);
}
// 决定是否停止程序
System.err.println("程序因错误而停止");
jsc.close();
System.exit(1);
} else {
System.out.println("处理结果:");
for (Integer value : successfulResults) {
System.out.println(" " + value);
}
}
jsc.close();
}
}
3. 使用累加器记录错误
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.util.LongAccumulator;
import java.util.Arrays;
import java.util.List;
public class SparkAccumulatorExceptionHandlingJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SparkAccumulatorExceptionHandling")
.setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
// 创建错误计数器
LongAccumulator errorCounter = jsc.sc().longAccumulator("errorCounter");
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> rdd = jsc.parallelize(data);
JavaRDD<Integer> resultRdd = rdd.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer x) throws Exception {
try {
if (x == 5) {
errorCounter.add(1);
throw new RuntimeException("处理值5时发生错误");
}
return x * 2;
} catch (Exception e) {
// 记录错误但继续处理
errorCounter.add(1);
return -1; // 返回错误标记
}
}
});
// 强制计算
List<Integer> results = resultRdd.collect();
// 检查错误计数器
if (errorCounter.value() > 0) {
System.err.println("发现 " + errorCounter.value() + " 个错误");
// 过滤出有效结果
System.out.println("有效结果:");
for (Integer result : results) {
if (result != -1) {
System.out.println(" " + result);
}
}
// 可以选择停止程序
// jsc.close();
// System.exit(1);
} else {
System.out.println("所有任务成功完成:");
for (Integer result : results) {
System.out.println(" " + result);
}
}
jsc.close();
}
}
4. 使用自定义异常处理机制
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class SparkCustomExceptionHandlingJava {
// 使用元组返回结果和状态
static class ProcessingResult {
private boolean success;
private String error;
private Integer result;
public ProcessingResult(boolean success, Integer result, String error) {
this.success = success;
this.result = result;
this.error = error;
}
public boolean isSuccess() { return success; }
public Integer getResult() { return result; }
public String getError() { return error; }
}
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SparkCustomExceptionHandling")
.setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> rdd = jsc.parallelize(data);
// 使用flatMap处理,可以过滤掉错误
JavaRDD<Integer> resultRdd = rdd.flatMap(new FlatMapFunction<Integer, Integer>() {
@Override
public Iterator<Integer> call(Integer x) throws Exception {
List<Integer> results = new ArrayList<>();
try {
if (x == 5) {
throw new RuntimeException("值5不允许处理");
}
results.add(x * 2);
} catch (Exception e) {
// 记录日志但不抛出异常
System.err.println("处理值 " + x + " 时出错: " + e.getMessage());
// 返回空列表,相当于过滤掉这个元素
}
return results.iterator();
}
});
List<Integer> results = resultRdd.collect();
System.out.println("成功处理的结果:");
for (Integer result : results) {
System.out.println(" " + result);
}
jsc.close();
}
}
5. 高级错误处理:通过Listener监控任务
import org.apache.spark.SparkConf;
import org.apache.spark.JavaSparkContext;
import org.apache.spark.scheduler.*;
import scala.PartialFunction;
import scala.runtime.AbstractPartialFunction;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicBoolean;
public class SparkListenerExceptionHandlingJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SparkListenerExceptionHandling")
.setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
// 使用原子布尔变量控制程序停止
AtomicBoolean hasCriticalError = new AtomicBoolean(false);
// 添加Spark监听器
jsc.sc().addSparkListener(new SparkListener() {
@Override
public void onTaskEnd(SparkListenerTaskEnd taskEnd) {
if (!taskEnd.reason().toString().contains("Success")) {
System.err.println("任务失败: " + taskEnd.taskInfo().taskId());
System.err.println("失败原因: " + taskEnd.reason());
// 如果是严重错误,设置标志
if (taskEnd.reason().toString().contains("严重")) {
hasCriticalError.set(true);
}
}
}
@Override
public void onJobEnd(SparkListenerJobEnd jobEnd) {
if (jobEnd.jobResult() instanceof JobFailed) {
JobFailed failed = (JobFailed) jobEnd.jobResult();
System.err.println("作业失败: " + failed.exception().getMessage());
if (hasCriticalError.get()) {
System.err.println("检测到严重错误,准备停止程序");
}
}
}
});
// 创建RDD并处理
try {
jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
.map(x -> {
if (x == 5) {
throw new RuntimeException("严重错误: 遇到值5");
}
return x * 2;
})
.collect()
.forEach(System.out::println);
} catch (Exception e) {
System.err.println("捕获异常: " + e.getMessage());
} finally {
// 检查是否有严重错误
if (hasCriticalError.get()) {
System.err.println("程序因严重错误而停止");
jsc.close();
System.exit(1);
} else {
jsc.close();
}
}
}
}
6. 使用Try-Catch包装整个计算
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
public class SparkGlobalExceptionHandlingJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SparkGlobalExceptionHandling")
.setMaster("local[*]");
// 设置默认的未捕获异常处理器
Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
@Override
public void uncaughtException(Thread t, Throwable e) {
System.err.println("线程 " + t.getName() + " 抛出未捕获异常: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
});
JavaSparkContext jsc = null;
try {
jsc = new JavaSparkContext(conf);
jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
.map(x -> {
if (x == 5) {
throw new RuntimeException("严重错误: 遇到值5");
}
return x * 2;
})
.collect()
.forEach(System.out::println);
} catch (Exception e) {
System.err.println("全局异常捕获: " + e.getMessage());
e.printStackTrace();
System.exit(1);
} finally {
if (jsc != null) {
jsc.close();
}
}
}
}
关键点总结:
-
Spark 2.2 Java API中,不要在map算子中直接停止程序
-
抛出异常:在map中抛出异常,在driver端捕获
-
使用包装对象:返回包含状态的结果对象
-
使用累加器:记录错误数量,在driver端检查
-
通过Listener监控:可以监控任务失败情况
-
全局异常处理:设置默认的未捕获异常处理器
最推荐的方法是方法2 (返回包装对象)或方法4(通过flatMap过滤错误),这样可以在不完全停止程序的情况下处理错误,同时保持程序的健壮性。