MySQL(149)如何进行数据清洗?

数据清洗在数据处理和分析过程中至关重要,确保数据质量和一致性。以下是一个详细的指南,展示如何使用Java进行数据清洗,包括处理缺失值、重复值、异常值、数据类型转换以及标准化等步骤。

一、准备工作

确保安装有Java开发环境(JDK)和Maven或Gradle等依赖管理工具。我们将使用Apache Commons CSV库来处理CSV文件,并使用Java标准库进行数据清洗操作。

二、加载数据

首先,我们加载数据。假设我们有一个CSV文件(data.csv)作为数据源。

1. 添加依赖

在Maven项目的pom.xml文件中添加Apache Commons CSV库的依赖:

xml 复制代码
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-csv</artifactId>
    <version>1.8</version>
</dependency>

2. 加载CSV文件

java 复制代码
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

import java.nio.file.Files;
import java.nio.file.Paths;
import java.io.IOException;
import java.util.List;

public class DataLoader {

    public static List<CSVRecord> loadData(String filePath) throws IOException {
        CSVParser parser = CSVParser.parse(Paths.get(filePath), CSVFormat.DEFAULT.withFirstRecordAsHeader());
        return parser.getRecords();
    }

    public static void main(String[] args) {
        try {
            List<CSVRecord> records = loadData("data.csv");
            System.out.println("Data loaded successfully.");
            System.out.println(records.get(0));
        } catch (IOException e) {
            System.err.println("Failed to load data: " + e.getMessage());
        }
    }
}

三、处理缺失值

1. 检查缺失值

java 复制代码
public static void checkMissingValues(List<CSVRecord> records) {
    for (CSVRecord record : records) {
        for (String field : record) {
            if (field == null || field.trim().isEmpty()) {
                System.out.println("Missing value found in record: " + record);
            }
        }
    }
}

2. 填充缺失值

假设需要使用均值填充缺失值,对于数值类型的列。

java 复制代码
import java.util.HashMap;
import java.util.Map;

public class MissingValueHandler {

    public static List<CSVRecord> fillMissingValuesWithMean(List<CSVRecord> records, String columnName) {
        double sum = 0;
        int count = 0;
        Map<CSVRecord, Double> missingRecords = new HashMap<>();

        for (CSVRecord record : records) {
            String value = record.get(columnName);
            if (value == null || value.trim().isEmpty()) {
                missingRecords.put(record, null);
            } else {
                double numericValue = Double.parseDouble(value);
                sum += numericValue;
                count++;
            }
        }

        double mean = sum / count;
        for (CSVRecord record : missingRecords.keySet()) {
            record.toMap().put(columnName, String.valueOf(mean));
        }

        return records;
    }
}

四、处理重复值

1. 检查重复值

java 复制代码
import java.util.HashSet;
import java.util.Set;

public class DuplicateHandler {

    public static void checkDuplicates(List<CSVRecord> records) {
        Set<CSVRecord> uniqueRecords = new HashSet<>();
        for (CSVRecord record : records) {
            if (!uniqueRecords.add(record)) {
                System.out.println("Duplicate record found: " + record);
            }
        }
    }
}

2. 删除重复值

java 复制代码
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

public class DuplicateHandler {

    public static List<CSVRecord> removeDuplicates(List<CSVRecord> records) {
        Set<CSVRecord> uniqueRecords = new LinkedHashSet<>(records);
        return new ArrayList<>(uniqueRecords);
    }
}

五、处理异常值

1. 使用统计方法检测异常值

java 复制代码
import java.util.ArrayList;
import java.util.List;

public class OutlierHandler {

    public static List<CSVRecord> removeOutliers(List<CSVRecord> records, String columnName) {
        List<Double> values = new ArrayList<>();
        for (CSVRecord record : records) {
            values.add(Double.parseDouble(record.get(columnName)));
        }

        double q1 = getPercentile(values, 25);
        double q3 = getPercentile(values, 75);
        double iqr = q3 - q1;

        double lowerBound = q1 - 1.5 * iqr;
        double upperBound = q3 + 1.5 * iqr;

        List<CSVRecord> filteredRecords = new ArrayList<>();
        for (CSVRecord record : records) {
            double value = Double.parseDouble(record.get(columnName));
            if (value >= lowerBound && value <= upperBound) {
                filteredRecords.add(record);
            }
        }

        return filteredRecords;
    }

    private static double getPercentile(List<Double> values, double percentile) {
        values.sort(Double::compareTo);
        int index = (int) Math.ceil(percentile / 100.0 * values.size());
        return values.get(index - 1);
    }
}

六、数据类型转换

1. 检查数据类型

java 复制代码
import java.util.List;

public class DataTypeChecker {

    public static void checkDataTypes(List<CSVRecord> records, String columnName) {
        for (CSVRecord record : records) {
            String field = record.get(columnName);
            try {
                Integer.parseInt(field);
                System.out.println("Field " + field + " is of type Integer");
            } catch (NumberFormatException e) {
                try {
                    Double.parseDouble(field);
                    System.out.println("Field " + field + " is of type Double");
                } catch (NumberFormatException ex) {
                    System.out.println("Field " + field + " is of type String");
                }
            }
        }
    }
}

2. 转换数据类型

java 复制代码
import java.util.List;

public class DataTypeConverter {

    public static void convertToDouble(List<CSVRecord> records, String columnName) {
        for (CSVRecord record : records) {
            String value = record.get(columnName);
            try {
                double doubleValue = Double.parseDouble(value);
                record.toMap().put(columnName, String.valueOf(doubleValue));
            } catch (NumberFormatException e) {
                System.err.println("Failed to convert " + value + " to Double");
            }
        }
    }
}

七、数据标准化与规范化

1. 标准化

java 复制代码
import java.util.ArrayList;
import java.util.List;

public class DataNormalizer {

    public static void standardizeData(List<CSVRecord> records, String columnName) {
        List<Double> values = new ArrayList<>();
        for (CSVRecord record : records) {
            values.add(Double.parseDouble(record.get(columnName)));
        }

        double mean = values.stream().mapToDouble(Double::doubleValue).average().orElse(0.0);
        double stdDev = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));

        for (CSVRecord record : records) {
            double value = Double.parseDouble(record.get(columnName));
            double standardizedValue = (value - mean) / stdDev;
            record.toMap().put(columnName, String.valueOf(standardizedValue));
        }
    }
}

2. 规范化

java 复制代码
import java.util.ArrayList;
import java.util.List;

public class DataNormalizer {

    public static void normalizeData(List<CSVRecord> records, String columnName) {
        List<Double> values = new ArrayList<>();
        for (CSVRecord record : records) {
            values.add(Double.parseDouble(record.get(columnName)));
        }

        double min = values.stream().min(Double::compareTo).orElse(0.0);
        double max = values.stream().max(Double::compareTo).orElse(1.0);

        for (CSVRecord record : records) {
            double value = Double.parseDouble(record.get(columnName));
            double normalizedValue = (value - min) / (max - min);
            record.toMap().put(columnName, String.valueOf(normalizedValue));
        }
    }
}

八、保存清洗后的数据

java 复制代码
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;

import java.io.FileWriter;
import java.io.IOException;
import java.util.List;

public class DataSaver {

    public static void saveData(String filePath, List<CSVRecord> records) {
        try (FileWriter writer = new FileWriter(filePath);
             CSVPrinter printer = new CSVPrinter(writer, CSVFormat.DEFAULT.withHeader(records.get(0).toMap().keySet().toArray(new String[0])))) {

            for (CSVRecord record : records) {
                printer.printRecord(record.toMap().values());
            }
        } catch (IOException e) {
            System.err.println("Failed to save data: " + e.getMessage());
        }
    }
}
相关推荐
蓝倾16 分钟前
淘宝获取商品分类接口操作指南
前端·后端·fastapi
小希爸爸21 分钟前
curl 网络测试常用方法
后端
星星电灯猴1 小时前
iOS WebView 调试实战 页面跳转失效与历史记录错乱的排查路径
后端
重楼七叶一枝花2 小时前
MySQL的在线模式学习笔记
后端·mysql
代码男孩2 小时前
python包管理工具uv的使用
后端
CodeWolf2 小时前
关于端口号配置优先级的问题
后端
C182981825752 小时前
Ribbon轮询实现原理
后端·spring cloud·ribbon
鹿鹿的布丁2 小时前
freeswitch通过编译方式安装
后端
JavaDog程序狗2 小时前
【软件环境】Windows安装JDK21
后端
舒一笑2 小时前
撕碎语法教科书!PandaCoder教大模型「暴力越狱」逐字翻译
后端·程序员·intellij idea