欢迎访问我的主页: https://heeheeaii.github.io/
kotlin
package com.treevalue.beself.other
import kotlin.math.*
import kotlin.random.Random
data class DataNode(val features: DoubleArray, val value: Double) {
override fun equals(other: Any?): Boolean {
if (this === other) return true
if (javaClass != other?.javaClass) return false
other as DataNode
if (!features.contentEquals(other.features)) return false
if (value != other.value) return false
return true
}
override fun hashCode(): Int {
var result = features.contentHashCode()
result = 31 * result + value.hashCode()
return result
}
}
sealed class TreeNode {
data class Leaf(val value: Double) : TreeNode()
data class MidNode(
val splitIdx: Int,
val threshold: Double,
val left: TreeNode,
val right: TreeNode,
) : TreeNode()
}
class DecisionTree(
private val maxDepth: Int = 10,
private val minSplitNum: Int = 2,
private val minSamplesLeaf: Int = 1,
) {
private var root: TreeNode? = null
fun train(data: List<DataNode>) {
root = buildTree(data, depth = 0)
}
private fun buildTree(data: List<DataNode>, depth: Int): TreeNode {
if (depth >= maxDepth || // 深度过深
data.size < minSplitNum || // 数量过少
data.map { it.value }.distinct().size == 1 // 值相同
) {
val prediction = data.map { it.value }.average()
return TreeNode.Leaf(prediction)
}
val bestSplit = findBestSplit(data)
if (bestSplit == null) {
val prediction = data.map { it.value }.average()
return TreeNode.Leaf(prediction)
}
val (leftData, rightData) = splitData(data, bestSplit.first, bestSplit.second)
if (leftData.size < minSamplesLeaf || rightData.size < minSamplesLeaf) {
val prediction = data.map { it.value }.average()
return TreeNode.Leaf(prediction)
}
val leftTree = buildTree(leftData, depth + 1)
val rightTree = buildTree(rightData, depth + 1)
return TreeNode.MidNode(bestSplit.first, bestSplit.second, leftTree, rightTree)
}
private fun findBestSplit(data: List<DataNode>): Pair<Int, Double>? {
if (data.isEmpty()) return null
val featureSize = data[0].features.size
var bestMse = Double.MAX_VALUE // 均方误差
var bestSplit: Pair<Int, Double>? = null // featureIdx, threshold
for (featureIdx in 0 until featureSize) { // 遍历特征找最小的均方误差
val featureValues = data.map { it.features[featureIdx] }.distinct().sorted()
for (jdx in 0 until featureValues.size - 1) {
val threshold = (featureValues[jdx] + featureValues[jdx + 1]) / 2
val mse = calculateSplitMse(data, featureIdx, threshold)
if (mse < bestMse) {
bestMse = mse
bestSplit = Pair(featureIdx, threshold)
}
}
}
return bestSplit
}
private fun calculateSplitMse(data: List<DataNode>, featureIndex: Int, threshold: Double): Double {
val (leftData, rightData) = splitData(data, featureIndex, threshold)
if (leftData.isEmpty() || rightData.isEmpty()) {
return Double.MAX_VALUE
}
val totalSize = data.size.toDouble()
val leftWeight = leftData.size / totalSize
val rightWeight = rightData.size / totalSize
val leftMse = calculateMse(leftData)
val rightMse = calculateMse(rightData)
return leftWeight * leftMse + rightWeight * rightMse
}
private fun calculateMse(data: List<DataNode>): Double {
if (data.isEmpty()) return 0.0
val mean = data.map { it.value }.average()
return data.map { (it.value - mean).pow(2) }.average()
}
private fun splitData(
data: List<DataNode>,
featureIndex: Int,
threshold: Double,
): Pair<List<DataNode>, List<DataNode>> {
val leftData = data.filter { it.features[featureIndex] <= threshold }
val rightData = data.filter { it.features[featureIndex] > threshold }
return Pair(leftData, rightData)
}
fun predict(features: DoubleArray): Double {
return root?.let { predictRecursive(it, features) } ?: 0.0
}
private fun predictRecursive(inputNode: TreeNode, features: DoubleArray): Double {
return when (inputNode) {
is TreeNode.Leaf -> inputNode.value
is TreeNode.MidNode -> {
if (features[inputNode.splitIdx] <= inputNode.threshold) {
predictRecursive(inputNode.left, features)
} else {
predictRecursive(inputNode.right, features)
}
}
}
}
fun predict(dataPoints: List<DoubleArray>): List<Double> {
return dataPoints.map { predict(it) }
}
}
class RandomForest(
private val maxTreeNum: Int = 100,
private val maxTreeDepth: Int = 10,
private val minSplitNum: Int = 2,
private val minLeafNodeNum: Int = 1,
private val maxFeatureNum: Int? = null,
private val sampleRatio: Double = 1.0,
private val random: Random = Random.Default,
) {
private val trees = mutableListOf<DecisionTree>()
private val useFeatures = mutableListOf<IntArray>()
fun train(data: List<DataNode>) {
val featureNum = data[0].features.size
val actualMaxFeatures = maxFeatureNum ?: sqrt(featureNum.toDouble()).toInt()
repeat(maxTreeNum) { _ ->
val sampleData = startSample(data, sampleRatio)
val sampleFeature = selectRandomFeatures(featureNum, actualMaxFeatures)
useFeatures.add(sampleFeature)
val subsetData = createFeatureSubsetData(sampleData, sampleFeature) // 随机采样
val tree = DecisionTree(maxTreeDepth, minSplitNum, minLeafNodeNum)
tree.train(subsetData)
trees.add(tree)
}
}
private fun startSample(data: List<DataNode>, ratio: Double): List<DataNode> {
val sampleSize = (data.size * ratio).toInt()
return (1..sampleSize).map {
data[random.nextInt(data.size)]
}
}
private fun selectRandomFeatures(totalFeatures: Int, maxFeatures: Int): IntArray {
val features = (0 until totalFeatures).toMutableList()
features.shuffle(random)
return features.take(maxFeatures).toIntArray()
}
private fun createFeatureSubsetData(data: List<DataNode>, featureSubset: IntArray): List<DataNode> {
return data.map { point ->
val subsetFeatures = featureSubset.map { point.features[it] }.toDoubleArray()
DataNode(subsetFeatures, point.value)
}
}
private fun predict(features: DoubleArray): Double {
val predictions = trees.mapIndexed { idx, tree ->
val featureSubset = useFeatures[idx]
val subsetFeatures = featureSubset.map { features[it] }.toDoubleArray()
tree.predict(subsetFeatures)
}
return predictions.average()
}
fun predict(dataPoints: List<DoubleArray>): List<Double> {
return dataPoints.map { predict(it) }
}
fun getWeightStatistic(maxFeatureNum: Int): DoubleArray { //
val weights = DoubleArray(maxFeatureNum)
useFeatures.forEach { useFt ->
useFt.forEach { idx ->
weights[idx] += 1.0
}
}
val total = weights.sum()
if (total > 0) { // 除0错误
for (idx in weights.indices) {
weights[idx] /= total
}
}
return weights
}
}
class GradientBoostingRegressor(
// 梯度提升回归器
private val learnerNum: Int = 100,
private val learningRate: Double = 0.1,
private val maxTreeDepth: Int = 3,
private val minSplitNum: Int = 2,
private val minLeafNum: Int = 1,
private val sampleRate: Double = 1.0,
private val random: Random = Random.Default,
) {
private val trees = mutableListOf<DecisionTree>()
private var initPrediction: Double = 0.0
fun train(data: List<DataNode>) {
initPrediction = data.map { it.value }.average()
var residuals = data.map { it.value - initPrediction }.toMutableList()
repeat(learnerNum) { idx ->
val residualData = data.mapIndexed { index, point ->
DataNode(point.features, residuals[index])
}
val trainData = if (sampleRate < 1.0) {
val sampleSize = (residualData.size * sampleRate).toInt()
residualData.shuffled(random).take(sampleSize)
} else {
residualData
}
val tree = DecisionTree(
maxDepth = maxTreeDepth,
minSplitNum = minSplitNum,
minSamplesLeaf = minLeafNum
)
tree.train(trainData)
trees.add(tree)
residuals = residuals.mapIndexed { idx, residual ->
val prediction = tree.predict(data[idx].features)
residual - learningRate * prediction
}.toMutableList()
if ((idx + 1) % 20 == 0) {
val predictionList = data.map { predict(it.features) }
val mse = data.zip(predictionList) { origin, pred ->
(origin.value - pred).pow(2)
}.average()
println("第 ${idx + 1} 轮后的MSE: $mse")
}
}
println("梯度提升模型训练完成!")
}
private fun predict(features: DoubleArray): Double {
var prediction = initPrediction
trees.forEach { tree ->
prediction += learningRate * tree.predict(features)
}
return prediction
}
fun predict(dataPoints: List<DoubleArray>): List<Double> {
return dataPoints.map { predict(it) }
}
}
object ModelEvaluator {
fun calculateMseInSameLen(actual: List<Double>, predicted: List<Double>): Double {
return actual.zip(predicted) { a, p -> (a - p).pow(2) }.average()
}
fun calculateRmseInSameLen(actual: List<Double>, predicted: List<Double>): Double {
return sqrt(calculateMseInSameLen(actual, predicted))
}
fun calculateR2InSameLen(actual: List<Double>, predicted: List<Double>): Double {
val actualMean = actual.average()
val totalSumSquares = actual.sumOf { (it - actualMean).pow(2) }
val residualSumSquares = actual.zip(predicted) { a, p -> (a - p).pow(2) }.sum()
return 1.0 - (residualSumSquares / totalSumSquares)
}
}
object DataGenerator {
fun generateNonlinearData(
samples: Int = 1000,
noise: Double = 0.2,
random: Random = Random.Default,
): List<DataNode> {
return (1..samples).map {
val x1 = random.nextDouble(-PI, PI)
val x2 = random.nextDouble(-PI, PI)
val x3 = random.nextDouble(-2.0, 2.0)
val target = sin(x1) + cos(x2) + x3.pow(2) + random.nextDouble() * noise
DataNode(doubleArrayOf(x1, x2, x3), target)
}
}
}
fun main() {
println("=== 机器学习算法演示 ===\n")
println("生成数据集...")
val random = Random(42)
val trainData = DataGenerator.generateNonlinearData(800, 0.2, random)
val testData = DataGenerator.generateNonlinearData(200, 0.2, random)
val testFeatures = testData.map { it.features }
val testLabels = testData.map { it.value }
println("训练集大小: ${trainData.size}")
println("测试集大小: ${testData.size}\n")
println("=== 1. 决策树 ===")
val decisionTree = DecisionTree(maxDepth = 8, minSplitNum = 5, minSamplesLeaf = 2)
decisionTree.train(trainData)
val dtPredictions = decisionTree.predict(testFeatures)
val dtMse = ModelEvaluator.calculateMseInSameLen(testLabels, dtPredictions)
val dtRmse = ModelEvaluator.calculateRmseInSameLen(testLabels, dtPredictions)
val dtR2 = ModelEvaluator.calculateR2InSameLen(testLabels, dtPredictions)
println("决策树结果:")
println(" MSE: $dtMse")
println(" RMSE: $dtRmse")
println(" R²: $dtR2\n")
println("=== 2. 随机森林 ===")
val randomForest = RandomForest(
maxTreeNum = 50,
maxTreeDepth = 8,
minSplitNum = 5,
minLeafNodeNum = 2,
maxFeatureNum = 2,
sampleRatio = 0.8,
random = random
)
randomForest.train(trainData)
val rfPredictions = randomForest.predict(testFeatures)
val rfMse = ModelEvaluator.calculateMseInSameLen(testLabels, rfPredictions)
val rfRmse = ModelEvaluator.calculateRmseInSameLen(testLabels, rfPredictions)
val rfR2 = ModelEvaluator.calculateR2InSameLen(testLabels, rfPredictions)
println("随机森林结果:")
println(" MSE: $rfMse")
println(" RMSE: $rfRmse")
println(" R²: $rfR2")
val featureImportances = randomForest.getWeightStatistic(3)
println(" 特征重要性: ${featureImportances.contentToString()}\n")
println("=== 3. 梯度提升 ===")
val gradientBoosting = GradientBoostingRegressor(
learnerNum = 100,
learningRate = 0.1,
maxTreeDepth = 4,
minSplitNum = 5,
sampleRate = 0.8,
random = random
)
gradientBoosting.train(trainData)
val gbPredictions = gradientBoosting.predict(testFeatures)
val gbMse = ModelEvaluator.calculateMseInSameLen(testLabels, gbPredictions)
val gbRmse = ModelEvaluator.calculateRmseInSameLen(testLabels, gbPredictions)
val gbR2 = ModelEvaluator.calculateR2InSameLen(testLabels, gbPredictions)
println("梯度提升结果:")
println(" MSE: $gbMse")
println(" RMSE: $gbRmse")
println(" R²: $gbR2\n")
println("=== 算法对比 ===")
println("算法 MSE RMSE R²")
println("决策树 %.6f %.6f %.6f".format(dtMse, dtRmse, dtR2))
println("随机森林 %.6f %.6f %.6f".format(rfMse, rfRmse, rfR2))
println("梯度提升 %.6f %.6f %.6f".format(gbMse, gbRmse, gbR2))
println("\n=== 演示完成 ===")
}