基本思想
遗传算法一个比较有趣的应用是,给定一组数据,用初等函数拟合出一个公式,尽可能接近这组数据
熟悉深度学习的同学可能会发现,这其实类似深度学习,不同点在于,深度学习使用多层网络+激活函数来拟合,内部是黑箱,具体得到了一个什么公式谁也说不清。
但是遗传算法来拟合,会先把每个初等函数都编码成一个基因,最后得到最优的基因后,可以解码出具体是什么公式!这叫做符号回归,也就是不时给定函数结构,预测参数,而是函数结构,参数都预测,这里符号就是用什么函数类型的意思。
这里因为每个个体要代表一个表达式,所以基因没有采用二进制串,而是一个抽象语法树。交叉时,分别随机选一个子树,进行交换。变异时有一定概率把一个子树变成完全随机的新子树,一定概率只改一个点的符号,一定概率只改一个点的常量值。
开始随机生成一个表达式,然后在他上面取一些数据点,传给遗传算法来预测。每轮的基因适应度,就是检查这些数据点的函数值,和真实函数值差多少,拿出来做一个均方差(MSE)。根据均方差,保留一些精英到下一代,剩下不能保留到下一代的繁殖,繁殖时适应度高的有优先择偶权。这块和一般的遗传算法差不多
特殊之处在于,这里初始生成的表达式都很简单,只用加减乘除,三角函数,且只在积累固定的模板中挑选,这是为了避免预测难度过大。所以为了避免预测试时,过拟合出一个复杂的公式,偏离正确答案,我们需要在预测时采用类似深度学习正则化的思想,对预测语法树的复杂度做一个惩罚,这里的具体实现是,限制节点数,深度上限,并且在适应度中加一个正比于节点数的项。
实现
c
#include <algorithm>
#include <chrono>
#include <cmath>
#include <iomanip>
#include <iostream>
#include <limits>
#include <memory>
#include <numeric>
#include <random>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
namespace {
// 数值求值时的保护阈值,避免树在演化过程中出现极端大数。
constexpr double kEvalLimit = 1e6;
// 保护除法的阈值,分母太接近 0 时不做真实除法。
constexpr double kDivisionEpsilon = 1e-6;
// 当表达式非法或误差爆炸时,直接赋一个极大的惩罚适应度。
constexpr double kFitnessPenalty = 1e12;
double clamp_value(double value, double low, double high) {
return std::max(low, std::min(value, high));
}
double clamp_eval(double value) {
if (!std::isfinite(value)) {
return 0.0;
}
return clamp_value(value, -kEvalLimit, kEvalLimit);
}
double protected_div(double lhs, double rhs) {
if (std::abs(rhs) < kDivisionEpsilon) {
return clamp_eval(lhs);
}
return clamp_eval(lhs / rhs);
}
// 表达式树允许使用的原始算子集合。
// 这组算子越丰富,表达能力越强;但搜索空间也会更大,进化更难稳定。
enum class NodeType {
Variable,
Constant,
Add,
Sub,
Mul,
Div,
Sin,
Cos,
};
int arity(NodeType type) {
switch (type) {
case NodeType::Variable:
case NodeType::Constant:
return 0;
case NodeType::Sin:
case NodeType::Cos:
return 1;
case NodeType::Add:
case NodeType::Sub:
case NodeType::Mul:
case NodeType::Div:
return 2;
}
return 0;
}
struct ExprNode {
NodeType type = NodeType::Constant;
double constant = 0.0;
// 一元运算只使用 left;二元运算同时使用 left / right。
std::unique_ptr<ExprNode> left;
std::unique_ptr<ExprNode> right;
};
std::unique_ptr<ExprNode> clone_tree(const ExprNode* node) {
if (node == nullptr) {
return nullptr;
}
auto copy = std::make_unique<ExprNode>();
copy->type = node->type;
copy->constant = node->constant;
copy->left = clone_tree(node->left.get());
copy->right = clone_tree(node->right.get());
return copy;
}
int node_count(const ExprNode* node) {
if (node == nullptr) {
return 0;
}
return 1 + node_count(node->left.get()) + node_count(node->right.get());
}
int tree_depth(const ExprNode* node) {
if (node == nullptr) {
return 0;
}
return 1 + std::max(tree_depth(node->left.get()), tree_depth(node->right.get()));
}
bool contains_variable(const ExprNode* node) {
if (node == nullptr) {
return false;
}
if (node->type == NodeType::Variable) {
return true;
}
return contains_variable(node->left.get()) || contains_variable(node->right.get());
}
double eval_tree(const ExprNode* node, double x) {
// 递归计算表达式树在 x 处的值,所有中间结果都做截断保护。
switch (node->type) {
case NodeType::Variable:
return clamp_eval(x);
case NodeType::Constant:
return clamp_eval(node->constant);
case NodeType::Add:
return clamp_eval(eval_tree(node->left.get(), x) + eval_tree(node->right.get(), x));
case NodeType::Sub:
return clamp_eval(eval_tree(node->left.get(), x) - eval_tree(node->right.get(), x));
case NodeType::Mul:
return clamp_eval(eval_tree(node->left.get(), x) * eval_tree(node->right.get(), x));
case NodeType::Div:
return protected_div(eval_tree(node->left.get(), x), eval_tree(node->right.get(), x));
case NodeType::Sin:
return clamp_eval(std::sin(eval_tree(node->left.get(), x)));
case NodeType::Cos:
return clamp_eval(std::cos(eval_tree(node->left.get(), x)));
}
return 0.0;
}
std::string format_number(double value) {
std::ostringstream out;
out << std::fixed << std::setprecision(3) << value;
std::string text = out.str();
while (!text.empty() && text.back() == '0') {
text.pop_back();
}
if (!text.empty() && text.back() == '.') {
text.pop_back();
}
if (text == "-0") {
text = "0";
}
return text;
}
std::string to_string(const ExprNode* node) {
switch (node->type) {
case NodeType::Variable:
return "x";
case NodeType::Constant:
return format_number(node->constant);
case NodeType::Add:
return "(" + to_string(node->left.get()) + " + " + to_string(node->right.get()) + ")";
case NodeType::Sub:
return "(" + to_string(node->left.get()) + " - " + to_string(node->right.get()) + ")";
case NodeType::Mul:
return "(" + to_string(node->left.get()) + " * " + to_string(node->right.get()) + ")";
case NodeType::Div:
return "(" + to_string(node->left.get()) + " / " + to_string(node->right.get()) + ")";
case NodeType::Sin:
return "sin(" + to_string(node->left.get()) + ")";
case NodeType::Cos:
return "cos(" + to_string(node->left.get()) + ")";
}
return "?";
}
void collect_paths(const ExprNode* node,
std::vector<std::vector<int>>& paths,
std::vector<int>& current) {
if (node == nullptr) {
return;
}
// 用从根走到该节点的路径给每个子树编号:0 表示左儿子,1 表示右儿子。
paths.push_back(current);
if (node->left) {
current.push_back(0);
collect_paths(node->left.get(), paths, current);
current.pop_back();
}
if (node->right) {
current.push_back(1);
collect_paths(node->right.get(), paths, current);
current.pop_back();
}
}
std::unique_ptr<ExprNode>* get_slot(std::unique_ptr<ExprNode>& root,
const std::vector<int>& path,
std::size_t index = 0) {
if (index == path.size()) {
return &root;
}
if (path[index] == 0) {
return get_slot(root->left, path, index + 1);
}
return get_slot(root->right, path, index + 1);
}
const ExprNode* get_node(const std::unique_ptr<ExprNode>& root, const std::vector<int>& path) {
const ExprNode* node = root.get();
for (int dir : path) {
node = (dir == 0 ? node->left.get() : node->right.get());
}
return node;
}
struct Sample {
double x;
double y;
};
// 负责随机生成表达式树,以及对常数做局部数值扰动。
// 这个类决定了"搜索空间的先验分布":初始树更偏向什么结构、常数更容易取到哪些值,
// 都会直接影响遗传算法前期能否快速找到有希望的候选公式。
class ExpressionFactory {
public:
explicit ExpressionFactory(std::mt19937& rng) : rng_(rng) {}
std::unique_ptr<ExprNode> random_tree(int max_depth, bool force_function) {
// Mix "grow" and "full" style initialization: either stop at a terminal
// or keep expanding until depth runs out.
// 混合 grow / full 两种初始化风格,增加初始种群多样性。
if (max_depth <= 1 || (!force_function && prob_(rng_) < 0.32)) {
return random_terminal();
}
if (prob_(rng_) < 0.30) {
auto node = std::make_unique<ExprNode>();
node->type = random_unary();
node->left = random_tree(max_depth - 1, false);
return node;
}
auto node = std::make_unique<ExprNode>();
node->type = random_binary();
node->left = random_tree(max_depth - 1, false);
node->right = random_tree(max_depth - 1, false);
return node;
}
std::unique_ptr<ExprNode> random_terminal() {
auto node = std::make_unique<ExprNode>();
if (prob_(rng_) < 0.60) {
node->type = NodeType::Variable;
} else {
node->type = NodeType::Constant;
node->constant = random_constant();
}
return node;
}
NodeType random_unary() {
std::uniform_int_distribution<int> dist(0, 1);
return dist(rng_) == 0 ? NodeType::Sin : NodeType::Cos;
}
NodeType random_binary() {
std::uniform_int_distribution<int> dist(0, 3);
switch (dist(rng_)) {
case 0:
return NodeType::Add;
case 1:
return NodeType::Sub;
case 2:
return NodeType::Mul;
default:
return NodeType::Div;
}
}
double random_constant() {
// 常数池故意不设得太大,便于在早期更快拼出可解释的表达式。
static const std::vector<double> pool = {-3.0, -2.0, -1.5, -1.0, -0.5,
0.5, 1.0, 1.5, 2.0, 3.0};
std::uniform_int_distribution<int> dist(0, static_cast<int>(pool.size()) - 1);
return pool[dist(rng_)];
}
void mutate_constant(ExprNode& node) {
std::normal_distribution<double> noise(0.0, 0.30);
if (prob_(rng_) < 0.50) {
node.constant = random_constant();
} else {
node.constant = clamp_value(node.constant + noise(rng_), -5.0, 5.0);
}
}
private:
std::mt19937& rng_;
std::uniform_real_distribution<double> prob_{0.0, 1.0};
};
class SymbolicRegressionGA {
public:
struct Individual {
std::unique_ptr<ExprNode> root;
double fitness = kFitnessPenalty;
double train_mse = kFitnessPenalty;
double test_mse = kFitnessPenalty;
};
SymbolicRegressionGA(std::vector<Sample> train_samples,
std::vector<Sample> test_samples,
unsigned seed)
: train_samples_(std::move(train_samples)),
test_samples_(std::move(test_samples)),
rng_(seed),
factory_(rng_) {}
void set_population_size(int value) { population_size_ = value; }
void set_generations(int value) { generations_ = value; }
void set_elite_count(int value) { elite_count_ = value; }
void set_tournament_size(int value) { tournament_size_ = value; }
void set_crossover_rate(double value) { crossover_rate_ = value; }
void set_mutation_rate(double value) { mutation_rate_ = value; }
void set_max_tree_depth(int value) { max_tree_depth_ = value; }
void set_max_tree_nodes(int value) { max_tree_nodes_ = value; }
// 主进化循环:
// 1. 对当前种群按适应度排序;
// 2. 保留精英;
// 3. 反复做选择、交叉、变异生成子代;
// 4. 评估子代后替换旧种群。
//
// 这里没有显式"提前停止"条件,而是固定跑满 generations。
// 这样做实现简单,但如果后续想提速,可以加入"若最优值长期不改善则停止"。
Individual run() {
std::vector<Individual> population = initialize_population();
Individual best = clone_individual(population.front());
for (int generation = 0; generation < generations_; ++generation) {
std::sort(population.begin(), population.end(),
[](const Individual& lhs, const Individual& rhs) {
return lhs.fitness < rhs.fitness;
});
if (population.front().fitness < best.fitness) {
best = clone_individual(population.front());
}
if (generation % 40 == 0 || generation + 1 == generations_) {
std::cout << "generation " << std::setw(3) << generation
<< " train_mse=" << std::setw(12) << std::setprecision(8)
<< population.front().train_mse << " size="
<< node_count(population.front().root.get()) << " expr="
<< to_string(population.front().root.get()) << '\n';
}
std::vector<Individual> next_population;
next_population.reserve(population_size_);
// 精英保留:前若干个最优个体原样进入下一代,减少回退。
for (int i = 0; i < elite_count_ && i < population_size_; ++i) {
next_population.push_back(clone_individual(population[i]));
}
while (static_cast<int>(next_population.size()) < population_size_) {
const Individual& parent_a = select(population);
const Individual& parent_b = select(population);
Individual child;
if (prob_(rng_) < crossover_rate_) {
child.root = crossover(parent_a.root, parent_b.root);
} else {
child.root = clone_tree(parent_a.root.get());
}
if (prob_(rng_) < mutation_rate_) {
mutate(child.root);
}
// 控制树的深度和节点数,防止表达式无限膨胀。
if (!is_valid_tree(child.root.get())) {
child.root = factory_.random_tree(3, true);
}
evaluate_individual(child);
next_population.push_back(std::move(child));
}
population.swap(next_population);
}
std::sort(population.begin(), population.end(),
[](const Individual& lhs, const Individual& rhs) {
return lhs.fitness < rhs.fitness;
});
if (population.front().fitness < best.fitness) {
best = clone_individual(population.front());
}
return best;
}
private:
// 初始化策略不是完全随机,而是"少量人工种子 + 大量随机树"的混合方式。
// 人工种子负责把搜索起点放在几个高频基础结构附近,例如 x、x*x、sin(x)。
// 随机树负责提供多样性,避免整个种群只在少数简单结构周围打转。
std::vector<Individual> initialize_population() {
std::vector<Individual> population;
population.reserve(population_size_);
auto seed_expr = [&](std::unique_ptr<ExprNode> root) {
Individual individual;
individual.root = std::move(root);
evaluate_individual(individual);
population.push_back(std::move(individual));
};
// 先放入一些简单且常见的基础结构,帮助搜索更快起步。
seed_expr(make_variable());
seed_expr(make_constant(1.0));
seed_expr(make_binary(NodeType::Mul, make_variable(), make_variable()));
seed_expr(make_unary(NodeType::Sin, make_variable()));
seed_expr(make_unary(NodeType::Cos, make_variable()));
seed_expr(make_binary(NodeType::Add, make_variable(), make_constant(1.0)));
std::uniform_int_distribution<int> depth_dist(2, 4);
while (static_cast<int>(population.size()) < population_size_) {
// 交替生成更满和更松的树,提高初始多样性。
bool full = (population.size() % 2 == 0);
auto root = factory_.random_tree(depth_dist(rng_), full);
if (!contains_variable(root.get())) {
// 符号回归里纯常数很容易卡成局部最优,这里强制把 x 拼进去。
root = make_binary(NodeType::Add, std::move(root), make_variable());
}
Individual individual;
individual.root = std::move(root);
evaluate_individual(individual);
population.push_back(std::move(individual));
}
return population;
}
Individual clone_individual(const Individual& other) const {
Individual copy;
copy.root = clone_tree(other.root.get());
copy.fitness = other.fitness;
copy.train_mse = other.train_mse;
copy.test_mse = other.test_mse;
return copy;
}
// 选择阶段决定"谁更有机会繁殖"。
// 这里使用锦标赛选择而不是轮盘赌:实现更稳健,也不怕 fitness 数值尺度差异过大。
const Individual& select(const std::vector<Individual>& population) {
// Tournament selection keeps pressure toward fitter programs without
// requiring probabilities derived from raw fitness values.
// 锦标赛选择:随机抽若干个体,取其中最优者。
std::uniform_int_distribution<int> dist(0, static_cast<int>(population.size()) - 1);
int best = dist(rng_);
for (int i = 1; i < tournament_size_; ++i) {
int candidate = dist(rng_);
if (population[candidate].fitness < population[best].fitness) {
best = candidate;
}
}
return population[best];
}
// 交叉的本质是"重组已有子结构"。
// 如果某个父代已经学到了有用的局部模式,例如 sin(x + c) 或 x / c,
// 那么子树交叉有机会把这些局部模式拼接成更完整的表达式。
std::unique_ptr<ExprNode> crossover(const std::unique_ptr<ExprNode>& lhs,
const std::unique_ptr<ExprNode>& rhs) {
auto child = clone_tree(lhs.get());
std::vector<std::vector<int>> lhs_paths;
std::vector<int> current;
collect_paths(child.get(), lhs_paths, current);
std::vector<std::vector<int>> rhs_paths;
collect_paths(rhs.get(), rhs_paths, current);
std::uniform_int_distribution<int> lhs_dist(0, static_cast<int>(lhs_paths.size()) - 1);
std::uniform_int_distribution<int> rhs_dist(0, static_cast<int>(rhs_paths.size()) - 1);
const auto& lhs_path = lhs_paths[lhs_dist(rng_)];
const auto& rhs_path = rhs_paths[rhs_dist(rng_)];
// Standard GP subtree crossover: replace one random subtree in lhs
// with one random subtree from rhs.
// 标准遗传编程的子树交叉:在 lhs 中随机挑一个子树位置,用 rhs 的随机子树替换它。
std::unique_ptr<ExprNode>* slot = get_slot(child, lhs_path);
*slot = clone_tree(get_node(rhs, rhs_path));
if (!contains_variable(child.get())) {
child = make_binary(NodeType::Add, std::move(child), make_variable());
}
return child;
}
// 变异负责给搜索引入新信息,避免种群过早收敛。
// 结构变异负责大步探索,算子变异负责结构微调,常数变异负责局部参数优化。
void mutate(std::unique_ptr<ExprNode>& root) {
std::vector<std::vector<int>> paths;
std::vector<int> current;
collect_paths(root.get(), paths, current);
std::uniform_int_distribution<int> path_dist(0, static_cast<int>(paths.size()) - 1);
std::uniform_real_distribution<double> choice(0.0, 1.0);
double mode = choice(rng_);
if (mode < 0.40) {
// 结构变异:随机找一棵子树,整棵替换成新生成的树。
std::unique_ptr<ExprNode>* slot = get_slot(root, paths[path_dist(rng_)]);
*slot = factory_.random_tree(3, false);
} else if (mode < 0.75) {
// 算子变异:保留子树结构,只改节点运算符。
std::vector<std::vector<int>> operator_paths;
for (const auto& path : paths) {
const ExprNode* node = get_node(root, path);
if (arity(node->type) > 0) {
operator_paths.push_back(path);
}
}
if (!operator_paths.empty()) {
std::uniform_int_distribution<int> op_dist(
0, static_cast<int>(operator_paths.size()) - 1);
std::unique_ptr<ExprNode>* slot = get_slot(root, operator_paths[op_dist(rng_)]);
if (arity((*slot)->type) == 1) {
(*slot)->type = factory_.random_unary();
} else {
(*slot)->type = factory_.random_binary();
}
}
} else {
// 数值变异:如果树里有常数,就对某个常数做重采样或小扰动。
std::vector<std::vector<int>> constant_paths;
for (const auto& path : paths) {
const ExprNode* node = get_node(root, path);
if (node->type == NodeType::Constant) {
constant_paths.push_back(path);
}
}
if (constant_paths.empty()) {
std::unique_ptr<ExprNode>* slot = get_slot(root, paths[path_dist(rng_)]);
*slot = factory_.random_tree(2, false);
} else {
std::uniform_int_distribution<int> const_dist(
0, static_cast<int>(constant_paths.size()) - 1);
std::unique_ptr<ExprNode>* slot =
get_slot(root, constant_paths[const_dist(rng_)]);
factory_.mutate_constant(**slot);
}
}
if (!contains_variable(root.get())) {
root = make_binary(NodeType::Add, std::move(root), make_variable());
}
}
// 适应度函数是整个搜索的方向盘。
// 这份实现同时考虑训练误差、表达式复杂度,以及额外采样点上的泛化表现。
void evaluate_individual(Individual& individual) const {
individual.train_mse = mean_squared_error(individual.root.get(), train_samples_);
individual.test_mse = mean_squared_error(individual.root.get(), test_samples_);
const int size = node_count(individual.root.get());
// Optimize training error first, but mildly discourage both overgrown
// trees and solutions that only fit the training grid.
// 适应度设计:训练误差是主目标,同时轻微惩罚大树,并混入少量 test_mse。
individual.fitness =
individual.train_mse + size * complexity_penalty_ + 0.10 * individual.test_mse;
}
// 对每个采样点计算平方误差,并在累计过程中做溢出保护。
// 一旦发现预测值非法,或者总误差已经大到没有比较价值,就直接返回惩罚值。
double mean_squared_error(const ExprNode* root, const std::vector<Sample>& samples) const {
long double total = 0.0L;
for (const auto& sample : samples) {
double prediction = eval_tree(root, sample.x);
if (!std::isfinite(prediction)) {
return kFitnessPenalty;
}
double diff = prediction - sample.y;
if (!std::isfinite(diff)) {
return kFitnessPenalty;
}
total += static_cast<long double>(diff) * static_cast<long double>(diff);
if (total > static_cast<long double>(kFitnessPenalty)) {
return kFitnessPenalty;
}
}
return static_cast<double>(total / samples.size());
}
bool is_valid_tree(const ExprNode* root) const {
if (root == nullptr) {
return false;
}
// 同时限制深度和节点数,控制搜索空间规模与表达式复杂度。
return node_count(root) <= max_tree_nodes_ && tree_depth(root) <= max_tree_depth_;
}
static std::unique_ptr<ExprNode> make_variable() {
auto node = std::make_unique<ExprNode>();
node->type = NodeType::Variable;
return node;
}
static std::unique_ptr<ExprNode> make_constant(double value) {
auto node = std::make_unique<ExprNode>();
node->type = NodeType::Constant;
node->constant = value;
return node;
}
static std::unique_ptr<ExprNode> make_unary(NodeType type, std::unique_ptr<ExprNode> child) {
auto node = std::make_unique<ExprNode>();
node->type = type;
node->left = std::move(child);
return node;
}
static std::unique_ptr<ExprNode> make_binary(NodeType type,
std::unique_ptr<ExprNode> lhs,
std::unique_ptr<ExprNode> rhs) {
auto node = std::make_unique<ExprNode>();
node->type = type;
node->left = std::move(lhs);
node->right = std::move(rhs);
return node;
}
std::vector<Sample> train_samples_;
std::vector<Sample> test_samples_;
int population_size_ = 240;
int generations_ = 320;
int elite_count_ = 10;
int tournament_size_ = 5;
int max_tree_depth_ = 8;
int max_tree_nodes_ = 40;
double crossover_rate_ = 0.90;
double mutation_rate_ = 0.38;
double complexity_penalty_ = 2e-3;
std::mt19937 rng_;
ExpressionFactory factory_;
std::uniform_real_distribution<double> prob_{0.0, 1.0};
};
std::unique_ptr<ExprNode> make_variable_node() {
auto node = std::make_unique<ExprNode>();
node->type = NodeType::Variable;
return node;
}
std::unique_ptr<ExprNode> make_constant_node(double value) {
auto node = std::make_unique<ExprNode>();
node->type = NodeType::Constant;
node->constant = value;
return node;
}
std::unique_ptr<ExprNode> make_unary_node(NodeType type, std::unique_ptr<ExprNode> child) {
auto node = std::make_unique<ExprNode>();
node->type = type;
node->left = std::move(child);
return node;
}
std::unique_ptr<ExprNode> make_binary_node(NodeType type,
std::unique_ptr<ExprNode> lhs,
std::unique_ptr<ExprNode> rhs) {
auto node = std::make_unique<ExprNode>();
node->type = type;
node->left = std::move(lhs);
node->right = std::move(rhs);
return node;
}
// 这个函数只用于生成演示用的隐藏目标表达式。
// 它相当于"出题人":先随机挑一个模板,再把常数参数填进去,
// 最后由遗传算法根据采样点把这个目标函数反推回来。
std::unique_ptr<ExprNode> generate_target_expression(std::mt19937& rng) {
static const std::vector<double> linear_pool = {-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
static const std::vector<double> denom_pool = {1.5, 2.0, 3.0};
std::uniform_int_distribution<int> pick_template(0, 4);
std::uniform_int_distribution<int> pick_linear(0, static_cast<int>(linear_pool.size()) - 1);
std::uniform_int_distribution<int> pick_denom(0, static_cast<int>(denom_pool.size()) - 1);
const double a = linear_pool[pick_linear(rng)];
const double b = denom_pool[pick_denom(rng)];
// Keep the hidden target family simple enough that the demo usually
// recovers a recognizable expression within a few hundred generations.
switch (pick_template(rng)) {
case 0:
return make_binary_node(
NodeType::Add,
make_unary_node(NodeType::Sin, make_variable_node()),
make_binary_node(NodeType::Mul, make_constant_node(a), make_variable_node()));
case 1:
return make_binary_node(
NodeType::Sub,
make_unary_node(NodeType::Cos, make_variable_node()),
make_binary_node(NodeType::Div, make_variable_node(), make_constant_node(b)));
case 2:
return make_binary_node(
NodeType::Add,
make_binary_node(NodeType::Mul, make_variable_node(), make_variable_node()),
make_binary_node(NodeType::Mul,
make_constant_node(a),
make_unary_node(NodeType::Sin, make_variable_node())));
case 3:
return make_binary_node(
NodeType::Sub,
make_binary_node(NodeType::Mul, make_variable_node(), make_variable_node()),
make_binary_node(NodeType::Mul,
make_constant_node(b),
make_unary_node(NodeType::Cos, make_variable_node())));
default:
return make_binary_node(
NodeType::Add,
make_unary_node(NodeType::Sin,
make_binary_node(NodeType::Add,
make_variable_node(),
make_constant_node(0.5))),
make_binary_node(NodeType::Div, make_variable_node(), make_constant_node(b)));
}
}
std::vector<Sample> make_train_samples(const ExprNode* target) {
std::vector<Sample> samples;
// 训练集使用较稀疏的采样点,GA 进化时只看得到这些数据。
for (int i = 0; i < 21; ++i) {
double x = -2.5 + 5.0 * i / 20.0;
samples.push_back({x, eval_tree(target, x)});
}
return samples;
}
std::vector<Sample> make_test_samples(const ExprNode* target) {
std::vector<Sample> samples;
// 测试集使用更密且略微错开的采样点,用来看泛化而不是死记训练点。
for (int i = 0; i < 40; ++i) {
double x = -2.4 + 4.8 * i / 39.0;
samples.push_back({x, eval_tree(target, x)});
}
return samples;
}
double mean_absolute_error(const ExprNode* lhs,
const ExprNode* rhs,
const std::vector<double>& xs) {
double total = 0.0;
for (double x : xs) {
total += std::abs(eval_tree(lhs, x) - eval_tree(rhs, x));
}
return total / xs.size();
}
double max_absolute_error(const ExprNode* lhs,
const ExprNode* rhs,
const std::vector<double>& xs) {
double best = 0.0;
for (double x : xs) {
best = std::max(best, std::abs(eval_tree(lhs, x) - eval_tree(rhs, x)));
}
return best;
}
} // namespace
// 程序流程:
// 1. 固定随机种子,生成一个可复现实验的目标函数;
// 2. 构造训练集与测试集;
// 3. 配置遗传算法参数并开始进化;
// 4. 输出恢复出来的表达式以及误差指标。
int main() {
std::ios::sync_with_stdio(false);
std::cin.tie(nullptr);
// const unsigned target_seed = 20260515U;
const unsigned target_seed = 5;
const unsigned evolution_seed = 20260516U;
std::mt19937 target_rng(target_seed);
// 先随机生成一个隐藏目标表达式,再从这个表达式采样出数据集。
auto target = generate_target_expression(target_rng);
auto train_samples = make_train_samples(target.get());
auto test_samples = make_test_samples(target.get());
std::cout << std::fixed << std::setprecision(6);
std::cout << "Symbolic regression demo based on genetic programming\n";
std::cout << "primitive set: +, -, *, /, sin, cos\n";
std::cout << "target seed: " << target_seed << '\n';
std::cout << "evolution seed: " << evolution_seed << '\n';
std::cout << "target expression: " << to_string(target.get()) << "\n\n";
std::cout << "training samples (only data passed to GA):\n";
for (const auto& sample : train_samples) {
std::cout << "x=" << std::setw(8) << sample.x << " y=" << std::setw(12) << sample.y
<< '\n';
}
std::cout << '\n';
// 这里的参数控制种群规模、迭代代数,以及交叉/变异强度。
SymbolicRegressionGA solver(train_samples, test_samples, evolution_seed);
solver.set_population_size(260);
solver.set_generations(360);
solver.set_elite_count(12);
solver.set_tournament_size(6);
solver.set_crossover_rate(0.92);
solver.set_mutation_rate(0.42);
solver.set_max_tree_depth(8);
solver.set_max_tree_nodes(42);
auto best = solver.run();
std::vector<double> compare_xs;
for (int i = 0; i < 81; ++i) {
compare_xs.push_back(-2.5 + 5.0 * i / 80.0);
}
std::cout << "\nrecovered expression: " << to_string(best.root.get()) << '\n';
std::cout << "train mse: " << best.train_mse << '\n';
std::cout << "test mse: " << best.test_mse << '\n';
std::cout << "mean abs error on dense grid: "
<< mean_absolute_error(target.get(), best.root.get(), compare_xs) << '\n';
std::cout << "max abs error on dense grid: "
<< max_absolute_error(target.get(), best.root.get(), compare_xs) << '\n';
std::cout << "\ncomparison on representative points:\n";
for (int i = 0; i <= 10; ++i) {
double x = -2.5 + 5.0 * i / 10.0;
double target_y = eval_tree(target.get(), x);
double predicted_y = eval_tree(best.root.get(), x);
std::cout << "x=" << std::setw(8) << x << " target=" << std::setw(12) << target_y
<< " predicted=" << std::setw(12) << predicted_y
<< " abs_diff=" << std::setw(12) << std::abs(target_y - predicted_y) << '\n';
}
return 0;
}