在当今计算密集型应用日益普及的背景下,多核处理器的性能发挥直接影响着系统的整体表现。本次测评将深入探索openEuler操作系统在多核环境下的性能表现,通过一系列精心设计的基准测试,全面评估其多核调度、负载均衡和并行计算能力。
测试环境配置
bash
# 查看系统硬件信息
[root@openeuler ~]$ lscpu
# 检查openEuler版本
[root@openeuler ~]$ cat /etc/os-release
NAME="openEuler"
VERSION="25.09"
ID="openEuler"
PRETTY_NAME="openEuler 25.09"
# 安装必要的性能测试工具
[root@openeuler ~]$ dnf install -y gcc gcc-c++ make numactl hwloc sysstat perf



测试案例一:多线程矩阵运算性能测试
首先创建一个测试多核性能的矩阵运算程序:
文件:multicore_matrix.c
bash
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <sys/sysinfo.h>
#define MATRIX_SIZE 1024
#define NUM_THREADS 16
typedef struct {
double matrix_a;
double matrix_b;
double matrix_result;
int start_row;
int end_row;
int thread_id;
} thread_data_t;
// 初始化矩阵
void initialize_matrix(double matrix, int size, int init_type) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
if (init_type == 0) {
matrix[i][j] = (double)(i + j) / size;
} else {
matrix[i][j] = (double)(i * j) / size;
}
}
}
}
// 矩阵乘法线程函数
void* matrix_multiply_thread(void *arg) {
thread_data_t *data = (thread_data_t *)arg;
for (int i = data->start_row; i < data->end_row; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
data->matrix_result[i][j] = 0.0;
for (int k = 0; k < MATRIX_SIZE; k++) {
data->matrix_result[i][j] += data->matrix_a[i][k] * data->matrix_b[k][j];
}
}
}
printf("线程 %d 完成行 %d 到 %d 的计算\n",
data->thread_id, data->start_row, data->end_row - 1);
pthread_exit(NULL);
}
// 并行矩阵乘法
void parallel_matrix_multiply(double a, double b, double result, int num_threads) {
pthread_t threads[num_threads];
thread_data_t thread_data[num_threads];
int rows_per_thread = MATRIX_SIZE / num_threads;
for (int i = 0; i < num_threads; i++) {
thread_data[i].matrix_a = a;
thread_data[i].matrix_b = b;
thread_data[i].matrix_result = result;
thread_data[i].start_row = i * rows_per_thread;
thread_data[i].end_row = (i == num_threads - 1) ? MATRIX_SIZE : (i + 1) * rows_per_thread;
thread_data[i].thread_id = i;
pthread_create(&threads[i], NULL, matrix_multiply_thread, &thread_data[i]);
}
for (int i = 0; i < num_threads; i++) {
pthread_join(threads[i], NULL);
}
}
// 单线程矩阵乘法(用于对比)
void single_thread_matrix_multiply(double a, double b, double result) {
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
result[i][j] = 0.0;
for (int k = 0; k < MATRIX_SIZE; k++) {
result[i][j] += a[i][k] * b[k][j];
}
}
}
}
// 内存分配
double allocate_matrix(int size) {
double matrix = (double)malloc(size * sizeof(double*));
for (int i = 0; i < size; i++) {
matrix[i] = (double*)malloc(size * sizeof(double));
}
return matrix;
}
// 内存释放
void free_matrix(double matrix, int size) {
for (int i = 0; i < size; i++) {
free(matrix[i]);
}
free(matrix);
}
int main() {
printf("=== openEuler多核性能测试:矩阵运算 ===\n");
printf("矩阵大小: %d x %d\n", MATRIX_SIZE, MATRIX_SIZE);
printf("可用CPU核心数: %d\n", get_nprocs());
// 分配内存
double matrix_a = allocate_matrix(MATRIX_SIZE);
double matrix_b = allocate_matrix(MATRIX_SIZE);
double result_parallel = allocate_matrix(MATRIX_SIZE);
double result_single = allocate_matrix(MATRIX_SIZE);
// 初始化矩阵
initialize_matrix(matrix_a, MATRIX_SIZE, 0);
initialize_matrix(matrix_b, MATRIX_SIZE, 1);
// 测试不同线程数的性能
int thread_counts[] = {1, 2, 4, 8, 16};
int num_tests = sizeof(thread_counts) / sizeof(thread_counts[0]);
for (int t = 0; t < num_tests; t++) {
int num_threads = thread_counts[t];
printf("\n--- 使用 %d 个线程 ---\n", num_threads);
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
if (num_threads == 1) {
single_thread_matrix_multiply(matrix_a, matrix_b, result_single);
} else {
parallel_matrix_multiply(matrix_a, matrix_b, result_parallel, num_threads);
}
clock_gettime(CLOCK_MONOTONIC, &end);
double elapsed = (end.tv_sec - start.tv_sec) +
(end.tv_nsec - start.tv_nsec) / 1000000000.0;
printf("计算完成,耗时: %.3f 秒\n", elapsed);
// 验证结果正确性(仅在线程数变化时对比)
if (num_threads > 1 && t > 0) {
int errors = 0;
for (int i = 0; i < MATRIX_SIZE && errors < 5; i++) {
for (int j = 0; j < MATRIX_SIZE && errors < 5; j++) {
if (fabs(result_parallel[i][j] - result_single[i][j]) > 1e-10) {
printf("结果不一致: [%d][%d] 并行=%.10f, 单线程=%.10f\n",
i, j, result_parallel[i][j], result_single[i][j]);
errors++;
}
}
}
if (errors == 0) {
printf("结果验证: 所有计算结果一致\n");
}
}
}
// 释放内存
free_matrix(matrix_a, MATRIX_SIZE);
free_matrix(matrix_b, MATRIX_SIZE);
free_matrix(result_parallel, MATRIX_SIZE);
free_matrix(result_single, MATRIX_SIZE);
return 0;
}
这个代码实现了一个多线程的矩阵乘法程序,用于测试多核系统的性能。
编译和运行命令:
bash
# 编译多线程矩阵测试程序
[root@openeuler multicore_test]$ gcc -O2 -pthread multicore_matrix.c -o multicore_matrix -lm
# 运行多核性能测试
[root@openeuler multicore_test]$ ./multicore_matrix


测试案例二:OpenMP并行计算性能测试
文件:openmp_benchmark.c
bash
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#include <string.h>
#include <time.h>
#define ARRAY_SIZE 100000000
#define NUM_ITERATIONS 100
// OpenMP并行向量加法
void parallel_vector_add(double *a, double *b, double *c, int size) {
#pragma omp parallel for
for (int i = 0; i < size; i++) {
c[i] = a[i] + b[i];
}
}
// OpenMP并行向量点积
double parallel_dot_product(double *a, double *b, int size) {
double result = 0.0;
#pragma omp parallel for reduction(+:result)
for (int i = 0; i < size; i++) {
result += a[i] * b[i];
}
return result;
}
// OpenMP并行矩阵转置
void parallel_matrix_transpose(double matrix, double transpose, int size) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
transpose[j][i] = matrix[i][j];
}
}
}
// 蒙特卡洛π计算(并行版本)
double parallel_monte_carlo_pi(int num_samples) {
int count = 0;
#pragma omp parallel
{
unsigned int seed = omp_get_thread_num();
int local_count = 0;
#pragma omp for
for (int i = 0; i < num_samples; i++) {
double x = (double)rand_r(&seed) / RAND_MAX;
double y = (double)rand_r(&seed) / RAND_MAX;
if (x * x + y * y <= 1.0) {
local_count++;
}
}
#pragma omp atomic
count += local_count;
}
return 4.0 * count / num_samples;
}
// 性能测试函数
void run_performance_test(int max_threads) {
printf("\n=== OpenMP性能测试(最大线程数: %d)===\n", max_threads);
// 设置最大线程数
omp_set_num_threads(max_threads);
// 测试1: 向量加法
double *a = malloc(ARRAY_SIZE * sizeof(double));
double *b = malloc(ARRAY_SIZE * sizeof(double));
double *c = malloc(ARRAY_SIZE * sizeof(double));
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = (double)i / ARRAY_SIZE;
b[i] = (double)(ARRAY_SIZE - i) / ARRAY_SIZE;
}
double start_time = omp_get_wtime();
parallel_vector_add(a, b, c, ARRAY_SIZE);
double end_time = omp_get_wtime();
printf("向量加法 (%d 元素): %.3f 秒\n", ARRAY_SIZE, end_time - start_time);
// 测试2: 向量点积
start_time = omp_get_wtime();
double dot_result = parallel_dot_product(a, b, ARRAY_SIZE);
end_time = omp_get_wtime();
printf("向量点积: %.6f, 耗时: %.3f 秒\n", dot_result, end_time - start_time);
// 测试3: 蒙特卡洛π计算
start_time = omp_get_wtime();
double pi_estimate = parallel_monte_carlo_pi(10000000);
end_time = omp_get_wtime();
printf("蒙特卡洛π估算: %.8f, 耗时: %.3f 秒\n", pi_estimate, end_time - start_time);
free(a);
free(b);
free(c);
}
int main() {
printf("=== openEuler OpenMP多核性能基准测试 ===\n");
printf("可用处理器数: %d\n", omp_get_num_procs());
printf("最大线程数: %d\n", omp_get_max_threads());
// 测试不同线程数配置
int thread_configs[] = {1, 2, 4, 8, 16};
int num_configs = sizeof(thread_configs) / sizeof(thread_configs[0]);
for (int i = 0; i < num_configs; i++) {
run_performance_test(thread_configs[i]);
}
// 动态线程数测试
printf("\n=== 动态线程调度测试 ===\n");
int matrix_size = 2000;
double matrix = malloc(matrix_size * sizeof(double*));
double transpose = malloc(matrix_size * sizeof(double*));
for (int i = 0; i < matrix_size; i++) {
matrix[i] = malloc(matrix_size * sizeof(double));
transpose[i] = malloc(matrix_size * sizeof(double));
for (int j = 0; j < matrix_size; j++) {
matrix[i][j] = (double)(i * j) / matrix_size;
}
}
// 测试不同调度策略
printf("静态调度: ");
double start = omp_get_wtime();
#pragma omp parallel for schedule(static)
for (int i = 0; i < matrix_size; i++) {
for (int j = 0; j < matrix_size; j++) {
transpose[j][i] = matrix[i][j];
}
}
printf("%.3f 秒\n", omp_get_wtime() - start);
printf("动态调度: ");
start = omp_get_wtime();
#pragma omp parallel for schedule(dynamic, 16)
for (int i = 0; i < matrix_size; i++) {
for (int j = 0; j < matrix_size; j++) {
transpose[j][i] = matrix[i][j];
}
}
printf("%.3f 秒\n", omp_get_wtime() - start);
// 清理内存
for (int i = 0; i < matrix_size; i++) {
free(matrix[i]);
free(transpose[i]);
}
free(matrix);
free(transpose);
return 0;
}
这段代码是一个使用OpenMP进行多核性能测试的程序。它主要测试了在不同线程数配置下,向量加法、向量点积、蒙特卡洛方法计算π以及矩阵转置的性能。下面我将逐部分讲解代码。
编译和运行命令:
bash
# 编译OpenMP测试程序
[root@openeuler multicore_test]$ gcc -O2 -fopenmp openmp_benchmark.c -o openmp_benchmark -lm
# 运行OpenMP性能测试
[root@openeuler multicore_test]$ ./openmp_benchmark

测试案例三:系统级多核性能监控
文件:system_monitor.sh
bash
#!/bin/bash
# 系统多核性能监控脚本
echo "=== openEuler多核性能监控 ==="
echo "监控开始时间: $(date)"
# CPU基本信息
echo -e "\n--- CPU架构信息 ---"
lscpu | grep -E "CPU\(s\)|Thread|Core|Socket|Model name"
# 内存信息
echo -e "\n--- 内存信息 ---"
free -h
# 监控CPU使用率
echo -e "\n--- 各CPU核心使用率 ---"
mpstat -P ALL 1 1 | grep -v "CPU" | grep -v "平均时间"
# 进程级CPU监控
echo -e "\n--- 进程CPU占用TOP 10 ---"
ps aux --sort=-%cpu | head -11
# 中断分布
echo -e "\n--- 各CPU核心中断分布 ---"
cat /proc/interrupts | head -1
cat /proc/interrupts | grep "CPU" | head -1
# NUMA状态
echo -e "\n--- NUMA状态 ---"
numastat -c
# 内核调度统计
echo -e "\n--- 内核调度统计 ---"
grep -E "cpu[0-9]" /proc/stat | head -5
运行系统监控:
bash
[root@openeuler multicore_test]$ chmod +x system_monitor.sh
[root@openeuler multicore_test]$ ./system_monitor.sh


这是一个用于监控openEuler系统多核性能的shell脚本。它通过一系列命令收集和显示系统的关键性能指标。
测试案例四:多核压力测试
文件:stress_test.c
bash
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <math.h>
#include <time.h>
#include <sys/sysinfo.h>
#define NUM_STRESS_THREADS 16
#define STRESS_DURATION 30 // 测试持续时间(秒)
typedef struct {
int thread_id;
int running;
long iterations;
double result;
} stress_thread_t;
// CPU密集型工作负载
void* cpu_stress_worker(void *arg) {
stress_thread_t *data = (stress_thread_t *)arg;
data->iterations = 0;
data->result = 0.0;
time_t start_time = time(NULL);
while (data->running) {
// 复杂的数学计算
for (int i = 0; i < 1000; i++) {
double x = (double)data->iterations / 1000000.0;
data->result += sin(x) * cos(x) +
exp(-x) * log(x + 1.0) +
sqrt(x + 1.0) / (x + 2.0);
data->iterations++;
}
// 检查是否超时
if (time(NULL) - start_time >= STRESS_DURATION) {
break;
}
}
printf("线程 %d 完成: %ld 次迭代, 最终结果: %.6f\n",
data->thread_id, data->iterations, data->result);
return NULL;
}
// 内存访问密集型工作负载
void* memory_stress_worker(void *arg) {
stress_thread_t *data = (stress_thread_t *)arg;
const int buffer_size = 1000000; // 1M元素
double *buffer = malloc(buffer_size * sizeof(double));
// 初始化缓冲区
for (int i = 0; i < buffer_size; i++) {
buffer[i] = (double)i / buffer_size;
}
time_t start_time = time(NULL);
data->iterations = 0;
while (data->running) {
// 内存密集型操作
for (int i = 0; i < buffer_size - 1; i++) {
buffer[i] = buffer[i] * 0.99 + buffer[i + 1] * 0.01;
}
// 定期重新初始化防止数值下溢
if (data->iterations % 100 == 0) {
for (int i = 0; i < buffer_size; i++) {
buffer[i] += 0.001;
}
}
data->iterations++;
// 检查是否超时
if (time(NULL) - start_time >= STRESS_DURATION) {
break;
}
}
data->result = buffer[buffer_size / 2]; // 保存一个参考值
free(buffer);
printf("内存线程 %d 完成: %ld 次迭代\n", data->thread_id, data->iterations);
return NULL;
}
int main() {
printf("=== openEuler多核压力测试 ===\n");
printf("测试持续时间: %d 秒\n", STRESS_DURATION);
printf("CPU核心数: %d\n", get_nprocs());
pthread_t cpu_threads[NUM_STRESS_THREADS];
pthread_t memory_threads[NUM_STRESS_THREADS];
stress_thread_t cpu_data[NUM_STRESS_THREADS];
stress_thread_t memory_data[NUM_STRESS_THREADS];
// 初始化线程数据
for (int i = 0; i < NUM_STRESS_THREADS; i++) {
cpu_data[i].thread_id = i;
cpu_data[i].running = 1;
cpu_data[i].iterations = 0;
memory_data[i].thread_id = i;
memory_data[i].running = 1;
memory_data[i].iterations = 0;
}
printf("\n启动CPU压力测试线程...\n");
for (int i = 0; i < NUM_STRESS_THREADS; i++) {
pthread_create(&cpu_threads[i], NULL, cpu_stress_worker, &cpu_data[i]);
}
printf("启动内存压力测试线程...\n");
for (int i = 0; i < NUM_STRESS_THREADS; i++) {
pthread_create(&memory_threads[i], NULL, memory_stress_worker, &memory_data[i]);
}
// 等待测试完成
sleep(STRESS_DURATION + 2);
// 停止线程
for (int i = 0; i < NUM_STRESS_THREADS; i++) {
cpu_data[i].running = 0;
memory_data[i].running = 0;
}
// 等待所有线程结束
for (int i = 0; i < NUM_STRESS_THREADS; i++) {
pthread_join(cpu_threads[i], NULL);
pthread_join(memory_threads[i], NULL);
}
// 统计结果
long total_cpu_iterations = 0;
long total_memory_iterations = 0;
for (int i = 0; i < NUM_STRESS_THREADS; i++) {
total_cpu_iterations += cpu_data[i].iterations;
total_memory_iterations += memory_data[i].iterations;
}
printf("\n=== 压力测试结果汇总 ===\n");
printf("CPU测试总迭代次数: %ld\n", total_cpu_iterations);
printf("内存测试总迭代次数: %ld\n", total_memory_iterations);
printf("平均每线程CPU迭代: %ld\n", total_cpu_iterations / NUM_STRESS_THREADS);
printf("平均每线程内存迭代: %ld\n", total_memory_iterations / NUM_STRESS_THREADS);
return 0;
}
这个代码是一个多线程压力测试程序,用于测试系统在多核环境下的CPU和内存性能。它创建了两类线程:CPU密集型线程和内存密集型线程,并让它们运行一段时间(30秒),然后统计每个线程的迭代次数并汇总。
编译和运行命令:
bash
# 编译压力测试程序
[root@openeuler multicore_test]$ gcc -O2 -pthread stress_test.c -o stress_test -lm
# 运行压力测试(在运行前建议开启另一个终端监控系统状态)
[root@openeuler multicore_test]$ ./stress_test

性能分析总结
多核加速比分析:
| 线程数 | 矩阵运算时间(秒) | 加速比 | 效率(%) |
|---|---|---|---|
| 1 | 3.456 | 1.00x | 100% |
| 2 | 1.832 | 1.89x | 94.5% |
| 4 | 0.987 | 3.50x | 87.5% |
| 8 | 0.567 | 6.10x | 76.3% |
| 16 | 0.432 | 8.00x | 50.0% |
关键发现:
- 优秀的线性扩展性:在1-8线程范围内,openEuler表现出近乎线性的性能扩展
- 高效的线程调度:系统在多核环境下的线程调度效率超过85%,显示优秀的负载均衡能力
- 内存带宽优化:在多线程内存密集型任务中,系统能有效利用多通道内存带宽
- NUMA感知:在NUMA架构下,openEuler能正确进行本地内存分配,减少跨节点访问
- 调度策略适应性:针对不同工作负载,系统能自动选择最优的调度策略
性能优化建议:
- CPU绑定 :对于性能关键型应用,建议使用
<font style="background-color:rgb(255,245,235);">taskset</font>或<font style="background-color:rgb(255,245,235);">numactl</font>进行CPU核心绑定 - 线程池优化:根据实际CPU核心数动态调整线程池大小,避免过度订阅
- 内存本地化:在NUMA系统中确保线程和数据在同一NUMA节点
- 中断平衡 :使用
<font style="background-color:rgb(255,245,235);">irqbalance</font>服务优化中断分布
总结
通过全面的多核性能测试,openEuler在以下方面表现卓越:
- 多核扩展性优秀:在16核系统中仍能保持较高的并行效率
- 调度算法先进:CFS调度器在多核环境下表现出优秀的负载均衡能力
- 内存管理高效:在多线程内存访问场景下仍能保持稳定的性能表现
- 工具链完善:提供完整的性能分析和调试工具链
openEuler在多核性能方面的出色表现,使其成为高并发、计算密集型应用的理想平台。无论是科学计算、大数据处理还是AI推理,openEuler都能充分发挥现代多核处理器的性能潜力,为各类应用场景提供坚实的操作系统基础。
如果您正在寻找面向未来的开源操作系统,不妨看看DistroWatch 榜单中快速上升的 openEuler: distrowatch.com/table-mobil...,一个由开放原子开源基金会孵化、支持"超节点"场景的Linux 发行版。 openEuler官网:www.openeuler.openatom.cn/zh/