执行摘要
本深度扩展报告在第一版基础上,进一步完善了 GPU 加速声场求解器的数学理论、算法实现、性能优化及工程实践。新增内容包括:高阶有限元离散、复杂边界条件处理、多频并行求解、频域 - 时域转换、完整的误差分析与验证体系。
新增核心内容:
- 高阶单元(二次、三次)离散化
- PML 吸收边界完整实现
- 多频点并行求解策略
- 时域脉冲响应计算
- 完整的数值误差分析
- 生产级代码框架(10000+ 行)
一、扩展数学理论
1.1 高阶有限元离散
1.1.1 二次四面体单元
形函数(10 节点四面体):
N1=(2L1−1)L1N2=(2L2−1)L2N3=(2L3−1)L3N4=(2L4−1)L4N5=4L1L2N6=4L2L3N7=4L3L1N8=4L1L4N9=4L2L4N10=4L3L4\begin{aligned} N_1 &= (2L_1 - 1)L_1 & N_2 &= (2L_2 - 1)L_2 & N_3 &= (2L_3 - 1)L_3 & N_4 &= (2L_4 - 1)L_4 \\ N_5 &= 4L_1 L_2 & N_6 &= 4L_2 L_3 & N_7 &= 4L_3 L_1 \\ N_8 &= 4L_1 L_4 & N_9 &= 4L_2 L_4 & N_{10} &= 4L_3 L_4 \end{aligned}N1N5N8=(2L1−1)L1=4L1L2=4L1L4N2N6N9=(2L2−1)L2=4L2L3=4L2L4N3N7N10=(2L3−1)L3=4L3L1=4L3L4N4=(2L4−1)L4
其中 LiL_iLi 为体积坐标(barycentric coordinates)。
单元刚度矩阵(10×10):
Kije=∫Ωe∇Ni⋅1ρ∇Nj dΩK_{ij}^e = \int_{\Omega_e} \nabla N_i \cdot \frac{1}{\rho}\nabla N_j \, d\OmegaKije=∫Ωe∇Ni⋅ρ1∇NjdΩ
对于二次单元,∇Ni\nabla N_i∇Ni 为线性函数,需使用高斯积分:
Kije≈∑k=1ngwk[∇Ni(ξk)⋅1ρ∇Nj(ξk)]det(J(ξk))K_{ij}^e \approx \sum_{k=1}^{n_g} w_k \left[\nabla N_i(\xi_k) \cdot \frac{1}{\rho}\nabla N_j(\xi_k)\right] \det(J(\xi_k))Kije≈k=1∑ngwk[∇Ni(ξk)⋅ρ1∇Nj(ξk)]det(J(ξk))
高斯积分规则(四面体):
| 积分点数 | 精度 | 适用 |
|---|---|---|
| 1 点 | 1 阶 | 线性单元 |
| 4 点 | 2 阶 | 二次单元 |
| 5 点 | 3 阶 | 三次单元 |
| 11 点 | 5 阶 | 高精度 |
1.1.2 谱元法(可选高阶方法)
python
# ============================================================================
# 文件:spectral_element.py
# ============================================================================
import numpy as np
from scipy.special import roots_legendre, lpmv
class SpectralElement3D:
"""
3D 谱元法实现
使用 Legendre 多项式作为基函数
Gauss-Lobatto-Legendre (GLL) 点作为节点
"""
def __init__(self, polynomial_order: int = 4):
self.N = polynomial_order # 多项式阶数
# 生成 GLL 点和权重
self.gll_points, self.gll_weights = self._compute_gll_points()
# 预计算微分矩阵
self.diff_matrix = self._compute_differentiation_matrix()
def _compute_gll_points(self) -> tuple:
"""
计算 Gauss-Lobatto-Legendre 点和权重
GLL 点包括区间端点 ±1
"""
N = self.N
# 内部点通过 Legendre 多项式导数的根得到
if N == 0:
return np.array([-1.0, 1.0]), np.array([2.0, 2.0])
# 使用牛顿法求 Legendre 多项式导数的根
x = np.cos(np.pi * np.arange(1, N) / N) # 初始猜测(Chebyshev 点)
for _ in range(100): # 牛顿迭代
Pn = self._legendre_poly(N, x)
Pn_deriv = self._legendre_poly_deriv(N, x)
Pn_deriv2 = self._legendre_poly_deriv2(N, x)
dx = Pn_deriv / Pn_deriv2
x_new = x - dx
if np.max(np.abs(x_new - x)) < 1e-14:
break
x = x_new
# 添加端点
gll_points = np.concatenate([[-1.0], x, [1.0]])
# 计算权重
weights = np.zeros(N + 1)
weights[0] = 2.0 / (N * (N + 1))
weights[-1] = 2.0 / (N * (N + 1))
for i in range(1, N):
weights[i] = 2.0 / (N * (N + 1) * Pn[i-1]**2)
return gll_points, weights
def _legendre_poly(self, n: int, x: np.ndarray) -> np.ndarray:
"""Legendre 多项式 P_n(x)"""
if n == 0:
return np.ones_like(x)
elif n == 1:
return x
P0 = np.ones_like(x)
P1 = x.copy()
for k in range(2, n + 1):
Pk = ((2*k - 1) * x * P1 - (k - 1) * P0) / k
P0 = P1
P1 = Pk
return P1
def _legendre_poly_deriv(self, n: int, x: np.ndarray) -> np.ndarray:
"""Legendre 多项式导数 P'_n(x)"""
# 使用关系式:(1-x²)P'_n = -nxP_n + nP_{n-1}
Pn = self._legendre_poly(n, x)
Pn_1 = self._legendre_poly(n - 1, x)
return (n * (Pn_1 - x * Pn)) / (1 - x**2 + 1e-14)
def _legendre_poly_deriv2(self, n: int, x: np.ndarray) -> np.ndarray:
"""Legendre 多项式二阶导数 P''_n(x)"""
# 使用递推关系
Pn = self._legendre_poly(n, x)
Pn_deriv = self._legendre_poly_deriv(n, x)
return (2*x * Pn_deriv - n*(n+1)*Pn) / (x**2 - 1 + 1e-14)
def _compute_differentiation_matrix(self) -> np.ndarray:
"""
计算微分矩阵 D_ij = dℓ_j/dx(ξ_i)
其中 ℓ_j 是 Lagrange 插值基函数
"""
N = self.N
xi = self.gll_points
D = np.zeros((N + 1, N + 1))
for i in range(N + 1):
for j in range(N + 1):
if i != j:
# 非对角元
prod = 1.0
for k in range(N + 1):
if k != i and k != j:
prod *= (xi[i] - xi[k]) / (xi[j] - xi[k])
D[i, j] = prod / (xi[i] - xi[j])
else:
# 对角元
if i == 0:
D[i, i] = -N * (N + 1) / 4
elif i == N:
D[i, i] = N * (N + 1) / 4
else:
D[i, i] = 0.0
return D
def compute_gradient(self, u: np.ndarray) -> np.ndarray:
"""
计算梯度 ∇u 在 GLL 点上的值
u: 函数值在 GLL 点上 [N+1]
返回:du/dx 在 GLL 点上 [N+1]
"""
return self.diff_matrix @ u
def compute_laplacian(self, u: np.ndarray) -> np.ndarray:
"""
计算拉普拉斯 ∇²u
"""
D2 = self.diff_matrix @ self.diff_matrix
return D2 @ u
def integrate(self, u: np.ndarray) -> float:
"""
GLL 数值积分 ∫u(x)dx ≈ Σ w_i u(ξ_i)
"""
return np.dot(self.gll_weights, u)
def map_to_physical(self, xi: np.ndarray,
physical_coords: np.ndarray) -> np.ndarray:
"""
从参考单元映射到物理单元
physical_coords: 物理单元节点坐标 [N+1, 3]
返回:物理坐标
"""
# Lagrange 插值
x_phys = np.zeros(3)
for i in range(len(xi)):
Li = self._lagrange_basis(xi, i)
x_phys += Li * physical_coords[i]
return x_phys
def _lagrange_basis(self, xi: np.ndarray, j: int) -> np.ndarray:
"""Lagrange 基函数 ℓ_j(ξ)"""
N = self.N
xj = self.gll_points[j]
L = np.ones_like(xi)
for i in range(N + 1):
if i != j:
L *= (xi - self.gll_points[i]) / (xj - self.gll_points[i])
return L
1.2 PML 吸收边界完整理论
1.2.1 复坐标拉伸
PML 控制方程(频域):
在 PML 区域内,坐标变换为:
x~=∫0x(1+σx(s)iω)ds\tilde{x} = \int_0^x \left(1 + \frac{\sigma_x(s)}{i\omega}\right) dsx~=∫0x(1+iωσx(s))ds
拉伸坐标下的波动方程:
1sxsysz∇⋅(sxsyszS∇p)+ω2ρc2p=0\frac{1}{s_x s_y s_z}\nabla \cdot \left(\frac{s_x s_y s_z}{\mathbf{S}}\nabla p\right) + \frac{\omega^2}{\rho c^2}p = 0sxsysz1∇⋅(Ssxsysz∇p)+ρc2ω2p=0
其中 S=diag(sx,sy,sz)\mathbf{S} = \text{diag}(s_x, s_y, s_z)S=diag(sx,sy,sz),si=1+σi/(iω)s_i = 1 + \sigma_i/(i\omega)si=1+σi/(iω)。
1.2.2 PML 实现代码
python
# ============================================================================
# 文件:pml_boundary.py
# ============================================================================
import numpy as np
from typing import Tuple, Optional
class PMLLayer:
"""
完美匹配层(PML)实现
支持任意厚度和吸收强度分布
"""
def __init__(
self,
domain_size: Tuple[float, float, float],
pml_thickness: Tuple[float, float, float],
max_absorption: float = 100.0,
polynomial_order: int = 2
):
"""
domain_size: 计算域尺寸 (Lx, Ly, Lz)
pml_thickness: PML 层厚度 (dx, dy, dz)
max_absorption: 最大吸收系数 σ_max
polynomial_order: 吸收剖面多项式阶数(通常 2-3)
"""
self.Lx, self.Ly, self.Lz = domain_size
self.dx_pml, self.dy_pml, self.dz_pml = pml_thickness
self.sigma_max = max_absorption
self.m = polynomial_order
# 预计算吸收系数分布
self._compute_absorption_profiles()
def _compute_absorption_profiles(self):
"""
计算各方向的吸收系数分布 σ(x)
σ(x) = σ_max * (x/d)^m
"""
# X 方向
x = np.linspace(0, self.dx_pml, 100)
self.sigma_x_profile = self.sigma_max * (x / self.dx_pml) ** self.m
# Y 方向
y = np.linspace(0, self.dy_pml, 100)
self.sigma_y_profile = self.sigma_max * (y / self.dy_pml) ** self.m
# Z 方向
z = np.linspace(0, self.dz_pml, 100)
self.sigma_z_profile = self.sigma_max * (z / self.dz_pml) ** self.m
def get_sigma(
self,
position: np.ndarray,
direction: str
) -> float:
"""
获取指定位置的吸收系数
position: 位置坐标 [x, y, z]
direction: 'x', 'y', 或 'z'
"""
x, y, z = position
if direction == 'x':
if x < self.dx_pml:
# 左边界 PML
d = x
elif x > self.Lx - self.dx_pml:
# 右边界 PML
d = self.Lx - x
else:
return 0.0
idx = int(d / self.dx_pml * 99)
idx = np.clip(idx, 0, 99)
return self.sigma_x_profile[idx]
elif direction == 'y':
if y < self.dy_pml:
d = y
elif y > self.Ly - self.dy_pml:
d = self.Ly - y
else:
return 0.0
idx = int(d / self.dy_pml * 99)
idx = np.clip(idx, 0, 99)
return self.sigma_y_profile[idx]
elif direction == 'z':
if z < self.dz_pml:
d = z
elif z > self.Lz - self.dz_pml:
d = self.Lz - z
else:
return 0.0
idx = int(d / self.dz_pml * 99)
idx = np.clip(idx, 0, 99)
return self.sigma_z_profile[idx]
return 0.0
def get_stretch_factor(
self,
position: np.ndarray,
frequency: float,
direction: str
) -> complex:
"""
计算坐标拉伸因子 s = 1 + σ/(iω)
"""
sigma = self.get_sigma(position, direction)
omega = 2 * np.pi * frequency
# s = 1 - i*σ/ω
return complex(1.0, -sigma / omega)
def compute_pml_matrix(
self,
grid_points: np.ndarray,
frequency: float
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
计算整个网格的 PML 拉伸因子
grid_points: 网格点坐标 [Nx, Ny, Nz, 3]
返回:(sx, sy, sz) 每个形状为 [Nx, Ny, Nz]
"""
shape = grid_points.shape[:3]
sx = np.zeros(shape, dtype=complex)
sy = np.zeros(shape, dtype=complex)
sz = np.zeros(shape, dtype=complex)
omega = 2 * np.pi * frequency
for i in range(shape[0]):
for j in range(shape[1]):
for k in range(shape[2]):
pos = grid_points[i, j, k]
sx[i, j, k] = complex(1.0, -self.get_sigma(pos, 'x') / omega)
sy[i, j, k] = complex(1.0, -self.get_sigma(pos, 'y') / omega)
sz[i, j, k] = complex(1.0, -self.get_sigma(pos, 'z') / omega)
return sx, sy, sz
class PMLAcousticSolver:
"""
带 PML 边界的声场求解器
"""
def __init__(
self,
mesh,
pml: PMLLayer,
material_params: dict
):
self.mesh = mesh
self.pml = pml
self.rho = material_params['density']
self.c = material_params['sound_speed']
def assemble_pml_system(
self,
frequency: float
) -> Tuple[np.ndarray, np.ndarray]:
"""
组装带 PML 的系统矩阵
控制方程:
∇·(A∇p) + ω²Bp = 0
其中 A 和 B 包含 PML 拉伸因子
"""
omega = 2 * np.pi * frequency
k0 = omega / self.c
# 获取 PML 拉伸因子
sx, sy, sz = self.pml.compute_pml_matrix(
self.mesh.grid_points, frequency
)
# 计算系数矩阵
# A = diag(1/sx, 1/sy, 1/sz) * (sy*sz, sx*sz, sx*sy)
Ax = sy * sz / sx
Ay = sx * sz / sy
Az = sx * sy / sz
# B = sx * sy * sz
B = sx * sy * sz
# 组装刚度矩阵(包含 PML)
K_pml = self._assemble_anisotropic_stiffness(Ax, Ay, Az)
# 组装质量矩阵(包含 PML)
M_pml = self._assemble_weighted_mass(B)
# 系统矩阵 A = K - ω²M
A_system = K_pml - omega**2 * M_pml
return A_system, K_pml, M_pml
def _assemble_anisotropic_stiffness(
self,
Ax: np.ndarray,
Ay: np.ndarray,
Az: np.ndarray
) -> np.ndarray:
"""
组装各向异性刚度矩阵
K_ij = ∫ (Ax ∂N_i/∂x ∂N_j/∂x + Ay ∂N_i/∂y ∂N_j/∂y + Az ∂N_i/∂z ∂N_j/∂z) dV
"""
n_nodes = self.mesh.n_nodes
# 使用有限元组装
K = np.zeros((n_nodes, n_nodes), dtype=complex)
for elem in self.mesh.elements:
Ke = self._compute_element_stiffness_pml(elem, Ax, Ay, Az)
self._assemble_element(elem, Ke, K)
return K
def _compute_element_stiffness_pml(
self,
elem,
Ax: np.ndarray,
Ay: np.ndarray,
Az: np.ndarray
) -> np.ndarray:
"""
计算单元刚度矩阵(含 PML)
"""
# 获取单元节点坐标和 PML 系数
node_ids = elem.node_ids
x = self.mesh.nodes[node_ids, 0]
y = self.mesh.nodes[node_ids, 1]
z = self.mesh.nodes[node_ids, 2]
# 单元中心
xc, yc, zc = x.mean(), y.mean(), z.mean()
# 单元平均 PML 系数
Ax_elem = Ax[self.mesh.node_to_grid[node_ids]].mean()
Ay_elem = Ay[self.mesh.node_to_grid[node_ids]].mean()
Az_elem = Az[self.mesh.node_to_grid[node_ids]].mean()
# 计算形函数导数
dN = self._compute_shape_derivatives(x, y, z)
# 单元刚度矩阵
n_local = len(node_ids)
Ke = np.zeros((n_local, n_local), dtype=complex)
for i in range(n_local):
for j in range(n_local):
grad_dot = (
Ax_elem * dN[i, 0] * dN[j, 0] +
Ay_elem * dN[i, 1] * dN[j, 1] +
Az_elem * dN[i, 2] * dN[j, 2]
)
Ke[i, j] = grad_dot * elem.volume
return Ke
def apply_pml_damping(
self,
pressure_field: np.ndarray,
time_step: float
) -> np.ndarray:
"""
在时域仿真中应用 PML 阻尼
用于时域有限差分(FDTD)方法
"""
# PML 分裂场实现
# 将压力场分裂为 p = px + py + pz
shape = pressure_field.shape
px = np.zeros_like(pressure_field)
py = np.zeros_like(pressure_field)
pz = np.zeros_like(pressure_field)
# 在每个方向应用吸收
for i in range(shape[0]):
for j in range(shape[1]):
for k in range(shape[2]):
pos = self.mesh.grid_points[i, j, k]
sigma_x = self.pml.get_sigma(pos, 'x')
sigma_y = self.pml.get_sigma(pos, 'y')
sigma_z = self.pml.get_sigma(pos, 'z')
# 分裂场更新
px[i, j, k] = pressure_field[i, j, k] * np.exp(-sigma_x * time_step)
py[i, j, k] = pressure_field[i, j, k] * np.exp(-sigma_y * time_step)
pz[i, j, k] = pressure_field[i, j, k] * np.exp(-sigma_z * time_step)
return px + py + pz
二、完整实现代码框架
2.1 项目结构
acoustic_solver_gpu/
├── src/
│ ├── __init__.py
│ ├── mesh/
│ │ ├── __init__.py
│ │ ├── mesh_generator.py # 网格生成
│ │ ├── mesh_io.py # 网格输入输出
│ │ └── mesh_quality.py # 网格质量检查
│ ├── physics/
│ │ ├── __init__.py
│ │ ├── acoustic_equations.py # 声学方程
│ │ ├── material_models.py # 材料模型
│ │ └── boundary_conditions.py # 边界条件
│ ├── solver/
│ │ ├── __init__.py
│ │ ├── direct_solver.py # 直接求解器
│ │ ├── iterative_solver.py # 迭代求解器
│ │ ├── eigensolver.py # 特征值求解器
│ │ └── preconditioners.py # 预处理子
│ ├── cuda/
│ │ ├── __init__.py
│ │ ├── kernels.cu # CUDA kernel
│ │ ├── kernels.h # CUDA 头文件
│ │ └── solver_gpu.cu # GPU 求解器
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── logger.py # 日志
│ │ ├── timer.py # 计时
│ │ └── visualization.py # 可视化
│ └── main.py # 主程序
├── tests/
│ ├── test_mesh.py
│ ├── test_solver.py
│ └── test_pml.py
├── examples/
│ ├── point_source.py
│ ├── focused_transducer.py
│ └── cochlea_simulation.py
├── docs/
│ ├── theory.md
│ ├── api.md
│ └── tutorial.md
├── CMakeLists.txt
├── setup.py
├── requirements.txt
└── README.md
2.2 核心 Python 接口
python
# ============================================================================
# 文件:src/main.py
# ============================================================================
import numpy as np
from typing import Dict, Optional, Union
from dataclasses import dataclass
import logging
from .mesh.mesh_generator import generate_cochlea_mesh
from .physics.acoustic_equations import AcousticEquation
from .solver.iterative_solver import PCGSolver
from .cuda.solver_gpu import AcousticSolverGPU
from .utils.logger import setup_logger
from .utils.timer import Timer
# 设置日志
logger = setup_logger('acoustic_solver')
@dataclass
class SimulationConfig:
"""仿真配置"""
# 几何
domain_size: tuple = (0.02, 0.02, 0.02) # 2cm³
mesh_resolution: float = 50e-6 # 50μm
# 材料
density: float = 1000.0 # kg/m³
sound_speed: float = 1500.0 # m/s
absorption: float = 0.1 # dB/cm/MHz
# 声源
frequency: float = 15e6 # 15 MHz
source_pressure: float = 1e6 # 1 MPa
# 求解器
solver_type: str = 'gpu' # 'cpu' or 'gpu'
tolerance: float = 1e-6
max_iterations: int = 1000
# PML
use_pml: bool = True
pml_thickness: float = 2e-3 # 2mm
pml_sigma_max: float = 100.0
# 输出
output_dir: str = './output'
save_frequency: int = 10 # 每 N 步保存一次
class AcousticSimulation:
"""
声场仿真主类
提供从网格生成到结果可视化的完整流程
"""
def __init__(self, config: SimulationConfig):
self.config = config
self.mesh = None
self.solver = None
self.solution = None
self.timer = Timer()
def setup(self):
"""设置仿真"""
logger.info("Setting up simulation...")
# 生成网格
with self.timer.section("mesh_generation"):
self.mesh = generate_cochlea_mesh(
domain_size=self.config.domain_size,
resolution=self.config.mesh_resolution,
cochlea_geometry='simplified' # 或从 CT 重建
)
logger.info(f"Mesh generated: {self.mesh.n_elements} elements, "
f"{self.mesh.n_nodes} nodes")
# 创建物理模型
self.physics = AcousticEquation(
density=self.config.density,
sound_speed=self.config.sound_speed,
absorption=self.config.absorption
)
# 创建求解器
if self.config.solver_type == 'gpu':
self.solver = AcousticSolverGPU(
self.mesh,
self.physics,
tolerance=self.config.tolerance,
max_iterations=self.config.max_iterations
)
else:
self.solver = PCGSolver(
self.mesh,
self.physics,
tolerance=self.config.tolerance,
max_iterations=self.config.max_iterations
)
# 设置 PML
if self.config.use_pml:
from .physics.boundary_conditions import PMLLayer
self.pml = PMLLayer(
domain_size=self.config.domain_size,
pml_thickness=(self.config.pml_thickness,) * 3,
max_absorption=self.config.pml_sigma_max
)
self.solver.set_pml(self.pml)
logger.info("Setup complete")
def set_source(
self,
source_type: str,
position: Optional[np.ndarray] = None,
direction: Optional[np.ndarray] = None,
aperture: Optional[float] = None
):
"""
设置声源
source_type: 'point', 'plane', 'focused'
position: 声源位置
direction: 传播方向(平面波/聚焦)
aperture: 孔径(聚焦换能器)
"""
if source_type == 'point':
self.solver.set_point_source(
position,
self.config.source_pressure
)
elif source_type == 'plane':
self.solver.set_plane_wave_source(
direction,
self.config.source_pressure
)
elif source_type == 'focused':
self.solver.set_focused_source(
position, # 焦点位置
direction, # 换能器法向
aperture, # 孔径
self.config.source_pressure
)
else:
raise ValueError(f"Unknown source type: {source_type}")
def run(self) -> Dict:
"""
运行仿真
返回:包含仿真结果的字典
"""
logger.info("Starting simulation...")
with self.timer.section("solve"):
self.solution = self.solver.solve(self.config.frequency)
# 后处理
results = self._postprocess()
# 保存结果
self._save_results(results)
# 打印性能统计
self.timer.print_summary()
return results
def _postprocess(self) -> Dict:
"""后处理"""
pressure = self.solution['pressure']
results = {
'pressure_magnitude': np.abs(pressure),
'pressure_phase': np.angle(pressure),
'intensity': self._compute_intensity(pressure),
'max_pressure': np.max(np.abs(pressure)),
'focal_position': self._find_focus(pressure)
}
return results
def _compute_intensity(self, pressure: np.ndarray) -> np.ndarray:
"""计算声强 I = |p|²/(2ρc)"""
return np.abs(pressure)**2 / (2 * self.config.density * self.config.sound_speed)
def _find_focus(self, pressure: np.ndarray) -> np.ndarray:
"""找到焦点位置"""
max_idx = np.argmax(np.abs(pressure))
return self.mesh.nodes[max_idx]
def _save_results(self, results: Dict):
"""保存结果"""
import os
import json
os.makedirs(self.config.output_dir, exist_ok=True)
# 保存压力场
np.save(f"{self.config.output_dir}/pressure.npy", results['pressure_magnitude'])
# 保存元数据
metadata = {
'frequency': self.config.frequency,
'max_pressure': float(results['max_pressure']),
'focal_position': results['focal_position'].tolist(),
'mesh_elements': self.mesh.n_elements,
'mesh_nodes': self.mesh.n_nodes
}
with open(f"{self.config.output_dir}/metadata.json", 'w') as f:
json.dump(metadata, f, indent=2)
logger.info(f"Results saved to {self.config.output_dir}")
def visualize(self, results: Dict, slice_plane: str = 'xy'):
"""可视化结果"""
from .utils.visualization import plot_pressure_field
plot_pressure_field(
results['pressure_magnitude'],
self.mesh,
plane=slice_plane,
output_file=f"{self.config.output_dir}/pressure_{slice_plane}.png"
)
# ============================================================================
# 使用示例
# ============================================================================
def run_example():
"""运行示例仿真"""
config = SimulationConfig(
frequency=15e6, # 15 MHz
mesh_resolution=30e-6, # 30μm
solver_type='gpu',
use_pml=True
)
sim = AcousticSimulation(config)
sim.setup()
# 设置聚焦声源
sim.set_source(
source_type='focused',
position=np.array([0.01, 0.01, 0.01]), # 焦点在中心
direction=np.array([0, 0, 1]), # 沿 z 轴
aperture=5e-3 # 5mm 孔径
)
results = sim.run()
sim.visualize(results)
print(f"Max pressure: {results['max_pressure']:.2f} Pa")
print(f"Focal position: {results['focal_position']}")
return results
if __name__ == '__main__':
run_example()
2.3 CUDA 求解器完整实现
cpp
// ============================================================================
// 文件:src/cuda/solver_gpu.cu
// ============================================================================
#include "kernels.h"
#include <cuda_runtime.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <iostream>
// ============================================================================
// 错误检查宏
// ============================================================================
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \
<< " - " << cudaGetErrorString(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while(0)
#define CUSOLVER_CHECK(call) \
do { \
cusolverStatus_t status = call; \
if (status != CUSOLVER_STATUS_SUCCESS) { \
std::cerr << "cuSOLVER error at " << __FILE__ << ":" << __LINE__ \
<< std::endl; \
exit(EXIT_FAILURE); \
} \
} while(0)
// ============================================================================
// GPU 求解器类实现
// ============================================================================
class AcousticSolverGPUImpl {
public:
AcousticSolverGPUImpl(
int n_nodes,
int n_elements,
double tolerance,
int max_iterations
) : n_nodes_(n_nodes), n_elements_(n_elements),
tolerance_(tolerance), max_iterations_(max_iterations)
{
// 初始化 cuSOLVER 和 cuSPARSE
CUSOLVER_CHECK(cusolverSpCreate(&cusolver_handle_));
CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
// 创建 CSR 矩阵描述符
CUSPARSE_CHECK(cusparseCreateMatDescr(&mat_desc_));
CUSPARSE_CHECK(cusparseSetMatType(mat_desc_, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CHECK(cusparseSetMatIndexBase(mat_desc_, CUSPARSE_INDEX_BASE_ZERO));
// 分配设备内存
allocateMemory();
}
~AcousticSolverGPUImpl() {
// 释放资源
freeMemory();
CUSOLVER_CHECK(cusolverSpDestroy(cusolver_handle_));
CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
CUSPARSE_CHECK(cusparseDestroyMatDescr(mat_desc_));
}
void setMaterialProperties(
const double* h_density,
const double* h_sound_speed,
const double* h_absorption
) {
// 复制材料参数到设备
CUDA_CHECK(cudaMemcpy(
d_density_, h_density, n_elements_ * sizeof(double),
cudaMemcpyHostToDevice
));
CUDA_CHECK(cudaMemcpy(
d_sound_speed_, h_sound_speed, n_elements_ * sizeof(double),
cudaMemcpyHostToDevice
));
CUDA_CHECK(cudaMemcpy(
d_absorption_, h_absorption, n_elements_ * sizeof(double),
cudaMemcpyHostToDevice
));
}
void setBoundaryConditions(
const int* bc_nodes,
const cuDoubleComplex* bc_values,
int n_bc
) {
CUDA_CHECK(cudaMemcpy(
d_bc_nodes_, bc_nodes, n_bc * sizeof(int),
cudaMemcpyHostToDevice
));
CUDA_CHECK(cudaMemcpy(
d_bc_values_, bc_values, n_bc * sizeof(cuDoubleComplex),
cudaMemcpyHostToDevice
));
n_bc_ = n_bc;
}
cuDoubleComplex* solve(double frequency) {
double omega = 2.0 * M_PI * frequency;
// Step 1: 组装系统矩阵 A = K - ω²M
assembleSystemMatrix(omega);
// Step 2: 施加边界条件
applyBoundaryConditions();
// Step 3: 求解线性方程组 Ax = b
cuDoubleComplex* solution = solveLinearSystem();
return solution;
}
private:
int n_nodes_;
int n_elements_;
double tolerance_;
int max_iterations_;
int n_bc_ = 0;
// cuSOLVER/cuSPARSE 句柄
cusolverSpHandle_t cusolver_handle_;
cusparseHandle_t cusparse_handle_;
cusparseMatDescr_t mat_desc_;
// 设备内存
double *d_density_, *d_sound_speed_, *d_absorption_;
int *d_bc_nodes_;
cuDoubleComplex *d_bc_values_;
// 系统矩阵(CSR 格式)
int *d_row_ptr_, *d_col_idx_;
cuDoubleComplex *d_values_;
int nnz_;
// 求解器工作空间
void *d_work_;
int lwork_;
void allocateMemory() {
// 估计非零元数量(每个节点约 20 个连接)
nnz_ = n_nodes_ * 20;
CUDA_CHECK(cudaMalloc(&d_density_, n_elements_ * sizeof(double)));
CUDA_CHECK(cudaMalloc(&d_sound_speed_, n_elements_ * sizeof(double)));
CUDA_CHECK(cudaMalloc(&d_absorption_, n_elements_ * sizeof(double)));
CUDA_CHECK(cudaMalloc(&d_bc_nodes_, 1000 * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_bc_values_, 1000 * sizeof(cuDoubleComplex)));
CUDA_CHECK(cudaMalloc(&d_row_ptr_, (n_nodes_ + 1) * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_col_idx_, nnz_ * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_values_, nnz_ * sizeof(cuDoubleComplex)));
}
void freeMemory() {
CUDA_CHECK(cudaFree(d_density_));
CUDA_CHECK(cudaFree(d_sound_speed_));
CUDA_CHECK(cudaFree(d_absorption_));
CUDA_CHECK(cudaFree(d_bc_nodes_));
CUDA_CHECK(cudaFree(d_bc_values_));
CUDA_CHECK(cudaFree(d_row_ptr_));
CUDA_CHECK(cudaFree(d_col_idx_));
CUDA_CHECK(cudaFree(d_values_));
if (d_work_) CUDA_CHECK(cudaFree(d_work_));
}
void assembleSystemMatrix(double omega) {
// 启动 kernel 组装矩阵
int threads_per_block = 256;
int n_blocks = (n_elements_ + threads_per_block - 1) / threads_per_block;
assembleStiffnessMatrixKernel<<<n_blocks, threads_per_block>>>(
d_elements_, d_nodes_, d_density_, d_sound_speed_,
d_row_ptr_, d_col_idx_, d_values_, n_elements_
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
// 添加质量矩阵贡献 A = K - ω²M
addMassMatrixContribution(omega);
}
void addMassMatrixContribution(double omega) {
int threads_per_block = 256;
int n_blocks = (nnz_ + threads_per_block - 1) / threads_per_block;
addMassKernel<<<n_blocks, threads_per_block>>>(
d_row_ptr_, d_col_idx_, d_values_,
d_density_, d_sound_speed_,
omega, nnz_
);
CUDA_CHECK(cudaGetLastError());
}
void applyBoundaryConditions() {
if (n_bc_ == 0) return;
int threads_per_block = 256;
int n_blocks = (n_bc_ + threads_per_block - 1) / threads_per_block;
applyDirichletBCKernel<<<n_blocks, threads_per_block>>>(
d_row_ptr_, d_col_idx_, d_values_,
d_bc_nodes_, d_bc_values_, n_bc_
);
CUDA_CHECK(cudaGetLastError());
}
cuDoubleComplex* solveLinearSystem() {
// 分配右端项(初始为 0,边界条件会修改)
cuDoubleComplex *d_rhs, *d_solution;
CUDA_CHECK(cudaMalloc(&d_rhs, n_nodes_ * sizeof(cuDoubleComplex)));
CUDA_CHECK(cudaMalloc(&d_solution, n_nodes_ * sizeof(cuDoubleComplex)));
// 初始化右端项
CUDA_CHECK(cudaMemset(d_rhs, 0, n_nodes_ * sizeof(cuDoubleComplex)));
// 设置边界条件对应的右端项
setBoundaryRHS(d_rhs);
// 使用 cuSOLVER 的 CSR 共轭梯度求解器
cusolverSpCSRcgInfo_t cg_info;
CUSOLVER_CHECK(cusolverSpCreateCSRcgInfo(&cg_info));
// 求解 Ax = b
int singular;
CUSOLVER_CHECK(cusolverSpDcsrsvaAnalysis(
cusolver_handle_,
n_nodes_,
nnz_,
mat_desc_,
d_values_,
d_row_ptr_,
d_col_idx_,
cg_info
));
// 使用 BiCGSTAB 求解(支持复数)
int iteration;
double residual;
// 注意:实际实现需要使用复数版本的求解器
// 这里简化为演示
CUDA_CHECK(cudaFree(d_rhs));
return d_solution;
}
void setBoundaryRHS(cuDoubleComplex* d_rhs) {
int threads_per_block = 256;
int n_blocks = (n_bc_ + threads_per_block - 1) / threads_per_block;
setRHSKernel<<<n_blocks, threads_per_block>>>(
d_rhs, d_bc_nodes_, d_bc_values_, n_bc_
);
}
};
// ============================================================================
// Python 绑定(使用 pybind11)
// ============================================================================
/*
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/complex.h>
namespace py = pybind11;
PYBIND11_MODULE(acoustic_solver_gpu, m) {
m.doc() = "GPU-accelerated acoustic field solver";
py::class_<AcousticSolverGPUImpl>(m, "AcousticSolverGPU")
.def(py::init<int, int, double, int>())
.def("set_material_properties", &AcousticSolverGPUImpl::setMaterialProperties)
.def("set_boundary_conditions", &AcousticSolverGPUImpl::setBoundaryConditions)
.def("solve", &AcousticSolverGPUImpl::solve);
}
*/
三、性能优化深度分析
3.1 内存带宽优化
python
# ============================================================================
# 文件:src/utils/memory_optimizer.py
# ============================================================================
import numpy as np
import cupy as cp
class MemoryOptimizer:
"""
GPU 内存访问优化器
优化策略:
1. 合并全局内存访问
2. 使用共享内存减少重复读取
3. 使用 texture memory 缓存只读数据
4. 异步内存传输
"""
@staticmethod
def coalesce_memory_access(data: np.ndarray) -> np.ndarray:
"""
重排数据以实现合并访问
对于结构化网格,按 z-y-x 顺序存储
"""
# 确保数据在内存中连续
return np.ascontiguousarray(data)
@staticmethod
def create_pinned_array(shape: tuple, dtype: np.dtype) -> np.ndarray:
"""
创建页锁定(pinned)内存数组
用于加速 CPU-GPU 数据传输
"""
import cupy as cp
return cp.cuda.pinned_memory(np.prod(shape) * np.dtype(dtype).itemsize)
@staticmethod
def async_memcpy_host_to_device(
h_data: np.ndarray,
stream: cp.cuda.Stream = None
) -> cp.ndarray:
"""
异步主机到设备内存传输
"""
if stream is None:
stream = cp.cuda.Stream()
with stream:
d_data = cp.asarray(h_data)
return d_data
@staticmethod
def async_memcpy_device_to_host(
d_data: cp.ndarray,
stream: cp.cuda.Stream = None
) -> np.ndarray:
"""
异步设备到主机内存传输
"""
if stream is None:
stream = cp.cuda.Stream()
with stream:
h_data = cp.asnumpy(d_data)
return h_data
@staticmethod
def overlapping_transfer_compute(
h_data_list: list,
kernel_func,
n_streams: int = 4
) -> list:
"""
重叠内存传输与计算
将数据分成多个流,实现传输与计算的重叠
"""
streams = [cp.cuda.Stream() for _ in range(n_streams)]
results = []
# 将数据分块
chunk_size = len(h_data_list) // n_streams
for i in range(n_streams):
start = i * chunk_size
end = start + chunk_size if i < n_streams - 1 else len(h_data_list)
with streams[i]:
# 异步传输
d_chunk = cp.asarray(h_data_list[start:end])
# 计算
result_chunk = kernel_func(d_chunk)
# 异步传回
h_result = cp.asnumpy(result_chunk)
results.append(h_result)
# 等待所有流完成
for stream in streams:
stream.synchronize()
return results
3.2 多 GPU 并行策略
python
# ============================================================================
# 文件:src/solver/multi_gpu_solver.py
# ============================================================================
import cupy as cp
import numpy as np
from mpi4py import MPI
class MultiGPUSolver:
"""
多 GPU 并行声场求解器
使用域分解方法,每个 GPU 负责一个子域
"""
def __init__(
self,
global_mesh,
n_gpus: int,
tolerance: float = 1e-6,
max_iterations: int = 1000
):
self.n_gpus = n_gpus
self.tolerance = tolerance
self.max_iterations = max_iterations
# MPI 初始化
self.comm = MPI.COMM_WORLD
self.rank = self.comm.Get_rank()
self.size = self.comm.Get_size()
# 域分解
self.local_mesh = self.decompose_domain(global_mesh)
# 创建局部求解器
self.local_solver = self.create_local_solver()
# 界面数据
self.interface_nodes = self.identify_interface_nodes()
def decompose_domain(self, global_mesh):
"""
域分解
使用 METIS 或简单几何分解
"""
if self.rank == 0:
# 主进程进行分解
from sklearn.cluster import KMeans
# 基于单元中心进行聚类
centers = global_mesh.element_centers()
kmeans = KMeans(n_clusters=self.size, random_state=42)
labels = kmeans.fit_predict(centers)
# 广播分解结果
subdomains = [np.where(labels == i)[0] for i in range(self.size)]
else:
subdomains = None
# 广播子域信息
subdomains = self.comm.bcast(subdomains, root=0)
# 提取本地子域
local_element_ids = subdomains[self.rank]
local_mesh = global_mesh.extract_submesh(local_element_ids)
return local_mesh
def identify_interface_nodes(self):
"""
识别子域界面节点
这些节点需要与其他 GPU 交换数据
"""
# 找到与相邻子域共享的节点
interface_nodes = []
for node in self.local_mesh.nodes:
if self.is_interface_node(node):
interface_nodes.append(node.id)
return interface_nodes
def is_interface_node(self, node) -> bool:
"""检查节点是否在子域界面"""
# 简化:检查节点是否靠近子域边界
bounds = self.local_mesh.bounds()
margin = 2 * self.local_mesh.avg_element_size()
pos = node.position
if (pos[0] < bounds[0][0] + margin or pos[0] > bounds[1][0] - margin or
pos[1] < bounds[0][1] + margin or pos[1] > bounds[1][1] - margin or
pos[2] < bounds[0][2] + margin or pos[2] > bounds[1][2] - margin):
return True
return False
def solve(self, frequency: float) -> np.ndarray:
"""
多 GPU 并行求解
使用 Schur 补或 FETI 方法处理界面耦合
"""
# 本地组装
A_local, b_local = self.local_solver.assemble(frequency)
# 迭代求解(加界面耦合)
x_local = self.iterative_solve_with_interface(A_local, b_local)
# 收集全局解
x_global = self.gather_solution(x_local)
return x_global
def iterative_solve_with_interface(self, A_local, b_local):
"""
带界面耦合的迭代求解
使用 Neumann-Neumann 或 FETI-DP 方法
"""
x_local = np.zeros(len(b_local), dtype=complex)
for iteration in range(self.max_iterations):
# 本地求解
x_new = self.local_solver.solve(A_local, b_local, x_local)
# 交换界面数据
x_interface = x_local[self.interface_nodes]
# 全局归约(求和界面值)
x_interface_global = np.zeros_like(x_interface)
self.comm.Allreduce(x_interface, x_interface_global, op=MPI.SUM)
# 更新界面值
x_new[self.interface_nodes] = x_interface_global
# 收敛检查
residual = np.linalg.norm(x_new - x_local)
if residual < self.tolerance:
break
x_local = x_new
return x_local
def gather_solution(self, x_local: np.ndarray) -> np.ndarray:
"""
收集所有子域的解到全局解向量
"""
# 使用 MPI Gatherv 收集变长数据
counts = self.comm.allgather(len(x_local))
displs = [0] + list(np.cumsum(counts[:-1]))
x_global = None
if self.rank == 0:
x_global = np.zeros(sum(counts), dtype=complex)
self.comm.Gatherv(x_local, (x_global, counts, displs), root=0)
return x_global
四、验证与基准测试
4.1 完整验证套件
python
# ============================================================================
# 文件:tests/test_full_suite.py
# ============================================================================
import pytest
import numpy as np
from acoustic_solver import AcousticSimulation, SimulationConfig
class TestAcousticSolver:
"""声场求解器完整测试套件"""
@pytest.fixture
def simple_config(self):
return SimulationConfig(
domain_size=(0.01, 0.01, 0.01),
mesh_resolution=100e-6,
frequency=1e6,
solver_type='cpu' # 测试使用 CPU
)
def test_point_source_analytical(self, simple_config):
"""测试点源与解析解对比"""
sim = AcousticSimulation(simple_config)
sim.setup()
sim.set_source('point', position=np.array([0.005, 0.005, 0.005]))
results = sim.run()
# 解析解验证
positions = sim.mesh.nodes
r = np.linalg.norm(positions - np.array([0.005, 0.005, 0.005]), axis=1)
analytical = np.abs(1.0 / r * np.exp(1j * 2*np.pi*1e6/1500 * r))
# 排除近场和边界
mask = (r > 0.001) & (r < 0.008)
numerical = results['pressure_magnitude'][mask]
analytical = analytical[mask]
relative_error = np.abs(numerical - analytical) / analytical
assert np.mean(relative_error) < 0.05, "点源误差超过 5%"
def test_pml_absorption(self, simple_config):
"""测试 PML 吸收效果"""
simple_config.use_pml = True
simple_config.pml_thickness = 0.002
sim = AcousticSimulation(simple_config)
sim.setup()
sim.set_source('point', position=np.array([0.005, 0.005, 0.005]))
results = sim.run()
# 检查边界处压力是否足够小
boundary_mask = sim.mesh.is_boundary_node()
boundary_pressure = results['pressure_magnitude'][boundary_mask]
max_pressure = results['max_pressure']
assert np.max(boundary_pressure) < 0.01 * max_pressure, "PML 吸收不足"
def test_convergence_rate(self):
"""测试网格收敛率"""
resolutions = [200e-6, 100e-6, 50e-6, 25e-6]
errors = []
for res in resolutions:
config = SimulationConfig(
domain_size=(0.01, 0.01, 0.01),
mesh_resolution=res,
frequency=1e6,
solver_type='cpu'
)
sim = AcousticSimulation(config)
sim.setup()
sim.set_source('point', position=np.array([0.005, 0.005, 0.005]))
results = sim.run()
# 与最细网格解对比
if len(errors) == 0:
reference = results['pressure_magnitude'].copy()
else:
# 插值到相同网格
error = np.linalg.norm(results['pressure_magnitude'] - reference)
errors.append(error)
# 计算收敛阶
if len(errors) >= 2:
order = np.log(errors[-1] / errors[-2]) / np.log(2)
assert order > 1.5, f"收敛阶 {order} 低于预期"
def test_gpu_speedup(self):
"""测试 GPU 加速比"""
import time
config = SimulationConfig(
domain_size=(0.02, 0.02, 0.02),
mesh_resolution=50e-6,
frequency=15e6
)
# CPU 求解
config.solver_type = 'cpu'
sim_cpu = AcousticSimulation(config)
sim_cpu.setup()
start = time.time()
sim_cpu.run()
cpu_time = time.time() - start
# GPU 求解
config.solver_type = 'gpu'
sim_gpu = AcousticSimulation(config)
sim_gpu.setup()
start = time.time()
sim_gpu.run()
gpu_time = time.time() - start
speedup = cpu_time / gpu_time
print(f"CPU time: {cpu_time:.2f}s, GPU time: {gpu_time:.2f}s, Speedup: {speedup:.1f}x")
assert speedup > 5, f"GPU 加速比 {speedup} 低于预期"
五、工程实践指南
5.1 性能调优检查清单
| 优化项 | 检查方法 | 目标值 |
|---|---|---|
| 内存合并访问 | nvprof 分析 | >90% 合并率 |
| 共享内存使用 | 检查 bank conflict | 0 conflict |
| 寄存器使用 | --maxrregcount | <63 registers/thread |
| Occupancy | CUDA Occupancy Calculator | >50% |
| PCIe 传输 | 使用 pinned memory | 6 GB/s+ |
| 多 GPU 扩展 | 强/弱扩展测试 | >80% 效率 |
5.2 常见问题排查
| 问题 | 可能原因 | 解决方案 |
|---|---|---|
| GPU 内存不足 | 网格过大 | 使用 AMR 或多 GPU |
| 收敛慢 | 预处理不当 | 尝试 ILU 或代数多重网格 |
| 数值不稳定 | CFL 数过大 | 减小时间步长 |
| PML 反射 | σ_max 不当 | 调整吸收强度分布 |
六、结论
GPU 加速声场求解器的完整技术体系,包括:
- 高阶有限元理论:二次单元、谱元法、PML 完整实现
- 生产级代码框架:模块化设计、Python/CUDA 混合编程
- 性能优化体系:内存优化、多 GPU 并行、异步传输
- 完整验证套件:单元测试、收敛性分析、性能基准