Node.js集群部署指南 🚀
引言
Node.js集群部署是提高应用可用性和性能的关键策略。本文将深入探讨Node.js集群部署的实现方案,包括进程管理、负载均衡、故障恢复等方面,帮助开发者构建高可用的Node.js应用。
集群部署概述
Node.js集群部署主要包括以下方面:
- 进程管理:主进程与工作进程的协调
- 负载均衡:请求分发与任务调度
- 故障恢复:进程监控与自动重启
- 状态同步:进程间通信与数据共享
- 资源管理:CPU和内存的合理分配
集群部署实现
集群管理器
typescript
// 集群管理器
class ClusterManager {
private static instance: ClusterManager;
private config: ClusterConfig;
private workers: Map<number, Worker>;
private metrics: Map<number, WorkerMetrics>;
private state: ClusterState;
private constructor() {
this.workers = new Map();
this.metrics = new Map();
this.config = {
workers: os.cpus().length,
maxMemory: 1024 * 1024 * 1024, // 1GB
restartDelay: 1000,
healthCheckInterval: 10000
};
this.state = {
isStarting: false,
isShuttingDown: false,
startTime: Date.now(),
restartCount: 0
};
}
// 获取单例实例
static getInstance(): ClusterManager {
if (!ClusterManager.instance) {
ClusterManager.instance = new ClusterManager();
}
return ClusterManager.instance;
}
// 初始化集群
init(config: ClusterConfig): void {
this.config = { ...this.config, ...config };
if (cluster.isPrimary) {
// 主进程逻辑
this.initializeMaster();
} else {
// 工作进程逻辑
this.initializeWorker();
}
}
// 初始化主进程
private initializeMaster(): void {
console.log(`Master ${process.pid} is running`);
// 启动工作进程
this.startWorkers();
// 设置进程事件处理
this.setupProcessHandlers();
// 启动健康检查
this.startHealthCheck();
// 设置IPC通信
this.setupIpcCommunication();
}
// 启动工作进程
private startWorkers(): void {
this.state.isStarting = true;
for (let i = 0; i < this.config.workers; i++) {
this.createWorker();
}
this.state.isStarting = false;
}
// 创建工作进程
private createWorker(): void {
const worker = cluster.fork();
this.workers.set(worker.id, worker);
this.metrics.set(worker.id, {
pid: worker.process.pid!,
cpu: 0,
memory: 0,
requests: 0,
errors: 0,
lastHeartbeat: Date.now()
});
console.log(`Worker ${worker.id} started`);
// 设置工作进程事件处理
this.setupWorkerHandlers(worker);
}
// 设置工作进程事件处理
private setupWorkerHandlers(worker: Worker): void {
// 监听在线状态
worker.on('online', () => {
console.log(`Worker ${worker.id} is online`);
});
// 监听退出事件
worker.on('exit', (code, signal) => {
console.log(
`Worker ${worker.id} died with code ${code} and signal ${signal}`
);
this.handleWorkerExit(worker, code, signal);
});
// 监听错误事件
worker.on('error', error => {
console.error(`Worker ${worker.id} error:`, error);
this.metrics.get(worker.id)!.errors++;
});
// 监听消息事件
worker.on('message', message => {
this.handleWorkerMessage(worker, message);
});
}
// 处理工作进程退出
private handleWorkerExit(
worker: Worker,
code: number,
signal: string
): void {
this.workers.delete(worker.id);
this.metrics.delete(worker.id);
// 非正常关闭时重启进程
if (!this.state.isShuttingDown && code !== 0) {
console.log(`Restarting worker ${worker.id}`);
setTimeout(() => {
this.createWorker();
this.state.restartCount++;
}, this.config.restartDelay);
}
}
// 处理工作进程消息
private handleWorkerMessage(worker: Worker, message: any): void {
if (message.type === 'heartbeat') {
// 更新心跳时间
this.metrics.get(worker.id)!.lastHeartbeat = Date.now();
// 更新指标
this.updateWorkerMetrics(worker.id, message.metrics);
}
}
// 更新工作进程指标
private updateWorkerMetrics(
workerId: number,
metrics: WorkerMetrics
): void {
const currentMetrics = this.metrics.get(workerId)!;
currentMetrics.cpu = metrics.cpu;
currentMetrics.memory = metrics.memory;
currentMetrics.requests = metrics.requests;
}
// 设置进程事件处理
private setupProcessHandlers(): void {
// 处理未捕获的异常
process.on('uncaughtException', error => {
console.error('Uncaught exception:', error);
this.gracefulShutdown();
});
// 处理进程信号
process.on('SIGTERM', () => {
console.log('Received SIGTERM signal');
this.gracefulShutdown();
});
process.on('SIGINT', () => {
console.log('Received SIGINT signal');
this.gracefulShutdown();
});
}
// 启动健康检查
private startHealthCheck(): void {
setInterval(() => {
this.checkWorkersHealth();
}, this.config.healthCheckInterval);
}
// 检查工作进程健康状态
private checkWorkersHealth(): void {
const now = Date.now();
for (const [workerId, metrics] of this.metrics.entries()) {
// 检查心跳超时
if (now - metrics.lastHeartbeat > this.config.healthCheckInterval * 2) {
console.warn(`Worker ${workerId} heartbeat timeout`);
this.restartWorker(workerId);
}
// 检查内存使用
if (metrics.memory > this.config.maxMemory) {
console.warn(`Worker ${workerId} memory exceeded`);
this.restartWorker(workerId);
}
}
}
// 重启工作进程
private restartWorker(workerId: number): void {
const worker = this.workers.get(workerId);
if (worker) {
console.log(`Restarting worker ${workerId}`);
// 停止接收新请求
worker.disconnect();
// 等待一段时间后强制终止
setTimeout(() => {
if (!worker.isDead()) {
worker.kill();
}
}, 5000);
}
}
// 设置IPC通信
private setupIpcCommunication(): void {
process.on('message', (message: any) => {
if (message.type === 'broadcast') {
this.broadcastToWorkers(message.data);
}
});
}
// 广播消息到所有工作进程
private broadcastToWorkers(data: any): void {
for (const worker of this.workers.values()) {
worker.send({
type: 'broadcast',
data
});
}
}
// 优雅关闭
async gracefulShutdown(): Promise<void> {
this.state.isShuttingDown = true;
console.log('Starting graceful shutdown...');
// 停止接收新请求
for (const worker of this.workers.values()) {
worker.disconnect();
}
// 等待工作进程退出
await this.waitForWorkersToExit();
console.log('All workers have exited');
process.exit(0);
}
// 等待工作进程退出
private async waitForWorkersToExit(): Promise<void> {
return new Promise(resolve => {
setInterval(() => {
if (this.workers.size === 0) {
resolve();
}
}, 1000);
});
}
// 初始化工作进程
private initializeWorker(): void {
console.log(`Worker ${process.pid} started`);
// 设置进程事件处理
this.setupWorkerProcessHandlers();
// 启动心跳
this.startHeartbeat();
// 设置IPC通信
this.setupWorkerIpcCommunication();
}
// 设置工作进程事件处理
private setupWorkerProcessHandlers(): void {
// 处理未捕获的异常
process.on('uncaughtException', error => {
console.error('Worker uncaught exception:', error);
process.exit(1);
});
// 处理未处理的Promise拒绝
process.on('unhandledRejection', (reason, promise) => {
console.error('Worker unhandled rejection:', reason);
});
}
// 启动心跳
private startHeartbeat(): void {
setInterval(() => {
if (process.send) {
process.send({
type: 'heartbeat',
metrics: {
cpu: process.cpuUsage(),
memory: process.memoryUsage().heapUsed,
requests: 0, // 需要应用层实现
errors: 0 // 需要应用层实现
}
});
}
}, 5000);
}
// 设置工作进程IPC通信
private setupWorkerIpcCommunication(): void {
process.on('message', (message: any) => {
if (message.type === 'broadcast') {
this.handleBroadcastMessage(message.data);
}
});
}
// 处理广播消息
private handleBroadcastMessage(data: any): void {
// 处理广播消息的逻辑
console.log('Received broadcast:', data);
}
}
// 负载均衡器
class LoadBalancer {
private algorithm: string;
private workers: Map<number, Worker>;
private metrics: Map<number, WorkerMetrics>;
constructor(
algorithm: string = 'round-robin',
workers: Map<number, Worker>,
metrics: Map<number, WorkerMetrics>
) {
this.algorithm = algorithm;
this.workers = workers;
this.metrics = metrics;
}
// 选择工作进程
selectWorker(): Worker | null {
switch (this.algorithm) {
case 'round-robin':
return this.roundRobin();
case 'least-connections':
return this.leastConnections();
case 'least-cpu':
return this.leastCpu();
default:
return this.roundRobin();
}
}
// 轮询算法
private roundRobin(): Worker | null {
const workers = Array.from(this.workers.values());
if (workers.length === 0) {
return null;
}
const worker = workers[0];
this.workers.delete(worker.id);
this.workers.set(worker.id, worker);
return worker;
}
// 最少连接算法
private leastConnections(): Worker | null {
let minConnections = Infinity;
let selectedWorker: Worker | null = null;
for (const [workerId, worker] of this.workers.entries()) {
const metrics = this.metrics.get(workerId);
if (metrics && metrics.requests < minConnections) {
minConnections = metrics.requests;
selectedWorker = worker;
}
}
return selectedWorker;
}
// 最低CPU使用率算法
private leastCpu(): Worker | null {
let minCpu = Infinity;
let selectedWorker: Worker | null = null;
for (const [workerId, worker] of this.workers.entries()) {
const metrics = this.metrics.get(workerId);
if (metrics && metrics.cpu < minCpu) {
minCpu = metrics.cpu;
selectedWorker = worker;
}
}
return selectedWorker;
}
}
// 接口定义
interface ClusterConfig {
workers: number;
maxMemory: number;
restartDelay: number;
healthCheckInterval: number;
}
interface ClusterState {
isStarting: boolean;
isShuttingDown: boolean;
startTime: number;
restartCount: number;
}
interface WorkerMetrics {
pid: number;
cpu: number;
memory: number;
requests: number;
errors: number;
lastHeartbeat: number;
}
// 使用示例
if (cluster.isPrimary) {
// 主进程
const clusterManager = ClusterManager.getInstance();
clusterManager.init({
workers: 4,
maxMemory: 1024 * 1024 * 1024,
restartDelay: 1000,
healthCheckInterval: 10000
});
// 创建HTTP服务器
const server = http.createServer();
server.on('request', (req, res) => {
// 使用负载均衡器选择工作进程
const loadBalancer = new LoadBalancer(
'round-robin',
clusterManager.workers,
clusterManager.metrics
);
const worker = loadBalancer.selectWorker();
if (worker) {
worker.send({
type: 'request',
data: {
url: req.url,
method: req.method,
headers: req.headers
}
});
} else {
res.writeHead(503);
res.end('Service Unavailable');
}
});
server.listen(3000);
} else {
// 工作进程
const clusterManager = ClusterManager.getInstance();
clusterManager.init({} as ClusterConfig);
// 处理请求
process.on('message', message => {
if (message.type === 'request') {
// 处理请求的逻辑
console.log('Worker handling request:', message.data);
}
});
}
最佳实践与建议
-
进程管理
- 合理设置进程数
- 监控进程状态
- 自动故障恢复
- 优雅关闭处理
-
负载均衡
- 选择合适算法
- 考虑进程负载
- 动态调整策略
- 避免单点故障
-
状态管理
- 使用共享存储
- 保持数据一致
- 处理并发访问
- 避免竞态条件
-
监控告警
- 实时监控指标
- 设置告警阈值
- 及时处理异常
- 记录运行日志
总结
Node.js集群部署需要考虑以下方面:
- 进程管理和负载均衡
- 故障检测和自动恢复
- 进程间通信和状态同步
- 资源管理和性能优化
- 监控和运维支持
通过合理的集群部署,可以提高Node.js应用的可用性和性能。
学习资源
- Node.js集群文档
- 进程管理工具
- 负载均衡策略
- 高可用架构
- 运维最佳实践
如果你觉得这篇文章有帮助,欢迎点赞收藏,也期待在评论区看到你的想法和建议!👇
终身学习,共同成长。
咱们下一期见
💻