什么是流式处理
流式处理(Streaming) 是一种边读边处理的方式,而不是一次性把所有数据加载到内存中。它的核心优势是:
内存占用低 ------ 不需要一次性加载全部数据
响应速度快 ------ 可以在数据还没读完时就开始处理
适合大数据量 ------ 尤其适合 TB 级文件、网络流、数据库大表查询等
C# 中如何实现流式处理
在 .NET Core 3.0+ / .NET 5+ 中,引入了 IAsyncEnumerable<T>
(异步枚举器),配合 await foreach
就可以很方便地实现异步流式处理。
cs
public static async IAsyncEnumerable<List<T>> BatchAsync<T>(
this IAsyncEnumerable<T> source, int batchSize)
{
var batch = new List<T>(batchSize);
await foreach (var item in source)
{
batch.Add(item);
if (batch.Count == batchSize)
{
yield return batch;
batch = new List<T>(batchSize);
}
}
if (batch.Count > 0)
yield return batch;
}
cs
var entities = dbContext.Set<MyEntity>()
.Where(x => x.Status == "Pending")
.AsAsyncEnumerable(); // 流式读取
await foreach (var batch in entities.BatchAsync(1000))
{
// 每批 1000 条,批量更新
await UpdateBatchAsync(batch);
}
通用框架设计思路
抽象成 3 层结构:
-
数据源(Source)
- 负责提供数据(文件、TCP、数据库、传感器等)
- 统一返回
IAsyncEnumerable<T>
-
处理器(Processor)
- 负责处理单个数据项
- 可替换不同的业务逻辑
-
流式处理引擎(StreamingEngine)
- 把 Source 和 Processor 组合起来
- 提供统一的执行、取消、错误处理能力
封装与用例:
cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Sockets;
using System.Text;
using System.Threading;
using System.Threading.Channels;
using System.Threading.Tasks;
using Microsoft.EntityFrameworkCore;
// ================================
// 1. 接口定义
// ================================
public interface ISource<T>
{
IAsyncEnumerable<T> ReadAsync(CancellationToken token = default);
}
public interface IProcessor<T>
{
ValueTask ProcessAsync(T item, CancellationToken token = default);
ValueTask ProcessBatchAsync(List<T> batch, CancellationToken token = default);
}
// ================================
// 2. 流式处理引擎(批处理 + 重试 + 背压)
// ================================
public class StreamingEngine<T>
{
private readonly ISource<T> _source;
private readonly IProcessor<T> _processor;
private readonly int _batchSize;
private readonly int _maxRetries;
private readonly TimeSpan _retryDelay;
private readonly int _channelCapacity;
public StreamingEngine(
ISource<T> source,
IProcessor<T> processor,
int batchSize = 0,
int maxRetries = 3,
TimeSpan? retryDelay = null,
int channelCapacity = 1000)
{
_source = source ?? throw new ArgumentNullException(nameof(source));
_processor = processor ?? throw new ArgumentNullException(nameof(processor));
_batchSize = batchSize;
_maxRetries = maxRetries;
_retryDelay = retryDelay ?? TimeSpan.FromSeconds(1);
_channelCapacity = channelCapacity;
}
public async Task RunAsync(CancellationToken token = default)
{
var channel = Channel.CreateBounded<T>(_channelCapacity);
var producer = Task.Run(async () =>
{
try
{
await foreach (var item in _source.ReadAsync(token))
{
await channel.Writer.WriteAsync(item, token);
}
}
finally
{
channel.Writer.Complete();
}
}, token);
var consumer = ConsumeAsync(channel.Reader, token);
await Task.WhenAll(producer, consumer);
}
private async Task ConsumeAsync(ChannelReader<T> reader, CancellationToken token)
{
if (_batchSize > 0)
{
await ProcessInBatchesAsync(reader, token);
}
else
{
await ProcessSingleItemsAsync(reader, token);
}
}
private async Task ProcessSingleItemsAsync(ChannelReader<T> reader, CancellationToken token)
{
while (await reader.WaitToReadAsync(token))
{
while (reader.TryRead(out var item))
{
await RetryAsync(() => _processor.ProcessAsync(item, token), token);
}
}
}
private async Task ProcessInBatchesAsync(ChannelReader<T> reader, CancellationToken token)
{
var batch = new List<T>(_batchSize);
while (await reader.WaitToReadAsync(token))
{
while (reader.TryRead(out var item))
{
batch.Add(item);
if (batch.Count >= _batchSize)
{
await RetryAsync(() => _processor.ProcessBatchAsync(batch, token), token);
batch.Clear();
}
}
}
if (batch.Count > 0)
{
await RetryAsync(() => _processor.ProcessBatchAsync(batch, token), token);
}
}
private async Task RetryAsync(Func<Task> operation, CancellationToken token)
{
for (int i = 0; i < _maxRetries; i++)
{
try
{
await operation();
return;
}
catch
{
if (i == _maxRetries - 1) throw;
await Task.Delay(_retryDelay, token);
}
}
}
}
// ================================
// 3. 数据源实现
// ================================
public class FileSource<T> : ISource<T>
{
private readonly string _filePath;
private readonly Func<string, T> _converter;
public FileSource(string filePath, Func<string, T> converter)
{
_filePath = filePath;
_converter = converter;
}
public async IAsyncEnumerable<T> ReadAsync(CancellationToken token = default)
{
using var reader = new StreamReader(new FileStream(_filePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, true));
string line;
while ((line = await reader.ReadLineAsync()) != null)
{
token.ThrowIfCancellationRequested();
yield return _converter(line);
}
}
}
public class DatabaseSource<T> : ISource<T> where T : class
{
private readonly DbContext _context;
private readonly int _batchSize;
public DatabaseSource(DbContext context, int batchSize = 1000)
{
_context = context;
_batchSize = batchSize;
}
public async IAsyncEnumerable<T> ReadAsync(CancellationToken token = default)
{
var offset = 0;
while (true)
{
var batch = await _context.Set<T>()
.Skip(offset)
.Take(_batchSize)
.AsNoTracking()
.ToListAsync(token);
if (batch.Count == 0) yield break;
foreach (var item in batch)
yield return item;
offset += _batchSize;
}
}
}
public class TcpSource<T> : ISource<T>
{
private readonly int _port;
private readonly Func<string, T> _converter;
public TcpSource(int port, Func<string, T> converter)
{
_port = port;
_converter = converter;
}
public async IAsyncEnumerable<T> ReadAsync(CancellationToken token = default)
{
var listener = new TcpListener(IPAddress.Any, _port);
listener.Start();
using var client = await listener.AcceptTcpClientAsync();
using var stream = client.GetStream();
var buffer = new byte[4096];
int bytesRead;
while ((bytesRead = await stream.ReadAsync(buffer, 0, buffer.Length, token)) > 0)
{
var data = Encoding.UTF8.GetString(buffer, 0, bytesRead);
yield return _converter(data);
}
listener.Stop();
}
}
public class SensorSource<T> : ISource<T>
{
private readonly Func<double, T> _converter;
private readonly int _intervalMs;
public SensorSource(Func<double, T> converter, int intervalMs = 100)
{
_converter = converter;
_intervalMs = intervalMs;
}
public async IAsyncEnumerable<T> ReadAsync(CancellationToken token = default)
{
var rnd = new Random();
while (!token.IsCancellationRequested)
{
await Task.Delay(_intervalMs, token);
yield return _converter(rnd.NextDouble() * 100);
}
}
}
// ================================
// 4. 处理器实现
// ================================
public class ConsoleProcessor<T> : IProcessor<T>
{
public ValueTask ProcessAsync(T item, CancellationToken token = default)
{
Console.WriteLine($"Processed: {item}");
return ValueTask.CompletedTask;
}
public ValueTask ProcessBatchAsync(List<T> batch, CancellationToken token = default)
{
Console.WriteLine($"Processed batch of {batch.Count} items");
return ValueTask.CompletedTask;
}
}
// ================================
// 5. 使用示例
// ================================
public class Program
{
public static async Task Main(string[] args)
{
// 示例1:文件流处理
var fileSource = new FileSource<string>("data.txt", line => line);
var fileProcessor = new ConsoleProcessor<string>();
var fileEngine = new StreamingEngine<string>(
fileSource,
fileProcessor,
batchSize: 100,
maxRetries: 3,
channelCapacity: 5000);
await fileEngine.RunAsync();
// 示例2:数据库流处理(需自行实现DbContext)
// var dbContext = new MyDbContext();
// var dbSource = new DatabaseSource<MyEntity>(dbContext);
// var dbProcessor = new ConsoleProcessor<MyEntity>();
// var dbEngine = new StreamingEngine<MyEntity>(
// dbSource,
// dbProcessor,
// batchSize: 1000);
// await dbEngine.RunAsync();
// 示例3:TCP流处理
// var tcpSource = new TcpSource<string>(8888, data => data);
// var tcpProcessor = new ConsoleProcessor<string>();
// var tcpEngine = new StreamingEngine<string>(
// tcpSource,
// tcpProcessor);
// await tcpEngine.RunAsync();
// 示例4:传感器流处理
// var sensorSource = new SensorSource<double>(value => value);
// var sensorProcessor = new ConsoleProcessor<double>();
// var sensorEngine = new StreamingEngine<double>(
// sensorSource,
// sensorProcessor,
// batchSize: 5);
// await sensorEngine.RunAsync();
}
}
// 为了让代码可编译,这里放一个空的DbContext示例
public class MyDbContext : DbContext
{
public DbSet<MyEntity> MyEntities { get; set; }
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
optionsBuilder.UseInMemoryDatabase("TestDb");
}
}
public class MyEntity
{
public int Id { get; set; }
public string Name { get; set; }
}
代码说明
- ISource<T>:数据源接口,你可以轻松扩展 WebSocket、Kafka 等新源
- IProcessor<T>:数据处理器接口,可实现写入数据库、调用 API 等业务逻辑
- StreamingEngine<T> :
- 支持单条处理 和批处理
- 内置失败重试机制
- 使用
Channel
实现背压,防止生产速度过快
- 四种数据源 :
FileSource
:大文件流式读取DatabaseSource
:数据库分页流式读取TcpSource
:TCP 网络流SensorSource
:实时传感器数据模拟