从事件驱动到数据发送
Redis作为一个高性能的内存数据库,其网络层的设计至关重要。让我们深入源码,看看Redis是如何优雅地将数据写回客户端的。
核心架构:事件驱动的写操作
1. 写事件处理器的注册与触发
在Redis中,写操作遵循事件驱动模式。当需要向客户端发送数据时,会注册写事件处理器:
c
/* Write event handler. Just send data to the client. */
void sendReplyToClient(connection *conn) {
client *c = connGetPrivateData(conn);
writeToClient(c,1);
}
这个函数是写事件的处理入口,当套接字变得可写时被调用。注意第二个参数handler_installed设为1,表示这个调用来自于已安装的事件处理器。
2. 多线程环境下的安全考虑
Redis在IO多线程模式下需要特别注意线程安全:
c
/* This function is called by threads, but always with handler_installed
* set to 0. So when handler_installed is set to 0 the function must be
* thread safe. */
int writeToClient(client *c, int handler_installed) {
// 线程安全的写计数器
atomicIncr(server.stat_total_writes_processed, 1);
数据写入的核心逻辑
3. 分层写入策略
Redis采用分层的数据写入策略,针对不同类型的客户端优化:
c
while(clientHasPendingReplies(c)) {
int ret = _writeToClient(c, &nwritten);
if (ret == C_ERR) break;
totwritten += nwritten;
// 流控机制:避免一个客户端占用太多资源
if (totwritten > NET_MAX_WRITES_PER_EVENT &&
(server.maxmemory == 0 ||
zmalloc_used_memory() < server.maxmemory) &&
!(c->flags & CLIENT_SLAVE)) break;
}
这里有三个重要考量:
-
公平性 :限制每次事件最多写入
NET_MAX_WRITES_PER_EVENT字节 -
内存压力:内存不足时尽可能多写
-
复制优化:从节点不受此限制,避免复制积压
4. 不同类型客户端的差异化处理
Redis针对主从复制进行了特殊优化:
c
if (getClientType(c) == CLIENT_TYPE_SLAVE) {
atomicIncr(server.stat_net_repl_output_bytes, totwritten);
// 从节点使用专门的复制缓冲区
replBufBlock *o = listNodeValue(c->ref_repl_buf_node);
*nwritten = connWrite(c->conn, o->buf+c->ref_block_pos,
o->used-c->ref_block_pos);
} else {
atomicIncr(server.stat_net_output_bytes, totwritten);
// 普通客户端使用常规缓冲区
if (listLength(c->reply) > 0) {
int ret = _writevToClient(c, nwritten); // 使用writev批量写入
} else if (c->bufpos > 0) {
*nwritten = connWrite(c->conn, c->buf + c->sentlen,
c->bufpos - c->sentlen);
}
}
函数指针:抽象化的网络层
5. 连接类型抽象
Redis通过函数指针实现了网络层的抽象,支持不同类型的连接:
c
typedef struct ConnectionType {
/* connection type */
connTypeGetTypeFunc *get_type;
/* IO operations */
connWriteFunc *write;
connWritevFunc *writev;
connReadFunc *read;
connSetWriteHandlerFunc *set_write_handler;
connSetReadHandlerFunc *set_read_handler;
/* ... other function pointers ... */
} ConnectionType;
6. Socket连接的具体实现
对于Socket连接,Redis提供了具体的实现:
c
static ConnectionType CT_Socket = {
.write = connSocketWrite,
.writev = connSocketWritev,
.set_write_handler = connSocketSetWriteHandler,
/* ... other function pointers ... */
};
static int connSocketWrite(connection *conn, const void *data, size_t data_len) {
int ret = write(conn->fd, data, data_len);
if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
if (errno != EINTR && conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;
}
事件处理器的动态管理
7. 写事件的注册与移除
Redis根据数据是否写完动态管理写事件处理器:
c
if (!clientHasPendingReplies(c)) {
c->sentlen = 0;
/* Note that writeToClient() is called in a threaded way, but
* aeDeleteFileEvent() is not thread safe: however writeToClient()
* is always called with handler_installed set to 0 from threads
* so we are fine. */
if (handler_installed) {
serverAssert(io_threads_op == IO_THREADS_OP_IDLE);
connSetWriteHandler(c->conn, NULL); // 移除写事件处理器
}
/* Close connection after entire reply has been sent. */
if (c->flags & CLIENT_CLOSE_AFTER_REPLY) {
freeClientAsync(c);
return C_ERR;
}
}
8. 连接状态管理
Redis在写操作后更新客户端状态:
c
if (totwritten > 0) {
/* For clients representing masters we don't count sending data
* as an interaction, since we always send REPLCONF ACK commands
* that take some time to just fill the socket output buffer.
* We just rely on data / pings received for timeout detection. */
if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = server.unixtime;
}
错误处理与资源清理
9. 写错误处理
当写操作失败时,Redis会妥善处理连接:
c
if (nwritten == -1) {
if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
serverLog(LL_VERBOSE,
"Error writing to client: %s", connGetLastError(c->conn));
freeClientAsync(c); // 异步释放客户端
return C_ERR;
}
}
10. 内存使用统计更新
写操作后需要更新客户端的内存使用统计:
c
/* Update client's memory usage after writing.
* Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in
* handleClientsWithPendingWritesUsingThreads(). */
if (io_threads_op == IO_THREADS_OP_IDLE)
updateClientMemUsageAndBucket(c);
设计亮点总结
-
分层抽象:通过函数指针实现连接类型的抽象,便于扩展
-
流控机制:智能控制每次事件的最大写入量,保证公平性
-
差异化处理:针对普通客户端、从节点、主节点采用不同策略
-
线程安全:在多线程环境下正确处理资源竞争
-
资源管理:及时清理已完成任务的连接
-
性能优化 :使用
writev系统调用减少系统调用次数 -
状态维护:准确记录客户端交互时间,支持超时检测
这种设计使得Redis能够在高并发场景下高效、稳定地向客户端发送数据,同时保持良好的资源管理和错误恢复能力。通过事件驱动和函数指针的巧妙运用,Redis实现了高性能、可扩展的网络通信层。
##源码
/* Write event handler. Just send data to the client. */
void sendReplyToClient(connection *conn) {
client *c = connGetPrivateData(conn);
writeToClient(c,1);
}
/* Write data in output buffers to client. Return C_OK if the client
* is still valid after the call, C_ERR if it was freed because of some
* error. If handler_installed is set, it will attempt to clear the
* write event.
*
* This function is called by threads, but always with handler_installed
* set to 0. So when handler_installed is set to 0 the function must be
* thread safe. */
int writeToClient(client *c, int handler_installed) {
/* Update total number of writes on server */
atomicIncr(server.stat_total_writes_processed, 1);
ssize_t nwritten = 0, totwritten = 0;
while(clientHasPendingReplies(c)) {
int ret = _writeToClient(c, &nwritten);
if (ret == C_ERR) break;
totwritten += nwritten;
/* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
* bytes, in a single threaded server it's a good idea to serve
* other clients as well, even if a very large request comes from
* super fast link that is always able to accept data (in real world
* scenario think about 'KEYS *' against the loopback interface).
*
* However if we are over the maxmemory limit we ignore that and
* just deliver as much data as it is possible to deliver.
*
* Moreover, we also send as much as possible if the client is
* a slave or a monitor (otherwise, on high-speed traffic, the
* replication/output buffer will grow indefinitely) */
if (totwritten > NET_MAX_WRITES_PER_EVENT &&
(server.maxmemory == 0 ||
zmalloc_used_memory() < server.maxmemory) &&
!(c->flags & CLIENT_SLAVE)) break;
}
if (getClientType(c) == CLIENT_TYPE_SLAVE) {
atomicIncr(server.stat_net_repl_output_bytes, totwritten);
} else {
atomicIncr(server.stat_net_output_bytes, totwritten);
}
if (nwritten == -1) {
if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
serverLog(LL_VERBOSE,
"Error writing to client: %s", connGetLastError(c->conn));
freeClientAsync(c);
return C_ERR;
}
}
if (totwritten > 0) {
/* For clients representing masters we don't count sending data
* as an interaction, since we always send REPLCONF ACK commands
* that take some time to just fill the socket output buffer.
* We just rely on data / pings received for timeout detection. */
if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = server.unixtime;
}
if (!clientHasPendingReplies(c)) {
c->sentlen = 0;
/* Note that writeToClient() is called in a threaded way, but
* aeDeleteFileEvent() is not thread safe: however writeToClient()
* is always called with handler_installed set to 0 from threads
* so we are fine. */
if (handler_installed) {
serverAssert(io_threads_op == IO_THREADS_OP_IDLE);
connSetWriteHandler(c->conn, NULL);
}
/* Close connection after entire reply has been sent. */
if (c->flags & CLIENT_CLOSE_AFTER_REPLY) {
freeClientAsync(c);
return C_ERR;
}
}
/* Update client's memory usage after writing.
* Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in
* handleClientsWithPendingWritesUsingThreads(). */
if (io_threads_op == IO_THREADS_OP_IDLE)
updateClientMemUsageAndBucket(c);
return C_OK;
}
/* This function does actual writing output buffers to different types of
* clients, it is called by writeToClient.
* If we write successfully, it returns C_OK, otherwise, C_ERR is returned,
* and 'nwritten' is an output parameter, it means how many bytes server write
* to client. */
int _writeToClient(client *c, ssize_t *nwritten) {
*nwritten = 0;
if (getClientType(c) == CLIENT_TYPE_SLAVE) {
serverAssert(c->bufpos == 0 && listLength(c->reply) == 0);
replBufBlock *o = listNodeValue(c->ref_repl_buf_node);
serverAssert(o->used >= c->ref_block_pos);
/* Send current block if it is not fully sent. */
if (o->used > c->ref_block_pos) {
*nwritten = connWrite(c->conn, o->buf+c->ref_block_pos,
o->used-c->ref_block_pos);
if (*nwritten <= 0) return C_ERR;
c->ref_block_pos += *nwritten;
}
/* If we fully sent the object on head, go to the next one. */
listNode *next = listNextNode(c->ref_repl_buf_node);
if (next && c->ref_block_pos == o->used) {
o->refcount--;
((replBufBlock *)(listNodeValue(next)))->refcount++;
c->ref_repl_buf_node = next;
c->ref_block_pos = 0;
incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL);
}
return C_OK;
}
/* When the reply list is not empty, it's better to use writev to save us some
* system calls and TCP packets. */
if (listLength(c->reply) > 0) {
int ret = _writevToClient(c, nwritten);
if (ret != C_OK) return ret;
/* If there are no longer objects in the list, we expect
* the count of reply bytes to be exactly zero. */
if (listLength(c->reply) == 0)
serverAssert(c->reply_bytes == 0);
} else if (c->bufpos > 0) {
*nwritten = connWrite(c->conn, c->buf + c->sentlen, c->bufpos - c->sentlen);
if (*nwritten <= 0) return C_ERR;
c->sentlen += *nwritten;
/* If the buffer was sent, set bufpos to zero to continue with
* the remainder of the reply. */
if ((int)c->sentlen == c->bufpos) {
c->bufpos = 0;
c->sentlen = 0;
}
}
return C_OK;
}
/* Write to connection, behaves the same as write(2).
*
* Like write(2), a short write is possible. A -1 return indicates an error.
*
* The caller should NOT rely on errno. Testing for an EAGAIN-like condition, use
* connGetState() to see if the connection state is still CONN_STATE_CONNECTED.
*/
static inline int connWrite(connection *conn, const void *data, size_t data_len) {
return conn->type->write(conn, data, data_len);
}
/* Register a write handler, to be called when the connection is writable.
* If NULL, the existing handler is removed.
*/
static inline int connSetWriteHandler(connection *conn, ConnectionCallbackFunc func) {
return conn->type->set_write_handler(conn, func, 0);
}
int (*set_write_handler)(struct connection *conn, ConnectionCallbackFunc handler, int barrier);
static ConnectionType CT_Socket = {
/* connection type */
.get_type = connSocketGetType,
/* connection type initialize & finalize & configure */
.init = NULL,
.cleanup = NULL,
.configure = NULL,
/* ae & accept & listen & error & address handler */
.ae_handler = connSocketEventHandler,
.accept_handler = connSocketAcceptHandler,
.addr = connSocketAddr,
.is_local = connSocketIsLocal,
.listen = connSocketListen,
/* create/shutdown/close connection */
.conn_create = connCreateSocket,
.conn_create_accepted = connCreateAcceptedSocket,
.shutdown = connSocketShutdown,
.close = connSocketClose,
/* connect & accept */
.connect = connSocketConnect,
.blocking_connect = connSocketBlockingConnect,
.accept = connSocketAccept,
/* IO */
.write = connSocketWrite,
.writev = connSocketWritev,
.read = connSocketRead,
.set_write_handler = connSocketSetWriteHandler,
.set_read_handler = connSocketSetReadHandler,
.get_last_error = connSocketGetLastError,
.sync_write = connSocketSyncWrite,
.sync_read = connSocketSyncRead,
.sync_readline = connSocketSyncReadLine,
/* pending data */
.has_pending_data = NULL,
.process_pending_data = NULL,
};
static connection *connCreateSocket(void) {
connection *conn = zcalloc(sizeof(connection));
conn->type = &CT_Socket;
conn->fd = -1;
conn->iovcnt = IOV_MAX;
return conn;
}
static int connSocketWrite(connection *conn, const void *data, size_t data_len) {
int ret = write(conn->fd, data, data_len);
if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (errno != EINTR && conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;
}