Redis网络层深度解析：数据如何写回客户端

从事件驱动到数据发送

Redis作为一个高性能的内存数据库，其网络层的设计至关重要。让我们深入源码，看看Redis是如何优雅地将数据写回客户端的。

核心架构：事件驱动的写操作

1. 写事件处理器的注册与触发

在Redis中，写操作遵循事件驱动模式。当需要向客户端发送数据时，会注册写事件处理器：

复制代码

/* Write event handler. Just send data to the client. */
void sendReplyToClient(connection *conn) {
    client *c = connGetPrivateData(conn);
    writeToClient(c,1);
}

这个函数是写事件的处理入口，当套接字变得可写时被调用。注意第二个参数handler_installed设为1，表示这个调用来自于已安装的事件处理器。

2. 多线程环境下的安全考虑

Redis在IO多线程模式下需要特别注意线程安全：

复制代码

/* This function is called by threads, but always with handler_installed
 * set to 0. So when handler_installed is set to 0 the function must be
 * thread safe. */
int writeToClient(client *c, int handler_installed) {
    // 线程安全的写计数器
    atomicIncr(server.stat_total_writes_processed, 1);

数据写入的核心逻辑

3. 分层写入策略

Redis采用分层的数据写入策略，针对不同类型的客户端优化：

复制代码

while(clientHasPendingReplies(c)) {
    int ret = _writeToClient(c, &nwritten);
    if (ret == C_ERR) break;
    totwritten += nwritten;
    
    // 流控机制：避免一个客户端占用太多资源
    if (totwritten > NET_MAX_WRITES_PER_EVENT &&
        (server.maxmemory == 0 ||
         zmalloc_used_memory() < server.maxmemory) &&
        !(c->flags & CLIENT_SLAVE)) break;
}

这里有三个重要考量：

公平性 ：限制每次事件最多写入NET_MAX_WRITES_PER_EVENT字节
内存压力：内存不足时尽可能多写
复制优化：从节点不受此限制，避免复制积压

4. 不同类型客户端的差异化处理

Redis针对主从复制进行了特殊优化：

复制代码

if (getClientType(c) == CLIENT_TYPE_SLAVE) {
    atomicIncr(server.stat_net_repl_output_bytes, totwritten);
    // 从节点使用专门的复制缓冲区
    replBufBlock *o = listNodeValue(c->ref_repl_buf_node);
    *nwritten = connWrite(c->conn, o->buf+c->ref_block_pos,
                          o->used-c->ref_block_pos);
} else {
    atomicIncr(server.stat_net_output_bytes, totwritten);
    // 普通客户端使用常规缓冲区
    if (listLength(c->reply) > 0) {
        int ret = _writevToClient(c, nwritten);  // 使用writev批量写入
    } else if (c->bufpos > 0) {
        *nwritten = connWrite(c->conn, c->buf + c->sentlen, 
                              c->bufpos - c->sentlen);
    }
}

函数指针：抽象化的网络层

5. 连接类型抽象

Redis通过函数指针实现了网络层的抽象，支持不同类型的连接：

复制代码

typedef struct ConnectionType {
    /* connection type */
    connTypeGetTypeFunc *get_type;
    
    /* IO operations */
    connWriteFunc *write;
    connWritevFunc *writev;
    connReadFunc *read;
    connSetWriteHandlerFunc *set_write_handler;
    connSetReadHandlerFunc *set_read_handler;
    
    /* ... other function pointers ... */
} ConnectionType;

6. Socket连接的具体实现

对于Socket连接，Redis提供了具体的实现：

复制代码

static ConnectionType CT_Socket = {
    .write = connSocketWrite,
    .writev = connSocketWritev,
    .set_write_handler = connSocketSetWriteHandler,
    /* ... other function pointers ... */
};

static int connSocketWrite(connection *conn, const void *data, size_t data_len) {
    int ret = write(conn->fd, data, data_len);
    if (ret < 0 && errno != EAGAIN) {
        conn->last_errno = errno;
        if (errno != EINTR && conn->state == CONN_STATE_CONNECTED)
            conn->state = CONN_STATE_ERROR;
    }
    return ret;
}

事件处理器的动态管理

7. 写事件的注册与移除

Redis根据数据是否写完动态管理写事件处理器：

复制代码

if (!clientHasPendingReplies(c)) {
    c->sentlen = 0;
    /* Note that writeToClient() is called in a threaded way, but
     * aeDeleteFileEvent() is not thread safe: however writeToClient()
     * is always called with handler_installed set to 0 from threads
     * so we are fine. */
    if (handler_installed) {
        serverAssert(io_threads_op == IO_THREADS_OP_IDLE);
        connSetWriteHandler(c->conn, NULL);  // 移除写事件处理器
    }
    
    /* Close connection after entire reply has been sent. */
    if (c->flags & CLIENT_CLOSE_AFTER_REPLY) {
        freeClientAsync(c);
        return C_ERR;
    }
}

8. 连接状态管理

Redis在写操作后更新客户端状态：

复制代码

if (totwritten > 0) {
    /* For clients representing masters we don't count sending data
     * as an interaction, since we always send REPLCONF ACK commands
     * that take some time to just fill the socket output buffer.
     * We just rely on data / pings received for timeout detection. */
    if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = server.unixtime;
}

错误处理与资源清理

9. 写错误处理

当写操作失败时，Redis会妥善处理连接：

复制代码

if (nwritten == -1) {
    if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
        serverLog(LL_VERBOSE,
            "Error writing to client: %s", connGetLastError(c->conn));
        freeClientAsync(c);  // 异步释放客户端
        return C_ERR;
    }
}

10. 内存使用统计更新

写操作后需要更新客户端的内存使用统计：

复制代码

/* Update client's memory usage after writing.
 * Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in
 * handleClientsWithPendingWritesUsingThreads(). */
if (io_threads_op == IO_THREADS_OP_IDLE)
    updateClientMemUsageAndBucket(c);

设计亮点总结

分层抽象：通过函数指针实现连接类型的抽象，便于扩展
流控机制：智能控制每次事件的最大写入量，保证公平性
差异化处理：针对普通客户端、从节点、主节点采用不同策略
线程安全：在多线程环境下正确处理资源竞争
资源管理：及时清理已完成任务的连接
性能优化 ：使用writev系统调用减少系统调用次数
状态维护：准确记录客户端交互时间，支持超时检测

这种设计使得Redis能够在高并发场景下高效、稳定地向客户端发送数据，同时保持良好的资源管理和错误恢复能力。通过事件驱动和函数指针的巧妙运用，Redis实现了高性能、可扩展的网络通信层。

##源码

复制代码

/* Write event handler. Just send data to the client. */
void sendReplyToClient(connection *conn) {
    client *c = connGetPrivateData(conn);
    writeToClient(c,1);
}

/* Write data in output buffers to client. Return C_OK if the client
 * is still valid after the call, C_ERR if it was freed because of some
 * error.  If handler_installed is set, it will attempt to clear the
 * write event.
 *
 * This function is called by threads, but always with handler_installed
 * set to 0. So when handler_installed is set to 0 the function must be
 * thread safe. */
int writeToClient(client *c, int handler_installed) {
    /* Update total number of writes on server */
    atomicIncr(server.stat_total_writes_processed, 1);

    ssize_t nwritten = 0, totwritten = 0;

    while(clientHasPendingReplies(c)) {
        int ret = _writeToClient(c, &nwritten);
        if (ret == C_ERR) break;
        totwritten += nwritten;
        /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
         * bytes, in a single threaded server it's a good idea to serve
         * other clients as well, even if a very large request comes from
         * super fast link that is always able to accept data (in real world
         * scenario think about 'KEYS *' against the loopback interface).
         *
         * However if we are over the maxmemory limit we ignore that and
         * just deliver as much data as it is possible to deliver.
         *
         * Moreover, we also send as much as possible if the client is
         * a slave or a monitor (otherwise, on high-speed traffic, the
         * replication/output buffer will grow indefinitely) */
        if (totwritten > NET_MAX_WRITES_PER_EVENT &&
            (server.maxmemory == 0 ||
             zmalloc_used_memory() < server.maxmemory) &&
            !(c->flags & CLIENT_SLAVE)) break;
    }

    if (getClientType(c) == CLIENT_TYPE_SLAVE) {
        atomicIncr(server.stat_net_repl_output_bytes, totwritten);
    } else {
        atomicIncr(server.stat_net_output_bytes, totwritten);
    }

    if (nwritten == -1) {
        if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
            serverLog(LL_VERBOSE,
                "Error writing to client: %s", connGetLastError(c->conn));
            freeClientAsync(c);
            return C_ERR;
        }
    }
    if (totwritten > 0) {
        /* For clients representing masters we don't count sending data
         * as an interaction, since we always send REPLCONF ACK commands
         * that take some time to just fill the socket output buffer.
         * We just rely on data / pings received for timeout detection. */
        if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = server.unixtime;
    }
    if (!clientHasPendingReplies(c)) {
        c->sentlen = 0;
        /* Note that writeToClient() is called in a threaded way, but
         * aeDeleteFileEvent() is not thread safe: however writeToClient()
         * is always called with handler_installed set to 0 from threads
         * so we are fine. */
        if (handler_installed) {
            serverAssert(io_threads_op == IO_THREADS_OP_IDLE);
            connSetWriteHandler(c->conn, NULL);
        }

        /* Close connection after entire reply has been sent. */
        if (c->flags & CLIENT_CLOSE_AFTER_REPLY) {
            freeClientAsync(c);
            return C_ERR;
        }
    }
    /* Update client's memory usage after writing.
     * Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in
     * handleClientsWithPendingWritesUsingThreads(). */
    if (io_threads_op == IO_THREADS_OP_IDLE)
        updateClientMemUsageAndBucket(c);
    return C_OK;
}

/* This function does actual writing output buffers to different types of
 * clients, it is called by writeToClient.
 * If we write successfully, it returns C_OK, otherwise, C_ERR is returned,
 * and 'nwritten' is an output parameter, it means how many bytes server write
 * to client. */
int _writeToClient(client *c, ssize_t *nwritten) {
    *nwritten = 0;
    if (getClientType(c) == CLIENT_TYPE_SLAVE) {
        serverAssert(c->bufpos == 0 && listLength(c->reply) == 0);

        replBufBlock *o = listNodeValue(c->ref_repl_buf_node);
        serverAssert(o->used >= c->ref_block_pos);
        /* Send current block if it is not fully sent. */
        if (o->used > c->ref_block_pos) {
            *nwritten = connWrite(c->conn, o->buf+c->ref_block_pos,
                                  o->used-c->ref_block_pos);
            if (*nwritten <= 0) return C_ERR;
            c->ref_block_pos += *nwritten;
        }

        /* If we fully sent the object on head, go to the next one. */
        listNode *next = listNextNode(c->ref_repl_buf_node);
        if (next && c->ref_block_pos == o->used) {
            o->refcount--;
            ((replBufBlock *)(listNodeValue(next)))->refcount++;
            c->ref_repl_buf_node = next;
            c->ref_block_pos = 0;
            incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL);
        }
        return C_OK;
    }

    /* When the reply list is not empty, it's better to use writev to save us some
     * system calls and TCP packets. */
    if (listLength(c->reply) > 0) {
        int ret = _writevToClient(c, nwritten);
        if (ret != C_OK) return ret;

        /* If there are no longer objects in the list, we expect
         * the count of reply bytes to be exactly zero. */
        if (listLength(c->reply) == 0)
            serverAssert(c->reply_bytes == 0);
    } else if (c->bufpos > 0) {
        *nwritten = connWrite(c->conn, c->buf + c->sentlen, c->bufpos - c->sentlen);
        if (*nwritten <= 0) return C_ERR;
        c->sentlen += *nwritten;

        /* If the buffer was sent, set bufpos to zero to continue with
         * the remainder of the reply. */
        if ((int)c->sentlen == c->bufpos) {
            c->bufpos = 0;
            c->sentlen = 0;
        }
    } 

    return C_OK;
}

/* Write to connection, behaves the same as write(2).
 *
 * Like write(2), a short write is possible. A -1 return indicates an error.
 *
 * The caller should NOT rely on errno. Testing for an EAGAIN-like condition, use
 * connGetState() to see if the connection state is still CONN_STATE_CONNECTED.
 */
static inline int connWrite(connection *conn, const void *data, size_t data_len) {
    return conn->type->write(conn, data, data_len);
}

/* Register a write handler, to be called when the connection is writable.
 * If NULL, the existing handler is removed.
 */
static inline int connSetWriteHandler(connection *conn, ConnectionCallbackFunc func) {
    return conn->type->set_write_handler(conn, func, 0);
}

int (*set_write_handler)(struct connection *conn, ConnectionCallbackFunc handler, int barrier);


static ConnectionType CT_Socket = {
    /* connection type */
    .get_type = connSocketGetType,

    /* connection type initialize & finalize & configure */
    .init = NULL,
    .cleanup = NULL,
    .configure = NULL,

    /* ae & accept & listen & error & address handler */
    .ae_handler = connSocketEventHandler,
    .accept_handler = connSocketAcceptHandler,
    .addr = connSocketAddr,
    .is_local = connSocketIsLocal,
    .listen = connSocketListen,

    /* create/shutdown/close connection */
    .conn_create = connCreateSocket,
    .conn_create_accepted = connCreateAcceptedSocket,
    .shutdown = connSocketShutdown,
    .close = connSocketClose,

    /* connect & accept */
    .connect = connSocketConnect,
    .blocking_connect = connSocketBlockingConnect,
    .accept = connSocketAccept,

    /* IO */
    .write = connSocketWrite,
    .writev = connSocketWritev,
    .read = connSocketRead,
    .set_write_handler = connSocketSetWriteHandler,
    .set_read_handler = connSocketSetReadHandler,
    .get_last_error = connSocketGetLastError,
    .sync_write = connSocketSyncWrite,
    .sync_read = connSocketSyncRead,
    .sync_readline = connSocketSyncReadLine,

    /* pending data */
    .has_pending_data = NULL,
    .process_pending_data = NULL,
};

static connection *connCreateSocket(void) {
    connection *conn = zcalloc(sizeof(connection));
    conn->type = &CT_Socket;
    conn->fd = -1;
    conn->iovcnt = IOV_MAX;

    return conn;
}

static int connSocketWrite(connection *conn, const void *data, size_t data_len) {
    int ret = write(conn->fd, data, data_len);
    if (ret < 0 && errno != EAGAIN) {
        conn->last_errno = errno;

        /* Don't overwrite the state of a connection that is not already
         * connected, not to mess with handler callbacks.
         */
        if (errno != EINTR && conn->state == CONN_STATE_CONNECTED)
            conn->state = CONN_STATE_ERROR;
    }

    return ret;
}