记录一下因为错用epoll的EPOLLET(边缘模式)导致的使用epoll创建tcp服务端后,使用多线程并发创建多个客户端时只有部分客户端创建连接成功的问题。具体的实现以及解决问题的步骤如下:
1.epoll监听tcp服务端口
创建tcp套接字,监听服务端口,创建epoll队列,epoll监听socket状态
在ServerListener.cpp中通过函数创建server socket,并创建listen epoll,但是listen epoll使用的events参数为EPOLLIN | EPOLLET;(边缘模式)。然后AcceptConnection()函数用于处理一次epoll_wait()触发事件。
在TcpServerBase.cpp中有一个线程循环执行AcceptConnection()函数用于持续接收连接请求。
cpp
# ServerListener.cpp
bool ServerListener::CreateSocket(int port){
server_socket_ = socket(AF_INET, SOCK_STREAM, 0);
int flag = 1;
if (setsockopt(server_socket_, SOL_SOCKET, SO_REUSEADDR, &flag,sizeof(flag)) == -1) {
return false;
}
// ....... other code
struct sockaddr_in server_addr;
memset(&server_addr, 0, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(port);
server_addr.sin_addr.s_addr = INADDR_ANY;
bind(server_socket_, (struct sockaddr*)&server_addr,sizeof(server_addr)) == 0;
if (listen(server_socket_, 1024) != 0) {
return false;
}
if ((listen_epoll_ = epoll_create(1)) < 0) {
return false;
}
struct epoll_event ev;
memset(&ev, 0, sizeof(ev));
ev.events = EPOLLIN | EPOLLET;
ev.data.fd = server_socket_;
if (epoll_ctl(listen_epoll_, EPOLL_CTL_ADD, server_socket_, &ev) < 0) {
return false;
}
}
void ServerListener::AcceptConnection() {
int c_socket;
timeval tv;
std::string client_ip;
int client_port;
char ipv6buf[INET6_ADDRSTRLEN];
struct epoll_event events[5];
int nready = epoll_wait(listen_epoll_, events, 5, -1);
if (nready <= 0) {
return;
}
std::cout << "nready " << nready << std::endl;
for (int i = 0; i < nready; i++) {
struct sockaddr_in addr;
socklen_t addr_len = sizeof(addr);
c_socket = accept(server_socket_, (struct sockaddr*)&addr, &addr_len);
if (c_socket < 0) {
continue;
}
client_port = ntohs(addr.sin_port);
client_ip = inet_ntoa(addr.sin_addr);
std::cout << "accept client " << client_ip << ":" << client_port << " " << c_socket << std::endl;
// ...... other code
}
}
cpp
# TcpServerBase.cpp
void TcpServerBase::listening() {
while (running_.load()) {
ServerListener::instance().AcceptConnection();
}
}
2.多线程并发创建客户端
模拟并发,循环使用多线程创建客户端,并请求连接服务端,发送数据和接收服务端返回数据。
cpp
# testclients.cc
void running() {
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in server;
server.sin_family = AF_INET;
server.sin_addr.s_addr = inet_addr(host.c_str());
server.sin_port = htons(port);
std::stringstream ss_send;
if (connect(sockfd, (struct sockaddr*)&server, sizeof(server)) == 0) {
ss_send << "connect to " << host << ":" << port << " " << sockfd;
} else {
ss_send << "connect to " << host << ":" << port << " failed";
}
SetSendTimeOut(sockfd, 1);
SetRecvTimeOut(sockfd, 1);
std::cout << ss_send.str() << std::endl;
std::string message = "client test request";
int seq_num_ = 0;
while (running_.load()) {
std::this_thread::sleep_for(std::chrono::seconds(5));
// ...... send and recv
seq_num_++;
if (send error) {
ss_send << "send failed: " << strerror(errno) << std::endl;
continue;
}
// ...... other code
if(recv error) {
ss_recv << "receive failed: " << strerror(errno) << std::endl;
}
// ...... other code
}
}
int main(int argc, char** argv) {
running_.store(true);
std::vector<std::thread> threads;
for (int i = 0; i < 10; i++) {
threads.push_back(std::thread(running));
}
while (true) {
continue;
}
running_.store(false);
for (auto& t : threads) {
t.join();
}
return 0;
}
3.启动服务端和客户端
服务端启动之后监听端口,客户端创建连接,客户端日志查看创建了10个连接,但是服务端日志却没有10次触发accept(),即服务端连接创建数 < 客户端请求连接数。
现象和猜疑:
客户端一共发了10个请求,服务端只接受了8个请求,我甚至怀疑是不是并发导致socket冲突了之类的,所以我把socket fd打印出来了。
cpp
# server log
starting server at 127.0.0.1:6666
nready 1
accept client 127.0.0.1:50142 5
insert client 127.0.0.1:50142
nready 1
accept client 127.0.0.1:50144 7
insert client 127.0.0.1:50144
nready 1
accept client 127.0.0.1:50146 8
insert client 127.0.0.1:50146
nready 1
accept client 127.0.0.1:50148 9
insert client 127.0.0.1:50148
nready 1
accept client 127.0.0.1:50150 10
insert client 127.0.0.1:50150
nready 1
accept client 127.0.0.1:50152 11
insert client 127.0.0.1:50152
nready 1
accept client 127.0.0.1:50154 12
insert client 127.0.0.1:50154
nready 1
accept client 127.0.0.1:50156 13
insert client 127.0.0.1:50156
cpp
# client log
connect to 127.0.0.1:6666 7
connect to 127.0.0.1:6666 10
connect to 127.0.0.1:6666 4
connect to 127.0.0.1:6666 5
connect to 127.0.0.1:6666 3
connect to 127.0.0.1:6666 6
connect to 127.0.0.1:6666 9
connect to 127.0.0.1:6666 8
connect to 127.0.0.1:6666 11
connect to 127.0.0.1:6666 12
cpp
# client send and recv log
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
recv error 11
1 : receive message from 127.0.0.1:6666,size:-1
receive failed: Resource temporarily unavailable
recv error 11
1 : receive message from 127.0.0.1:6666,size:-1
receive failed: Resource temporarily unavailable
cpp
# server recv log
127.0.0.1:50148 recv: client test request
127.0.0.1:50154 recv: client test request
127.0.0.1:50156 recv: client test request
127.0.0.1:50144 recv: client test request
127.0.0.1:50150 recv: client test request
127.0.0.1:50152 recv: client test request
127.0.0.1:50142 recv: client test request
127.0.0.1:50146 recv: client test request
根据客户端和服务端的日志查看,毫无疑问,服务端接受连接成功的8个客户端可以发送和接收数据,存在2个未接受的连接请求,可以发送数据,但是无法接收数据(原因是客户端已经打开了socket fd,所以可以发送,接收报错是因为我设置了接收超时,如果不设置接收超时会一直卡在recv())。
4.进入调试问题阶段
基于上面的怀疑,我在循环创建客户端的加上sleep间隔,避免客户端同时创建的冲突。修改代码如下:
cpp
# testclients.cc
int main(int argc, char** argv) {
running_.store(true);
std::vector<std::thread> threads;
for (int i = 0; i < 10; i++) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
threads.push_back(std::thread(running));
}
while (true) {
continue;
}
running_.store(false);
for (auto& t : threads) {
t.join();
}
return 0;
}
再重启客户端和服务端,使用间隔50ms,可以看到服务端的接受请求数和客户端请求数已经对上了。
cpp
# server log
starting server at 127.0.0.1:6666
nready 1
accept client 127.0.0.1:50258 5
insert client 127.0.0.1:50258
nready 1
accept client 127.0.0.1:50260 7
insert client 127.0.0.1:50260
nready 1
accept client 127.0.0.1:50262 8
insert client 127.0.0.1:50262
nready 1
accept client 127.0.0.1:50264 9
insert client 127.0.0.1:50264
nready 1
accept client 127.0.0.1:50266 10
insert client 127.0.0.1:50266
nready 1
accept client 127.0.0.1:50268 11
insert client 127.0.0.1:50268
nready 1
accept client 127.0.0.1:50270 12
insert client 127.0.0.1:50270
nready 1
accept client 127.0.0.1:50272 13
insert client 127.0.0.1:50272
nready 1
accept client 127.0.0.1:50274 14
insert client 127.0.0.1:50274
nready 1
accept client 127.0.0.1:50276 15
insert client 127.0.0.1:50276
cpp
# client log
connect to 127.0.0.1:6666 3
connect to 127.0.0.1:6666 4
connect to 127.0.0.1:6666 5
connect to 127.0.0.1:6666 6
connect to 127.0.0.1:6666 7
connect to 127.0.0.1:6666 8
connect to 127.0.0.1:6666 9
connect to 127.0.0.1:6666 10
connect to 127.0.0.1:6666 11
connect to 127.0.0.1:6666 12
cpp
# client send and recv log
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
1 : send message to 127.0.0.1:6666
1 : receive message from 127.0.0.1:6666,size:24
receive message from 127.0.0.1:6666, msg:deal_service_ex response,size:24
cpp
# server recv log
127.0.0.1:50258 recv: client test request
127.0.0.1:50260 recv: client test request
127.0.0.1:50262 recv: client test request
127.0.0.1:50264 recv: client test request
127.0.0.1:50266 recv: client test request
127.0.0.1:50268 recv: client test request
127.0.0.1:50270 recv: client test request
127.0.0.1:50272 recv: client test request
127.0.0.1:50274 recv: client test request
127.0.0.1:50276 recv: client test request
到这一步,创建连接和发送、接收数据都正常了,但是我还是有一个疑问,正常情况下,都是直接循环创建多个客户端,谁会在创建客户端的时候刻意设置间隔呢?所以我基本确定肯定是有问题的,而且绝对不是冲突的问题。
5.修改代码
基于前面的情况,我询问AI,(... 忽略AI交流过程),才发现我的listen epoll使用的是EPOLLET模式监听的server socket。而我的处理过程又是每次由请求过来的时候只接受了一个连接请求。
现在就有两种改法:
1.继续使用当前的流程,每次请求过来只接收一个连接请求,但是要选择使用EPOLLLT(水平模式)监听server socket。这样才能保证服务端在高并发的情况下能接受每一个连接请求。
2.继续使用当前的EPOLLET(边缘模式),但是每次有连接请求过来的时候,要循环accept()接受连接请求,直到accept()<0,才进入下一次epoll_wait(),这样才能保证高并发情况下多个连接请求同时到达服务端,服务端不应该只连接一个就忽略后面的。只接受了一个是因为当时没有意识到多个连接请求过来的时候,只有server socket这一个触发,但是这一个socket上面同时到达了两个需要aceept()的事件,没有意识到这一点才是最根本的原因。
5.1 使用EPOLLLT
如果修改为使用EPOLLLT,只需要修改创建listen epoll的时候使用EPOLLLT即可
bash
ev.events = EPOLLIN | EPOLLLT;
cpp
# ServerListener.cpp
bool ServerListener::CreateSocket(int port){
server_socket_ = socket(AF_INET, SOCK_STREAM, 0);
int flag = 1;
if (setsockopt(server_socket_, SOL_SOCKET, SO_REUSEADDR, &flag,sizeof(flag)) == -1) {
return false;
}
// ....... other code
struct sockaddr_in server_addr;
memset(&server_addr, 0, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(port);
server_addr.sin_addr.s_addr = INADDR_ANY;
bind(server_socket_, (struct sockaddr*)&server_addr,sizeof(server_addr)) == 0;
if (listen(server_socket_, 1024) != 0) {
return false;
}
if ((listen_epoll_ = epoll_create(1)) < 0) {
return false;
}
struct epoll_event ev;
memset(&ev, 0, sizeof(ev));
ev.events = EPOLLIN | EPOLLLT;
ev.data.fd = server_socket_;
if (epoll_ctl(listen_epoll_, EPOLL_CTL_ADD, server_socket_, &ev) < 0) {
return false;
}
}
void ServerListener::AcceptConnection() {
int c_socket;
timeval tv;
std::string client_ip;
int client_port;
char ipv6buf[INET6_ADDRSTRLEN];
struct epoll_event events[5];
int nready = epoll_wait(listen_epoll_, events, 5, -1);
if (nready <= 0) {
return;
}
std::cout << "nready " << nready << std::endl;
for (int i = 0; i < nready; i++) {
struct sockaddr_in addr;
socklen_t addr_len = sizeof(addr);
c_socket = accept(server_socket_, (struct sockaddr*)&addr, &addr_len);
if (c_socket < 0) {
continue;
}
client_port = ntohs(addr.sin_port);
client_ip = inet_ntoa(addr.sin_addr);
std::cout << "accept client " << client_ip << ":" << client_port << " " << c_socket << std::endl;
// ...... other code
}
}
5.2 使用EPOLLET
如果继续使用EPOLLET,则需要修改accept()部分,使用while(true)持续循环接受请求。
cpp
# ServerListener.cpp
bool ServerListener::CreateSocket(int port){
server_socket_ = socket(AF_INET, SOCK_STREAM, 0);
int flag = 1;
if (setsockopt(server_socket_, SOL_SOCKET, SO_REUSEADDR, &flag,sizeof(flag)) == -1) {
return false;
}
// ....... other code
struct sockaddr_in server_addr;
memset(&server_addr, 0, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(port);
server_addr.sin_addr.s_addr = INADDR_ANY;
bind(server_socket_, (struct sockaddr*)&server_addr,sizeof(server_addr)) == 0;
if (listen(server_socket_, 1024) != 0) {
return false;
}
if ((listen_epoll_ = epoll_create(1)) < 0) {
return false;
}
struct epoll_event ev;
memset(&ev, 0, sizeof(ev));
ev.events = EPOLLIN | EPOLLET;
ev.data.fd = server_socket_;
if (epoll_ctl(listen_epoll_, EPOLL_CTL_ADD, server_socket_, &ev) < 0) {
return false;
}
}
void ServerListener::AcceptConnection() {
int c_socket;
timeval tv;
std::string client_ip;
int client_port;
char ipv6buf[INET6_ADDRSTRLEN];
struct epoll_event events[5];
int nready = epoll_wait(listen_epoll_, events, 5, -1);
if (nready <= 0) {
return;
}
std::cout << "nready " << nready << std::endl;
while(true) {
struct sockaddr_in addr;
socklen_t addr_len = sizeof(addr);
c_socket = accept(server_socket_, (struct sockaddr*)&addr, &addr_len);
if (c_socket < 0) {
break;
}
client_port = ntohs(addr.sin_port);
client_ip = inet_ntoa(addr.sin_addr);
std::cout << "accept client " << client_ip << ":" << client_port << " " << c_socket << std::endl;
// ...... other code
}
}