1.相关概念
www: 万维网(web)
一个大规模、联机式的信息储存所
URL:统一资源定位符;
URL格式:
2.http协议
(1)定义
1.定义:超文本传输协议
一准面向事务的应用协议;
端口号:80, 备用端口号:8080;
(2)HTTP通信过程
基于传输层的TCP协议;
(3)HTTP的报文格式
keep-alive (长连接):请求响应之后,连接保持一段时间;
close(短连接):请求响应之后,立即断开连接;
(4)常见请求报文的方法
(5)常见状态码
3.代码练习
搜狐网站爬虫
cs
#include "head.h"
#define SER_PORT 80
#define SER_IP "117.34.49.212"
int create_tcp_connect()
{
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
if(sockfd < 0)
{
perror("socket error");
return -1;
}
struct sockaddr_in sockaddr;
sockaddr.sin_family = AF_INET;
sockaddr.sin_port = htons(SER_PORT);
sockaddr.sin_addr.s_addr = inet_addr(SER_IP);
int ret = connect(sockfd, (struct sockaddr *)&sockaddr, sizeof(sockaddr));
if(ret < 0)
{
perror("connect error");
return -1;
}
return sockfd;
}
int send_http_request(int sockfd)
{
char *buf ="GET / HTTP/1.1\r\n"
"Host: news.sohu.com\r\n"
"User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0\r\n"
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\r\n"
"Accept-Language: en-US,en;q=0.5\r\n"
"Connection: close\r\n"
"\r\n";
int ret = send(sockfd, buf, strlen(buf), 0);
if(ret < 0)
{
perror("send error");
return -1;
}
return 0;
}
int recv_http_response(int sockfd)
{
char buf[1024] = {0};
while(1)
{
int cont = recv(sockfd, buf, sizeof(buf), 0);
if(cont < 0)
{
perror("recv error");
return -1;
}
else if(cont == 0)
{
printf("server off\n");
break;
}
else
{
write(1, buf, cont);
}
}
return 0;
}
int main(int argc, char const *argv[])
{
int sockfd = create_tcp_connect();
send_http_request(sockfd);
recv_http_response(sockfd);
close(sockfd);
return 0;
}