1、robot协议
爬虫协议:告诉爬虫or搜索引擎,哪些可以爬 哪些不能爬
robot.txt(一般就在根目录下)
User-agent:*
Disallow: /ulink?
Allow:/public
2、robots.txt敏感信息泄露:/admin;漏洞级别:中危or低危
3、robotparser模块:用于解析robots.txt
bash
#!/usr/bin/env python3
import urllib.robotparser
from urllib.robotparser import RobotFileParser
url = 'https://www.baidu.com/robots.txt'
# 创建一个RobotFileParser用于解析robots.txt协议
robot_Parser = urllib.robotparser.RobotFileParser()
robot_Parser.set_url(url)
# 读取并解析robots.txt协议
robot_Parser.read()
# 检查是否可以爬起特定的URL
user_agent = 'BaiduSpider'
check_url = 'https://www.baidu.com/baidu'
# can_fetch
if robot_Parser.can_fetch(user_agent, check_url):
print("可以爬取")
else:
print("不可以爬取")
requests各种请求方式
python
#!/usr/bin/env python3
import requests
url_1 = 'http://httpbin.org/get'
url_2 = 'http://httpbin.org/post'
# 1、########get
req = requests.get(url_1)
print(req.text)
# 2、########添加参数的get
data = {
'name': 'handsome_boy_wangbo',
'age': '23'
}
req = requests.get(url_1, data)
print(req.text)
# 3、########post
data = {
'name': 'handsome_boy_wangbo',
'age': '23'
}
req = requests.post(url_2, data)
print(req.text)
r1 = requests.delete('http://httpbin.org/delete')
r2 = requests.head('http://httpbin.org/head')
r3 = requests.options('http://httpbin.org/options')
r4 = requests.put('http://httpbin.org/put')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
}
req = requests.get('https://www.baidu.com/', headers=headers)
print(req.text)