免责声明:本文仅做技术交流与学习...
爬取后,结合暗黑搜索引擎等等进行进一步搜索.
edu_src.py
python
import requests, time
from bs4 import BeautifulSoup
for i in range(1, 20):
url = f'https://src.sjtu.edu.cn/rank/firm/0/?page={i}'
print(f"正在获取第{i}页数据")
s = requests.get(url).text
# print(s)
soup = BeautifulSoup(s, 'html.parser')
edu1 = soup.find_all('tr')
# print(edu1)
for edu in edu1:
try:
edu_name = edu.a.text
# print(edu_name)
with open('edu_name.txt', 'a+',encoding='utf-8') as f:
f.write(edu_name + '\n')
except:
pass
print(f"{i}页已经写入!!!")