课程目标
爬去智联招聘
课程内容
python
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time
def tran_salary(ori_salary):
if "万" in ori_salary:
ori_salary = ori_salary.replace("万","")
ori_salary = float(ori_salary)
ori_salary *= 10000
elif "千" in ori_salary:
ori_salary = ori_salary.replace("千","")
ori_salary = float(ori_salary)
ori_salary *= 1000
return ori_salary
def get_page(page):
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"priority": "u=0, i",
"referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1",
"sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
cookies = {'_uab_collina': '172727354143424658574824', 'acw_tc': '1a0c638e17272735400066445e0053005d54c1b918e525ad699ab6092e7e02', 'acw_sc__v2': '66f41a3eacd51f5c8bb71f5793f12066ce7ab07c', 'x-zp-client-id': 'bb03355b-599f-495e-8d2b-30a4f57b7584', 'FSSBBIl1UgzbN7NS': '5iZQG1DC.WA2czpgKmafwzsAdzR.QEOHg8HC8skKuYfgXHOVdgLCPwc7y8ZlgriWSib.caw32rM0w0pfv0PBV9G', 'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219229869c96ab7-04ec55af3b58d84-26001151-921600-19229869c9713fe%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkyMjk4NjljOTZhYjctMDRlYzU1YWYzYjU4ZDg0LTI2MDAxMTUxLTkyMTYwMC0xOTIyOTg2OWM5NzEzZmUifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219229869c96ab7-04ec55af3b58d84-26001151-921600-19229869c9713fe%22%7D', 'sajssdk_2015_cross_new_user': '1', 'HMACCOUNT_BFESS': '3A069830089BCDB2', 'Hm_lvt_21a348fada873bdc2f7f75015beeefeb': '1727273541', 'Hm_lpvt_21a348fada873bdc2f7f75015beeefeb': '1727273541', 'HMACCOUNT': '3A069830089BCDB2', 'locationInfo_search': '{%22code%22:%22635%22%2C%22name%22:%22%E5%8D%97%E4%BA%AC%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}', 'FSSBBIl1UgzbN7NT': '5RBAFHbM0DOZqqqD1t3F9WG8co7bIo4rV6.nD4kr2dtixybv9BNj2CJSBoTS6tURMFwXclovVDDk_XXjinHgWwlAwiipd_yC9AZ3c7InbwyLyhfAZTH_vbrvOZ1x2kRsF.RbKexulxkWEG.GqrbeUedQMWVLHIeOa2CoNwYTdTGUm5_Nv6RqouNuFlzPykLCfXUuOFagtyYYE5hZ9WwOOl9WDpo378yA.WI.SlXZe5Hh.Nhm_tajQ5lufEmxXLurZC_ephouMjVXf4fav7tqBmB', '1420ba6bb40c9512e9642a1f8c243891': '3f4dcf67-7cd0-40ea-9af4-9dd3ea9acb1e'}
url = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{page}"
response = requests.get(url, headers=headers, cookies=cookies)
html_str = response.text
return html_str
infos = []
for i in range(1,6):
print(f"正在爬取第{i}页")
html_str = get_page(1)
soup = BeautifulSoup(html_str,"html.parser")
joblist = soup.find_all("div",class_="joblist-box__item")
for job_item in tqdm(joblist):
jobinfo__name = job_item.find("a",class_="jobinfo__name").text.strip()
company_name = job_item.find("a",class_="companyinfo__name").text.strip()
jobinfo__salary = job_item.find("p",class_="jobinfo__salary").text.strip()
if jobinfo__salary == '面议':
salary = 0
else:
if "·" in jobinfo__salary:
jobinfo__salary = jobinfo__salary.split("·")[0]
min_salary,max_salary = jobinfo__salary.split("-")
min_salary = tran_salary(min_salary)
max_salary = tran_salary(max_salary)
salary = (min_salary+max_salary)/2
jobinfo__tag = job_item.find("div",class_="jobinfo__tag")
skills = [] # 技能要求
if jobinfo__tag is not None:
joblist_box__item_tags = jobinfo__tag.findAll("div")
for joblist_box__item_tag in joblist_box__item_tags:
skills.append(joblist_box__item_tag.text)
jobinfo__other_info = job_item.find("div",class_="jobinfo__other-info")
jobinfo__other_infos = jobinfo__other_info.find_all("div")
area = jobinfo__other_infos[0].text.strip()
area_strs = area.split("·")
region,classify,city = "","",""
if len(area_strs) > 2:
region = area_strs[2]
if len(area_strs) > 1:
classify = area_strs[1]
if len(area_strs) > 0:
city = area_strs[0]
experience_requirement = jobinfo__other_infos[1].text.strip()
if experience_requirement == "经验不限":
experience_requirement = "0"
experience_requirement = experience_requirement.replace("年","")
if "-" in experience_requirement:
experience_requirement_list = experience_requirement.split("-")
experience_requirement = experience_requirement_list[0]
experience_requirement = int(experience_requirement)
education_background_requirement = jobinfo__other_infos[2].text.strip()
companyinfo__tag = job_item.find("div",class_="companyinfo__tag")
comany_info_items = companyinfo__tag.findAll("div")
finance_info = comany_info_items[0].text.strip()
scale = comany_info_items[1].text.strip()
if len(comany_info_items) > 2:
conany_type = comany_info_items[2].text.strip()
else:
conany_type = ""
info = { "公司名字": company_name, "薪资": salary, "技能要求": skills, "市": city, "区": classify, "区域": region, "经验要求": experience_requirement, "学历要求": education_background_requirement, "融资信息": finance_info, "规模": scale, "公司类型": conany_type, }
infos.append(info)
time.sleep(2)
# 使用pandas将infos列表转换为DataFrame
df = pd.DataFrame(infos)
# 将DataFrame保存为Excel文件
df.to_excel("智联职位信息.xlsx", index=False)