3.4 爬虫实战-爬去智联招聘职位信息

课程目标

爬去智联招聘

课程内容

python 复制代码
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time
def tran_salary(ori_salary):
    if "万" in ori_salary:
        ori_salary = ori_salary.replace("万","")
        ori_salary = float(ori_salary)
        ori_salary *= 10000
    elif "千" in ori_salary:
        ori_salary = ori_salary.replace("千","")
        ori_salary = float(ori_salary)
        ori_salary *= 1000
    return ori_salary
def get_page(page):
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "max-age=0",
        "priority": "u=0, i",
        "referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1",
        "sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-origin",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
    }
    cookies = {'_uab_collina': '172727354143424658574824', 'acw_tc': '1a0c638e17272735400066445e0053005d54c1b918e525ad699ab6092e7e02', 'acw_sc__v2': '66f41a3eacd51f5c8bb71f5793f12066ce7ab07c', 'x-zp-client-id': 'bb03355b-599f-495e-8d2b-30a4f57b7584', 'FSSBBIl1UgzbN7NS': '5iZQG1DC.WA2czpgKmafwzsAdzR.QEOHg8HC8skKuYfgXHOVdgLCPwc7y8ZlgriWSib.caw32rM0w0pfv0PBV9G', 'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219229869c96ab7-04ec55af3b58d84-26001151-921600-19229869c9713fe%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkyMjk4NjljOTZhYjctMDRlYzU1YWYzYjU4ZDg0LTI2MDAxMTUxLTkyMTYwMC0xOTIyOTg2OWM5NzEzZmUifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219229869c96ab7-04ec55af3b58d84-26001151-921600-19229869c9713fe%22%7D', 'sajssdk_2015_cross_new_user': '1', 'HMACCOUNT_BFESS': '3A069830089BCDB2', 'Hm_lvt_21a348fada873bdc2f7f75015beeefeb': '1727273541', 'Hm_lpvt_21a348fada873bdc2f7f75015beeefeb': '1727273541', 'HMACCOUNT': '3A069830089BCDB2', 'locationInfo_search': '{%22code%22:%22635%22%2C%22name%22:%22%E5%8D%97%E4%BA%AC%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}', 'FSSBBIl1UgzbN7NT': '5RBAFHbM0DOZqqqD1t3F9WG8co7bIo4rV6.nD4kr2dtixybv9BNj2CJSBoTS6tURMFwXclovVDDk_XXjinHgWwlAwiipd_yC9AZ3c7InbwyLyhfAZTH_vbrvOZ1x2kRsF.RbKexulxkWEG.GqrbeUedQMWVLHIeOa2CoNwYTdTGUm5_Nv6RqouNuFlzPykLCfXUuOFagtyYYE5hZ9WwOOl9WDpo378yA.WI.SlXZe5Hh.Nhm_tajQ5lufEmxXLurZC_ephouMjVXf4fav7tqBmB', '1420ba6bb40c9512e9642a1f8c243891': '3f4dcf67-7cd0-40ea-9af4-9dd3ea9acb1e'}
    url = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{page}"
    response = requests.get(url, headers=headers, cookies=cookies)
    html_str = response.text
    return html_str

infos = []
for i in range(1,6):
    print(f"正在爬取第{i}页")
    html_str = get_page(1)
    soup = BeautifulSoup(html_str,"html.parser")
    joblist = soup.find_all("div",class_="joblist-box__item")
    for job_item in tqdm(joblist):
        jobinfo__name = job_item.find("a",class_="jobinfo__name").text.strip()
        company_name = job_item.find("a",class_="companyinfo__name").text.strip()
        jobinfo__salary = job_item.find("p",class_="jobinfo__salary").text.strip()
        if jobinfo__salary == '面议':
            salary = 0
        else:
            if "·" in jobinfo__salary:
                jobinfo__salary = jobinfo__salary.split("·")[0]
            min_salary,max_salary = jobinfo__salary.split("-")
            min_salary = tran_salary(min_salary)
            max_salary = tran_salary(max_salary)
            salary = (min_salary+max_salary)/2
        jobinfo__tag = job_item.find("div",class_="jobinfo__tag")
        skills = [] # 技能要求
        if jobinfo__tag is not None:
            joblist_box__item_tags = jobinfo__tag.findAll("div")
            for joblist_box__item_tag in joblist_box__item_tags:
                skills.append(joblist_box__item_tag.text)

        jobinfo__other_info = job_item.find("div",class_="jobinfo__other-info")
        jobinfo__other_infos = jobinfo__other_info.find_all("div")
        area = jobinfo__other_infos[0].text.strip()
        area_strs = area.split("·")

        region,classify,city = "","",""
        if len(area_strs) > 2:
            region = area_strs[2]
        if len(area_strs) > 1:
            classify = area_strs[1]
        if len(area_strs) > 0:
            city = area_strs[0]

        experience_requirement = jobinfo__other_infos[1].text.strip()
        if experience_requirement == "经验不限":
            experience_requirement = "0"
        experience_requirement = experience_requirement.replace("年","")
        if "-" in experience_requirement:
            experience_requirement_list = experience_requirement.split("-")
        experience_requirement = experience_requirement_list[0]
        experience_requirement = int(experience_requirement)
        education_background_requirement = jobinfo__other_infos[2].text.strip()
        companyinfo__tag = job_item.find("div",class_="companyinfo__tag")
        comany_info_items = companyinfo__tag.findAll("div")
        finance_info = comany_info_items[0].text.strip()
        scale = comany_info_items[1].text.strip()
        if len(comany_info_items) > 2:
            conany_type = comany_info_items[2].text.strip()
        else:
            conany_type = ""
        info = { "公司名字": company_name, "薪资": salary, "技能要求": skills, "市": city, "区": classify, "区域": region, "经验要求": experience_requirement, "学历要求": education_background_requirement, "融资信息": finance_info, "规模": scale, "公司类型": conany_type, }
        infos.append(info)
    time.sleep(2)
# 使用pandas将infos列表转换为DataFrame
df = pd.DataFrame(infos)

# 将DataFrame保存为Excel文件
df.to_excel("智联职位信息.xlsx", index=False)
相关推荐
喵手3 小时前
Python爬虫实战:HTTP缓存系统深度实战 — ETag、Last-Modified与requests-cache完全指南(附SQLite持久化存储)!
爬虫·python·爬虫实战·http缓存·etag·零基础python爬虫教学·requests-cache
喵手3 小时前
Python爬虫实战:容器化与定时调度实战 - Docker + Cron + 日志轮转 + 失败重试完整方案(附CSV导出 + SQLite持久化存储)!
爬虫·python·爬虫实战·容器化·零基础python爬虫教学·csv导出·定时调度
喵手5 小时前
Python爬虫实战:全站 Sitemap 自动发现 - 解析 sitemap.xml → 自动生成抓取队列的工业级实现!
爬虫·python·爬虫实战·零基础python爬虫教学·sitemap·解析sitemap.xml·自动生成抓取队列实现
iFeng的小屋6 小时前
【2026年新版】Python根据小红书关键词爬取所有笔记数据
笔记·爬虫·python
Love Song残响7 小时前
揭秘Libvio爬虫:动态接口与逆向实战
爬虫
喵手9 小时前
Python爬虫实战:构建招聘会数据采集系统 - requests+lxml 实战企业名单爬取与智能分析!
爬虫·python·爬虫实战·requests·lxml·零基础python爬虫教学·招聘会数据采集
iFeng的小屋9 小时前
【2026最新当当网爬虫分享】用Python爬取千本日本相关图书,自动分析价格分布!
开发语言·爬虫·python
数研小生10 小时前
关键词搜索京东列表API技术对接指南
大数据·数据库·爬虫
喵手10 小时前
Python爬虫实战:网页截图归档完全指南 - 构建生产级页面存证与历史回溯系统!
爬虫·python·爬虫实战·零基础python爬虫教学·网页截图归档·历史回溯·生产级方案
Blurpath住宅代理11 小时前
动态代理的五大优点:提升爬虫效率与安全性
网络·爬虫·动态ip·住宅ip·住宅代理