数据存mysql
pythonimport requests from bs4 import BeautifulSoup import pymysql # 链接数据库pymysql conn = pymysql.connect( user='root', password="JIAJIA", host='127.0.0.1', database='cnblogs', port=3306, ) cursor = conn.cursor() cursor = conn.cursor() # 爬数据 res = requests.get('https://www.autohome.com.cn/news/1/#liststart') # print(res.text) # 引入解析库 soup = BeautifulSoup(res.text, 'html.parser') ul_list = soup.find_all(name='ul', class_='article') for ul in ul_list: li_list = ul.find_all(name='li') for li in li_list: h3 = li.find(name='h3') if h3: title = h3.text content = li.find('p').text url = 'https:' + li.find(name='a').attrs['href'] img = li.find('img')['src'] que = 'https:' if que not in img: img = que + img print(''' 文章标题:%s 文章摘要:%s 文章链接:%s 文章图片:%s ''' % (title, content, url, img)) # 把图片下载到本地 img_res = requests.get(img) with open('./img/' + img.split('/')[-1], 'wb') as f: for line in img_res.iter_content(): f.write(line) # 存入数据库 cursor.execute("INSERT INTO news (title,content,img,url) VALUES (%s,%s,%s,%s)", args=[title, content, img, url]) conn.commit()
上传cnblogs图片
pythonimport requests header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0', 'Referer': 'https://i.cnblogs.com/ngsw-worker.js', 'Cookie': '_ga_K9ZHTBGT3Z=GS1.2.1702215735.1.0.1702215735.0.0.0; __gads=ID=5a6b96e7dce231d8:T=1700637208:RT=1706056779:S=ALNI_MYRXZOHhBLLyQzpN1We4NUdfMGYQQ; __gpi=UID=00000c93f4b05ac4:T=1700637208:RT=1706056779:S=ALNI_MatObgQktmbem_0SXLJiayLziRINg; cto_bundle=m0Ggk19mR1BnZTB1dExyaHNvT01CN1dQM0RISU9WUUtIZXVsRW43OXpxJTJCM2ZRMEpncWxtRUpqUiUyRnJ4TkdJQTFlWWdJcWUlMkZub2I4ZWQ1M0w3bk9YRjFmSVozNFBZaFp1MkJEeDhMMjZCc1R4YTFLanNXVVhyNGZIOHJpUGpEREZUJTJGMG9pTmI3YURQWWQ3enlTVVVaY3RoNXNVQSUzRCUzRA; _ga_7DSFGJNPL4=GS1.1.1708417388.1.0.1708417388.0.0.0; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1706056778,1708328454,1708521052; _ga=GA1.1.1528785063.1700619377; .AspNetCore.Antiforgery.b8-pDmTq1XM=CfDJ8DZoAyJmInJHoSwqM1IbzdQjW57WAADzO55EUBOrBFTDFAwXvmddFdsOwcd91uR9v1A23sMz6-hAGYwVrJHEH5c21hYY0xIfJ0-q3IBRlH4fIIoIFR2YMrT0AFNY6FN6LMlo2UEaQP9bnNUglQExnvY; .Cnblogs.AspNetCore.Cookies=CfDJ8DZoAyJmInJHoSwqM1IbzdTqIZnOOklAz_Oo1-ESSjFQVR0Sus_rZ18eCIcP-akb1YjYH-Uic23MF7u9RZHFjwmkhNbiUWC7xbFsYnzTcgu-nLsl5S6ZxcwEU3JGwjspeBFIqJ6kAA52NE1KnJKSkjvgKyfQL4FTAWpP6W7djbWxgFRsMY_eHqjRRS6L0-6dGePy8BFcwCNKm8yfVDN_wgKlsW6AztrWX9UB00sjnrBp0-QmK13o4qf8uLCw4eznsmlPWv_X1F6cNkp1Y6KODCznqgBcsOybycWBKS9vV2uIQ6-36t6HHxqNxL8ChBv73C3rTKfUtSMtLVYtBBE-goLnnqoVTfHqPAdhtj7USyVivYiEcBWAZrEScJ0kJy_9_yyMKiSIgjpOD7aQzzhD1U13tgnOO9ukBjSL-DH8BMODa7dcjfVlkm_8_osfLiwl7Y7oUTlxkIfIhTex2gyUsTIeWuvxRjOG-1kunQwCKpf6r2pZ_zmchTeWtg0hmR8FrHSH0JiT0GiQJZx7RBUZj3RmFfffkHanyu2KGXzMF_ekixBdCpzYAie5TkbQk1uHKoKrAMgdrGBWvRetLEcXfBoDtYgAmw3yDzLavMmVM1Ue; .CNBlogsCookie=4CC9FDFAB5122BA83534E90C476C1A6247EAF22D393555C1F9B6AA6247A9015208A126D736C7FD60A945836247262F06D289A6B0BF8B8B0E2981F8291311DD502E97F3DBD91ECD511D3EAF92566F14674A86FB9B; _ga_3Q0DVSGN10=GS1.1.1708521056.3.1.1708521352.28.0.0; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1708521353; _ga_M95P3TTWJZ=GS1.1.1708521052.31.1.1708521356.0.0.0; XSRF-TOKEN=CfDJ8DZoAyJmInJHoSwqM1IbzdRGjy4Kotcb_jOERQehTlGIpqZdhn-Zue6X3BLKyWSLj8cJCfqtCnXEFdWPWy1OLD64D1M7CWuHNCnJDZYsyExMi6EjFarOowjQTTuyzlGLPgG-6TnhkfDNGW1xsjqEuzDuINZ_HRV_fC0k_-nPEvGEU_bOuPqJXS3X6F7tGa0DwA; _ga_C2LFP3RFGH=GS1.1.1708521243.1.1.1708521421.0.0.0' } files = { 'imageFile': ("55.png", open('55.png', 'rb'), "image/jpeg"), 'host': 'www.cnblogs.com', 'uploadType': 'Paste' } res = requests.post('https://upload.cnblogs.com/imageuploader/CorsUpload', headers=header, files=files) print(res.text)
selenium介绍
原由:由于requests不能执行js
requests会逐个分析 ajax请求,模拟发送获取数据
使用requests爬取的数据 很大概率跟在浏览器中看到的不一样
介绍:selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题
本质:selenium是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果(跟咱们在浏览器看到的完全一致),支持多种浏览器
使用:
1、确认好,我们要驱动什么浏览器,我们用谷歌
2、下载一个浏览器驱动:驱动版本跟谷歌浏览器要对应
122.0.6261.58 ----++https://googlechromelabs.github.io/chrome-for-testing/++
win平台是:chromedriver.exe
3、把驱动放在环境变量下,项目根路径下
4、安装selenium 模块 pip install selenium
5、写代码
pythonfrom selenium import webdriver import time bro = webdriver.Chrome() # 手动打开了浏览器 bro.get('https://www.baidu.com') # 在浏览器中输入要访问的网址,并访问 time.sleep(5) bro.close() # 关闭浏览器
如果是:122.0.6261.57这个版本,安装包如下:
https://storage.googleapis.com/chrome-for-testing-public/122.0.6261.57/win64/chromedriver-win64.zip
selenium模拟登录
pythonimport time from selenium import webdriver from selenium.webdriver.common.by import By bro = webdriver.Chrome() # 手动打开了浏览器 bro.get('https://www.baidu.com') # 设置等待 10s--->找标签,如果找不到--会最多等待10s钟 bro.implicitly_wait(10) # 屏幕最大 bro.maximize_window() # 选择器---》找标签 # submit_btn=bro.find_element(by=By.ID,value='s-top-loginbtn') # 通过id找的 submit_btn = bro.find_element(by=By.LINK_TEXT, value='登录') # 通过a标签的文字找 submit_btn.click() # 点击标签 # 点击短信登录---》睡3s钟 sms_submit = bro.find_element(By.ID, 'TANGRAM__PSP_11__changeSmsCodeItem') sms_submit.click() time.sleep(3) # 点击账号登录 username_submit = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem') username_submit.click() time.sleep(3) # 找到账号和密码---》输入 username = bro.find_element(By.ID, 'TANGRAM__PSP_11__userName') username.send_keys('19856014525') # 向输入框中写内容 time.sleep(1) password = bro.find_element(By.ID, 'TANGRAM__PSP_11__password') password.send_keys('19856014525') # 向输入框中写内容 time.sleep(1) # 点击接受---》点击登录 accept = bro.find_element(By.ID, 'TANGRAM__PSP_11__isAgree') accept.click() time.sleep(1) submit = bro.find_element(By.ID, 'TANGRAM__PSP_11__submit') submit.click() time.sleep(5) bro.close()
无头浏览器
概念:无头浏览器是不加载页面也可以执行浏览器中的操作
过程:会去环境变量找驱动,找的过程比较慢,但是找到过一次,再运行就快了
这个是谷歌浏览器可执行文件的位置(可以不配, 环境变量中有)
指定驱动的位置,具体需要参照文档
python# options.binary_location = r"C:\Users\oldboy\AppData\Local\Google\Chrome\Application\chrome.exe" # options.xx='D:\Python27\crawl_day03\chromedriver.exe'
pythonfrom selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度 options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 bro = webdriver.Chrome(options=options) bro.get('https://www.cnblogs.com/liuqingzheng/p/16005896.html') print('破解知乎登陆' in bro.page_source) print(bro.page_source)
搜索标签
搜索标签:
pythonBy.ID # 根据id号查找标签 By.NAME # 根据name属性查找标签 By.TAG_NAME # 根据标签查找标签 By.CLASS_NAME # 按类名找 By.LINK_TEXT # a标签文字 By.PARTIAL_LINK_TEXT # a标签文字,模糊匹配 By.CSS_SELECTOR # 按css选择器找 By.XPATH #按xpath找
标签属性,位置,大小,文本:
pythonimport time from selenium import webdriver from selenium.webdriver.common.by import By bro = webdriver.Chrome() bro.get('https://www.cnblogs.com/liuqingzheng/p/16005896.html') bro.implicitly_wait(10) bro.maximize_window() # bro.find_element() # 找一个 # bro.find_elements() # 找所有 # 1 按id找---找到点赞---》点击它--->使用id找 number=bro.find_element(By.ID,'digg_count') number.click() #2 按标签名找 找出页面中所有a标签 按标签名找 a_list=bro.find_elements(By.TAG_NAME,'a') print(len(a_list)) # 3 按类名找 dig=bro.find_element(By.CLASS_NAME,'diggit') dig.click() # 4 按 a 标签 文字找 By.LINK_TEXT res=bro.find_element(By.LINK_TEXT,'分布式爬虫') print(res.text) # print(res.get_attribute('href')) # res.click() # 5 a标签文字,模糊匹配 By.PARTIAL_LINK_TEXT res=bro.find_element(By.PARTIAL_LINK_TEXT,'分布式') print(res.text) # print(res.get_attribute('href')) # res.click() # 6 css 解析 res=bro.find_element(By.CSS_SELECTOR,'a#cb_post_title_url>span') res=bro.find_element(By.CSS_SELECTOR,'#cb_post_title_url > span') print(res.get_attribute('role')) print(res.text) # 7 xpath解析--->不会xpath语法 res=bro.find_element(By.XPATH,'//*[@id="cb_post_title_url"]/span') print(res.get_attribute('role')) print(res.text) time.sleep(5) bro.close()
等待元素
1、隐士等待:等待标签加载好
原由:在find_element 找标签时,代码执行很快,标签可能还没加载出来,取不到会报错
设置隐士等待: bro.implicitly_wait(10)
加了这一句,当取标签时,如果标签没加载好,等待最多10s
等标签加载出来后,找到了继续往后走
2、显示等待(不好用)
每找一个标签,都要给它设置一次等待
以后,都在访问到某个地址后,加入这句话即可 bro.implicitly_wait(10)
执行js
bro.execute_script('js代码')
pythonimport time from selenium import webdriver from selenium.webdriver.common.by import By bro = webdriver.Chrome() bro.get('https://www.pearvideo.com/category_1') bro.implicitly_wait(10) bro.maximize_window() # 1 基本使用 bro.execute_script('alert("美女")') # 2 打印出一些变量 bro.execute_script('console.log(urlMap)') bro.execute_script('alert(JSON.stringify(urlMap))') # 3 新建选项卡 bro.execute_script('open()') # 4 滑动屏幕 bro.execute_script('scrollTo(0,document.documentElement.scrollHeight)') # 5 获取当前访问地址 bro.execute_script('alert(location)') bro.execute_script('location="http://www.baidu.com"') # 6 打印cookie bro.execute_script('alert(document.cookie)') time.sleep(10) bro.close()
选项卡
打开选项卡(通过执行js):bro.execute_script('open()')
切换到某个选项卡:bro.switch_to.window(bro.window_handles[1])
关闭选项卡:bro.close()
关闭浏览器:bro.quit()
pythonfrom selenium import webdriver import time bro = webdriver.Chrome() bro.get('https://www.pearvideo.com/') bro.implicitly_wait(10) print(bro.window_handles) # 开启选项卡 bro.execute_script('window.open()') # 获取出所有选项卡 bro.switch_to.window(bro.window_handles[1]) # 切换到第一个个选项卡 bro.get('http://www.taobao.com') time.sleep(2) bro.switch_to.window(bro.window_handles[0]) # 切换到某个选项卡 bro.get('http://www.baidu.com') time.sleep(2) bro.execute_script('window.open()') bro.execute_script('window.open()') bro.close() # 关闭选项卡 time.sleep(2) bro.quit() # 关闭页面
模拟浏览器前进后退
前进:bro.forward()
后退:bro.back()
pythonfrom selenium import webdriver import time bro = webdriver.Chrome() bro.get('https://www.pearvideo.com/') bro.implicitly_wait(10) # 获取出所有选项卡 time.sleep(2) bro.get('http://www.taobao.com') time.sleep(2) bro.get('http://www.baidu.com') time.sleep(2) bro.back() time.sleep(2) bro.back() # 前进 time.sleep(2) bro.forward() # 后退 bro.quit() # 关闭页面
自动登录cnblogs
pythonimport time from selenium import webdriver from selenium.webdriver.chrome.options import Options import json from selenium.webdriver.common.by import By # 去掉自动化软件控制的检测 options = Options() options.add_argument("--disable-blink-features=AutomationControlled") # 去掉自动化控制 bro = webdriver.Chrome(options=options) bro.get('https://www.cnblogs.com/') bro.implicitly_wait(10) # 等待十秒 bro.maximize_window() # 屏幕最大化 login_btn = bro.find_element(By.LINK_TEXT, '登录') login_btn.click() # selenium自动点击登录按钮 time.sleep(2) # 找到用户名和密码输入框 username = bro.find_element(By.CSS_SELECTOR, '#mat-input-0') password = bro.find_element(By.ID, 'mat-input-1') submit_btn = bro.find_element(By.CSS_SELECTOR, 'body > app-root > app-sign-in-layout > div > div > app-sign-in > app-content-container > div > div > div > form > div > button') # 验证码 code=bro.find_element(By.ID,'rectMask') time.sleep(1) username.send_keys('糖果爱上我125') time.sleep(1) password.send_keys('20020308Zjq.') time.sleep(1) submit_btn.click() # 一种情况直接登录成功 一种情况会弹出验证码 code.click() time.sleep(10) # 让程序先停在这---》手动操作浏览器---》把验证码搞好---》程序再继续往下走 # 到现在,是登录成功的状态 # 取出cookie存起来 cookies = bro.get_cookies() with open('cnblogs.json', 'w', encoding='utf-8') as f: json.dump(cookies, f) time.sleep(2) bro.close()