python爬虫抓取新闻并且植入自己的mysql远程数据库内

python爬虫抓取新闻并且植入自己的mysql远程数据库内!这个代码是我自己写了很久才写好的,分享给大家。喜欢的点个赞。


复制代码
# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
import datetime
import random

import pymysql
from selenium import webdriver
from lxml import etree
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


def strreplace_v1(old_str, key, value):
    # 替换某个字符串的一个或某几个字符串
    new_str = old_str.replace(key, value)
    return new_str


def get_page_source_html(driver, urlinfo):
    driver.get(urlinfo)
    page_text = driver.page_source
    tree = etree.HTML(page_text)
    return tree


def get_page_source_etree(driver):
    page_text = driver.page_source
    tree = etree.HTML(page_text)
    return tree


def get_list_a(etree, xpathinfo):
    return etree.xpath(xpathinfo)


def get_news_title(etree, xpathino):
    return etree.xpath(xpathino)


def get_news_content(etree, xpathino):
    return etree.xpath(xpathino)


def get_news_publish(etree, xpathino):
    return etree.xpath(xpathino)


def getUA():
    uaList = [
        # 360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
        # chrome
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",

        # firefox
        # "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0",

        # ie11
        # "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        # ie8
        # "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)",

        # 2345王牌
        # "Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018",

        # 搜狗
        # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
        # opera
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"

    ]
    headers = random.choice(uaList)
    return headers


def get_desurl_list():
    # 所有的目标url集合。500个。
    urlinfo_list = []
    tree = ET.parse('sitemap.xml')
    url = tree.find("url")
    for rank in tree.iter('loc'):
        urlinfo_list.append(rank.text)
    return urlinfo_list


def createwailian(urlwllist, urlzhiru):
    str = strreplace_v1(urlzhiru, 'hrefinfo', urlwllist[random.randint(0, len(urlwllist))])
    return str


if __name__ == "__main__":
    allwlurllist = get_desurl_list()
    options = Options()
    options.add_argument("--disable-desktop-notifications")
    options.add_argument('User-Agent=%s' % getUA())
    # options.add_argument('--proxy-server={0}'.format('103.37.141.69:80'))
    # 创建浏览器对象
    driver = webdriver.Chrome(options=options)
    urlend = ""
    urlbegin = 'http://www.106ms.com/index.php?list=6-'
    for urlstart in range(1, 10):
        print('当前正访问:{0}'.format(urlbegin + str(urlstart)))
        driver.get(urlbegin + str(urlstart))
        # //*[@id="menu-item-10"]/a
        # 点击SEO基础知识链接
        # driver.find_element(By.XPATH, value='//*[@id="menu-item-5"]/a').click()
        # 存储a标签的集合
        list_a = []
        # xpath获取到的集合是一个存储了大量的webelment对象的集合,想具体拿到属性信息,还得再写标签自身的xpath语法。
        # //*[@id="moar"]/section[2]/div/div[1]/div[1]/dl/dd/h3/a
        a_list = get_list_a(get_page_source_etree(driver), '//*[@id="moar"]/section[2]/div/div/div/dl/dd/h3/a')
        sleep(1)
        for a in a_list:
            href = a.xpath('./@href')[0]
            list_a.append(href)
        print("当前页面获取a标签集合长度为{0}".format(len(list_a)))
        sleep(1)
        # 遍历当前list_a
        try:
            db = pymysql.Connect(
                host='8.142.*.*',  # 服务器ip地址
                port=3306,  # mysql默认端口号
                user="106iiaa",  # 用户名
                password="yrdsrootadmi3",  # 密码
                charset="utf8",  # 字符集
                db="hbdsa89aa"  # 数据库
            )
            cursor = db.cursor()
            #xuanyan = "<p>本平台所发布的部分公开信息来源于互联网,转载的目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点,本平台所提供的信息,只供参考之用。不保证信息的准确性、有效性、及时性和完整性。如有侵权请联系:[14878741214]删除,谢谢合作!</p>"
            #urlzhiru = '<p>网站入口1:<a href="hrefinfo" target="_blank">网站入口地址</a></p><p>网站入口2:<a href="http://diyigefan.com/" target="_blank">diyigefan.com</a></p>'
            readnumber = 0
            base_url = 'http://www.106ms.com'
            for newsurl in list_a:
                try:
                    wanquan = base_url + newsurl
                    # print(wanquan)
                    driver.get(wanquan)
                    # title = driver.find_element(By.CLASS_NAME, value='article-title').text
                    # content = driver.find_element(By.CLASS_NAME, value='article-content').text
                    # newscontent = driver.find_element(By.XPATH, value='//div[@class="news_txt"]')
                    #newscontentfordes = driver.find_element(By.CLASS_NAME, value='news_txt').text
                    # /html/head/meta[6]
                    newscontentfordes = driver.find_element(By.XPATH, value='/html/head/meta[6]')
                    # 使用beautifulsoup封装html源码信息,然后开始提取内容。
                    soup = BeautifulSoup(driver.page_source, features='lxml', from_encoding='utf-8')
                    for s in soup('img'):
                        s.extract()
                    for s in soup('a'):
                        s.extract()
                    # 已经提前清除了img标签和a标签了
                    allp = soup.find("article", {"class": "content text-left"}).findAll('div')
                    paragraphs = []
                    for x in allp:
                        paragraphs.append(str(x))
                    # 去掉最后一个元素的值。
                    content2 = ''.join(paragraphs[0:-1])
                    # print(content2)
                    # 标题信息
                    title = driver.find_element(By.XPATH, value='//*[@id="moar"]/section[2]/div/div/div/article/header/h2').text
                    keywords = title
                    # content = content2  + createwailian(allwlurllist, urlzhiru)
                    content = content2
                    # content = newscontent + xuanyan
                    des = str(newscontentfordes.strip())[0:120]
                    sql = 'insert into news (title, keywords, des,content, author,publish,click,state,attr,attrdiy,flag,cate,uid) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
                    values = (
                        title, keywords, des, content, 'admin',
                        str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                        random.randint(100, 999),
                        0, 0, 0, '画室新闻', 5, 3)
                    cursor.execute(sql, values)
                    db.commit()
                    readnumber = readnumber + 1
                    print("标题:{0}---插入数据库成功".format(title))
                except Exception as ee:
                    print('发生了异常', ee)
                    continue
        except Exception as e:
            # print('发生了异常', e)
            db.rollback()
        finally:
            cursor.close()
            db.close()
        sleep(5)

        print('本次任务成功植入{0}篇软文.'.format(readnumber))
        sleep(600)
    driver.quit()

前提是,你需要提前安装好里面使用到了一些插件包。

否则是会报错的。

相关推荐
老徐聊GEO4 小时前
AI搜索流量转化率实测分享:我的案例与复盘
人工智能·python
草莓熊Lotso4 小时前
【LangChain】流式传输原理与 LangSmith 应用监控全解析
人工智能·python·langchain·gpt-3
数据知道4 小时前
指纹浏览器:DNS 泄漏防范与 WebRTC 本地 IP 屏蔽的底层实现
爬虫·网络协议·tcp/ip·安全·webrtc·数据采集·指纹浏览器
老毛肚11 小时前
jeecg-boot-base-core 02 day
javascript·python
yaoxin52112311 小时前
434. Java 日期时间 API - Period 基于日期的时间段
java·开发语言·python
岁月宁静12 小时前
RAG 文档摄入全链路,从原理到生产落地
vue.js·人工智能·python
火山上的企鹅12 小时前
Codex实战:APP远程升级服务搭建(三)后台管理页面(APK 上传、版本管理、多应用页签)
服务器·网络·数据库·oracle·qgc
JaydenAI12 小时前
[对比学习LangChain和MAF-07]如何引入人机交互的审批流程
python·ai·langchain·c#·agent·hitl·maf
阿狸猿13 小时前
论 NoSQL 数据库技术及其应用
数据库·nosql
神奇元创13 小时前
商用级光路加速卡:大模型推理的极速落地方案
python·神经网络·fpga开发·dsp开发