python版 html正文提取(CEPF)

python 复制代码
from selectolax.parser import *
import math



class CountInfo:
    def __init__(self):
        self.textCount = 0
        self.linkTextCount = 0
        self.tagCount = 0
        self.linkTagCount = 0
        self.density = 0
        self.densitySum = 0
        self.score = 0
        self.pCount = 0
        self.leafList = []

    def __str__(self) -> str:
        return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"


class ContentExtractor:

    def __init__(self):
        pass

    def reload(self, content):
        self.doc = HTMLParser(content)
        self.infoMap = []

    def clear(self):
        tags = ["script", "noscript", "style", "iframe", "br"]
        self.doc.strip_tags(tags)

    def computeInfo(self, node):
        if node.tag != "-text":
            countInfo = CountInfo()
            for child_node in node.iter(include_text=True):
                childCountInfo = self.computeInfo(child_node)
                countInfo.textCount += childCountInfo.textCount
                countInfo.linkTextCount += childCountInfo.linkTextCount
                countInfo.tagCount += childCountInfo.tagCount
                countInfo.linkTagCount += childCountInfo.linkTagCount
                countInfo.leafList.extend(childCountInfo.leafList)
                countInfo.densitySum += childCountInfo.density
                countInfo.pCount += childCountInfo.pCount
            countInfo.tagCount += 1
            tagname = node.tag
            if tagname == "a":
                countInfo.linkTextCount = countInfo.textCount
                countInfo.linkTagCount += 1
            elif tagname == "p":
                countInfo.pCount += 1

            pureLen = countInfo.textCount - countInfo.linkTextCount
            tag_len = countInfo.tagCount - countInfo.linkTagCount

            if pureLen == 0 or tag_len == 0:
                countInfo.density = 0
            else:
                countInfo.density = pureLen / tag_len

            self.infoMap.append({"node": node, "countInfo": countInfo})
            return countInfo
        else:
            countInfo = CountInfo()
            text = node.text_content
            text_len = len(text)
            countInfo.textCount = text_len
            countInfo.leafList.append(text_len)
            return countInfo

    def computerVar(self, data):
        """方差"""
        if not data:
            return 0
        if len(data) == 1:
            return data[0] / 2
        avg = sum(data) / len(data)
        return sum((x - avg) ** 2 for x in data) / len(data)

    def computeScore(self, countInfo):
        "计算得分"

        sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
        score = (
            math.log(sqrt)
            * countInfo.densitySum
            * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
            * math.log10(countInfo.pCount + 2)
        )
        return score

    def getContentElement(self):
        self.clear()
        if not self.doc.body:

            return ""
        self.computeInfo(self.doc.body)
        content = None
        maxScore = 0

        for obj in self.infoMap:
            node = obj.get("node")
            if node.tag == "a" or node.tag == "body":
                continue
            score = self.computeScore(obj.get("countInfo"))
            if score > maxScore:
                maxScore = score
                content = node

        return content
相关推荐
2***B44921 分钟前
C++在金融中的QuantLibXL
开发语言·c++·金融
A***071735 分钟前
C++在游戏中的阴影渲染
开发语言·c++·游戏
2401_8370885038 分钟前
Redisson的multilock原理
java·开发语言
能鈺CMS44 分钟前
内容付费系统全面解析:构建知识变现体系的最强工具(2025 SEO 深度专题)
大数据·人工智能·html
合作小小程序员小小店1 小时前
桌面开发,在线%超市销售管理%系统,基于vs2022,c#,winform,sql server数据
开发语言·数据库·microsoft·c#
Salt_07281 小时前
DAY 19 数组的常见操作和形状
人工智能·python·机器学习
无心水2 小时前
【Python实战进阶】2、Jupyter Notebook终极指南:为什么说不会Jupyter就等于不会Python?
python·jupyter·信息可视化·binder·google colab·python实战进阶·python工程化实战进阶
Q***l6872 小时前
C++在计算机图形学中的渲染
开发语言·c++
0和1的舞者2 小时前
《网络编程核心概念与 UDP Socket 组件深度解析》
java·开发语言·网络·计算机网络·udp·socket
惜棠2 小时前
visual code + rust入门指南
开发语言·后端·rust