python版 html正文提取(CEPF)

python 复制代码
from selectolax.parser import *
import math



class CountInfo:
    def __init__(self):
        self.textCount = 0
        self.linkTextCount = 0
        self.tagCount = 0
        self.linkTagCount = 0
        self.density = 0
        self.densitySum = 0
        self.score = 0
        self.pCount = 0
        self.leafList = []

    def __str__(self) -> str:
        return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"


class ContentExtractor:

    def __init__(self):
        pass

    def reload(self, content):
        self.doc = HTMLParser(content)
        self.infoMap = []

    def clear(self):
        tags = ["script", "noscript", "style", "iframe", "br"]
        self.doc.strip_tags(tags)

    def computeInfo(self, node):
        if node.tag != "-text":
            countInfo = CountInfo()
            for child_node in node.iter(include_text=True):
                childCountInfo = self.computeInfo(child_node)
                countInfo.textCount += childCountInfo.textCount
                countInfo.linkTextCount += childCountInfo.linkTextCount
                countInfo.tagCount += childCountInfo.tagCount
                countInfo.linkTagCount += childCountInfo.linkTagCount
                countInfo.leafList.extend(childCountInfo.leafList)
                countInfo.densitySum += childCountInfo.density
                countInfo.pCount += childCountInfo.pCount
            countInfo.tagCount += 1
            tagname = node.tag
            if tagname == "a":
                countInfo.linkTextCount = countInfo.textCount
                countInfo.linkTagCount += 1
            elif tagname == "p":
                countInfo.pCount += 1

            pureLen = countInfo.textCount - countInfo.linkTextCount
            tag_len = countInfo.tagCount - countInfo.linkTagCount

            if pureLen == 0 or tag_len == 0:
                countInfo.density = 0
            else:
                countInfo.density = pureLen / tag_len

            self.infoMap.append({"node": node, "countInfo": countInfo})
            return countInfo
        else:
            countInfo = CountInfo()
            text = node.text_content
            text_len = len(text)
            countInfo.textCount = text_len
            countInfo.leafList.append(text_len)
            return countInfo

    def computerVar(self, data):
        """方差"""
        if not data:
            return 0
        if len(data) == 1:
            return data[0] / 2
        avg = sum(data) / len(data)
        return sum((x - avg) ** 2 for x in data) / len(data)

    def computeScore(self, countInfo):
        "计算得分"

        sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
        score = (
            math.log(sqrt)
            * countInfo.densitySum
            * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
            * math.log10(countInfo.pCount + 2)
        )
        return score

    def getContentElement(self):
        self.clear()
        if not self.doc.body:

            return ""
        self.computeInfo(self.doc.body)
        content = None
        maxScore = 0

        for obj in self.infoMap:
            node = obj.get("node")
            if node.tag == "a" or node.tag == "body":
                continue
            score = self.computeScore(obj.get("countInfo"))
            if score > maxScore:
                maxScore = score
                content = node

        return content
相关推荐
VBA63372 分钟前
VBA数据库解决方案第十五讲:Recordset集合中单个数据的精确处理
开发语言
wrx繁星点点6 分钟前
事务的四大特性(ACID)
java·开发语言·数据库
不写八个13 分钟前
Python办公自动化教程(005):Word添加段落
开发语言·python·word
HEX9CF17 分钟前
【CTF Web】Pikachu xss之href输出 Writeup(GET请求+反射型XSS+javascript:伪协议绕过)
开发语言·前端·javascript·安全·网络安全·ecmascript·xss
_.Switch30 分钟前
Python机器学习框架介绍和入门案例:Scikit-learn、TensorFlow与Keras、PyTorch
python·机器学习·架构·tensorflow·keras·scikit-learn
赵荏苒42 分钟前
Python小白之Pandas1
开发语言·python
丶Darling.44 分钟前
代码随想录 | Day26 | 二叉树:二叉搜索树中的插入操作&&删除二叉搜索树中的节点&&修剪二叉搜索树
开发语言·数据结构·c++·笔记·学习·算法
人生の三重奏1 小时前
前端——js补充
开发语言·前端·javascript
酷酷-1 小时前
彩虹易支付最新版源码及安装教程(修复BUG+新增加订单投诉功能)
html·php·bug
平凡的小码农1 小时前
JAVA实现大写金额转小写金额
java·开发语言