python版 html正文提取(CEPF)

python 复制代码
from selectolax.parser import *
import math



class CountInfo:
    def __init__(self):
        self.textCount = 0
        self.linkTextCount = 0
        self.tagCount = 0
        self.linkTagCount = 0
        self.density = 0
        self.densitySum = 0
        self.score = 0
        self.pCount = 0
        self.leafList = []

    def __str__(self) -> str:
        return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"


class ContentExtractor:

    def __init__(self):
        pass

    def reload(self, content):
        self.doc = HTMLParser(content)
        self.infoMap = []

    def clear(self):
        tags = ["script", "noscript", "style", "iframe", "br"]
        self.doc.strip_tags(tags)

    def computeInfo(self, node):
        if node.tag != "-text":
            countInfo = CountInfo()
            for child_node in node.iter(include_text=True):
                childCountInfo = self.computeInfo(child_node)
                countInfo.textCount += childCountInfo.textCount
                countInfo.linkTextCount += childCountInfo.linkTextCount
                countInfo.tagCount += childCountInfo.tagCount
                countInfo.linkTagCount += childCountInfo.linkTagCount
                countInfo.leafList.extend(childCountInfo.leafList)
                countInfo.densitySum += childCountInfo.density
                countInfo.pCount += childCountInfo.pCount
            countInfo.tagCount += 1
            tagname = node.tag
            if tagname == "a":
                countInfo.linkTextCount = countInfo.textCount
                countInfo.linkTagCount += 1
            elif tagname == "p":
                countInfo.pCount += 1

            pureLen = countInfo.textCount - countInfo.linkTextCount
            tag_len = countInfo.tagCount - countInfo.linkTagCount

            if pureLen == 0 or tag_len == 0:
                countInfo.density = 0
            else:
                countInfo.density = pureLen / tag_len

            self.infoMap.append({"node": node, "countInfo": countInfo})
            return countInfo
        else:
            countInfo = CountInfo()
            text = node.text_content
            text_len = len(text)
            countInfo.textCount = text_len
            countInfo.leafList.append(text_len)
            return countInfo

    def computerVar(self, data):
        """方差"""
        if not data:
            return 0
        if len(data) == 1:
            return data[0] / 2
        avg = sum(data) / len(data)
        return sum((x - avg) ** 2 for x in data) / len(data)

    def computeScore(self, countInfo):
        "计算得分"

        sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
        score = (
            math.log(sqrt)
            * countInfo.densitySum
            * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
            * math.log10(countInfo.pCount + 2)
        )
        return score

    def getContentElement(self):
        self.clear()
        if not self.doc.body:

            return ""
        self.computeInfo(self.doc.body)
        content = None
        maxScore = 0

        for obj in self.infoMap:
            node = obj.get("node")
            if node.tag == "a" or node.tag == "body":
                continue
            score = self.computeScore(obj.get("countInfo"))
            if score > maxScore:
                maxScore = score
                content = node

        return content
相关推荐
databook2 小时前
Manim实现闪光轨迹特效
后端·python·动效
Juchecar3 小时前
解惑:NumPy 中 ndarray.ndim 到底是什么?
python
用户8356290780513 小时前
Python 删除 Excel 工作表中的空白行列
后端·python
Json_3 小时前
使用python-fastApi框架开发一个学校宿舍管理系统-前后端分离项目
后端·python·fastapi
数据智能老司机10 小时前
精通 Python 设计模式——分布式系统模式
python·设计模式·架构
数据智能老司机11 小时前
精通 Python 设计模式——并发与异步模式
python·设计模式·编程语言
数据智能老司机11 小时前
精通 Python 设计模式——测试模式
python·设计模式·架构
数据智能老司机11 小时前
精通 Python 设计模式——性能模式
python·设计模式·架构
c8i11 小时前
drf初步梳理
python·django
每日AI新事件11 小时前
python的异步函数
python