python版 html正文提取(CEPF)

python 复制代码
from selectolax.parser import *
import math



class CountInfo:
    def __init__(self):
        self.textCount = 0
        self.linkTextCount = 0
        self.tagCount = 0
        self.linkTagCount = 0
        self.density = 0
        self.densitySum = 0
        self.score = 0
        self.pCount = 0
        self.leafList = []

    def __str__(self) -> str:
        return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"


class ContentExtractor:

    def __init__(self):
        pass

    def reload(self, content):
        self.doc = HTMLParser(content)
        self.infoMap = []

    def clear(self):
        tags = ["script", "noscript", "style", "iframe", "br"]
        self.doc.strip_tags(tags)

    def computeInfo(self, node):
        if node.tag != "-text":
            countInfo = CountInfo()
            for child_node in node.iter(include_text=True):
                childCountInfo = self.computeInfo(child_node)
                countInfo.textCount += childCountInfo.textCount
                countInfo.linkTextCount += childCountInfo.linkTextCount
                countInfo.tagCount += childCountInfo.tagCount
                countInfo.linkTagCount += childCountInfo.linkTagCount
                countInfo.leafList.extend(childCountInfo.leafList)
                countInfo.densitySum += childCountInfo.density
                countInfo.pCount += childCountInfo.pCount
            countInfo.tagCount += 1
            tagname = node.tag
            if tagname == "a":
                countInfo.linkTextCount = countInfo.textCount
                countInfo.linkTagCount += 1
            elif tagname == "p":
                countInfo.pCount += 1

            pureLen = countInfo.textCount - countInfo.linkTextCount
            tag_len = countInfo.tagCount - countInfo.linkTagCount

            if pureLen == 0 or tag_len == 0:
                countInfo.density = 0
            else:
                countInfo.density = pureLen / tag_len

            self.infoMap.append({"node": node, "countInfo": countInfo})
            return countInfo
        else:
            countInfo = CountInfo()
            text = node.text_content
            text_len = len(text)
            countInfo.textCount = text_len
            countInfo.leafList.append(text_len)
            return countInfo

    def computerVar(self, data):
        """方差"""
        if not data:
            return 0
        if len(data) == 1:
            return data[0] / 2
        avg = sum(data) / len(data)
        return sum((x - avg) ** 2 for x in data) / len(data)

    def computeScore(self, countInfo):
        "计算得分"

        sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
        score = (
            math.log(sqrt)
            * countInfo.densitySum
            * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
            * math.log10(countInfo.pCount + 2)
        )
        return score

    def getContentElement(self):
        self.clear()
        if not self.doc.body:

            return ""
        self.computeInfo(self.doc.body)
        content = None
        maxScore = 0

        for obj in self.infoMap:
            node = obj.get("node")
            if node.tag == "a" or node.tag == "body":
                continue
            score = self.computeScore(obj.get("countInfo"))
            if score > maxScore:
                maxScore = score
                content = node

        return content
相关推荐
JELEE.2 小时前
Django登录注册完整代码(图片、邮箱验证、加密)
前端·javascript·后端·python·django·bootstrap·jquery
孫治AllenSun4 小时前
【算法】图相关算法和递归
windows·python·算法
QX_hao4 小时前
【Go】--反射(reflect)的使用
开发语言·后端·golang
inferno5 小时前
Maven基础(二)
java·开发语言·maven
我是李武涯5 小时前
从`std::mutex`到`std::lock_guard`与`std::unique_lock`的演进之路
开发语言·c++
史不了6 小时前
静态交叉编译rust程序
开发语言·后端·rust
读研的武6 小时前
DashGo零基础入门 纯Python的管理系统搭建
开发语言·python
Andy7 小时前
Python基础语法4
开发语言·python
但要及时清醒7 小时前
ArrayList和LinkedList
java·开发语言