python版 html正文提取(CEPF)

python 复制代码
from selectolax.parser import *
import math



class CountInfo:
    def __init__(self):
        self.textCount = 0
        self.linkTextCount = 0
        self.tagCount = 0
        self.linkTagCount = 0
        self.density = 0
        self.densitySum = 0
        self.score = 0
        self.pCount = 0
        self.leafList = []

    def __str__(self) -> str:
        return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"


class ContentExtractor:

    def __init__(self):
        pass

    def reload(self, content):
        self.doc = HTMLParser(content)
        self.infoMap = []

    def clear(self):
        tags = ["script", "noscript", "style", "iframe", "br"]
        self.doc.strip_tags(tags)

    def computeInfo(self, node):
        if node.tag != "-text":
            countInfo = CountInfo()
            for child_node in node.iter(include_text=True):
                childCountInfo = self.computeInfo(child_node)
                countInfo.textCount += childCountInfo.textCount
                countInfo.linkTextCount += childCountInfo.linkTextCount
                countInfo.tagCount += childCountInfo.tagCount
                countInfo.linkTagCount += childCountInfo.linkTagCount
                countInfo.leafList.extend(childCountInfo.leafList)
                countInfo.densitySum += childCountInfo.density
                countInfo.pCount += childCountInfo.pCount
            countInfo.tagCount += 1
            tagname = node.tag
            if tagname == "a":
                countInfo.linkTextCount = countInfo.textCount
                countInfo.linkTagCount += 1
            elif tagname == "p":
                countInfo.pCount += 1

            pureLen = countInfo.textCount - countInfo.linkTextCount
            tag_len = countInfo.tagCount - countInfo.linkTagCount

            if pureLen == 0 or tag_len == 0:
                countInfo.density = 0
            else:
                countInfo.density = pureLen / tag_len

            self.infoMap.append({"node": node, "countInfo": countInfo})
            return countInfo
        else:
            countInfo = CountInfo()
            text = node.text_content
            text_len = len(text)
            countInfo.textCount = text_len
            countInfo.leafList.append(text_len)
            return countInfo

    def computerVar(self, data):
        """方差"""
        if not data:
            return 0
        if len(data) == 1:
            return data[0] / 2
        avg = sum(data) / len(data)
        return sum((x - avg) ** 2 for x in data) / len(data)

    def computeScore(self, countInfo):
        "计算得分"

        sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
        score = (
            math.log(sqrt)
            * countInfo.densitySum
            * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
            * math.log10(countInfo.pCount + 2)
        )
        return score

    def getContentElement(self):
        self.clear()
        if not self.doc.body:

            return ""
        self.computeInfo(self.doc.body)
        content = None
        maxScore = 0

        for obj in self.infoMap:
            node = obj.get("node")
            if node.tag == "a" or node.tag == "body":
                continue
            score = self.computeScore(obj.get("countInfo"))
            if score > maxScore:
                maxScore = score
                content = node

        return content
相关推荐
apihz几秒前
域名WHOIS信息查询免费API使用指南
android·开发语言·数据库·网络协议·tcp/ip
coding随想14 分钟前
掌控网页的魔法之书:JavaScript DOM的奇幻之旅
开发语言·javascript·ecmascript
Norvyn_723 分钟前
LeetCode|Day18|20. 有效的括号|Python刷题笔记
笔记·python·leetcode
爱吃烤鸡翅的酸菜鱼33 分钟前
IDEA高效开发:Database Navigator插件安装与核心使用指南
java·开发语言·数据库·编辑器·intellij-idea·database
chao_78941 分钟前
更灵活方便的初始化、清除方法——fixture【pytest】
服务器·自动化测试·python·pytest
心情好的小球藻1 小时前
Python应用进阶DAY9--类型注解Type Hinting
开发语言·python
都叫我大帅哥1 小时前
LangChain加载HTML内容全攻略:从入门到精通
python·langchain
惜.己1 小时前
使用python读取json数据,简单的处理成元组数组
开发语言·python·测试工具·json
Y4090011 小时前
C语言转Java语言,相同与相异之处
java·c语言·开发语言·笔记
DanB242 小时前
html复习
javascript·microsoft·html