python doc转png踩坑历程分享

首先python根据文本内容生成doc，使用的是python-docx库，使用示例如下：

复制代码

from docx import Document
from docx.shared import Pt, RGBColor
from docx.oxml.ns import qn
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT  #设置对象居中、对齐等


document = Document()
document.styles['Normal'].font.name = u'微软雅黑'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
#  标题：20号字体,居中,加粗,黑色,微软雅黑,段落间距20
 title = document.add_paragraph()
#  设置段落间距
 title.paragraph_format.line_spacing = 1.3  # 行距
# 设置居中
title.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# 加入文字
run = title.add_run('测试文档')
# 设置字体大小20
run.font.size = Pt(16)
# 设置字体颜色
run.font.color.rgb = RGBColor(0, 0, 0)
# 加粗
run.font.bold = True 
document.save('./a.docx')

下面进入重点python doc怎么转png，调研并测试过这几个方法：

使用unoconv和libreoffice，doc转pdf，pdf转png【环境centos 7】

复制代码

# 安装 libreoffice  可将doc转pdf
yum install libreoffice-writer
yum install unoconv
# 安装imagemagick
yum install ImageMagick


1.安装字体库
yum -y install fontconfig

2.添加中文字体，建立存储中文字体的文件夹
mkdir /usr/share/fonts/chinese

3.在windows上打开c盘下的Windows/Fonts目录，一般选择宋体和黑体，可以看到2个后缀名ttf和ttc的文件，将中文字体复制到Linux中那个chinese文件夹

4.添加权限
chmod -R 755 /usr/share/fonts/chinese

5.安装ttmkfdir来搜索目录中所有的字体信息，并汇总生成fonts.scale文件
yum -y install ttmkfdir

6.接下来生成
ttmkfdir -e /usr/share/X11/fonts/encodings/encodings.dir

7.修改字体配置文件
vi /etc/fonts/fonts.conf

8.可以看到一个Font list，即字体列表，添加中文字体文件夹位置，生成缓存
fc-cache

查看中文字体是否被添加进去
fc-list



# 执行转换
$ unoconv -f pdf -o ./11.pdf 11.docx 
$ sz 11.pdf 

$ unoconv -f png -o ./11.png 11.pdf # 只能转换1张，使用：convert -density 300 12.pdf -alpha off -background white -quality 100 image_transparent.png  #会生成多张 image_transparent-0.png image_transparent-1.png
$ sz 11.png

总结：可行，速度不行

spire.doc，pip安装库spire-doc，总结如下：

一个收费库即有水印，小贵，花不了一点儿（官网价格：https://www.e-iceblue.com/Buy/Spire.Doc.html）。
经测试水印位置，上中下都有，尝试去水印，去除的不彻底边缘能看到红色，或者导致原始文档内容不连续。
尝试doc直接转svg，修改svg水印字颜色和字号，结果转出的png英文显示，中文显示的是框框，还出现字重叠的情况。
速度还是挺快的。

aspose words，pip安装库aspose-words，总结如下：

与spire.doc一样收费，价格没去看，就看这个水印就放弃了，如图：

使用unoconv和libreoffice，doc转pdf，再使用fitz库pdf转png，总结如下：

可行，此方案仅仅想在第一种方案上进行速度优化。
实际验证速度还是不行。

创建带样式的模版html，读doc转html，替换掉模版正文，DrissionPage读取本地html文件，截屏存储为png。

最后这种思路虽然有点绕，不过速度是最快的。
无任何收费，DP最优解。

以下是测试代码：

复制代码

import os
import platform

from PIL import Image
# import cairosvg
from spire.doc import *
from spire.doc.common import *
# import xml.etree.ElementTree as ET
from lxml import etree as ET


def remove_red_watermark(image_path, output_path, red_threshold=100):
    # 打开图片并转换为RGBA格式
    image = Image.open(image_path).convert("RGBA")
    pixels = image.load()

    # 遍历每个像素
    for x in range(image.width):
        for y in range(image.height):
            r, g, b, a = pixels[x, y]

            # 判断是否为红色（可以根据需要调整red_threshold的值）
            if r > red_threshold and g < red_threshold + 20 and b < red_threshold + 20:
                # # 将红色像素的透明度设置为完全透明
                # pixels[x, y] = (r, g, b, 0)
                # 将红色像素的颜色改为白色，并保留其不透明度
                pixels[x, y] = (255, 255, 255, a)

    # 保存修改后的图片
    image.save(output_path, format="PNG")


# 读取SVG文件
def read_svg(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root, tree


# 修改SVG中的<tspan>标签
def modify_tspan_tags(root, target_fill, new_fill, new_font_size):
    # 遍历所有<tspan>标签
    for tspan in root.findall('.//{http://www.w3.org/2000/svg}tspan'):
        # 备份原始的transform属性值
        original_transform = tspan.attrib.get('transform', '')
        # 检查fill属性是否匹配目标值
        if 'fill' in tspan.attrib and tspan.attrib['fill'] == target_fill:
            # 修改fill属性和font-size
            tspan.attrib['fill'] = new_fill
            tspan.set('font-size', new_font_size)
        # 恢复原始的transform属性值
        tspan.attrib['transform'] = original_transform


# 保存修改后的SVG文件
def save_svg(tree, output_path):
    tree.write(output_path, encoding="utf-8", xml_declaration=True, standalone="yes")
    # tree.write(output_path)


# 将SVG文件转换为PNG格式
# def convert_svg_to_png(svg_path, png_path):
#     try:
#         # cairosvg.svg2png(url=svg_path, write_to=png_path)
#         # 使用cairosvg生成PNG的字节流
#         with open(svg_path, "r", encoding="utf-8") as f:
#             svg_content = f.read()
#         cairosvg.svg2png(bytestring=svg_content.encode('GBK'), write_to=png_path)
#
#     except Exception as e:
#         print(f'svg convert png err:{svg_path}')


# 主函数
def main(input_svg_path):
    # 输入SVG文件路径

    # 读取SVG文件
    root, tree = read_svg(input_svg_path)

    # 修改<tspan>标签
    modify_tspan_tags(root, '#ff0000', 'white', '0')

    # 输出修改后的SVG文件路径

    output_svg_path = 'modified_input.svg'
    new_svg = input_svg_path.replace('.svg', output_svg_path)

    # 保存修改后的SVG文件
    save_svg(tree, new_svg)
    print(f"修改后的SVG文件已保存为：{new_svg}")
    # 输出PNG文件路径
    output_png_path = new_svg.replace(output_svg_path, '.png')

    # 将修改后的SVG文件转换为PNG格式
    # convert_svg_to_png(new_svg, output_png_path)
    print(f"修改后的SVG文件已转换为PNG格式：{output_png_path}")


def gen_png(file_path):
    # 加载Word文档
    document = Document()
    document.LoadFromFile(file_path)

    # 保存为.png图片（也可以保存为jpg或bmp等图片格式）
    new_p = file_path.replace('.docx', '')
    document.SaveToFile(f"{new_p}-.svg")
    # 关闭文档
    document.Close()

    file_dir = file_path.rsplit('/', 1)[0]
    file_dir = file_dir if platform.system() == 'Linux' else file_path.rsplit('\\', 1)[0]
    print(f'file_dir==={file_dir}')
    for _, _, fs in os.walk(file_dir):
        for ff in fs:
            if ff.endswith('.svg'):
                ff_abs = os.path.join(file_dir, ff)
                main(ff_abs)


def gen_png1(file_path):
    # 加载Word文档
    document = Document()
    document.LoadFromFile(file_path)

    new_p = file_path.replace('.docx', '')
    # 遍历所有页面
    for i in range(document.GetPageCount()):
        # 转换指定页面为图片流
        imageStream = document.SaveImageToStreams(i, ImageType.Bitmap)
        # 保存为.png图片（也可以保存为jpg或bmp等图片格式）
        with open(f"{new_p}-{str(i)}_output.png", 'wb') as imageFile:
            imageFile.write(imageStream.ToArray())
        remove_red_watermark(f"{new_p}-{str(i)}_output.png", f"{new_p}-{str(i)}.png")
    # 关闭文档
    document.Close()


if __name__ == '__main__':
    f_p = r'C:\Users\user\Desktop\test\11.docx'
    f_p = f_p if platform.system() == 'Windows' else '/jjyy/11.docx'
    print(f'f_p==={f_p}')
    # gen_png(f_p)
    gen_png1(f_p)

若报以下错，参考下面文章：

sqlite:No module named _sqlite3

ImportError: lxml.html.clean module is now a separate project lxml_html_clean

DrissionPage.errors.WrongURLError 无效的url，也许要加上"http://"？