背景:
有一份word文档,但是通过 aspose.words 转化为图片后会有水印,虽然上下方的水印可以通过截取去掉,但是文本中间的logo水印无法去除,所以需要转为html进行去除
版本说明:aspose-words==24.2.0
输入:一个word的文件路径
输出:一个同名的html问价
import aspose.words.saving as saving
import aspose.words as aw
from bs4 import BeautifulSoup
import re
def word_html(file_name):
docx = aw.Document(file_name)
save_options = saving.HtmlSaveOptions(aw.SaveFormat.HTML)
save_options.export_images_as_base64 = True
docx.save('tmp.html', save_options)
html_content = open('tmp.html', "r", encoding="utf-8")
soup = BeautifulSoup(html_content, features="lxml")
# 删除指定的aspose的内容
for tag in soup.find_all(style=re.compile("-aw-headerfooter-type:")):
tag.extract()
word_key_tag = soup.find("p", text=re.compile("Evaluation Only"))
word_key_tag.extract()
f = open(file_name.split('.')[0] + '.html', "w", encoding="utf-8")
f.write(soup.prettify())
f.close()
if __name__ == '__main__':
word_html(file_name='1.docx')
# 输出 1.html