关于html导出word总结一

总结

测试结果不理想，html-to-docx 和 html-docx-js 最终导出的结果都 差强人意，效果可以见末尾的附图

环境

"electron": "24.3.0"

依赖库

测试结果

html-docx-js

这个库在前端【我】无法使用，在用的时候，库本身的代码里需要全局变量 __dirname, 这个在前端浏览器环境好像是没有的，是属于node环境的变量

html-to-docx

这个可以用，但效果不好，最终效果见下方附图，这里附上使用代码，下面的代码文件可以直接使用。

javascript 复制代码

import { saveAs } from 'file-saver'
import HTMLtoDOCX from 'html-to-docx'

const printStyles = `
            @media print {
            body, html {
                margin: 0;
                padding: 0;
            }
            }`

function getHtmlContent(ctrlId) {
    // 获取整个页面的 HTML
    const pageHTML = document.documentElement.outerHTML
    // 使用 DOMParser 解析 HTML
    const parser = new DOMParser()
    const doc = parser.parseFromString(pageHTML, 'text/html')

    // 找到 #printableArea
    const printableArea = doc.querySelector(ctrlId)
    if (!printableArea) return ''

    // 隐藏 #printableArea 以外的所有元素
    doc.body.childNodes.forEach(node => {
        if (node !== printableArea) {
            if (node.style) node.style.visibility = 'hidden'
        }
    })

    // 删除所有 .no-print 元素
    const noPrintElements = printableArea.querySelectorAll('.no-print')
    noPrintElements.forEach(el => el.remove())

    // 获取所有的 style 和 link 标签
    const styles = doc.querySelectorAll('style, link[rel="stylesheet"]')

    return `
    <!DOCTYPE html>
    <html lang="en">
        <head>
            <meta charset="UTF-8" />
            <title>Document</title>
            <style>
                ${Array.from(styles).map(style => style.outerHTML).join('\n')}
            </style>
            <style>
                ${printStyles}
            </style>
        </head>
        <body>
            ${printableArea.outerHTML}
        </body>
    </html>
    `
}

/**
 * 这个接口可以把一个 HTML 字符串转成 docx 文件
 * 但经过测试，在某些html页面下，存在导出的 docx 文件无法打开的问题，比如整个页面都是table套table...
 * 通过HTMLtoDOCX导出的word，word内容版式是非常不理想的
 * 先保留这个代码万一以后用得到~
 * @param {*} ctrlId #abc
 */
export const exportWord = async function(ctrlId) {
    const htmlString = getHtmlContent(ctrlId)

    const fileBuffer = await HTMLtoDOCX(htmlString, null, {
        table: { row: { cantSplit: true }},
        footer: true,
        pageNumber: true
    })

    // console.log('fileBuffer', fileBuffer)

    saveAs(fileBuffer, 'html-to-docx.docx')
}

附注

用python把html导出docx

格式错乱，没用

原生table标签导出为xlsx

原网页效果

导出的xlsx效果

效果还行

代码

javascript 复制代码

import FileSaver from 'file-saver'
import * as XLSX from 'xlsx'

function getHtmlContent(ctrlId) {
    // 获取整个页面的 HTML
    const pageHTML = document.documentElement.outerHTML
    // 使用 DOMParser 解析 HTML
    const parser = new DOMParser()
    const doc = parser.parseFromString(pageHTML, 'text/html')

    // 找到 #printableArea
    const printableArea = doc.querySelector(ctrlId)
    if (!printableArea) return ''

    // 隐藏 #printableArea 以外的所有元素
    doc.body.childNodes.forEach(node => {
        if (node !== printableArea) {
            if (node.style) node.style.visibility = 'hidden'
        }
    })

    // 删除所有 .no-print 元素
    const noPrintElements = printableArea.querySelectorAll('.no-print')
    noPrintElements.forEach(el => el.remove())

    return printableArea
}

export const exportExcel = async function(ctrlId) {
    const exportContent = getHtmlContent(ctrlId)

    // console.log(XLSX)

    /* generate workbook object from table */
    // var wb = XLSX.utils.table_to_book(document.querySelector(ctrlId))
    var wb = XLSX.utils.table_to_book(exportContent)

    /* get binary string as output */
    var wbout = XLSX.write(wb, { bookType: 'xlsx', bookSST: true, type: 'array' })
    try {
        FileSaver.saveAs(new Blob([wbout], { type: 'application/octet-stream' }), 'sheetjs.xlsx')
    } catch (e) {
        if (typeof console !== 'undefined') { console.log(e, wbout) }
    }
    // return wbout
}

通过pdf转word

导出pdf，通过pdf转word，确实还不错，但是pdf里的文字都被截取成图片放到word里了。。

附图

原网页

下图是上面的代码中函数getHtmlContent返回的html保存到html文件后，打开的样子，后面的导出都以 getHtmlContent返回的html 为准

html-to-docx 库导出的docx

第二页就不放了~

导出为pdf

pdf的效果是完美的~

pdf转word

效果非常不错，但其中的每一段文字都是图片，无法编辑