精益求精,支持处理嵌套表格的Word表格转HTML表格

上一篇博文(用VBA根据Word中的表格生成具有完全相同表结构的Markdown表或HTML表代码)完成后,拿着代码去第一个实验的Mistral嘚瑟一下,顺便臭一下它的水平不够,然后它臭了我一下:你这个没有处理嵌套表格,建议增加处理嵌套表格的功能。我让它给我生成处理嵌套表格的代码,结果折腾了一天它也没能成功。我一开始也小看了难度,自己鼓捣了一番也没成功。然后把题目出给其它AI,经过多次循循善诱,最接近成功的是最近长思考上线的Kimi------它给出了一个创建临时文档并在其中创建tempRng以复制单元格内容的方案,在我用下面的截图中最后一个魔鬼难度测试案例测试前执行结果基本符合预期(表结构正确,部分单元格文本内容有错误),但是败在魔鬼难度案例之下。我认为创建临时文档并在其中创建tempRng以复制单元格内容不但执行效率低,在用下面的魔法取得rowspan和colspan时也容易出错(有可能用到错误的临时文档中的Selection,也因此导致Kimi给出的代码读取单元格文本内容出错):

vbnet 复制代码
' 计算合并单元格(仅当前表格内)
With tmpCell
    .Select
    Selection.SelectRow: rSpan = Selection.Rows.Count
    .Select
    Selection.SelectColumn: cSpan = Selection.Columns.Count
End With

想了一下,临时Range不过就是用来粘贴单元格内容,完全可以在源文档末尾创建,无需创建新文档,这样的话当前单元格对应的临时Range的结束位置总是源文档末尾,只是由于要使用递归(处理嵌套表格天然适宜用递归),当前单元格对应的临时Range的开始位置可能因为进入下一层递归而发生变化------递归不是天然与堆栈有关联吗?何不在每次递归时将当前单元格对应的临时Range的开始位置压入堆栈,本单元格递归处理完成后将其开始位置取出,然后将该位置直至文末区域删除不就行了吗?将这个思路喂给Kimi,Kimi修改了它上一次给出的代码,在这份代码的基础上我只做了很小的修改就得到了以下结果:

vbnet 复制代码
Sub TableToHtmlWithNestedTable()
    Dim html As String, oTable As Table
    Dim numRows As Long, numCols As Long, rowIndex As Long, colIndex As Long
    Dim oCell As Word.cell, doc As Document
    
    Application.ScreenUpdating = False
    
    If Not Selection.Information(wdWithInTable) Then
        MsgBox "请将光标放在表格内", vbExclamation
        Exit Sub
    End If
    
    Set doc = ActiveDocument
    Set oTable = Selection.Tables(1)
    numRows = oTable.Rows.Count
    numCols = oTable.Columns.Count
    
    html = "<table>" & vbCrLf
    For rowIndex = 1 To numRows
        html = html & "  <tr>" & vbCrLf
        For colIndex = 1 To numCols
            On Error Resume Next
            Set oCell = oTable.cell(rowIndex, colIndex)
            If Err.Number <> 0 Then
                On Error GoTo 0
                GoTo nextCol
            End If
            On Error GoTo 0
            
            Dim rSpan As Long, cSpan As Long
            With oCell
                .Select: Selection.SelectColumn: cSpan = Selection.Columns.Count
                .Select: Selection.SelectRow: rSpan = Selection.Rows.Count
            End With
            
            Dim cellContent As String, currPos As Long
            currPos = 0
            cellContent = GetCellContent(doc, oCell, currPos)
            
            Dim tag As String: tag = IIf(rowIndex = 1, "th", "td")
            html = html & "    <" & tag
            If rSpan > 1 Then html = html & " rowspan=""" & rSpan & """"
            If cSpan > 1 Then html = html & " colspan=""" & cSpan & """"
            html = html & ">" & cellContent & "</" & tag & ">" & vbCrLf
nextCol:
        Next colIndex
        html = html & "  </tr>" & vbCrLf
    Next rowIndex
    html = html & "</table>" & vbCrLf
    
    
    Selection.start = oTable.Range.End
    Dim bInsert As VbMsgBoxResult
    bInsert = MsgBox("是否在表格下方输出HTML代码?", vbQuestion + vbYesNo, "确认")
    If bInsert = vbYes Then
        Selection.InsertAfter html & vbCrLf
        Selection.Range.HighlightColorIndex = wdYellow
    End If
    
    With CreateObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
        .SetText html: .PutInClipboard
    End With
    
    If depthWarning Then
        MsgBox "已生成HTML代码(含警告:嵌套表格超过5层)", vbExclamation
    Else
        MsgBox "已生成HTML代码并复制到剪贴板", vbOKOnly
    End If
    Application.ScreenUpdating = True
End Sub

Function GetCellContent(doc As Document, oCell As cell, ByRef currPos As Long) As String
    Dim html As String, tempStart As Long ' 临时范围起始位置(存入堆栈)
    Dim stack() As Long, stackPtr As Long ' 堆栈及指针
    Dim cellRng As Range, tempRng As Range ' 单元格范围及临时范围
    Dim tblCount As Integer, i As Integer
    Dim tmpTbl As Table, tmpCell As cell
    Dim numRows As Integer, numCols As Integer
    Dim rowIndex As Integer, colIndex As Integer
    Dim rSpan As Integer, cSpan As Integer, cellContent As String, tag As String
    
    ' 初始化堆栈(存储临时范围起始位置)
    stackPtr = 0
    ReDim stack(0)
    
    ' 1. 复制当前单元格内容(排除末尾回车符)
    Set cellRng = doc.Range(oCell.Range.start, oCell.Range.End - 1)
    
    ' 2. 在源文档末尾创建临时范围并压栈
    Set tempRng = doc.content
    tempRng.Collapse wdCollapseEnd
    tempStart = tempRng.start
    stackPtr = stackPtr + 1
    ReDim Preserve stack(stackPtr)
    stack(stackPtr) = tempStart ' 压栈
    
    ' 3. 粘贴内容到临时范围
    If cellRng.End > cellRng.start Then
        cellRng.Copy
        tempRng.Paste
    End If
    Set tempRng = doc.Range(tempStart, doc.content.End) ' 更新临时范围
    
    ' 4. 处理临时范围内的内容
    tblCount = tempRng.Tables.Count
    If tblCount > 0 Then
        For i = 1 To tblCount
            Set tmpTbl = tempRng.Tables(i)
            
            ' 处理表格前的文本
            If currPos < tmpTbl.Range.start Then
                html = html & ProcessText(doc.Range(tempStart + currPos, tmpTbl.Range.start))
                currPos = tmpTbl.Range.start - tempStart ' 相对临时范围的位置
            End If
            
            ' 生成表格标签
            html = html & "<table>" & vbCrLf
            numRows = tmpTbl.Rows.Count
            numCols = tmpTbl.Columns.Count
            
            ' 遍历行列
            For rowIndex = 1 To numRows
                html = html & "  <tr>" & vbCrLf
                For colIndex = 1 To numCols
                    ' 跳过合并单元格
                    On Error Resume Next
                    Set tmpCell = tmpTbl.cell(rowIndex, colIndex)
                    If Err.Number <> 0 Then
                        On Error GoTo 0
                        GoTo nextCol
                    End If
                    On Error GoTo 0
                    
                    ' 计算合并单元格(仅当前表格内)
                    With tmpCell
                        .Select
                        Selection.SelectRow: rSpan = Selection.Rows.Count
                        .Select
                        Selection.SelectColumn: cSpan = Selection.Columns.Count
                    End With
                    
                    ' 递归处理单元格内容
                    currPos = 0
                    cellContent = GetCellContent(doc, tmpTbl.cell(rowIndex, colIndex), currPos)
                    
                    ' 生成单元格标签
                    tag = IIf(rowIndex = 1, "th", "td")
                    html = html & "    <" & tag
                    If rSpan > 1 Then html = html & " rowspan=""" & rSpan & """"
                    If cSpan > 1 Then html = html & " colspan=""" & cSpan & """"
                    html = html & ">" & cellContent & "</" & tag & ">" & vbCrLf
nextCol:
                Next colIndex
                html = html & "  </tr>" & vbCrLf
            Next rowIndex
            html = html & "</table>" & vbCrLf
            currPos = tmpTbl.Range.End - tempStart ' 更新相对位置
        Next i
        
        ' 处理表格后的文本
        If currPos < tempRng.End - tempStart Then
            html = html & ProcessText(doc.Range(tempStart + currPos, tempRng.End))
            currPos = tempRng.End - tempStart
        End If
    Else
        ' 无表格时直接取文本
        html = html & ProcessText(tempRng)
        currPos = tempRng.End - tempStart
    End If
    
    ' 5. 清理临时内容(从堆栈弹出起始位置并删除)
    tempStart = stack(stackPtr)
    stackPtr = stackPtr - 1
    ReDim Preserve stack(stackPtr)
    doc.Range(tempStart, doc.content.End).Delete
    
    ' 6. 返回结果
    GetCellContent = html
End Function


Function ProcessText(rng As Range) As String
    Dim text As String: text = rng.text
    text = Replace(text, "&", "&amp;")
    text = Replace(text, "<", "&lt;")
    text = Replace(text, Chr(13), "<p/>")
    text = Replace(text, Chr(11), "<br>")
    ProcessText = Trim(text)
End Function

技术要点:

1、处理单元格内容时不能直接处理cell.range(否则连range.tables.count都不能准确取得,这是所有AI在我提示之前都会犯的错误),必须要将单元格中的内容复制到tempRng。

2、在源文档中直接建立tempRng,并将其开始位置保存在堆栈(或者类似的数据结构,我这里其实是用的数组模拟的堆栈)中。

用于测试的表格有以下几个:

魔鬼难度的(当然实际文档中谁设计出这样的表格该被打屁股):

生成的HTML代码粘贴到HTML文件中都能够完美重现Word文档中的表格结构。

相关推荐
沐知全栈开发2 小时前
PHP MySQL WHERE 子句详解
开发语言
糖纸风筝2 小时前
Java指南:eclipse、java-activemq与测试验证
java·开发语言·学习
小坏讲微服务2 小时前
整合Spring Cloud Alibaba与Gateway实现跨域的解决方案
java·开发语言·后端·spring cloud·云原生·gateway
码上成长3 小时前
<script setup> 实战模式:大型组件怎么拆?
开发语言·javascript·vue.js
九河_3 小时前
解决pip install gym==0.19.0安装失败问题
开发语言·python·pip·gym
红豆诗人3 小时前
C语言进阶知识--文件操作
c语言·开发语言·文件操作
唐青枫3 小时前
C#.NET 全局异常到底怎么做?最完整的实战指南
c#·.net
麦麦鸡腿堡4 小时前
Java绘图技术
java·开发语言
热爱编程的OP4 小时前
Linux进程池与管道通信详解:从原理到实现
linux·开发语言·c++