csharp
复制代码
using System;
using System.IO;
using System.Linq;
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
/// <summary>
/// 使用OpenXML获取文档内容,替代Aspose方式
/// </summary>
/// <param name="path">文档路径</param>
/// <param name="password">密码</param>
/// <returns>文档内容字符串</returns>
public static string GetWordContentByOpenXml(string path, string password)
{
try
{
using (var document = WordprocessingDocument.Open(path, false, new OpenSettings()
{
Password = password
}))
{
if (document.MainDocumentPart?.Document?.Body == null)
return null;
// 创建StringBuilder来存储文档主体内容
var contentBuilder = new StringBuilder();
// 获取文档主体,排除页眉页脚
var body = document.MainDocumentPart.Document.Body;
// 提取主文档内容(不包括页眉页脚)
ExtractBodyContent(body, contentBuilder);
// 获取原始内容
string contentWithoutHeaderFooter = contentBuilder.ToString();
// 应用内容清理和格式化
string content = CleanContent(contentWithoutHeaderFooter);
// 处理特定的截取逻辑
int index = content.LastIndexOf("限公司第");
if (index > 0)
{
return content.Substring(0, index).Trim();
}
else
{
return content;
}
}
}
catch (Exception ex)
{
LogManager.WriteError("GetWordContentByOpenXml()", ex.StackTrace?.ToString());
return null;
}
}
/// <summary>
/// 提取文档主体内容,排除页眉页脚
/// </summary>
/// <param name="body">文档主体</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractBodyContent(Body body, StringBuilder contentBuilder)
{
// 遍历文档主体中的所有元素
foreach (var element in body.Elements())
{
ExtractElementContent(element, contentBuilder);
}
}
/// <summary>
/// 递归提取元素内容
/// </summary>
/// <param name="element">OpenXML元素</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractElementContent(OpenXmlElement element, StringBuilder contentBuilder)
{
switch (element)
{
case Paragraph paragraph:
ExtractParagraphContent(paragraph, contentBuilder);
contentBuilder.AppendLine(); // 段落后换行
break;
case Table table:
ExtractTableContent(table, contentBuilder);
break;
case SectionProperties _:
// 跳过节属性,这些通常包含页眉页脚引用
break;
default:
// 递归处理其他容器元素
foreach (var childElement in element.Elements())
{
ExtractElementContent(childElement, contentBuilder);
}
break;
}
}
/// <summary>
/// 提取段落内容
/// </summary>
/// <param name="paragraph">段落元素</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractParagraphContent(Paragraph paragraph, StringBuilder contentBuilder)
{
foreach (var run in paragraph.Elements<Run>())
{
foreach (var text in run.Elements<Text>())
{
contentBuilder.Append(text.Text);
}
// 处理制表符
foreach (var tab in run.Elements<TabChar>())
{
contentBuilder.Append("\t");
}
// 处理换行符
foreach (var br in run.Elements<Break>())
{
contentBuilder.AppendLine();
}
}
}
/// <summary>
/// 提取表格内容
/// </summary>
/// <param name="table">表格元素</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractTableContent(Table table, StringBuilder contentBuilder)
{
foreach (var row in table.Elements<TableRow>())
{
foreach (var cell in row.Elements<TableCell>())
{
foreach (var paragraph in cell.Elements<Paragraph>())
{
ExtractParagraphContent(paragraph, contentBuilder);
}
contentBuilder.Append("\t"); // 单元格间用制表符分隔
}
contentBuilder.AppendLine(); // 表格行后换行
}
}
/// <summary>
/// 清理和格式化内容,模拟Aspose的清理功能
/// </summary>
/// <param name="content">原始内容</param>
/// <returns>清理后的内容</returns>
private static string CleanContent(string content)
{
if (string.IsNullOrEmpty(content))
return string.Empty;
// 移除多余的空白字符(模拟Tool.TrimAll功能)
content = System.Text.RegularExpressions.Regex.Replace(content, @"\s+", " ");
content = content.Trim();
// 移除多余的换行符
content = System.Text.RegularExpressions.Regex.Replace(content, @"\n\s*\n", "\n");
// 移除Aspose评估版本的水印文本(虽然OpenXML不会有,但保持兼容性)
content = content.Replace("EvaluationOnly.CreatedwithAspose.Words.Copyright2003-2024AsposePtyLtd.", "");
// 移除其他可能的控制字符
content = System.Text.RegularExpressions.Regex.Replace(content, @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "");
return content.Trim();
}
/// <summary>
/// 检查文档是否需要密码
/// </summary>
/// <param name="path">文档路径</param>
/// <returns>是否需要密码</returns>
public static bool IsPasswordRequired(string path)
{
try
{
using (var document = WordprocessingDocument.Open(path, false))
{
// 如果能正常打开,说明不需要密码
return false;
}
}
catch (OpenXmlPackageException ex)
{
// 如果抛出密码相关异常,说明需要密码
return ex.Message.Contains("password") || ex.Message.Contains("encrypted") || ex.Message.Contains("protected");
}
catch
{
// 其他异常可能也表示需要密码
return true;
}
}
/// <summary>
/// 增强版本:支持更多文档处理选项
/// </summary>
/// <param name="path">文档路径</param>
/// <param name="password">密码</param>
/// <param name="includeHyperlinks">是否包含超链接文本</param>
/// <param name="includeFootnotes">是否包含脚注</param>
/// <returns>文档内容</returns>
public static string GetWordContentByOpenXmlAdvanced(string path, string password, bool includeHyperlinks = false, bool includeFootnotes = false)
{
try
{
using (var document = WordprocessingDocument.Open(path, false, new OpenSettings()
{
Password = password
}))
{
if (document.MainDocumentPart?.Document?.Body == null)
return null;
var contentBuilder = new StringBuilder();
var body = document.MainDocumentPart.Document.Body;
// 提取主文档内容
ExtractBodyContentAdvanced(body, contentBuilder, includeHyperlinks);
// 如果需要包含脚注
if (includeFootnotes && document.MainDocumentPart.FootnotesPart != null)
{
ExtractFootnotesContent(document.MainDocumentPart.FootnotesPart, contentBuilder);
}
string contentWithoutHeaderFooter = contentBuilder.ToString();
string content = CleanContent(contentWithoutHeaderFooter);
// 应用特定的截取逻辑
int index = content.LastIndexOf("公司第");
if (index > 0)
{
return content.Substring(0, index).Trim();
}
else
{
return content;
}
}
}
catch (Exception ex)
{
LogManager.WriteError("GetWordContentByOpenXmlAdvanced()", ex.StackTrace?.ToString());
return null;
}
}
/// <summary>
/// 高级内容提取,支持超链接等
/// </summary>
private static void ExtractBodyContentAdvanced(Body body, StringBuilder contentBuilder, bool includeHyperlinks)
{
foreach (var element in body.Elements())
{
if (element is Paragraph paragraph)
{
ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);
contentBuilder.AppendLine();
}
else if (element is Table table)
{
ExtractTableContentAdvanced(table, contentBuilder, includeHyperlinks);
}
else if (!(element is SectionProperties))
{
// 递归处理其他元素
foreach (var childElement in element.Elements())
{
ExtractBodyContentAdvanced(new Body(childElement), contentBuilder, includeHyperlinks);
}
}
}
}
/// <summary>
/// 高级段落内容提取
/// </summary>
private static void ExtractParagraphContentAdvanced(Paragraph paragraph, StringBuilder contentBuilder, bool includeHyperlinks)
{
foreach (var element in paragraph.Elements())
{
if (element is Run run)
{
foreach (var text in run.Elements<Text>())
{
contentBuilder.Append(text.Text);
}
}
else if (element is Hyperlink hyperlink && includeHyperlinks)
{
foreach (var run2 in hyperlink.Elements<Run>())
{
foreach (var text in run2.Elements<Text>())
{
contentBuilder.Append(text.Text);
}
}
}
}
}
/// <summary>
/// 高级表格内容提取
/// </summary>
private static void ExtractTableContentAdvanced(Table table, StringBuilder contentBuilder, bool includeHyperlinks)
{
foreach (var row in table.Elements<TableRow>())
{
foreach (var cell in row.Elements<TableCell>())
{
foreach (var paragraph in cell.Elements<Paragraph>())
{
ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);
}
contentBuilder.Append("\t");
}
contentBuilder.AppendLine();
}
}
/// <summary>
/// 提取脚注内容
/// </summary>
private static void ExtractFootnotesContent(FootnotesPart footnotesPart, StringBuilder contentBuilder)
{
if (footnotesPart.Footnotes != null)
{
contentBuilder.AppendLine("\n--- 脚注 ---");
foreach (var footnote in footnotesPart.Footnotes.Elements<Footnote>())
{
foreach (var paragraph in footnote.Elements<Paragraph>())
{
ExtractParagraphContent(paragraph, contentBuilder);
contentBuilder.AppendLine();
}
}
}
}