一次PDF文件的处理(二)

主要是对 《一次PDF文件的处理(一)》进行详细的代码说明:

显示PDF

这个比较简单 。使用 private DevExpress.XtraPdfViewer.PdfViewer

pdfViewer1.LoadDocument(@"美文晨读.pdf");

读取PDF的内容

使用PdfDataRecognizer

page.Rect = new RectangleF((float)CropBox.Left, (float)CropBox.Top, (float)CropBox.Width, (float)CropBox.Height);

PdfPageData pdata = PdfDataRecognizer.Recognize(pdfpage, false, false);

foreach (PdfWord w in pdata.Words)

{

foreach (PdfCharacter c in w.Characters)

{

if (c.UnicodeData == "")

continue;

PDFWord ow = new PDFWord();

ow.Text = c.UnicodeData;

ow.TextRect.Rect = new RectangleF((float)c.Rectangle.Left, (float)(c.Rectangle.Top), (float)c.Rectangle.Width, (float)c.Rectangle.Height);

page.Words.Add(ow);

}

}

预览PDF时显示调试信息

使用PdfViewer的Paint事件

public void paint_debug_info(PaintEventArgs e, int PageNumber)

{

PDFPage page = _document.GetPage(PageNumber);

if (page == null)

return;

PointF pt1 = _viewer.GetClientPoint(new PdfDocumentPosition(PageNumber, new PdfPoint(page.Rect.Left, page.Rect.Top)));

PointF pt2 = _viewer.GetClientPoint(new PdfDocumentPosition(PageNumber, new PdfPoint(page.Rect.Right, page.Rect.Top - page.Rect.Height)));

e.Graphics.DrawRectangle(Pens.Blue, pt1.X, pt1.Y, pt2.X - pt1.X, pt2.Y - pt1.Y);

foreach (PDFWord w in page.Words)

{

if (w.Text == "")

continue;

PdfDocumentPosition pp1 = new PdfDocumentPosition(PageNumber, new PdfPoint(w.TextRect.Rect.Left, w.TextRect.Rect.Top));

PdfDocumentPosition pp2 = new PdfDocumentPosition(PageNumber, new PdfPoint(w.TextRect.Rect.Right, w.TextRect.Rect.Top - w.TextRect.Rect.Height));

pt1 = _viewer.GetClientPoint(pp1);

pt2 = _viewer.GetClientPoint(pp2);

if (_show_line)

{

ExtractPdfPageHTML(_viewer, curr_pagenum);

}

if ((pt1.Y > 0) || (pt2.Y > 0))

{

if (_show_pt)

{

Rectangle r = new Rectangle((int) pt1.X, (int)pt1.Y, (int)pt2.X- (int)pt1.X, (int)pt2.Y - (int)pt1.Y);

e.Graphics.DrawRectangle(Pens.Blue, r);

//e.Graphics.DrawLine(Pens.Blue, pt1, pt2);

}

if (_show_txt)

e.Graphics.DrawString(w.Text, drawFont, drawBrush, pt1);

if (_show_line)

{

if (w.PinYinWord.Count > 0)

{

foreach (PDFWord w2 in w.PinYinWord)

{

PdfDocumentPosition pp1_2 = new PdfDocumentPosition(PageNumber, new PdfPoint(w2.TextRect.Rect.Left, w2.TextRect.Rect.Top));

PdfDocumentPosition pp2_2 = new PdfDocumentPosition(PageNumber, new PdfPoint(w2.TextRect.Rect.Right, w2.TextRect.Rect.Top - w2.TextRect.Rect.Height));

PointF pt1_2 = _viewer.GetClientPoint(pp1_2);

PointF pt2_2 = _viewer.GetClientPoint(pp2_2);

e.Graphics.DrawLine(Pens.Blue, (pt1.X+pt2.X)/2, (pt1.Y + pt2.Y) / 2, (pt1_2.X + pt2_2.X) / 2, (pt1_2.Y + pt2_2.Y) / 2);

}

}

}

}

}

}

计算汉字和拼音的位置

for (int i=0;i < lm.Lines.Count;i++)

{

Line line = lm.Lines[i];

if (line.item_w_AVG < lm.line_item_w_AVG)

{

if (i < lm.Lines.Count - 1)

{

Line line2 = lm.Lines[i + 1];

if ((line2.Rect.Top + line2.Rect.Height) > (line.Rect.Top - line2.Rect.Height))

{

foreach (PDFWord w2 in line2.Words)

{

w2.PinYin = "";

w2.PinYinWord.Clear();

}

foreach (PDFWord w in line.Words)

{

float d_min = 0;

float i_min = -1;

float x = w.TextRect.Rect.X + w.TextRect.Rect.Width / 2;

float y = w.TextRect.Rect.Y - w.TextRect.Rect.Height / 2;

PDFWord w_txt = null;

foreach (PDFWord w2 in line2.Words)

{

float x2 = w2.TextRect.Rect.X + w2.TextRect.Rect.Width / 2;

float y2 = w2.TextRect.Rect.Y - w2.TextRect.Rect.Height / 2;

float d = (x2 - x) * (x2 - x) + (y2 - y) * (y2 - y);

if (w_txt== null)

{

d_min = d;

w_txt = w2;

}

else if (d < d_min) {

d_min = d;

w_txt = w2;

}

}

if (w_txt != null)

{

w_txt.PinYin = w_txt.PinYin + w.Text;

w_txt.PinYinWord.Add(w);

}

}

}

}

}

else

{

if (line.Rect.Left > lm.line_x_MIN + lm.line_item_w_AVG)

sb.AppendLine(@"<br /> &nbsp; &nbsp;");

foreach (PDFWord w2 in line.Words)

{

if (w2.PinYin=="")

sb.AppendLine(@" <span class=""char-group""><span class=""pinyin"">&nbsp;</span><span class=""hanzi"">" + w2.Text + @"</span></span>");

else

sb.AppendLine(@" <span class=""char-group""><span class=""pinyin"">"+w2.PinYin+@"</span><span class=""hanzi"">"+w2.Text+@"</span></span>");

}

}

}

相关推荐
开开心心就好2 小时前
免费无广告的礼金记账本,安卓应用
java·前端·ubuntu·edge·pdf·负载均衡·语音识别
优化控制仿真模型2 小时前
2015-2025年12月英语六级历年真题及答案PDF电子版(含听力音频)
经验分享·pdf
爱编程的小吴3 小时前
LangChain基础入门:DocumentLoader加载PDF/Markdown文档实战
python·langchain·pdf
摆烂的少年3 小时前
Asp .net web应用程序使用VS2022调试时打开文件选择器服务自动关闭问题
c#·.net
William_cl3 小时前
C# ASP.NET Identity 授权实战:[Authorize (Roles=“Admin“)] 仅管理员访问(避坑 + 图解)
开发语言·c#·asp.net
软件工程小施同学3 小时前
区块链论文速读 CCF A--CCS 2025 (1) 附pdf下载
pdf·区块链
.NET修仙日记3 小时前
构建社区照护桥梁:.NET Core3.1+MVC社区呼叫系统设计与实现
c#·毕业设计·.net·.net core·社区照护平台
红黑色的圣西罗3 小时前
Lua和C#交互探究记录
c#·lua·交互
八苦18 小时前
如何用c# 做 mcp/ChatGPT app
c#·mcp