一次PDF文件的处理(二)

主要是对 《一次PDF文件的处理(一)》进行详细的代码说明:

显示PDF

这个比较简单 。使用 private DevExpress.XtraPdfViewer.PdfViewer

pdfViewer1.LoadDocument(@"美文晨读.pdf");

读取PDF的内容

使用PdfDataRecognizer

page.Rect = new RectangleF((float)CropBox.Left, (float)CropBox.Top, (float)CropBox.Width, (float)CropBox.Height);

PdfPageData pdata = PdfDataRecognizer.Recognize(pdfpage, false, false);

foreach (PdfWord w in pdata.Words)

{

foreach (PdfCharacter c in w.Characters)

{

if (c.UnicodeData == "")

continue;

PDFWord ow = new PDFWord();

ow.Text = c.UnicodeData;

ow.TextRect.Rect = new RectangleF((float)c.Rectangle.Left, (float)(c.Rectangle.Top), (float)c.Rectangle.Width, (float)c.Rectangle.Height);

page.Words.Add(ow);

}

}

预览PDF时显示调试信息

使用PdfViewer的Paint事件

public void paint_debug_info(PaintEventArgs e, int PageNumber)

{

PDFPage page = _document.GetPage(PageNumber);

if (page == null)

return;

PointF pt1 = _viewer.GetClientPoint(new PdfDocumentPosition(PageNumber, new PdfPoint(page.Rect.Left, page.Rect.Top)));

PointF pt2 = _viewer.GetClientPoint(new PdfDocumentPosition(PageNumber, new PdfPoint(page.Rect.Right, page.Rect.Top - page.Rect.Height)));

e.Graphics.DrawRectangle(Pens.Blue, pt1.X, pt1.Y, pt2.X - pt1.X, pt2.Y - pt1.Y);

foreach (PDFWord w in page.Words)

{

if (w.Text == "")

continue;

PdfDocumentPosition pp1 = new PdfDocumentPosition(PageNumber, new PdfPoint(w.TextRect.Rect.Left, w.TextRect.Rect.Top));

PdfDocumentPosition pp2 = new PdfDocumentPosition(PageNumber, new PdfPoint(w.TextRect.Rect.Right, w.TextRect.Rect.Top - w.TextRect.Rect.Height));

pt1 = _viewer.GetClientPoint(pp1);

pt2 = _viewer.GetClientPoint(pp2);

if (_show_line)

{

ExtractPdfPageHTML(_viewer, curr_pagenum);

}

if ((pt1.Y > 0) || (pt2.Y > 0))

{

if (_show_pt)

{

Rectangle r = new Rectangle((int) pt1.X, (int)pt1.Y, (int)pt2.X- (int)pt1.X, (int)pt2.Y - (int)pt1.Y);

e.Graphics.DrawRectangle(Pens.Blue, r);

//e.Graphics.DrawLine(Pens.Blue, pt1, pt2);

}

if (_show_txt)

e.Graphics.DrawString(w.Text, drawFont, drawBrush, pt1);

if (_show_line)

{

if (w.PinYinWord.Count > 0)

{

foreach (PDFWord w2 in w.PinYinWord)

{

PdfDocumentPosition pp1_2 = new PdfDocumentPosition(PageNumber, new PdfPoint(w2.TextRect.Rect.Left, w2.TextRect.Rect.Top));

PdfDocumentPosition pp2_2 = new PdfDocumentPosition(PageNumber, new PdfPoint(w2.TextRect.Rect.Right, w2.TextRect.Rect.Top - w2.TextRect.Rect.Height));

PointF pt1_2 = _viewer.GetClientPoint(pp1_2);

PointF pt2_2 = _viewer.GetClientPoint(pp2_2);

e.Graphics.DrawLine(Pens.Blue, (pt1.X+pt2.X)/2, (pt1.Y + pt2.Y) / 2, (pt1_2.X + pt2_2.X) / 2, (pt1_2.Y + pt2_2.Y) / 2);

}

}

}

}

}

}

计算汉字和拼音的位置

for (int i=0;i < lm.Lines.Count;i++)

{

Line line = lm.Lines[i];

if (line.item_w_AVG < lm.line_item_w_AVG)

{

if (i < lm.Lines.Count - 1)

{

Line line2 = lm.Lines[i + 1];

if ((line2.Rect.Top + line2.Rect.Height) > (line.Rect.Top - line2.Rect.Height))

{

foreach (PDFWord w2 in line2.Words)

{

w2.PinYin = "";

w2.PinYinWord.Clear();

}

foreach (PDFWord w in line.Words)

{

float d_min = 0;

float i_min = -1;

float x = w.TextRect.Rect.X + w.TextRect.Rect.Width / 2;

float y = w.TextRect.Rect.Y - w.TextRect.Rect.Height / 2;

PDFWord w_txt = null;

foreach (PDFWord w2 in line2.Words)

{

float x2 = w2.TextRect.Rect.X + w2.TextRect.Rect.Width / 2;

float y2 = w2.TextRect.Rect.Y - w2.TextRect.Rect.Height / 2;

float d = (x2 - x) * (x2 - x) + (y2 - y) * (y2 - y);

if (w_txt== null)

{

d_min = d;

w_txt = w2;

}

else if (d < d_min) {

d_min = d;

w_txt = w2;

}

}

if (w_txt != null)

{

w_txt.PinYin = w_txt.PinYin + w.Text;

w_txt.PinYinWord.Add(w);

}

}

}

}

}

else

{

if (line.Rect.Left > lm.line_x_MIN + lm.line_item_w_AVG)

sb.AppendLine(@"<br /> &nbsp; &nbsp;");

foreach (PDFWord w2 in line.Words)

{

if (w2.PinYin=="")

sb.AppendLine(@" <span class=""char-group""><span class=""pinyin"">&nbsp;</span><span class=""hanzi"">" + w2.Text + @"</span></span>");

else

sb.AppendLine(@" <span class=""char-group""><span class=""pinyin"">"+w2.PinYin+@"</span><span class=""hanzi"">"+w2.Text+@"</span></span>");

}

}

}

相关推荐
新手小新3 小时前
C#学习笔记1-在VS CODE部署C#开发环境
笔记·学习·c#
小贺儿开发5 小时前
Unity3D 心理沙盘互动演示
unity·ai·pdf·人机交互·工具·互动·心理沙盘
rockey6276 小时前
AScript动态脚本多语言环境支持
sql·c#·.net·script·eval·function·动态脚本
ou.cs7 小时前
c# SemaphoreSlim保姆级教程
开发语言·网络·c#
龙侠九重天7 小时前
ML.NET 实战:快速构建分类模型
分类·数据挖掘·c#·.net
私人珍藏库8 小时前
[Windows] PDF工具箱 PDF24 Creator 11.30.0
windows·pdf·工具·软件·多功能
fengyehongWorld8 小时前
C# 创建Worker,杀死指定程序的线程
c#
彭于晏Yan8 小时前
基于iText7的动态PDF生成技术方案
spring boot·pdf
悟乙己9 小时前
Advanced RAG 02:揭秘 PDF 解析
ai·pdf·llm·文档解析
lq12332109 小时前
PDF工具箱 PDF24 Creator 11.30.0
pdf