mupdf提取pdf图像

ExtractPDF

Extract image and text from PDF file by mupdf

Compile as a dll

$ gcc -c libpdf.c -I../../include

$ gcc -shared -o libpdf.dll libpdf.o -L/d/dev/mupdf/build/debug/ -lmupdf -lz -lopenjpeg -ljpeg -ljbig2dec -lfreetype \ -lmupdf-js-none

Execute as exe

c:\\> gcc -o libpdf.exe libpdf.o -L/d/dev/mupdf/build/debug/ -lmupdf -lz -lopenjpeg -ljpeg -ljbig2dec -lfreetype -lmupdf-js -none

  • As you can see, this worked both on Windows and linux. and sucessfully called by .Net

this code use the orginal license as mupdf does.

mupdf-1.3 compiled by: make build=release NOX11=yes

cpp 复制代码
#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#define EXPORT_DLL  __declspec(dllexport)
static pdf_document   *doc = NULL;
static fz_document    *fzdoc = NULL;
static fz_context     *ctx = NULL;
static int            images = 0;
static int			  image_obj_num[100];
static fz_text_sheet  *sheet = NULL;
FILE *ofp;

void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int rgb)
{
  char buf[1024];
  fz_pixmap *converted = NULL;
  if (!pix)
    {
      return;
    }
  if (rgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
    {
      fz_irect bbox;
      converted = fz_new_pixmap_with_bbox(ctx,fz_device_rgb(ctx),fz_pixmap_bbox(ctx,pix,&bbox));
      fz_convert_pixmap(ctx,converted,pix);
      pix=converted;
    }
  if(pix->n<=4)
    {
      snprintf(buf,sizeof(buf),"%s.png",file);
      fz_write_png(ctx,pix,buf,0);
    }
  else
    {
      snprintf(buf,sizeof(buf),"%s.pam",file);
      fz_write_pam(ctx,pix,buf,0);
    }
  fz_drop_pixmap(ctx,converted);
}

void saveimage(char *dest_path,int num)
{
  fz_image   *image;
  fz_pixmap  *pix;
  pdf_obj    *ref;
  char       buf[200];
  
  ref = pdf_new_indirect(doc,num,0);
  
  //save image into file
  image = pdf_load_image(doc,ref);
  pix = fz_new_pixmap_from_image(ctx,image,0,0);
  fz_drop_image(ctx,image);

  snprintf(buf,sizeof(buf),"%s/img-%04d",dest_path,num);
  //rgb mode
  writepixmap(ctx,pix,buf,1);

  fz_drop_pixmap(ctx,pix);
  pdf_drop_obj(ref);
}

EXPORT_DLL int init_image(char *filename,char *password)
{
  int result = 0;
  FILE * f;
  f=fopen("testfile.txt","a");
  fprintf(f,"%s",filename);
  fclose(f);
  ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
  if (!ctx)
    {
      //can not initialise context
      result = 1;
    }
  doc = pdf_open_document_no_run(ctx,filename);      

  if (pdf_needs_password(doc))
    {
      if (!pdf_authenticate_password(doc,password))
        {
          // cannot authenticate password
          result = 2;
        }
      result = 0;
    }
  // pdf sucessfully loaded
  return result;
}

EXPORT_DLL int page_counts()
{
  int pages = 0;
  pages = pdf_count_pages(doc);
  return pages;
}

EXPORT_DLL void extractImage(char *dest_path,int objnum)
{
  pdf_obj  *obj;
  obj = pdf_load_object(doc,objnum,0);
 if (isimage(obj))
  saveimage(dest_path,objnum);
  pdf_drop_obj(obj);
}

EXPORT_DLL void destroy_image()
{
  if (doc)
    {
      pdf_close_document(doc);
      doc = NULL;
      fz_free_context(ctx);
    }  
}
static void analysis_image_info(int page,pdf_obj *pageref,pdf_obj *pageobj,pdf_obj *dict)
{
  int i,n;

  n = pdf_dict_len(dict);

  for (i = 0; i < n; ++i)
    {
      pdf_obj	*imagedict;
      pdf_obj	*type;
      
      imagedict = pdf_dict_get_val(dict,i);
      if (!pdf_is_dict(imagedict)) continue;

      type = pdf_dict_gets(imagedict,"Subtype");
      if (strcmp(pdf_to_name(type),"Image")) continue;
      images++;
      image_obj_num[images-1]=pdf_to_num(imagedict);
    }
}

static void analysis_resource_info(int page,pdf_obj *rsrc)
{
  pdf_obj	*pageobj;
  pdf_obj	*pageref;
  pdf_obj	*xobj;
  pdf_obj	*subrsrc;

  pageref = pdf_lookup_page_obj(doc,page-1);
  pageobj = pdf_resolve_indirect(pageref);

  xobj = pdf_dict_gets(rsrc,"XObject");
  if (xobj)
    {
      analysis_image_info(page,pageref,pageobj,xobj);
      int n = pdf_dict_len(xobj);
      int i = 0;
      for (i = 0; i < n; i++)
        {
          pdf_obj *obj = pdf_dict_get_val(xobj,i);
          subrsrc = pdf_dict_gets(obj,"Resources");
          if (subrsrc && pdf_objcmp(rsrc,subrsrc))
            {
              analysis_resource_info(page,subrsrc);
            }
        }
    }
}

EXPORT_DLL int analysis_page_info(int page)
{
  pdf_obj *pageobj;
  pdf_obj *pageref;
  pdf_obj *rsrc;

  int result=0;
  pageref = pdf_lookup_page_obj(doc,page-1);
  pageobj = pdf_resolve_indirect(pageref);
  
  if (!pageobj)
    {
      // cannot retrieve info from page
      result = 1;
    }

  rsrc = pdf_dict_gets(pageobj,"Resources");
  analysis_resource_info(page,rsrc);
  return result;
}

EXPORT_DLL void get_image_obj_num(char imagenum[])
{
  char *pos= imagenum;

  int i = 0;

  for (i = 0; i < images; ++i)
    {
      if (i)
        {
          pos += sprintf(pos,",");
        }
	  pos += sprintf(pos,"%d",image_obj_num[i]);
    }
}
int isimage(pdf_obj *obj)
{
	pdf_obj *type = pdf_dict_gets(obj, "Subtype");
	return pdf_is_name(type) && !strcmp(pdf_to_name(type), "Image");
}

EXPORT_DLL void export_all_image(char *dest_path)
{
  int len=pdf_count_objects(doc);
  int n;
  for(n=0;n<len;n++)
    {
       extractImage(dest_path,n);
    }
}
EXPORT_DLL int init_txt(char* filename,char* password)
{
  int result=0;
  fz_var(fzdoc);
  ctx = fz_new_context(NULL,NULL,FZ_STORE_DEFAULT);
  if (!ctx)
    {
      //initialise error
      result = 1;
    }
  fzdoc = fz_open_document(ctx,filename);
  if (fz_needs_password(fzdoc))
     {
       if (!fz_authenticate_password(fzdoc,password))
         {
           //authenticate error
           result = 2;
         }
     }
 }
EXPORT_DLL int extractText(int pagenum,char* textfile)
{
  fz_page      *page;
  fz_device    *dev = NULL;
  fz_text_page *text = NULL;
  fz_cookie    cookie = {0};
  fz_output		*out = NULL;

  int 	result = 0;

  ofp = fopen(textfile,"w");
  fz_var(dev);
  fz_var(text);
  fz_var(out);

  fz_try(ctx)
  {

    out = fz_new_output_with_file(ctx,ofp);
    sheet = fz_new_text_sheet(ctx);
 
    page = fz_load_page(fzdoc,pagenum-1);
    text = fz_new_text_page(ctx);
    dev = fz_new_text_device(ctx,sheet,text);
    fz_run_page(fzdoc,page,dev,&fz_identity,&cookie);
    
    fz_print_text_page(ctx,out,text);
  }
  fz_always(ctx)
  {
    fz_free_device(dev);
    dev = NULL;
    fz_free_text_page(ctx,text);
  }
  fz_catch(ctx)
  {
    fz_free_page(fzdoc,page);
    //exception
    result = 3;
  }
  fz_close_output(out);
  fclose(ofp);
  return result;
}

EXPORT_DLL void destroy_txt()
{
  fz_close_document(fzdoc);
  fzdoc = NULL;
  fz_free_context(ctx);

}

int main(int argc, char *argv[])
{
  char *filename= argv[1];
  init_image(filename,"");
  int pages=page_counts();
  printf("pages:%d",pages);
  export_all_image(".");
  destroy_image();
  init_txt(filename,"");
  int n=0;
  for( n=1; n<pages+1;n++){
    char txt[5];
    sprintf(txt,"%d.txt",n);
    extractText(n,txt);
  }
  destroy_txt();
  return 0;
}
相关推荐
神色自若14 小时前
Net9为PDF文字替换,使用Spire.PDF版本10.12.4.1360
pdf
机器懒得学习17 小时前
解析交通事故报告:利用 PDF、AI 与数据标准化技术构建智能分析系统
pdf
合合技术团队1 天前
高效准确的PDF解析工具,赋能企业非结构化数据治理
人工智能·科技·pdf·aigc·文档
jingling5551 天前
如何使用免费资源--知网篇
开发语言·经验分享·搜索引擎·pdf·开源
haha_qasim2 天前
怎么将pdf中的某一个提取出来?介绍几种提取PDF中页面的方法
前端·pdf
m0_748249542 天前
前端预览pdf文件流
前端·pdf
百年孤独_2 天前
高阶:基于Python paddleocr库 提取pdf 文档高亮显示的内容
开发语言·python·pdf
m0_748236582 天前
前端如何将pdf等文件传入后端
前端·pdf·状态模式
翔云API2 天前
通用文档识别接口包含PDF文档识别么?集成方式是什么
pdf
觅远2 天前
python实现Word转PDF(comtypes、win32com、docx2pdf)
python·pdf·自动化·word