mupdf提取pdf图像

ExtractPDF

Extract image and text from PDF file by mupdf

Compile as a dll

$ gcc -c libpdf.c -I../../include

$ gcc -shared -o libpdf.dll libpdf.o -L/d/dev/mupdf/build/debug/ -lmupdf -lz -lopenjpeg -ljpeg -ljbig2dec -lfreetype \ -lmupdf-js-none

Execute as exe

c:\\> gcc -o libpdf.exe libpdf.o -L/d/dev/mupdf/build/debug/ -lmupdf -lz -lopenjpeg -ljpeg -ljbig2dec -lfreetype -lmupdf-js -none

  • As you can see, this worked both on Windows and linux. and sucessfully called by .Net

this code use the orginal license as mupdf does.

mupdf-1.3 compiled by: make build=release NOX11=yes

cpp 复制代码
#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#define EXPORT_DLL  __declspec(dllexport)
static pdf_document   *doc = NULL;
static fz_document    *fzdoc = NULL;
static fz_context     *ctx = NULL;
static int            images = 0;
static int			  image_obj_num[100];
static fz_text_sheet  *sheet = NULL;
FILE *ofp;

void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int rgb)
{
  char buf[1024];
  fz_pixmap *converted = NULL;
  if (!pix)
    {
      return;
    }
  if (rgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
    {
      fz_irect bbox;
      converted = fz_new_pixmap_with_bbox(ctx,fz_device_rgb(ctx),fz_pixmap_bbox(ctx,pix,&bbox));
      fz_convert_pixmap(ctx,converted,pix);
      pix=converted;
    }
  if(pix->n<=4)
    {
      snprintf(buf,sizeof(buf),"%s.png",file);
      fz_write_png(ctx,pix,buf,0);
    }
  else
    {
      snprintf(buf,sizeof(buf),"%s.pam",file);
      fz_write_pam(ctx,pix,buf,0);
    }
  fz_drop_pixmap(ctx,converted);
}

void saveimage(char *dest_path,int num)
{
  fz_image   *image;
  fz_pixmap  *pix;
  pdf_obj    *ref;
  char       buf[200];
  
  ref = pdf_new_indirect(doc,num,0);
  
  //save image into file
  image = pdf_load_image(doc,ref);
  pix = fz_new_pixmap_from_image(ctx,image,0,0);
  fz_drop_image(ctx,image);

  snprintf(buf,sizeof(buf),"%s/img-%04d",dest_path,num);
  //rgb mode
  writepixmap(ctx,pix,buf,1);

  fz_drop_pixmap(ctx,pix);
  pdf_drop_obj(ref);
}

EXPORT_DLL int init_image(char *filename,char *password)
{
  int result = 0;
  FILE * f;
  f=fopen("testfile.txt","a");
  fprintf(f,"%s",filename);
  fclose(f);
  ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
  if (!ctx)
    {
      //can not initialise context
      result = 1;
    }
  doc = pdf_open_document_no_run(ctx,filename);      

  if (pdf_needs_password(doc))
    {
      if (!pdf_authenticate_password(doc,password))
        {
          // cannot authenticate password
          result = 2;
        }
      result = 0;
    }
  // pdf sucessfully loaded
  return result;
}

EXPORT_DLL int page_counts()
{
  int pages = 0;
  pages = pdf_count_pages(doc);
  return pages;
}

EXPORT_DLL void extractImage(char *dest_path,int objnum)
{
  pdf_obj  *obj;
  obj = pdf_load_object(doc,objnum,0);
 if (isimage(obj))
  saveimage(dest_path,objnum);
  pdf_drop_obj(obj);
}

EXPORT_DLL void destroy_image()
{
  if (doc)
    {
      pdf_close_document(doc);
      doc = NULL;
      fz_free_context(ctx);
    }  
}
static void analysis_image_info(int page,pdf_obj *pageref,pdf_obj *pageobj,pdf_obj *dict)
{
  int i,n;

  n = pdf_dict_len(dict);

  for (i = 0; i < n; ++i)
    {
      pdf_obj	*imagedict;
      pdf_obj	*type;
      
      imagedict = pdf_dict_get_val(dict,i);
      if (!pdf_is_dict(imagedict)) continue;

      type = pdf_dict_gets(imagedict,"Subtype");
      if (strcmp(pdf_to_name(type),"Image")) continue;
      images++;
      image_obj_num[images-1]=pdf_to_num(imagedict);
    }
}

static void analysis_resource_info(int page,pdf_obj *rsrc)
{
  pdf_obj	*pageobj;
  pdf_obj	*pageref;
  pdf_obj	*xobj;
  pdf_obj	*subrsrc;

  pageref = pdf_lookup_page_obj(doc,page-1);
  pageobj = pdf_resolve_indirect(pageref);

  xobj = pdf_dict_gets(rsrc,"XObject");
  if (xobj)
    {
      analysis_image_info(page,pageref,pageobj,xobj);
      int n = pdf_dict_len(xobj);
      int i = 0;
      for (i = 0; i < n; i++)
        {
          pdf_obj *obj = pdf_dict_get_val(xobj,i);
          subrsrc = pdf_dict_gets(obj,"Resources");
          if (subrsrc && pdf_objcmp(rsrc,subrsrc))
            {
              analysis_resource_info(page,subrsrc);
            }
        }
    }
}

EXPORT_DLL int analysis_page_info(int page)
{
  pdf_obj *pageobj;
  pdf_obj *pageref;
  pdf_obj *rsrc;

  int result=0;
  pageref = pdf_lookup_page_obj(doc,page-1);
  pageobj = pdf_resolve_indirect(pageref);
  
  if (!pageobj)
    {
      // cannot retrieve info from page
      result = 1;
    }

  rsrc = pdf_dict_gets(pageobj,"Resources");
  analysis_resource_info(page,rsrc);
  return result;
}

EXPORT_DLL void get_image_obj_num(char imagenum[])
{
  char *pos= imagenum;

  int i = 0;

  for (i = 0; i < images; ++i)
    {
      if (i)
        {
          pos += sprintf(pos,",");
        }
	  pos += sprintf(pos,"%d",image_obj_num[i]);
    }
}
int isimage(pdf_obj *obj)
{
	pdf_obj *type = pdf_dict_gets(obj, "Subtype");
	return pdf_is_name(type) && !strcmp(pdf_to_name(type), "Image");
}

EXPORT_DLL void export_all_image(char *dest_path)
{
  int len=pdf_count_objects(doc);
  int n;
  for(n=0;n<len;n++)
    {
       extractImage(dest_path,n);
    }
}
EXPORT_DLL int init_txt(char* filename,char* password)
{
  int result=0;
  fz_var(fzdoc);
  ctx = fz_new_context(NULL,NULL,FZ_STORE_DEFAULT);
  if (!ctx)
    {
      //initialise error
      result = 1;
    }
  fzdoc = fz_open_document(ctx,filename);
  if (fz_needs_password(fzdoc))
     {
       if (!fz_authenticate_password(fzdoc,password))
         {
           //authenticate error
           result = 2;
         }
     }
 }
EXPORT_DLL int extractText(int pagenum,char* textfile)
{
  fz_page      *page;
  fz_device    *dev = NULL;
  fz_text_page *text = NULL;
  fz_cookie    cookie = {0};
  fz_output		*out = NULL;

  int 	result = 0;

  ofp = fopen(textfile,"w");
  fz_var(dev);
  fz_var(text);
  fz_var(out);

  fz_try(ctx)
  {

    out = fz_new_output_with_file(ctx,ofp);
    sheet = fz_new_text_sheet(ctx);
 
    page = fz_load_page(fzdoc,pagenum-1);
    text = fz_new_text_page(ctx);
    dev = fz_new_text_device(ctx,sheet,text);
    fz_run_page(fzdoc,page,dev,&fz_identity,&cookie);
    
    fz_print_text_page(ctx,out,text);
  }
  fz_always(ctx)
  {
    fz_free_device(dev);
    dev = NULL;
    fz_free_text_page(ctx,text);
  }
  fz_catch(ctx)
  {
    fz_free_page(fzdoc,page);
    //exception
    result = 3;
  }
  fz_close_output(out);
  fclose(ofp);
  return result;
}

EXPORT_DLL void destroy_txt()
{
  fz_close_document(fzdoc);
  fzdoc = NULL;
  fz_free_context(ctx);

}

int main(int argc, char *argv[])
{
  char *filename= argv[1];
  init_image(filename,"");
  int pages=page_counts();
  printf("pages:%d",pages);
  export_all_image(".");
  destroy_image();
  init_txt(filename,"");
  int n=0;
  for( n=1; n<pages+1;n++){
    char txt[5];
    sprintf(txt,"%d.txt",n);
    extractText(n,txt);
  }
  destroy_txt();
  return 0;
}
相关推荐
一只花里胡哨的程序猿2 小时前
odoo打印pdf速度慢问题
pdf·odoo
灵海之森4 小时前
Python将md转html,转pdf
pdf
阿幸软件杂货间8 小时前
最新PDF版本!Acrobat Pro DC 2025,解压即用版
pdf·adobe acrobat·acrobat
星空的资源小屋10 小时前
网易UU远程,免费电脑远程控制软件
人工智能·python·pdf·电脑
会飞的小菠菜13 小时前
如何一次性将多个PPT幻灯片批量转换成PDF文档
pdf·powerpoint·ppt·批量·格式转换
somethingGoWay16 小时前
wpf .netcore 导出pdf文件
pdf·wpf·.netcore
小白电脑技术1 天前
PDF教程|如何把想要的网页保存下来?
pdf·电脑
我没想到原来他们都是一堆坏人1 天前
通过Gen AI SDK调用gemini 2.5 pro,单独上传pdf文件 | ai agent 开发笔记 2025.9.2 Day 2
ai·google·pdf·sdk·gemini
AI视觉网奇1 天前
麒麟系统 doc转pdf
linux·运维·pdf
CodeCraft Studio1 天前
国产化PDF处理控件Spire.PDF教程:如何在 Java 中通过模板生成 PDF
java·python·pdf·spire.pdf·java创建pdf·从html创建pdf