c++ mupdf 提取pdf文件里面图片

有时候需要从pdf中提取图片，那么此时可以使用mupdf库，如果采用vs2013版本，那么此时就需要考虑mupdf库的版本了，编译过多次之后，最终成功的版本是1.15.0，代码实例如下：

// TestPdf2png.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

/*
* pdfextract -- the ultimate way to extract images and fonts from pdfs
*/

#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#include <stdlib.h>
#include <stdio.h>

static pdf_document *doc = NULL;
static fz_context *ctx = NULL;
static int dorgb = 0;

static void usage(void)
{
   fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n");
   fprintf(stderr, "\t-p\tpassword\n");
   fprintf(stderr, "\t-r\tconvert images to rgb\n");
   exit(1);
}

static int isimage(pdf_obj *obj)
{
pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
return pdf_name_eq(ctx, type, PDF_NAME(Image));
}

static int isfontdesc(pdf_obj *obj)
{
pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Type));
return pdf_name_eq(ctx, type, PDF_NAME(FontDescriptor));
}

static void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int dorgb)
{
char buf[1024];
fz_pixmap *rgb = NULL;

if (!pix)
return;

if (dorgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
   {
       rgb = fz_convert_pixmap(ctx, pix, fz_device_rgb(ctx), NULL, NULL, NULL /* FIXME */, 1);
       pix = rgb;
   }

if (pix->n - pix->alpha <= 3)
   {
       fz_snprintf(buf, sizeof(buf), "%s.png", file);
       printf("extracting image %s\n", buf);
       fz_save_pixmap_as_png(ctx, pix, buf);
   }
   else
   {
       fz_snprintf(buf, sizeof(buf), "%s.pam", file);
       printf("extracting image %s\n", buf);
       fz_save_pixmap_as_pam(ctx, pix, buf);
   }

fz_drop_pixmap(ctx, rgb);
}

static void
writejpeg(fz_context *ctx, const unsigned char *data, size_t len, const char *file)
{
char buf[1024];
fz_output *out;

fz_snprintf(buf, sizeof(buf), "%s.jpg", file);

out = fz_new_output_with_path(ctx, buf, 0);
   fz_try(ctx)
   {
       printf("extracting image %s\n", buf);
       fz_write_data(ctx, out, data, len);
       fz_close_output(ctx, out);
   }
   fz_always(ctx)
       fz_drop_output(ctx, out);
   fz_catch(ctx)
       fz_rethrow(ctx);
}

static void saveimage(pdf_obj *ref)
{
   fz_image *image = NULL;
   fz_pixmap *pix = NULL;
   char buf[32];
   fz_compressed_buffer *cbuf;
   int type;

fz_var(image);
fz_var(pix);

fz_try(ctx)
   {
       image = pdf_load_image(ctx, doc, ref);
       cbuf = fz_compressed_image_buffer(ctx, image);
       fz_snprintf(buf, sizeof(buf), "img-%04d", pdf_to_num(ctx, ref));
       type = cbuf == NULL ? FZ_IMAGE_UNKNOWN : cbuf->params.type;

if (image->use_colorkey)
           type = FZ_IMAGE_UNKNOWN;
       if (image->use_decode)
           type = FZ_IMAGE_UNKNOWN;
       if (image->mask)
           type = FZ_IMAGE_UNKNOWN;
       if (dorgb)
       {
           enum fz_colorspace_type ctype = fz_colorspace_type(ctx, image->colorspace);
           if (ctype != FZ_COLORSPACE_RGB && ctype != FZ_COLORSPACE_GRAY)
               type = FZ_IMAGE_UNKNOWN;
       }

if (type == FZ_IMAGE_JPEG)
       {
           unsigned char *data;
           size_t len = fz_buffer_storage(ctx, cbuf->buffer, &data);
           writejpeg(ctx, data, len, buf);
       }
       else
       {
           pix = fz_get_pixmap_from_image(ctx, image, NULL, NULL, 0, 0);
           writepixmap(ctx, pix, buf, dorgb);
       }
   }
   fz_always(ctx)
   {
       fz_drop_image(ctx, image);
       fz_drop_pixmap(ctx, pix);
   }
   fz_catch(ctx)
       fz_rethrow(ctx);
}

static void savefont(pdf_obj *dict)
{
   char namebuf[1024];
   fz_buffer *buf;
   pdf_obj *stream = NULL;
   pdf_obj *obj;
   char *ext = "";
   fz_output *out;
   const char *fontname = "font";
   size_t len;
   unsigned char *data;

obj = pdf_dict_get(ctx, dict, PDF_NAME(FontName));
if (obj)
fontname = pdf_to_name(ctx, obj);

obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile));
   if (obj)
   {
       stream = obj;
       ext = "pfa";
   }

obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile2));
   if (obj)
   {
       stream = obj;
       ext = "ttf";
   }

obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile3));
   if (obj)
   {
       stream = obj;

obj = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
if (obj && !pdf_is_name(ctx, obj))
fz_throw(ctx, FZ_ERROR_GENERIC, "invalid font descriptor subtype");

if (pdf_name_eq(ctx, obj, PDF_NAME(Type1C)))
           ext = "cff";
       else if (pdf_name_eq(ctx, obj, PDF_NAME(CIDFontType0C)))
           ext = "cid";
       else if (pdf_name_eq(ctx, obj, PDF_NAME(OpenType)))
           ext = "otf";
       else
           fz_throw(ctx, FZ_ERROR_GENERIC, "unhandled font type '%s'", pdf_to_name(ctx, obj));
   }

if (!stream)
   {
       fz_warn(ctx, "unhandled font type");
       return;
   }

buf = pdf_load_stream(ctx, stream);
   len = fz_buffer_storage(ctx, buf, &data);
   fz_try(ctx)
   {
       fz_snprintf(namebuf, sizeof(namebuf), "%s-%04d.%s", fontname, pdf_to_num(ctx, dict), ext);
       printf("extracting font %s\n", namebuf);
       out = fz_new_output_with_path(ctx, namebuf, 0);
       fz_try(ctx)
       {
           fz_write_data(ctx, out, data, len);
           fz_close_output(ctx, out);
       }
       fz_always(ctx)
           fz_drop_output(ctx, out);
       fz_catch(ctx)
           fz_rethrow(ctx);
   }
   fz_always(ctx)
       fz_drop_buffer(ctx, buf);
   fz_catch(ctx)
       fz_rethrow(ctx);
}

static void extractobject(int num)
{
pdf_obj *ref;

if (!doc)
fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");

fz_try(ctx)
   {
       ref = pdf_new_indirect(ctx, doc, num, 0);
       if (isimage(ref))
           saveimage(ref);
       if (isfontdesc(ref))
           savefont(ref);
   }
   fz_always(ctx)
       pdf_drop_obj(ctx, ref);
   fz_catch(ctx)
       fz_warn(ctx, "ignoring object %d", num);
}

int _tmain(int argc, _TCHAR* argv[])
{
   char *infile;
   char *password = "";
   int c, o;
   infile = "C:\\Users\\14713\\Desktop\\1.pdf";

ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
   if (!ctx)
   {
       fprintf(stderr, "cannot initialise context\n");
       exit(1);
   }

doc = pdf_open_document(ctx, infile);
   if (pdf_needs_password(ctx, doc))
       if (!pdf_authenticate_password(ctx, doc, password))
           fz_throw(ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);

int len = pdf_count_objects(ctx, doc);
for (o = 1; o < len; o++)
extractobject(o);

pdf_drop_document(ctx, doc);
fz_flush_warnings(ctx);
fz_drop_context(ctx);

return 0;
}

c++ mupdf 提取pdf文件里面图片相关推荐

Python脚本工具，PyMuPDF批量提取PDF文件中的图片
如何批量快速提取出PDF中的图片文件,你是否遇到这样的一个问题,尤其是PPT文件转换为PDF文件,需要快速提取其中的图片文件,如果你恰好会那么一点py,同时复制粘贴没问题的话,那么相信你也能够很轻松的 ...
提取PDF文件里面的图片
现在很多资源都是PDF格式的,里面的很多图片也都很值得大家借鉴,但是截图出来的图片总是显得不清晰,我们可以考虑将PDF文件里面的图片文件提取出来,直接使用原图更方便.如果你需要提取PDF文件里面的图片 ...
3个方法提取PDF文件里的图片
PDF文件可以保护文档内容不容易被更改,但也因为这样,有一些需求无法进行操作.比如PDF文件里的图片,不能通过另存为保存下来. 那如果想要提取PDF文件里的图片要如何操作呢?下面分享三个方法给大家. ...
怎么用迅捷PDF转换器在线提取PDF文件中的图片
大家在学习或者是办公中经常使用到PDF文件,我们在做一份工作文件的时候,需要一些资料来补充内容,这些资料是以PDF文件格式呈现,在使用PDF文件时,文件中有的图片做到很精细,想要单独提取保存下来备用. ...
通过Python的fitz库提取pdf中的图片
文章目录前言一.fitz库是什么? 二.安装fitz库三.查看fitz库版本四.pymupdf库是什么? 五.安装pymupdf库六.查看pymupdf库版本七.fitz和pymupdf是 ...
怎样提取PDF文件其中一页
在对PDF文件进行处理的时候,想要提取文件内其中一页该如何解决呢?一般PDF格式处理的方式都是先将PDF转换成其它可编辑的文件格式,但是提取PDF文件中的页面就不需要了,对于一些职场小白是不知道这些的 ...
工具：通过Python fitz 提取PDF内的图片
通过Python fitz 提取PDF内的图片 # 打开pdf读取文本和图片内容 # pip install PyMuPDFimport fitzpdf_document = "1.pdf& ...
Java 添加、提取PDF中的图片
Spire.Cloud.SDK for Java提供了PdfImagesApi接口可用于添加图片到PDF文档addImage().提取PDF中的图片extractImages(),具体操作步骤和Jav ...
提取pdf文件中文字的两种方法
如今,在我们的工作与学习中已经不是单单使用word.Excel等格式文件了,pdf格式的文件已经被广泛地运用到我们的办公室中.大家都知道pdf文件是不可直接编辑与修改的,使用起来有些不便.那么当我们需 ...

c++ mupdf 提取pdf文件里面图片

c++ mupdf 提取pdf文件里面图片相关推荐

最新文章

热门文章