PDFium导出pdf 图像

✅ 修正版:支持递归提取表单中的图片对象

我在你的基础上只增加了一个 递归遍历函数,不影响你现有逻辑和日志输出。


cpp 复制代码
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#include <filesystem>

#include "fpdfview.h"
#include "fpdf_edit.h"
#include "fpdf_save.h"

// stb_image_write 实现 JPEG 保存
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

bool convert_to_rgb(FPDF_BITMAP bitmap, std::vector<unsigned char>& rgb_data) {
    int format = FPDFBitmap_GetFormat(bitmap);
    int width = FPDFBitmap_GetWidth(bitmap);
    int height = FPDFBitmap_GetHeight(bitmap);
    int stride = FPDFBitmap_GetStride(bitmap);
    unsigned char* buffer = (unsigned char*)FPDFBitmap_GetBuffer(bitmap);

    rgb_data.resize(width * height * 3);

    switch (format) {
    case FPDFBitmap_Gray:
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                unsigned char gray = src[x];
                rgb_data[(y * width + x) * 3 + 0] = gray;
                rgb_data[(y * width + x) * 3 + 1] = gray;
                rgb_data[(y * width + x) * 3 + 2] = gray;
            }
        }
        break;

    case FPDFBitmap_BGR:
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                rgb_data[(y * width + x) * 3 + 0] = src[x * 3 + 2];
                rgb_data[(y * width + x) * 3 + 1] = src[x * 3 + 1];
                rgb_data[(y * width + x) * 3 + 2] = src[x * 3 + 0];
            }
        }
        break;

    case FPDFBitmap_BGRx:
    case FPDFBitmap_BGRA:
    case FPDFBitmap_BGRA_Premul:
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                rgb_data[(y * width + x) * 3 + 0] = src[x * 4 + 2];
                rgb_data[(y * width + x) * 3 + 1] = src[x * 4 + 1];
                rgb_data[(y * width + x) * 3 + 2] = src[x * 4 + 0];
            }
        }
        break;

    default:
        std::cerr << "❌ Unsupported bitmap format: " << format << std::endl;
        return false;
    }

    return true;
}

std::string format_to_string(int format) {
    switch (format) {
    case FPDFBitmap_Unknown: return "Unknown";
    case FPDFBitmap_Gray: return "Gray";
    case FPDFBitmap_BGR: return "BGR";
    case FPDFBitmap_BGRx: return "BGRx";
    case FPDFBitmap_BGRA: return "BGRA";
    case FPDFBitmap_BGRA_Premul: return "BGRA_Premul";
    default: return "Unknown(" + std::to_string(format) + ")";
    }
}

// 🧩 递归提取函数,支持 Form 对象中的图片
void ExtractImagesFromObject(FPDF_DOCUMENT doc, FPDF_PAGE page, FPDF_PAGEOBJECT obj,
                             const std::string& output_dir, int page_index, int& image_counter) {
    if (!obj) return;

    int type = FPDFPageObj_GetType(obj);

    // 如果是 Image 类型,导出
    if (type == FPDF_PAGEOBJ_IMAGE) {
        unsigned int logical_w = 0, logical_h = 0;
        FPDFImageObj_GetImagePixelSize(obj, &logical_w, &logical_h);

        FPDF_BITMAP bitmap = FPDFImageObj_GetRenderedBitmap(doc, page, obj);
        if (!bitmap) return;

        int bmp_w = FPDFBitmap_GetWidth(bitmap);
        int bmp_h = FPDFBitmap_GetHeight(bitmap);
        int bmp_format = FPDFBitmap_GetFormat(bitmap);

        std::cout << "🖼️  Image " << image_counter
                  << ": logical(" << logical_w << "x" << logical_h
                  << "), bitmap(" << bmp_w << "x" << bmp_h
                  << "), format=" << format_to_string(bmp_format) << std::endl;

        if (bmp_w <= 0 || bmp_h <= 0) {
            std::cerr << "⚠️  Invalid bitmap size, skipping.\n";
            FPDFBitmap_Destroy(bitmap);
            return;
        }

        std::vector<unsigned char> rgb_data;
        if (!convert_to_rgb(bitmap, rgb_data)) {
            FPDFBitmap_Destroy(bitmap);
            return;
        }

        // 输出路径
        std::ostringstream oss;
        oss << output_dir << "/page_" << std::setw(2) << std::setfill('0') << page_index
            << "_img_" << std::setw(3) << std::setfill('0') << image_counter << ".jpg";
        std::string output_path = oss.str();

        if (stbi_write_jpg(output_path.c_str(), bmp_w, bmp_h, 3, rgb_data.data(), 90)) {
            std::cout << "✅ Saved: " << output_path << std::endl;
        } else {
            std::cerr << "❌ Failed to write JPEG: " << output_path << std::endl;
        }

        image_counter++;
        FPDFBitmap_Destroy(bitmap);
    }

    // 如果是 Form 对象,递归进入
    if (type == FPDF_PAGEOBJ_FORM) {
        int sub_count = FPDFFormObj_CountObjects(obj);
        for (int k = 0; k < sub_count; ++k) {
            FPDF_PAGEOBJECT sub_obj = FPDFFormObj_GetObject(obj, k);
            ExtractImagesFromObject(doc, page, sub_obj, output_dir, page_index, image_counter);
        }
    }
}

int main() {
    FPDF_InitLibrary();

    const std::string input_pdf = "D:/BugPdf/bug.pdf";
    const std::string output_dir = "D:/image_out";

    FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);
    if (!document) {
        unsigned long error = FPDF_GetLastError();
        std::cerr << "❌ Failed to load PDF document: " << input_pdf
                  << " (Error: " << error << ")" << std::endl;
        FPDF_DestroyLibrary();
        return -1;
    }

    int page_count = FPDF_GetPageCount(document);
    std::cout << "📄 Document has " << page_count << " pages." << std::endl;

    int total_images_exported = 0;

    for (int i = 0; i < page_count; i++) {
        FPDF_PAGE page = FPDF_LoadPage(document, i);
        if (!page) continue;

        int obj_count = FPDFPage_CountObjects(page);
        std::cout << "📄 Page " << i << " has " << obj_count << " objects." << std::endl;

        for (int j = 0; j < obj_count; j++) {
            FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);
            ExtractImagesFromObject(document, page, obj, output_dir, i, total_images_exported);
        }

        FPDF_ClosePage(page);
    }

    FPDF_CloseDocument(document);
    FPDF_DestroyLibrary();

    std::cout << "\n🎉 Done! Exported " << total_images_exported << " images.\n";
    return 0;
}

✅ 新功能说明

功能 说明
🧩 支持递归 Form 对象 自动深入表单对象中的嵌套图像
🖼️ 图像计数全局递增 输出的文件名连续编号,不会覆盖
💪 完全兼容旧 PDFium 接口 不需要修改库,只调用公开 API
🚫 不崩溃防护保留 异常位图尺寸仍会安全跳过

自动检测透明度的版本

完美 👍 那我来给你一个最终升级版:

✅ 支持 递归提取表单中的图像

✅ 自动判断 透明通道 → PNG不透明 → JPEG

✅ 保留安全检查与日志输出

✅ 命名连续编号,防止覆盖


💎 最终版本:自动透明检测输出(PNG/JPEG)

cpp 复制代码
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#include <filesystem>

#include "fpdfview.h"
#include "fpdf_edit.h"
#include "fpdf_save.h"

// ---------------- STB Image Write ----------------
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

// 创建目录
bool create_directory(const std::string& path) {
    try {
        return std::filesystem::create_directories(path);
    } catch (const std::exception& e) {
        std::cerr << "❌ Failed to create directory: " << path
                  << " - " << e.what() << std::endl;
        return false;
    }
}

// 将 PDFium 位图转换为 RGB(A)
bool convert_to_rgb(FPDF_BITMAP bitmap, std::vector<unsigned char>& out_data, bool& has_alpha) {
    int format = FPDFBitmap_GetFormat(bitmap);
    int width = FPDFBitmap_GetWidth(bitmap);
    int height = FPDFBitmap_GetHeight(bitmap);
    int stride = FPDFBitmap_GetStride(bitmap);
    unsigned char* buffer = (unsigned char*)FPDFBitmap_GetBuffer(bitmap);

    has_alpha = false;

    switch (format) {
    case FPDFBitmap_Gray:
        out_data.resize(width * height * 3);
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                unsigned char gray = src[x];
                out_data[(y * width + x) * 3 + 0] = gray;
                out_data[(y * width + x) * 3 + 1] = gray;
                out_data[(y * width + x) * 3 + 2] = gray;
            }
        }
        break;

    case FPDFBitmap_BGR:
        out_data.resize(width * height * 3);
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                out_data[(y * width + x) * 3 + 0] = src[x * 3 + 2];
                out_data[(y * width + x) * 3 + 1] = src[x * 3 + 1];
                out_data[(y * width + x) * 3 + 2] = src[x * 3 + 0];
            }
        }
        break;

    case FPDFBitmap_BGRx:
        out_data.resize(width * height * 3);
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                out_data[(y * width + x) * 3 + 0] = src[x * 4 + 2];
                out_data[(y * width + x) * 3 + 1] = src[x * 4 + 1];
                out_data[(y * width + x) * 3 + 2] = src[x * 4 + 0];
            }
        }
        break;

    case FPDFBitmap_BGRA:
    case FPDFBitmap_BGRA_Premul:
        has_alpha = true;
        out_data.resize(width * height * 4);
        for (int y = 0; y < height; ++y) {
            unsigned char* src = buffer + y * stride;
            for (int x = 0; x < width; ++x) {
                out_data[(y * width + x) * 4 + 0] = src[x * 4 + 2]; // R
                out_data[(y * width + x) * 4 + 1] = src[x * 4 + 1]; // G
                out_data[(y * width + x) * 4 + 2] = src[x * 4 + 0]; // B
                out_data[(y * width + x) * 4 + 3] = src[x * 4 + 3]; // A
            }
        }
        break;

    default:
        std::cerr << "❌ Unsupported bitmap format: " << format << std::endl;
        return false;
    }

    return true;
}

// 格式说明
std::string format_to_string(int format) {
    switch (format) {
    case FPDFBitmap_Unknown: return "Unknown";
    case FPDFBitmap_Gray: return "Gray";
    case FPDFBitmap_BGR: return "BGR";
    case FPDFBitmap_BGRx: return "BGRx";
    case FPDFBitmap_BGRA: return "BGRA";
    case FPDFBitmap_BGRA_Premul: return "BGRA_Premul";
    default: return "Unknown(" + std::to_string(format) + ")";
    }
}

// 递归提取图片(支持 Form)
void ExtractImagesFromObject(FPDF_DOCUMENT doc, FPDF_PAGE page, FPDF_PAGEOBJECT obj,
                             const std::string& output_dir, int page_index, int& image_counter) {
    if (!obj) return;
    int type = FPDFPageObj_GetType(obj);

    // Image 对象
    if (type == FPDF_PAGEOBJ_IMAGE) {
        unsigned int logical_w = 0, logical_h = 0;
        FPDFImageObj_GetImagePixelSize(obj, &logical_w, &logical_h);

        FPDF_BITMAP bitmap = FPDFImageObj_GetRenderedBitmap(doc, page, obj);
        if (!bitmap) return;

        int bmp_w = FPDFBitmap_GetWidth(bitmap);
        int bmp_h = FPDFBitmap_GetHeight(bitmap);
        int bmp_format = FPDFBitmap_GetFormat(bitmap);

        std::cout << "🖼️  Image " << image_counter
                  << ": logical(" << logical_w << "x" << logical_h
                  << "), bitmap(" << bmp_w << "x" << bmp_h
                  << "), format=" << format_to_string(bmp_format) << std::endl;

        if (bmp_w <= 0 || bmp_h <= 0) {
            std::cerr << "⚠️  Invalid bitmap size, skipping.\n";
            FPDFBitmap_Destroy(bitmap);
            return;
        }

        std::vector<unsigned char> img_data;
        bool has_alpha = false;

        if (!convert_to_rgb(bitmap, img_data, has_alpha)) {
            FPDFBitmap_Destroy(bitmap);
            return;
        }

        // 输出文件名
        std::ostringstream oss;
        oss << output_dir << "/page_" << std::setw(2) << std::setfill('0') << page_index
            << "_img_" << std::setw(3) << std::setfill('0') << image_counter
            << (has_alpha ? ".png" : ".jpg");
        std::string output_path = oss.str();

        bool success = false;
        if (has_alpha) {
            success = stbi_write_png(output_path.c_str(), bmp_w, bmp_h, 4, img_data.data(), bmp_w * 4);
        } else {
            success = stbi_write_jpg(output_path.c_str(), bmp_w, bmp_h, 3, img_data.data(), 90);
        }

        if (success)
            std::cout << "✅ Saved: " << output_path << std::endl;
        else
            std::cerr << "❌ Failed to write: " << output_path << std::endl;

        image_counter++;
        FPDFBitmap_Destroy(bitmap);
    }

    // Form 对象递归
    if (type == FPDF_PAGEOBJ_FORM) {
        int sub_count = FPDFFormObj_CountObjects(obj);
        for (int k = 0; k < sub_count; ++k) {
            FPDF_PAGEOBJECT sub_obj = FPDFFormObj_GetObject(obj, k);
            ExtractImagesFromObject(doc, page, sub_obj, output_dir, page_index, image_counter);
        }
    }
}

int main() {
    FPDF_InitLibrary();

    const std::string input_pdf = "D:/BugPdf/bug.pdf";
    const std::string output_dir = "D:/image_out";

    create_directory(output_dir);

    FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);
    if (!document) {
        unsigned long error = FPDF_GetLastError();
        std::cerr << "❌ Failed to load PDF document: " << input_pdf
                  << " (Error: " << error << ")" << std::endl;
        FPDF_DestroyLibrary();
        return -1;
    }

    int page_count = FPDF_GetPageCount(document);
    std::cout << "📄 Document has " << page_count << " pages.\n";

    int total_images = 0;

    for (int i = 0; i < page_count; i++) {
        FPDF_PAGE page = FPDF_LoadPage(document, i);
        if (!page) continue;

        int obj_count = FPDFPage_CountObjects(page);
        std::cout << "\n📄 Page " << i << " has " << obj_count << " objects.\n";

        for (int j = 0; j < obj_count; j++) {
            FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);
            ExtractImagesFromObject(document, page, obj, output_dir, i, total_images);
        }

        FPDF_ClosePage(page);
    }

    FPDF_CloseDocument(document);
    FPDF_DestroyLibrary();

    std::cout << "\n🎉 Done! Exported " << total_images << " images.\n";
    return 0;
}

✨ 改进要点

功能 说明
🧩 递归 Form 支持 可提取表单对象内的嵌套图片
🧠 透明通道检测 BGRA / Premul → PNG
🪶 普通图片 → JPEG 自动区分输出格式
🧱 安全检查 所有宽高、缓冲区安全验证保留
💾 自动创建输出目录 避免路径不存在报错
📸 连续命名 page_XX_img_XXX 格式防覆盖

相关推荐
momo_al3 小时前
Umi-OCR制作双层PDF
pdf·ocr
Larry_Yanan3 小时前
QML学习笔记(四十三)QML与C++交互:上下文属性暴露
c++·笔记·qt·学习·ui·交互
励志成为美貌才华为一体的女子3 小时前
pdf解析工具---Miner-u 本地部署记录
学习·pdf
reasonsummer3 小时前
【办公类-115-02】20251018信息员每周通讯上传之文字稿整理(PDF转docx没有成功)
python·pdf
owCode3 小时前
4-C++智能指针
开发语言·c++
liu****3 小时前
10.queue的模拟实现
开发语言·数据结构·c++·算法
宋恩淇要努力3 小时前
C++多态
c++
哦你看看3 小时前
学习Python 03
开发语言·windows·python
小龙报4 小时前
《彻底理解C语言指针全攻略(6)-- qsort、sizeof和strlen》
c语言·开发语言·职场和发展·创业创新·学习方法·业界资讯·visual studio