✅ 修正版:支持递归提取表单中的图片对象
我在你的基础上只增加了一个 递归遍历函数,不影响你现有逻辑和日志输出。
cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#include <filesystem>
#include "fpdfview.h"
#include "fpdf_edit.h"
#include "fpdf_save.h"
// stb_image_write 实现 JPEG 保存
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
bool convert_to_rgb(FPDF_BITMAP bitmap, std::vector<unsigned char>& rgb_data) {
int format = FPDFBitmap_GetFormat(bitmap);
int width = FPDFBitmap_GetWidth(bitmap);
int height = FPDFBitmap_GetHeight(bitmap);
int stride = FPDFBitmap_GetStride(bitmap);
unsigned char* buffer = (unsigned char*)FPDFBitmap_GetBuffer(bitmap);
rgb_data.resize(width * height * 3);
switch (format) {
case FPDFBitmap_Gray:
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
unsigned char gray = src[x];
rgb_data[(y * width + x) * 3 + 0] = gray;
rgb_data[(y * width + x) * 3 + 1] = gray;
rgb_data[(y * width + x) * 3 + 2] = gray;
}
}
break;
case FPDFBitmap_BGR:
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
rgb_data[(y * width + x) * 3 + 0] = src[x * 3 + 2];
rgb_data[(y * width + x) * 3 + 1] = src[x * 3 + 1];
rgb_data[(y * width + x) * 3 + 2] = src[x * 3 + 0];
}
}
break;
case FPDFBitmap_BGRx:
case FPDFBitmap_BGRA:
case FPDFBitmap_BGRA_Premul:
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
rgb_data[(y * width + x) * 3 + 0] = src[x * 4 + 2];
rgb_data[(y * width + x) * 3 + 1] = src[x * 4 + 1];
rgb_data[(y * width + x) * 3 + 2] = src[x * 4 + 0];
}
}
break;
default:
std::cerr << "❌ Unsupported bitmap format: " << format << std::endl;
return false;
}
return true;
}
std::string format_to_string(int format) {
switch (format) {
case FPDFBitmap_Unknown: return "Unknown";
case FPDFBitmap_Gray: return "Gray";
case FPDFBitmap_BGR: return "BGR";
case FPDFBitmap_BGRx: return "BGRx";
case FPDFBitmap_BGRA: return "BGRA";
case FPDFBitmap_BGRA_Premul: return "BGRA_Premul";
default: return "Unknown(" + std::to_string(format) + ")";
}
}
// 🧩 递归提取函数,支持 Form 对象中的图片
void ExtractImagesFromObject(FPDF_DOCUMENT doc, FPDF_PAGE page, FPDF_PAGEOBJECT obj,
const std::string& output_dir, int page_index, int& image_counter) {
if (!obj) return;
int type = FPDFPageObj_GetType(obj);
// 如果是 Image 类型,导出
if (type == FPDF_PAGEOBJ_IMAGE) {
unsigned int logical_w = 0, logical_h = 0;
FPDFImageObj_GetImagePixelSize(obj, &logical_w, &logical_h);
FPDF_BITMAP bitmap = FPDFImageObj_GetRenderedBitmap(doc, page, obj);
if (!bitmap) return;
int bmp_w = FPDFBitmap_GetWidth(bitmap);
int bmp_h = FPDFBitmap_GetHeight(bitmap);
int bmp_format = FPDFBitmap_GetFormat(bitmap);
std::cout << "🖼️ Image " << image_counter
<< ": logical(" << logical_w << "x" << logical_h
<< "), bitmap(" << bmp_w << "x" << bmp_h
<< "), format=" << format_to_string(bmp_format) << std::endl;
if (bmp_w <= 0 || bmp_h <= 0) {
std::cerr << "⚠️ Invalid bitmap size, skipping.\n";
FPDFBitmap_Destroy(bitmap);
return;
}
std::vector<unsigned char> rgb_data;
if (!convert_to_rgb(bitmap, rgb_data)) {
FPDFBitmap_Destroy(bitmap);
return;
}
// 输出路径
std::ostringstream oss;
oss << output_dir << "/page_" << std::setw(2) << std::setfill('0') << page_index
<< "_img_" << std::setw(3) << std::setfill('0') << image_counter << ".jpg";
std::string output_path = oss.str();
if (stbi_write_jpg(output_path.c_str(), bmp_w, bmp_h, 3, rgb_data.data(), 90)) {
std::cout << "✅ Saved: " << output_path << std::endl;
} else {
std::cerr << "❌ Failed to write JPEG: " << output_path << std::endl;
}
image_counter++;
FPDFBitmap_Destroy(bitmap);
}
// 如果是 Form 对象,递归进入
if (type == FPDF_PAGEOBJ_FORM) {
int sub_count = FPDFFormObj_CountObjects(obj);
for (int k = 0; k < sub_count; ++k) {
FPDF_PAGEOBJECT sub_obj = FPDFFormObj_GetObject(obj, k);
ExtractImagesFromObject(doc, page, sub_obj, output_dir, page_index, image_counter);
}
}
}
int main() {
FPDF_InitLibrary();
const std::string input_pdf = "D:/BugPdf/bug.pdf";
const std::string output_dir = "D:/image_out";
FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);
if (!document) {
unsigned long error = FPDF_GetLastError();
std::cerr << "❌ Failed to load PDF document: " << input_pdf
<< " (Error: " << error << ")" << std::endl;
FPDF_DestroyLibrary();
return -1;
}
int page_count = FPDF_GetPageCount(document);
std::cout << "📄 Document has " << page_count << " pages." << std::endl;
int total_images_exported = 0;
for (int i = 0; i < page_count; i++) {
FPDF_PAGE page = FPDF_LoadPage(document, i);
if (!page) continue;
int obj_count = FPDFPage_CountObjects(page);
std::cout << "📄 Page " << i << " has " << obj_count << " objects." << std::endl;
for (int j = 0; j < obj_count; j++) {
FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);
ExtractImagesFromObject(document, page, obj, output_dir, i, total_images_exported);
}
FPDF_ClosePage(page);
}
FPDF_CloseDocument(document);
FPDF_DestroyLibrary();
std::cout << "\n🎉 Done! Exported " << total_images_exported << " images.\n";
return 0;
}
✅ 新功能说明
功能 | 说明 |
---|---|
🧩 支持递归 Form 对象 | 自动深入表单对象中的嵌套图像 |
🖼️ 图像计数全局递增 | 输出的文件名连续编号,不会覆盖 |
💪 完全兼容旧 PDFium 接口 | 不需要修改库,只调用公开 API |
🚫 不崩溃防护保留 | 异常位图尺寸仍会安全跳过 |
自动检测透明度的版本
完美 👍 那我来给你一个最终升级版:
✅ 支持 递归提取表单中的图像
✅ 自动判断 透明通道 → PNG 、不透明 → JPEG
✅ 保留安全检查与日志输出
✅ 命名连续编号,防止覆盖
💎 最终版本:自动透明检测输出(PNG/JPEG)
cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#include <filesystem>
#include "fpdfview.h"
#include "fpdf_edit.h"
#include "fpdf_save.h"
// ---------------- STB Image Write ----------------
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
// 创建目录
bool create_directory(const std::string& path) {
try {
return std::filesystem::create_directories(path);
} catch (const std::exception& e) {
std::cerr << "❌ Failed to create directory: " << path
<< " - " << e.what() << std::endl;
return false;
}
}
// 将 PDFium 位图转换为 RGB(A)
bool convert_to_rgb(FPDF_BITMAP bitmap, std::vector<unsigned char>& out_data, bool& has_alpha) {
int format = FPDFBitmap_GetFormat(bitmap);
int width = FPDFBitmap_GetWidth(bitmap);
int height = FPDFBitmap_GetHeight(bitmap);
int stride = FPDFBitmap_GetStride(bitmap);
unsigned char* buffer = (unsigned char*)FPDFBitmap_GetBuffer(bitmap);
has_alpha = false;
switch (format) {
case FPDFBitmap_Gray:
out_data.resize(width * height * 3);
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
unsigned char gray = src[x];
out_data[(y * width + x) * 3 + 0] = gray;
out_data[(y * width + x) * 3 + 1] = gray;
out_data[(y * width + x) * 3 + 2] = gray;
}
}
break;
case FPDFBitmap_BGR:
out_data.resize(width * height * 3);
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
out_data[(y * width + x) * 3 + 0] = src[x * 3 + 2];
out_data[(y * width + x) * 3 + 1] = src[x * 3 + 1];
out_data[(y * width + x) * 3 + 2] = src[x * 3 + 0];
}
}
break;
case FPDFBitmap_BGRx:
out_data.resize(width * height * 3);
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
out_data[(y * width + x) * 3 + 0] = src[x * 4 + 2];
out_data[(y * width + x) * 3 + 1] = src[x * 4 + 1];
out_data[(y * width + x) * 3 + 2] = src[x * 4 + 0];
}
}
break;
case FPDFBitmap_BGRA:
case FPDFBitmap_BGRA_Premul:
has_alpha = true;
out_data.resize(width * height * 4);
for (int y = 0; y < height; ++y) {
unsigned char* src = buffer + y * stride;
for (int x = 0; x < width; ++x) {
out_data[(y * width + x) * 4 + 0] = src[x * 4 + 2]; // R
out_data[(y * width + x) * 4 + 1] = src[x * 4 + 1]; // G
out_data[(y * width + x) * 4 + 2] = src[x * 4 + 0]; // B
out_data[(y * width + x) * 4 + 3] = src[x * 4 + 3]; // A
}
}
break;
default:
std::cerr << "❌ Unsupported bitmap format: " << format << std::endl;
return false;
}
return true;
}
// 格式说明
std::string format_to_string(int format) {
switch (format) {
case FPDFBitmap_Unknown: return "Unknown";
case FPDFBitmap_Gray: return "Gray";
case FPDFBitmap_BGR: return "BGR";
case FPDFBitmap_BGRx: return "BGRx";
case FPDFBitmap_BGRA: return "BGRA";
case FPDFBitmap_BGRA_Premul: return "BGRA_Premul";
default: return "Unknown(" + std::to_string(format) + ")";
}
}
// 递归提取图片(支持 Form)
void ExtractImagesFromObject(FPDF_DOCUMENT doc, FPDF_PAGE page, FPDF_PAGEOBJECT obj,
const std::string& output_dir, int page_index, int& image_counter) {
if (!obj) return;
int type = FPDFPageObj_GetType(obj);
// Image 对象
if (type == FPDF_PAGEOBJ_IMAGE) {
unsigned int logical_w = 0, logical_h = 0;
FPDFImageObj_GetImagePixelSize(obj, &logical_w, &logical_h);
FPDF_BITMAP bitmap = FPDFImageObj_GetRenderedBitmap(doc, page, obj);
if (!bitmap) return;
int bmp_w = FPDFBitmap_GetWidth(bitmap);
int bmp_h = FPDFBitmap_GetHeight(bitmap);
int bmp_format = FPDFBitmap_GetFormat(bitmap);
std::cout << "🖼️ Image " << image_counter
<< ": logical(" << logical_w << "x" << logical_h
<< "), bitmap(" << bmp_w << "x" << bmp_h
<< "), format=" << format_to_string(bmp_format) << std::endl;
if (bmp_w <= 0 || bmp_h <= 0) {
std::cerr << "⚠️ Invalid bitmap size, skipping.\n";
FPDFBitmap_Destroy(bitmap);
return;
}
std::vector<unsigned char> img_data;
bool has_alpha = false;
if (!convert_to_rgb(bitmap, img_data, has_alpha)) {
FPDFBitmap_Destroy(bitmap);
return;
}
// 输出文件名
std::ostringstream oss;
oss << output_dir << "/page_" << std::setw(2) << std::setfill('0') << page_index
<< "_img_" << std::setw(3) << std::setfill('0') << image_counter
<< (has_alpha ? ".png" : ".jpg");
std::string output_path = oss.str();
bool success = false;
if (has_alpha) {
success = stbi_write_png(output_path.c_str(), bmp_w, bmp_h, 4, img_data.data(), bmp_w * 4);
} else {
success = stbi_write_jpg(output_path.c_str(), bmp_w, bmp_h, 3, img_data.data(), 90);
}
if (success)
std::cout << "✅ Saved: " << output_path << std::endl;
else
std::cerr << "❌ Failed to write: " << output_path << std::endl;
image_counter++;
FPDFBitmap_Destroy(bitmap);
}
// Form 对象递归
if (type == FPDF_PAGEOBJ_FORM) {
int sub_count = FPDFFormObj_CountObjects(obj);
for (int k = 0; k < sub_count; ++k) {
FPDF_PAGEOBJECT sub_obj = FPDFFormObj_GetObject(obj, k);
ExtractImagesFromObject(doc, page, sub_obj, output_dir, page_index, image_counter);
}
}
}
int main() {
FPDF_InitLibrary();
const std::string input_pdf = "D:/BugPdf/bug.pdf";
const std::string output_dir = "D:/image_out";
create_directory(output_dir);
FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);
if (!document) {
unsigned long error = FPDF_GetLastError();
std::cerr << "❌ Failed to load PDF document: " << input_pdf
<< " (Error: " << error << ")" << std::endl;
FPDF_DestroyLibrary();
return -1;
}
int page_count = FPDF_GetPageCount(document);
std::cout << "📄 Document has " << page_count << " pages.\n";
int total_images = 0;
for (int i = 0; i < page_count; i++) {
FPDF_PAGE page = FPDF_LoadPage(document, i);
if (!page) continue;
int obj_count = FPDFPage_CountObjects(page);
std::cout << "\n📄 Page " << i << " has " << obj_count << " objects.\n";
for (int j = 0; j < obj_count; j++) {
FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);
ExtractImagesFromObject(document, page, obj, output_dir, i, total_images);
}
FPDF_ClosePage(page);
}
FPDF_CloseDocument(document);
FPDF_DestroyLibrary();
std::cout << "\n🎉 Done! Exported " << total_images << " images.\n";
return 0;
}
✨ 改进要点
功能 | 说明 |
---|---|
🧩 递归 Form 支持 | 可提取表单对象内的嵌套图片 |
🧠 透明通道检测 | BGRA / Premul → PNG |
🪶 普通图片 → JPEG | 自动区分输出格式 |
🧱 安全检查 | 所有宽高、缓冲区安全验证保留 |
💾 自动创建输出目录 | 避免路径不存在报错 |
📸 连续命名 | page_XX_img_XXX 格式防覆盖 |