JAVA实现将PDF转换成word文档

POM.xml

复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
       <groupId>org.springframework.boot</groupId>
       <artifactId>spring-boot-starter-parent</artifactId>
<!--       <version>3.2.1</version>-->
       <version>2.3.9.RELEASE</version>
       <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.jack</groupId>
    <artifactId>jackDemo</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>jackDemo</name>
    <description>jackDemo</description>
    <properties>
       <java.version>1.8</java.version>
    </properties>
    <dependencies>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-data-mongodb</artifactId>
       </dependency>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-web</artifactId>
       </dependency>

       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-devtools</artifactId>
          <scope>runtime</scope>
          <optional>true</optional>
       </dependency>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-test</artifactId>
          <scope>test</scope>
       </dependency>

       <dependency>
          <groupId>com.alibaba</groupId>
          <artifactId>fastjson</artifactId>
          <version>1.2.75</version>
       </dependency>


       <dependency>
          <groupId>org.openpnp</groupId>
          <artifactId>opencv</artifactId>
          <version>4.5.3-4</version>
       </dependency>

       <!-- Apache POI for Excel files -->
       <dependency>
          <groupId>org.apache.poi</groupId>
          <artifactId>poi-ooxml</artifactId>
          <version>5.2.3</version> <!-- 请检查并使用最新版本 -->
       </dependency>

       <!-- Apache POI dependencies (these may be included automatically by Maven, but it's good to be explicit) -->
       <dependency>
          <groupId>org.apache.poi</groupId>
          <artifactId>poi</artifactId>
          <version>5.2.3</version> <!-- 与poi-ooxml版本保持一致 -->
       </dependency>

       <!-- Apache Commons Collections (required by POI) -->
       <dependency>
          <groupId>org.apache.commons</groupId>
          <artifactId>commons-collections4</artifactId>
          <version>4.4</version> <!-- 确保版本与你的项目兼容 -->
       </dependency>

       <!-- Apache Commons IO (optional, but useful for file handling) -->
       <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.11.0</version> <!-- 确保版本与你的项目兼容 -->
       </dependency>

       <!-- PDFBox for reading PDF files -->
       <dependency>
          <groupId>org.apache.pdfbox</groupId>
          <artifactId>pdfbox</artifactId>
          <version>2.0.24</version>
       </dependency>

       <!-- docx4j for creating Word documents -->
       <dependency>
          <groupId>org.docx4j</groupId>
          <artifactId>docx4j</artifactId>
          <version>3.2.1</version>
       </dependency>

       <dependency>
          <groupId>javax.xml.bind</groupId>
          <artifactId>jaxb-api</artifactId>
          <version>2.3.1</version>
       </dependency>
       <dependency>
          <groupId>org.glassfish.jaxb</groupId>
          <artifactId>jaxb-runtime</artifactId>
          <version>2.3.1</version>
       </dependency>

    </dependencies>


    <build>
       <plugins>
          <plugin>
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-maven-plugin</artifactId>
          </plugin>
       </plugins>
    </build>

</project>

java 文件:

复制代码
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.wml.Body;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.docx4j.wml.Text;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

public class PdfToWordConverter {

    public static void main(String[] args) throws Exception{
        String pdfFilePath = "D:\\word\\何以为父影响彼此一生的父子关系.pdf"; // 替换为你的PDF文件路径
        String wordFilePath = "D:\\word\\何以为父影响彼此一生的父子关系.docx"; // 生成的Word文件路径

        try {
            // 读取PDF文件内容
            String pdfText = extractTextFromPdf(pdfFilePath);

            // 将内容写入Word文档
            createWordDocument(wordFilePath, pdfText);

            System.out.println("PDF to Word conversion completed successfully!");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static String extractTextFromPdf(String filePath) throws IOException {
        PDDocument document = PDDocument.load(new FileInputStream(filePath));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        return pdfStripper.getText(document);
    }

    public static void createWordDocument(String filePath, String content) throws Exception {
        WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
        MainDocumentPart mainDocumentPart = wordMLPackage.getMainDocumentPart();
        Body body = mainDocumentPart.getContents().getBody();

        // 将内容按段落分割并添加到Word文档中
        String[] paragraphs = content.split("\\r?\\n");
        for (String paragraph : paragraphs) {
            P p = new P();
            R r = new R();
            Text text = new Text();
            text.setParent(paragraph);
            r.getContent().add(text);
            p.getContent().add(r);
            body.getContent().add(p);
        }

        // 保存Word文档
        try (FileOutputStream out = new FileOutputStream(new File(filePath))) {
            wordMLPackage.save(out);
        }
    }
}
相关推荐
小江的记录本41 分钟前
【JVM虚拟机】堆内存分代模型:年轻代(Eden+Survivor)、老年代、元空间Metaspace(附《思维导图》+《面试高频考点清单》)
java·前端·jvm·后端·python·spring·面试
在繁华处1 小时前
Java从零到熟练(三):流程控制
java·开发语言·python
tedcloud1231 小时前
DeepSeek-TUI部署教程:打造CLI AI助手环境
服务器·人工智能·word·excel·dreamweaver
唐青枫1 小时前
Java Optional 实战指南:优雅处理空值与链式转换
java
一起学开源1 小时前
一文读懂 ReAct 范式:让 AI Agent 真正学会“思考+行动“
java·javascript·react.js·ecmascript·react·alibaba·智能体开发
asdzx671 小时前
使用 Python 快速提取 PDF 中的表格
python·pdf
逍遥德2 小时前
MQTT教程详解-04.SpringBoot集成MQTT(告别手动控制)
java·spring boot·物联网·中间件·iot·iotdb
语戚2 小时前
力扣 3161. 块放置查询:线段树解法(Java 实现)
java·算法·leetcode·面试·线段树·力扣·
我命由我123453 小时前
Android 开发问题:MlKitException: An internal error occurred during initialization.
android·java·java-ee·android jetpack·android-studio·androidx·android runtime
888CC++3 小时前
java 并发编程
java·开发语言·python