JAVA实现将PDF转换成word文档

POM.xml

复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
       <groupId>org.springframework.boot</groupId>
       <artifactId>spring-boot-starter-parent</artifactId>
<!--       <version>3.2.1</version>-->
       <version>2.3.9.RELEASE</version>
       <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.jack</groupId>
    <artifactId>jackDemo</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>jackDemo</name>
    <description>jackDemo</description>
    <properties>
       <java.version>1.8</java.version>
    </properties>
    <dependencies>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-data-mongodb</artifactId>
       </dependency>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-web</artifactId>
       </dependency>

       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-devtools</artifactId>
          <scope>runtime</scope>
          <optional>true</optional>
       </dependency>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-test</artifactId>
          <scope>test</scope>
       </dependency>

       <dependency>
          <groupId>com.alibaba</groupId>
          <artifactId>fastjson</artifactId>
          <version>1.2.75</version>
       </dependency>


       <dependency>
          <groupId>org.openpnp</groupId>
          <artifactId>opencv</artifactId>
          <version>4.5.3-4</version>
       </dependency>

       <!-- Apache POI for Excel files -->
       <dependency>
          <groupId>org.apache.poi</groupId>
          <artifactId>poi-ooxml</artifactId>
          <version>5.2.3</version> <!-- 请检查并使用最新版本 -->
       </dependency>

       <!-- Apache POI dependencies (these may be included automatically by Maven, but it's good to be explicit) -->
       <dependency>
          <groupId>org.apache.poi</groupId>
          <artifactId>poi</artifactId>
          <version>5.2.3</version> <!-- 与poi-ooxml版本保持一致 -->
       </dependency>

       <!-- Apache Commons Collections (required by POI) -->
       <dependency>
          <groupId>org.apache.commons</groupId>
          <artifactId>commons-collections4</artifactId>
          <version>4.4</version> <!-- 确保版本与你的项目兼容 -->
       </dependency>

       <!-- Apache Commons IO (optional, but useful for file handling) -->
       <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.11.0</version> <!-- 确保版本与你的项目兼容 -->
       </dependency>

       <!-- PDFBox for reading PDF files -->
       <dependency>
          <groupId>org.apache.pdfbox</groupId>
          <artifactId>pdfbox</artifactId>
          <version>2.0.24</version>
       </dependency>

       <!-- docx4j for creating Word documents -->
       <dependency>
          <groupId>org.docx4j</groupId>
          <artifactId>docx4j</artifactId>
          <version>3.2.1</version>
       </dependency>

       <dependency>
          <groupId>javax.xml.bind</groupId>
          <artifactId>jaxb-api</artifactId>
          <version>2.3.1</version>
       </dependency>
       <dependency>
          <groupId>org.glassfish.jaxb</groupId>
          <artifactId>jaxb-runtime</artifactId>
          <version>2.3.1</version>
       </dependency>

    </dependencies>


    <build>
       <plugins>
          <plugin>
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-maven-plugin</artifactId>
          </plugin>
       </plugins>
    </build>

</project>

java 文件:

复制代码
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.wml.Body;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.docx4j.wml.Text;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

public class PdfToWordConverter {

    public static void main(String[] args) throws Exception{
        String pdfFilePath = "D:\\word\\何以为父影响彼此一生的父子关系.pdf"; // 替换为你的PDF文件路径
        String wordFilePath = "D:\\word\\何以为父影响彼此一生的父子关系.docx"; // 生成的Word文件路径

        try {
            // 读取PDF文件内容
            String pdfText = extractTextFromPdf(pdfFilePath);

            // 将内容写入Word文档
            createWordDocument(wordFilePath, pdfText);

            System.out.println("PDF to Word conversion completed successfully!");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static String extractTextFromPdf(String filePath) throws IOException {
        PDDocument document = PDDocument.load(new FileInputStream(filePath));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        return pdfStripper.getText(document);
    }

    public static void createWordDocument(String filePath, String content) throws Exception {
        WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
        MainDocumentPart mainDocumentPart = wordMLPackage.getMainDocumentPart();
        Body body = mainDocumentPart.getContents().getBody();

        // 将内容按段落分割并添加到Word文档中
        String[] paragraphs = content.split("\\r?\\n");
        for (String paragraph : paragraphs) {
            P p = new P();
            R r = new R();
            Text text = new Text();
            text.setParent(paragraph);
            r.getContent().add(text);
            p.getContent().add(r);
            body.getContent().add(p);
        }

        // 保存Word文档
        try (FileOutputStream out = new FileOutputStream(new File(filePath))) {
            wordMLPackage.save(out);
        }
    }
}
相关推荐
小bo波1 天前
使用Thread子类创建线程 VS 使用Runnable接口创建线程的区别
java·多线程·thread·并发编程·runnable
SamDeepThinking1 天前
高并发场景下,CompletableFuture与ForkJoinPool该如何取舍?
java·后端·面试
张不才1 天前
CPU 100% 了怎么办?Java 性能排障的标准化操作
java·后端
shepherd1111 天前
吞吐量提升 10 倍:高并发大批量数据处理任务的架构演进与性能调优
java·后端·架构
plainGeekDev1 天前
单例模式 → object 声明
android·java·kotlin
用户298698530141 天前
Java 实现 Word 文档文本与图片提取的方法
java·后端
SimonKing1 天前
铁子,IntelliJ IDEA 2026.1.3来了,升不升?
java·后端·程序员
咖啡八杯2 天前
GoF设计模式——策略模式
java·后端·spring·设计模式
用户128526116022 天前
我把祖传Java项目重构后,接口响应从3s砍到了200ms,只改了这几行代码
java