提取各种文件的文本内容,offic image zip 等等...
Apache Tika 2.9.2 、 jdk8
基础 pom.xml
xml
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>2.9.2</version>
</dependency>
还需要用到的 pom.xml
xml
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.17.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.26.2</version>
</dependency>
- 注意版本号,不然会出问题
java demo
java
@Test
public void test() throws Exception {
InputStream inputStream = Files.newInputStream(Paths.get("text.zip"));
BodyContentHandler contentHandler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
new AutoDetectParser()
.parse(inputStream, contentHandler, metadata, parseContext);
// 提取出来的内容
System.out.println(contentHandler);
System.out.println("-------------------------------------------");
// 元数据信息
String[] names = metadata.names();
for (String name : names) {
System.out.println(name + ":" + metadata.get(name));
}
}