/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.test.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
public class DFSIOMain {
// 1MB缓冲区大小
private static final int BUFFER_SIZE = 1024 * 1024; // 1MB
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("用法: java HDFSFileReader <hdfs-file-path> [buffer-size-in-MB]");
System.err.println("示例: java HDFSFileReader hdfs://namenode:9000/data/100mfile.dat 1");
System.exit(1);
}
String hdfsPath = args[0];
int bufferSize = BUFFER_SIZE;
if (args.length >= 2) {
try {
int mb = Integer.parseInt(args[1]);
bufferSize = mb * 1024 * 1024;
System.out.println("使用自定义缓冲区大小: MB: " + mb);
} catch (NumberFormatException e) {
System.out.println("无效的缓冲区大小参数,使用默认值1MB");
}
}
Configuration conf = new Configuration();
// conf.set("fs.defaultFS", "hdfs://10.17.26.174:9000");
FileSystem fs = null;
InputStream in = null;
try {
fs = FileSystem.get(URI.create(hdfsPath), conf);
Path path = new Path(hdfsPath);
// 获取文件信息
long fileSize = fs.getFileStatus(path).getLen();
System.out.println(
"文件大小: "
+ fileSize
+ " bytes ({ "
+ String.format("%.2f", fileSize / (1024.0 * 1024.0))
+ "}MB)");
// 打开文件流
in = fs.open(path);
long actualSize = 0;
byte[] buffer = new byte[bufferSize];
;
while (actualSize < fileSize) {
int curSize = in.read(buffer, 0, bufferSize);
if (curSize < 0) break;
actualSize += curSize;
}
System.out.println("actualSize :" + actualSize);
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
// 确保资源被关闭
if (in != null) {
try {
in.close();
} catch (IOException e) {
System.out.println("关闭输入流失败" + e.getMessage());
}
}
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
System.out.println("关闭文件系统失败" + e.getMessage());
}
}
}
}
}
POM
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.spark</groupId>
<artifactId>HadoopScience</artifactId>
<version>1.0.0-SNAPSHOT</version>
<name>HadoopScience</name>
<properties>
<hadoop.version>3.3.3</hadoop.version>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.12.15</scala.version>
<scala.binary.version>2.12</scala.binary.version>
<spec2.version>4.2.0</spec2.version>
<spotless.version>2.27.2</spotless.version>
<spotless.scalafmt.version>3.8.3</spotless.scalafmt.version>
<spotless.delimiter>package</spotless.delimiter>
<spotless.license.header>
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
</spotless.license.header>
</properties>
<dependencies>
<!-- Hadoop 客户端 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
<configuration>
<sourceDir>src/main/scala</sourceDir>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<executions>
<execution>
<id>default-compile</id>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</execution>
<execution>
<id>default-testCompile</id>
<phase>test-compile</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.test.hadoop.DFSIOMain</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for an identifier -->
<phase>package</phase> <!-- bind to the packaging phase -->
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.4.2</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>com.test.hadoop.DFSIOMain</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.21.0</version>
</plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>${spotless.version}</version>
<configuration>
<java>
<toggleOffOn />
<googleJavaFormat>
<version>1.7</version>
</googleJavaFormat>
<!-- \# refers to the static imports -->
<importOrder>
<order>org.apache.gluten,io.substrait.spark,,javax,java,scala,\#</order>
</importOrder>
<removeUnusedImports />
<licenseHeader>
<content>${spotless.license.header}</content>
<delimiter>${spotless.delimiter}</delimiter>
</licenseHeader>
</java>
<scala>
<!-- make it works `// spotless:off ` -->
<toggleOffOn />
<scalafmt>
<version>${spotless.scalafmt.version}</version>
<scalaMajorVersion>${scala.binary.version}</scalaMajorVersion>
<file>.scalafmt.conf</file>
</scalafmt>
<licenseHeader>
<content>${spotless.license.header}</content>
<delimiter>${spotless.delimiter}</delimiter>
</licenseHeader>
</scala>
</configuration>
<executions>
<execution>
<id>spotless-check</id>
<phase>validate</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.3.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
</project>
# time hadoop jar HadoopScience-1.0.0-SNAPSHOT.jar /benchmarks/TestDFSIO/io_data/test_io_1
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600
real 0m1.548s
user 0m2.411s
sys 0m0.491s
- real (实际时间,也称为墙钟时间,wall-clock time)
这是命令从开始到结束的总时间,也就是我们感受到的流逝的时间。在这个例子中,实际时间是1.548秒。
- user (用户态时间)
这是进程在用户态(即非内核态)运行所花费的CPU时间。这里是2.411秒。注意,这可以超过real时间,因为如果有多个CPU核心,多个线程可以并行执行,所以用户态时间可以大于实际时间。
- sys (内核态时间)
这是进程在内核态运行所花费的CPU时间。这里是0.491秒。
fat jar 测试
java -Dfs.defaultFS=hdfs://{IP}:9000/ \
-Dfs.hdfs.impl=org.apache.hadoop.hdfs.DistributedFileSystem \
-jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
hdfs://{IP}:9000/benchmarks/TestDFSIO/io_data/test_io_1
会仍旧报如下错误
Caused by: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "hdfs"
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
需要在代码里显式指定
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
才能成功
# java -Dfs.defaultFS=hdfs://{IP}:9000/ -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar hdfs://{IP}:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600
# java -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar hdfs://{IP}:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600
# java -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar /benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" java.lang.RuntimeException: java.io.FileNotFoundException: File /benchmarks/TestDFSIO/io_data/test_io_1 does not exist
at com.test.hadoop.DFSIOMain.main(DFSIOMain.java:84)
Caused by: java.io.FileNotFoundException: File /benchmarks/TestDFSIO/io_data/test_io_1 does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
at com.test.hadoop.DFSIOMain.main(DFSIOMain.java:63)
time 时间对比
# time java -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar hdfs://hadoop00:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600
real 0m0.846s
user 0m1.634s
sys 0m0.207s
JVM 进程调优
# time java -Xverify:none -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar hdfs://hadoop00:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600
real 0m0.645s
user 0m1.349s
sys 0m0.221s