HDFS 之 Client 调试

复制代码

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.test.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;

public class DFSIOMain {

  // 1MB缓冲区大小
  private static final int BUFFER_SIZE = 1024 * 1024; // 1MB

  public static void main(String[] args) {
    if (args.length < 1) {
      System.err.println("用法: java HDFSFileReader <hdfs-file-path> [buffer-size-in-MB]");
      System.err.println("示例: java HDFSFileReader hdfs://namenode:9000/data/100mfile.dat 1");
      System.exit(1);
    }

    String hdfsPath = args[0];
    int bufferSize = BUFFER_SIZE;

    if (args.length >= 2) {
      try {
        int mb = Integer.parseInt(args[1]);
        bufferSize = mb * 1024 * 1024;
        System.out.println("使用自定义缓冲区大小: MB: " + mb);
      } catch (NumberFormatException e) {
        System.out.println("无效的缓冲区大小参数，使用默认值1MB");
      }
    }

    Configuration conf = new Configuration();
    //    conf.set("fs.defaultFS", "hdfs://10.17.26.174:9000");

    FileSystem fs = null;
    InputStream in = null;

    try {
      fs = FileSystem.get(URI.create(hdfsPath), conf);
      Path path = new Path(hdfsPath);
      // 获取文件信息
      long fileSize = fs.getFileStatus(path).getLen();
      System.out.println(
          "文件大小: "
              + fileSize
              + " bytes ({ "
              + String.format("%.2f", fileSize / (1024.0 * 1024.0))
              + "}MB)");

      // 打开文件流
      in = fs.open(path);
      long actualSize = 0;
      byte[] buffer = new byte[bufferSize];
      ;
      while (actualSize < fileSize) {
        int curSize = in.read(buffer, 0, bufferSize);
        if (curSize < 0) break;
        actualSize += curSize;
      }
      System.out.println("actualSize :" + actualSize);

    } catch (IOException e) {
      throw new RuntimeException(e);
    } finally {
      // 确保资源被关闭
      if (in != null) {
        try {
          in.close();
        } catch (IOException e) {
          System.out.println("关闭输入流失败" + e.getMessage());
        }
      }
      if (fs != null) {
        try {
          fs.close();
        } catch (IOException e) {
          System.out.println("关闭文件系统失败" + e.getMessage());
        }
      }
    }
  }
}

POM

复制代码

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.apache.spark</groupId>
  <artifactId>HadoopScience</artifactId>
  <version>1.0.0-SNAPSHOT</version>
  <name>HadoopScience</name>

  <properties>
    <hadoop.version>3.3.3</hadoop.version>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
    <encoding>UTF-8</encoding>
    <scala.version>2.12.15</scala.version>
    <scala.binary.version>2.12</scala.binary.version>
    <spec2.version>4.2.0</spec2.version>
    <spotless.version>2.27.2</spotless.version>
    <spotless.scalafmt.version>3.8.3</spotless.scalafmt.version>
    <spotless.delimiter>package</spotless.delimiter>
    <spotless.license.header>
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
    </spotless.license.header>
  </properties>

  <dependencies>
  <!-- Hadoop 客户端 -->
  <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
  </dependency>


    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.13.1</version>
      <scope>test</scope>
    </dependency>
  </dependencies>

  <build>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <version>2.15.2</version>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <sourceDir>src/main/scala</sourceDir>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.8.1</version>
        <executions>
          <execution>
            <id>default-compile</id>
            <phase>compile</phase>
            <goals>
              <goal>compile</goal>
            </goals>
            <configuration>
              <source>8</source>
              <target>8</target>
            </configuration>
          </execution>
          <execution>
            <id>default-testCompile</id>
            <phase>test-compile</phase>
            <goals>
              <goal>testCompile</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <archive>
            <manifest>
              <mainClass>com.test.hadoop.DFSIOMain</mainClass>
            </manifest>
          </archive>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id> <!-- this is used for an identifier -->
            <phase>package</phase> <!-- bind to the packaging phase -->
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-jar-plugin</artifactId>
        <version>3.4.2</version>
        <configuration>
          <archive>
            <manifest>
              <addClasspath>true</addClasspath>
              <mainClass>com.test.hadoop.DFSIOMain</mainClass>
            </manifest>
          </archive>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-surefire-plugin</artifactId>
        <version>2.21.0</version>
      </plugin>

      <plugin>
        <groupId>com.diffplug.spotless</groupId>
        <artifactId>spotless-maven-plugin</artifactId>
        <version>${spotless.version}</version>
        <configuration>
          <java>
            <toggleOffOn />
            <googleJavaFormat>
              <version>1.7</version>
            </googleJavaFormat>

            <!-- \# refers to the static imports -->
            <importOrder>
              <order>org.apache.gluten,io.substrait.spark,,javax,java,scala,\#</order>
            </importOrder>

            <removeUnusedImports />
            <licenseHeader>
              <content>${spotless.license.header}</content>
              <delimiter>${spotless.delimiter}</delimiter>
            </licenseHeader>
          </java>
          <scala>
            <!--  make it works `// spotless:off `  -->
            <toggleOffOn />
            <scalafmt>
              <version>${spotless.scalafmt.version}</version>
              <scalaMajorVersion>${scala.binary.version}</scalaMajorVersion>
              <file>.scalafmt.conf</file>
            </scalafmt>
            <licenseHeader>
              <content>${spotless.license.header}</content>
              <delimiter>${spotless.delimiter}</delimiter>
            </licenseHeader>
          </scala>
        </configuration>
        <executions>
          <execution>
            <id>spotless-check</id>
            <phase>validate</phase>
            <goals>
              <goal>check</goal>
            </goals>
          </execution>
        </executions>
      </plugin>

      <plugin>
        <groupId>net.alchim31.maven</groupId>
        <artifactId>scala-maven-plugin</artifactId>
        <version>3.3.1</version>
        <executions>
          <execution>
            <id>scala-compile-first</id>
            <phase>process-resources</phase>
            <goals>
              <goal>add-source</goal>
              <goal>compile</goal>
            </goals>
          </execution>

          <execution>
            <phase>compile</phase>
            <goals>
              <goal>compile</goal>
              <goal>testCompile</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
        </configuration>
      </plugin>

    </plugins>
  </build>
</project>

# time hadoop jar HadoopScience-1.0.0-SNAPSHOT.jar /benchmarks/TestDFSIO/io_data/test_io_1
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600

real    0m1.548s
user    0m2.411s
sys     0m0.491s

real (实际时间，也称为墙钟时间，wall-clock time)

这是命令从开始到结束的总时间，也就是我们感受到的流逝的时间。在这个例子中，实际时间是1.548秒。

user (用户态时间)

这是进程在用户态（即非内核态）运行所花费的CPU时间。这里是2.411秒。注意，这可以超过real时间，因为如果有多个CPU核心，多个线程可以并行执行，所以用户态时间可以大于实际时间。

sys (内核态时间)

这是进程在内核态运行所花费的CPU时间。这里是0.491秒。

fat jar 测试

复制代码

java -Dfs.defaultFS=hdfs://{IP}:9000/ \
     -Dfs.hdfs.impl=org.apache.hadoop.hdfs.DistributedFileSystem \
     -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
     hdfs://{IP}:9000/benchmarks/TestDFSIO/io_data/test_io_1

会仍旧报如下错误

复制代码

Caused by: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "hdfs"
        at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
        at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
        at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
        at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
        at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
        at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)

需要在代码里显式指定

复制代码

conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");

才能成功

复制代码

# java -Dfs.defaultFS=hdfs://{IP}:9000/      -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar      hdfs://{IP}:9000/benchmarks/TestDFSIO/io_data/test_io_1

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.

文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600

# java       -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar      hdfs://{IP}:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600

# java       -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar     /benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" java.lang.RuntimeException: java.io.FileNotFoundException: File /benchmarks/TestDFSIO/io_data/test_io_1 does not exist
        at com.test.hadoop.DFSIOMain.main(DFSIOMain.java:84)
Caused by: java.io.FileNotFoundException: File /benchmarks/TestDFSIO/io_data/test_io_1 does not exist
        at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
        at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
        at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
        at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
        at com.test.hadoop.DFSIOMain.main(DFSIOMain.java:63)

time 时间对比

复制代码

# time java -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar  hdfs://hadoop00:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600

real    0m0.846s
user    0m1.634s
sys     0m0.207s

JVM 进程调优

复制代码

# time java -Xverify:none   -jar HadoopScience-1.0.0-SNAPSHOT-jar-with-dependencies.jar  hdfs://hadoop00:9000/benchmarks/TestDFSIO/io_data/test_io_1
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
文件大小: 104857600 bytes ({ 100.00}MB)
actualSize :104857600

real    0m0.645s
user    0m1.349s
sys     0m0.221s