minio
参考:
https://github.com/burningmyself/burningmyself.github.io/blob/master/docs/tool/minio.md
安装
bash
docker pull minio/minio:latest
docker run -p 30099:9000 \
-p 9001:9001 \
--name minio \
--restart=always \
-e MINIO_ROOT_USER= \
-e MINIO_ROOT_PASSWORD= \
-v /etc/localtime:/etc/localtime \
-d minio/minio server /data \
--console-address ":9001"
可通过网址访问:http://<ip>:30099
客户端使用
bash
set -ex
chmod +x init_minio.sh
docker run --rm -it --entrypoint=/bin/sh \
-v $(pwd)/init_minio.sh:/init_minio.sh \
minio/mc -c "/init_minio.sh"
init_minio.sh
:
bash
#!/bin/bash
# 设置 MinIO 服务访问的别名。替换为你的 MinIO 服务的实际 IP 地址和端口。
MC_ALIAS=minio
MC_ENDPOINT=http://<ip>:30099
MC_ACCESS_KEY=
MC_SECRET_KEY=
BUCKET_NAME=zlg-contract-lite
FOLDER_PATH="fileupload/"
# Step 1: 配置 /usr/bin/mc 并连接到 MinIO 服务
/usr/bin/mc alias set $MC_ALIAS $MC_ENDPOINT $MC_ACCESS_KEY $MC_SECRET_KEY
# Step 2: 创建 bucket。如 bucket 已存在,则不会报错。
/usr/bin/mc mb $MC_ALIAS/$BUCKET_NAME
# Step 3: 创建初始化目录或上传文件
/usr/bin/mc cp --recursive /usr/bin/ $MC_ALIAS/$BUCKET_NAME/$FOLDER_PATH
echo "MinIO bucket and initial files have been setup."
hadoop s3 sdk访问demo
可以使用marven构建下面的项目
scala
package com.example
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.hadoop.fs._
object SparkS3IntegrationTest {
def main(args: Array[String]): Unit = {
println("print env:")
sys.env.foreach { case (key, value) =>
println(s"$key = $value")
}
// 设置 S3AFileSystem 的日志级别为 TRACE,提供最详细的日志
Logger.getLogger("org.apache.hadoop.fs.s3a.S3AFileSystem").setLevel(Level.TRACE)
// 如果你还需要调试更多的 Hadoop 和 AWS 相关的日志,可以如下设置
Logger.getLogger("org.apache.hadoop.fs").setLevel(Level.TRACE)
Logger.getLogger("com.amazonaws").setLevel(Level.TRACE)
// 设置 Spark 的日志级别为 INFO,避免过多应用层日志干扰
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
// 创建 Spark Session
val spark = SparkSession.builder()
.appName("Spark S3 Integration Test")
.config("spark.hadoop.fs.s3a.access.key", "")
.config("spark.hadoop.fs.s3a.secret.key", "")
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.config("spark.hadoop.fs.s3a.endpoint", "http://127.0.0.1:30099")
.config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
.config("spark.hadoop.fs.s3a.fast.upload", "true")
.getOrCreate()
// 将 S3 文件系统相关配置加入到 Hadoop Configuration
val hadoopConf = spark.sparkContext.hadoopConfiguration
hadoopConf.set("fs.s3a.access.key", "")
hadoopConf.set("fs.s3a.secret.key", "")
hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoopConf.set("fs.s3a.endpoint", "http://127.0.0.1:30099") // 172.24.55.13:30099
hadoopConf.set("fs.s3a.connection.ssl.enabled", "false")
hadoopConf.set("fs.s3a.fast.upload", "true")
// 尝试列出 S3 bucket 中的内容
try {
val s3aPath = "s3a://zlg-contract-lite/fileupload/"
val path = new Path(s3aPath)
val fs = path.getFileSystem(hadoopConf) // 获取正确的文件系统
val status = fs.listStatus(path)
status.foreach(x => println(x.getPath.toString))
} catch {
case e: Exception =>
println("Exception Occurred:")
e.printStackTrace()
} finally {
// 停止 Spark 会话
spark.stop()
}
}
}
maven pom.xml
pom<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.example</groupId> <artifactId>spark_s3_test</artifactId> <packaging>jar</packaging> <version>1.0-SNAPSHOT</version> <name>spark_s3_test</name> <url>http://maven.apache.org</url> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.12</artifactId> <version>3.1.3</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.12</artifactId> <version>3.1.3</version> </dependency> <dependency> <groupId>com.amazonaws</groupId> <artifactId>aws-java-sdk-bundle</artifactId> <version>1.11.375</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-aws</artifactId> <version>3.2.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>3.2.0</version> </dependency> </dependencies> <build> <sourceDirectory>src/main/scala/com/example</sourceDirectory> <plugins> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <executions> <execution> <goals> <goal>compile</goal> </goals> </execution> </executions> <configuration> <scalaVersion>2.12</scalaVersion> <args> <arg>-target:jvm-1.8</arg> </args> </configuration> </plugin> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <archive> <manifest> <mainClass>com.example.SparkS3IntegrationTest</mainClass> </manifest> </archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <!-- this is used for inheritance merges --> <phase>package</phase> <!-- bind to the packaging phase --> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project>