Gradle Dependencies
gradle
复制代码
implementation('org.apache.spark:spark-sql_2.13:3.5.3')
implementation 'org.apache.hadoop:hadoop-common:3.3.4'
testImplementation "org.springframework.boot:spring-boot-starter-test"
testImplementation "org.apache.sshd:sshd-core:2.8.0"
testImplementation "org.apache.sshd:sshd-sftp:2.8.0"
Setup a Fake SFTP server
java
复制代码
// GIVEN
// SETUP Fake SFTP Server
String host = "127.0.0.1";
String user = "username";
String passwd = "password";
int port = 9188;
SshServer sshd = SshServer.setUpDefaultServer();
sshd.setPort(port);
sshd.setKeyPairProvider(new SimpleGeneratorHostKeyProvider());
sshd.setPasswordAuthenticator((username, password, session) -> user.equals(username) && passwd.equals(password) );
sshd.setSubsystemFactories(Collections.singletonList(new SftpSubsystemFactory()));
sshd.setFileSystemFactory(new VirtualFileSystemFactory(rootPath));
sshd.start();
System.out.println("Fake SFTP server started at port " + port);
Generate A tested CSV file based on Hadoop SFTP FileSystem api
java
复制代码
String sftpURL = String.format("sftp://%s:%s@%s:%d", user, passwd, host, port);
String testedCsvFile = "test.csv";
// WHEN
// Create a CSV file by Hadoop FileSystem api
Configuration conf = new Configuration();
conf.set("fs.sftp.impl", "org.apache.hadoop.fs.sftp.SFTPFileSystem");
conf.set("fs.defaultFS", sftpURL);
// get FileSystem instance by a root Path
Path path = new Path("/");
FileSystem sftpFileSystem = FileSystem.get(path.toUri(),conf);
Assertions.assertTrue(sftpFileSystem instanceof SFTPFileSystem);
// Create a test csv file and write text contents to it
try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(sftpFileSystem.create(new Path(testedCsvFile), true)))) {
br.write("A|B|C|D");
br.newLine();
br.write("1|2|3|4");
}
// check the tested file
FileStatus[] statuses = sftpFileSystem.listStatus(new Path("/"));
Assertions.assertEquals(1, statuses.length);
Assertions.assertTrue(statuses[0].isFile());
Assertions.assertEquals(testedCsvFile, statuses[0].getPath().getName());
Finally, Read the tested data from SFTP Server
java
复制代码
// THEN
// Read the test csv file by Spark
SparkConf sparkConf = new SparkConf()
.setAppName("spark-test")
.setMaster("local[2]")
.set("spark.ui.enabled","false")
.set("spark.hadoop.fs.sftp.impl","org.apache.hadoop.fs.sftp.SFTPFileSystem")
.set("spark.hadoop.fs.defaultFS",sftpURL)
;
SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
// read csv file by the sftp connection
Dataset<Row> dataset = sparkSession.read()
.option("header","true").option("delimiter","|")
.csv(testedCsvFile);
dataset.printSchema();
dataset.show();
text
复制代码
root
|-- A: string (nullable = true)
|-- B: string (nullable = true)
|-- C: string (nullable = true)
|-- D: string (nullable = true)
+---+---+---+---+
| A| B| C| D|
+---+---+---+---+
| 1| 2| 3| 4|
+---+---+---+---+