实在是受不了内外网导出导入jar包了,心一横,写了一个安26个字母排序扒maven中央仓所有jar的代码。
pom.xml 文件
java
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.bullgod</groupId>
<artifactId>MavenRepoBaLaLa</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.5</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-reload4j</artifactId>
<version>2.0.5</version>
</dependency>
</dependencies>
</project>
源码
java
package org.bullgod;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class MavenBalabalaThread {
public static void main(String[] args) throws IOException {
ExecutorService es = Executors.newFixedThreadPool(30 );
char cc = 'a';
for (int i = 0; i < 26; i++) {
char dd = (char) (cc + i);//强制类型转化
String ccc = String.valueOf(dd);
es.submit(new Task(ccc));
}
// 关闭线程池:
es.shutdown();
}
}
class Task implements Runnable {
/**
* 爬取根目录
*/
//private static final String ROOT = "https://repo.maven.apache.org/maven2/";
private static final String ROOT = "https://repo1.maven.org/maven2/";
/**
* 硬盘存取根目录
*/
private static final String DiskROOT = "D:\\maven2\\";
/**
* maven-metadata.xml文件名
*/
private static final String MAVEN_METADATA_XML_FILENAME = "maven-metadata.xml";
/**
* 全部顶层索引文件
*/
private static final String indexfilename = "maven2Indexall.txt";
String firstAlpaca = "all"; //all 全部爬取,失败概率大,建议分字母 a,b,c...爬取
public Task(String args) throws IOException {
firstAlpaca = args;
}
/**
* 查询子url
*
* @param url 当前url
* @param sleepMillis 睡眠毫秒数
*/
private static void findSubUrl(String url, int sleepMillis) {
try {
Thread.sleep(sleepMillis);
Document doc = null;
boolean needreconnect = true;
while (needreconnect) {
try {
doc = Jsoup.connect(url).userAgent("Mozilla").timeout(5000).get();
} catch (SocketTimeoutException te) {
//链接超时,等待重连,10秒
Thread.sleep(10 * 1000);
//System.out.println("链接超时,等待重连,10秒");
System.out.println("链接超时,等待10秒重连");
needreconnect = true;
continue;
}
needreconnect = false;
}
Elements links = doc.select("#contents a");
for (Element link : links) {
String pathorfilename = link.attr("href");
if (pathorfilename.equals("../")) {
//上级目录,不处理
continue;
}
//创建文件夹
//获得绝对URL
String absUrl = link.absUrl("href");
System.out.println(absUrl);
System.out.println("{}" + absUrl);
//获得保存文件路径
int urllen = ROOT.length();
String pathName = absUrl.substring(urllen);
java.util.Date day = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String nowtime = sdf.format(day);
System.out.println("[" + nowtime + "]: " + pathName);
System.out.println("[{}]: " + nowtime + "{}" + pathName);
//判断是目录还是文件
int ret = pathorfilename.indexOf("/");
if (ret == -1) {
String saveFile = DiskROOT + pathName;
File f1 = null;
//是文件,不是目录
//储存网络文件到硬盘
while (true) {
try {
f1 = new File(saveFile);
if (!f1.exists()) {
//文件不存在才下载
URL httpurl = new URL(absUrl);
BufferedInputStream bis = new BufferedInputStream(httpurl.openStream());
FileOutputStream fis = new FileOutputStream(saveFile);
byte[] buffer = new byte[1024];
int count = 0;
while ((count = bis.read(buffer, 0, 1024)) != -1) {
fis.write(buffer, 0, count);
}
fis.close();
bis.close();
break;
}
} catch (IOException e) {
System.out.println("下载文件失败:{}" + saveFile);
if (f1.exists()) {
f1.delete();
}
Thread.sleep(10 * 1000);
//System.out.println("链接超时,等待重连,10秒");
System.out.println("文件下载失败,等待10秒重新下载");
//重新下载
continue;
}
}
} else {
//目录
//创建硬盘目录
String filePath = DiskROOT + pathName;
File f2 = new File(filePath);
if (!f2.exists()) {
boolean flag2 = f2.mkdir();
if (!flag2) {
//System.out.println( "文件夹创建失败:"+filePath);
System.out.println("创建文件失败:{}" + filePath);
}
}
//递归处理
findSubUrl(absUrl, sleepMillis);
}
}
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
}
private static void searchdir(String rooturl, String dir, int sleepMillis) {
String filePath = DiskROOT + dir;
File f = new File(filePath);
if (!f.exists()) {
boolean flag2 = f.mkdir();
if (!flag2) {
System.out.println("文件夹创建失败:{}" + filePath);
}
}
String suburl = rooturl + dir;
findSubUrl(suburl, sleepMillis);
}
@Override
public void run() {
System.out.println("Beging crawler:beging with {}" + firstAlpaca);
int sleepMillis = 100;
String rooturl = ROOT;
// findSubUrl(rooturl, sleepMillis); //直接爬取全部
File file = new File(DiskROOT + indexfilename);
try {
BufferedReader br = new BufferedReader(new FileReader(file));
String st;
while ((st = br.readLine()) != null) {
System.out.println(st);
String dir = st.trim();
if (firstAlpaca.equals("all") || firstAlpaca.equals("ALL")) {
searchdir(rooturl, dir, sleepMillis);
} else {
int index = dir.toLowerCase().indexOf(firstAlpaca);
if (index == 0) {
//首字母合格
searchdir(rooturl, dir, sleepMillis);
}
}
}
} catch (FileNotFoundException e) {
System.out.println("找不到文件:{}" + indexfilename);
} catch (IOException ie) {
System.out.println("使用文件失败:{}" + indexfilename);
}
System.out.println("End crawler");
}
}