Gemini的写代码能力是真的强,用它写了一个脚本,很好用
本脚本实现了从arxiv自动下载论文的功能
- 可根据关键字进行索引,逻辑为'AND'和'OR'
- 可限制论文年份
- 可配置需下载论文数量
- 可配置论文下载路径,脚本会根据论文所属会议或期刊自动创建子文件夹
- 文件名根据论文名命名,并注明所属期刊和发表年份
python
import os
import arxiv
import re
import datetime
# ================= 配置区域 =================
# 1. 搜索关键词列表
KEYWORDS = ["Transformer", "Attention Mechanism"]
# 2. 关键词连接逻辑 ("AND" / "OR")
LOGIC = "AND"
# 3. 年份限制
START_YEAR = 2024
END_YEAR = 2025
# 4. 想要下载的论文数量
MAX_RESULTS = 10
# 5. 下载总目录
# 确保下载文件夹总是生成在脚本同级目录下
current_dir = os.path.dirname(os.path.abspath(__file__))
BASE_DOWNLOAD_DIR = os.path.join(current_dir, "downloaded_papers")
# 6. 排序方式
SORT_BY = arxiv.SortCriterion.Relevance
# ===========================================
def sanitize_filename(name):
"""
清理文件名,移除非法字符,并限制长度。
"""
if not name:
return "Unknown_Title"
# 移除 Windows/Linux 文件系统中的非法字符
clean_name = re.sub(r'[\\/*?:"<>|]', "", name)
# 将换行符替换为空格
clean_name = clean_name.replace('\n', ' ').replace('\r', '')
# 移除多余空格
clean_name = ' '.join(clean_name.split())
# 限制文件名长度,防止操作系统报错 (取前100个字符)
if len(clean_name) > 100:
clean_name = clean_name[:100].strip() + "..."
return clean_name
def extract_category_info(result):
"""
从 ArXiv 结果中提取会议/期刊信息。
逻辑:
1. 搜索 journal_ref 和 comment 两个字段。
2. 优先匹配已知的顶会简称 (CVPR, ICCV 等) 或其全称。
3. 如果未匹配到顶会,检查是否包含期刊/会议的通用关键词,归类为 "Journal_Paper" 或 "Conference_Paper"。
4. 如果都无法识别,归类为 "ArXiv_Preprint"。
"""
# 组合搜索文本:很多作者只在 comment 里写会议信息
text_to_search = ""
if result.journal_ref:
text_to_search += str(result.journal_ref) + " "
if result.comment:
text_to_search += str(result.comment)
if not text_to_search.strip():
return "ArXiv_Preprint"
text_upper = text_to_search.upper()
# 1. 常见计算机顶会简称映射
# 键是简称,值是匹配的正则模式(包含全称匹配)
conf_patterns = {
"CVPR": r"\bCVPR\b|\bCOMPUTER VISION AND PATTERN RECOGNITION\b",
"ICCV": r"\bICCV\b|\bINTERNATIONAL CONFERENCE ON COMPUTER VISION\b",
"ECCV": r"\bECCV\b|\bEUROPEAN CONFERENCE ON COMPUTER VISION\b",
"NeurIPS": r"\bNEURIPS\b|\bNIPS\b|\bNEURAL INFORMATION PROCESSING SYSTEMS\b",
"ICLR": r"\bICLR\b|\bINTERNATIONAL CONFERENCE ON LEARNING REPRESENTATIONS\b",
"ICML": r"\bICML\b|\bINTERNATIONAL CONFERENCE ON MACHINE LEARNING\b",
"AAAI": r"\bAAAI\b|\bASSOCIATION FOR THE ADVANCEMENT OF ARTIFICIAL INTELLIGENCE\b",
"IJCAI": r"\bIJCAI\b|\bINTERNATIONAL JOINT CONFERENCE ON ARTIFICIAL INTELLIGENCE\b",
"ACL": r"\bACL\b|\bASSOCIATION FOR COMPUTATIONAL LINGUISTICS\b",
"EMNLP": r"\bEMNLP\b|\bEMPIRICAL METHODS IN NATURAL LANGUAGE PROCESSING\b",
"TPAMI": r"\bTPAMI\b|\bTRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE\b",
"IJCV": r"\bIJCV\b|\bINTERNATIONAL JOURNAL OF COMPUTER VISION\b",
"TOG": r"\bTOG\b|\bTRANSACTIONS ON GRAPHICS\b",
"SIGGRAPH": r"\bSIGGRAPH\b"
}
for abbr, pattern in conf_patterns.items():
if re.search(pattern, text_upper):
return abbr
# 2. 如果没匹配到顶会,尝试判断是期刊还是会议
# 期刊关键词
if re.search(r"\b(JOURNAL|TRANS\.|TRANSACTIONS|REV\.|REVIEW|LETTERS)\b", text_upper):
return "Journal_Paper"
# 会议关键词
if re.search(r"\b(CONF\.|CONFERENCE|PROC\.|PROCEEDINGS|SYMPOSIUM|WORKSHOP)\b", text_upper):
return "Conference_Paper"
# 3. 默认归类
return "ArXiv_Preprint"
def build_query():
"""构建查询字符串"""
quoted_keywords = [f'"{k}"' for k in KEYWORDS]
keyword_query = f" {LOGIC} ".join(quoted_keywords)
final_query = f"({keyword_query})"
if START_YEAR and END_YEAR:
start_date = f"{START_YEAR}01010000"
end_date = f"{END_YEAR}12312359"
final_query += f" AND submittedDate:[{start_date} TO {end_date}]"
print(f"启用时间过滤: {START_YEAR} - {END_YEAR}")
return final_query
def download_papers():
# 1. 确保基础目录存在
if not os.path.exists(BASE_DOWNLOAD_DIR):
os.makedirs(BASE_DOWNLOAD_DIR)
# 2. 构建查询并搜索
final_query = build_query()
print(f"最终查询语句: '{final_query}'")
client = arxiv.Client()
search = arxiv.Search(
query=final_query,
max_results=MAX_RESULTS,
sort_by=SORT_BY
)
results = list(client.results(search))
print(f"找到 {len(results)} 篇论文,准备下载...\n")
# 3. 遍历并下载
for i, result in enumerate(results):
try:
# --- 提取信息 ---
year = result.published.year
# 使用新的提取逻辑: 得到 CVPR, Journal_Paper 或 Conference_Paper
category_name = extract_category_info(result)
# --- 构建目录路径 ---
# 目标文件夹路径: ./downloaded_papers/CVPR
target_dir = os.path.join(BASE_DOWNLOAD_DIR, category_name)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
# --- 构建文件名 ---
# 使用论文标题作为主文件名
# 格式: 标题_年份_类别.pdf
clean_title = sanitize_filename(result.title)
file_name = f"{clean_title}_{year}_{category_name}.pdf"
save_path = os.path.join(target_dir, file_name)
# --- 下载逻辑 ---
if os.path.exists(save_path):
print(f"[{i+1}/{len(results)}] 跳过 (已存在): {category_name}/{file_name}")
continue
print(f"[{i+1}/{len(results)}] 正在下载: {result.title[:50]}...")
print(f" -> 目录: {category_name}")
print(f" -> 文件: {file_name}")
# 下载
result.download_pdf(dirpath=target_dir, filename=file_name)
except Exception as e:
print(f" X 下载失败: {e}")
print("\n所有任务完成!")
if __name__ == "__main__":
download_papers()
本脚本比较简洁,逻辑也不复杂,适于简单需求,效率较高,分类和下载速度都高于人工,是基于arxiv免费接口的,不是爬虫,所以放心使用
比较适用于学生党个体