跟沐神学读论文-论文阅读管理

摘要

近期有读论文的需求，就需要去了解一下论文到底要怎么读，同一个系列之间的论文如何作整理和归纳，之前也有了解过市面上有成熟的论文阅读工具，但是对于学生党来讲没什么性价比，在B站上看到沐神有讲解他的思路Typora作为工作中的md生产工具，我有一点浅显的认识希望和大家交流学习。Typora可以作为编辑工具，之前有被同事安利过，但是那个时候md格式还并不了解，今天重拾起，简单来讲我的做法就是Typora+gitee形成云端存储的一套方案，配套使用我自己的一些脚本，可以很好的实现论文阅读的功能。

一：Typora的安装

Typora 是一个所见即所得的 Markdown 跨平台写作工具，目前已经发布正式版，并且更改为付费模式，0.11.18_beta 是最后一个免费的测试版，有需要的可以选择下载。

Windows 用户

下载地址： _{[https://github.com/iuxt/src/releases/download/2.0/typora-0-11-18.exe](https://github.com/iuxt/src/releases/download/2.0/typora-0-11-18.exe)}

0.11.18 现在被远程施法了，会提示过期无法使用,可以使用 0.9.96 版

下载地址：https://github.com/iuxt/src/releases/download/2.0/typora-setup-x64_0.9.96.exe

Mac 用户

下载地址： https://github.com/iuxt/src/releases/download/2.0/typora-0-11-18.dmg

Ubuntu 用户

下载地址：https://github.com/iuxt/src/releases/download/2.0/Typora_Linux_0.11.18_amd64.deb

安装方法

使用 apt 安装：

复制代码

sudo apt install ./Typora_Linux_0.11.18_amd64.deb

如此你就完成了笔记编辑器的安装。

二：Gitee的配置

https://gitee.com在这里去作账户注册和登陆，新建仓库

在这里插入图片描述在本地新建立一个文件夹，在该文件下开命令行

bash 复制代码

#Git 全局设置:

git config --global user.name "YourName"
git config --global user.email "YourInfo@user.noreply.gitee.com"

#创建 git 仓库:

mkdir paper
cd paper
git init 
touch README.md
git add README.md
git commit -m "first commit"
git remote add origin https://gitee.com/YourName/paper.git
git push -u origin "master"

#已有仓库?

cd existing_git_repo
git remote add origin https://gitee.com/YourName/paper.git
git push -u origin "master"

如此每次更改后可以配合gitee去作同步。

三：脚本

脚本一：通过arxiv自动下载论文，提取论文标题，作者，日期，索引数等

arxiv_2_md.py

bash 复制代码

#!/usr/bin/env python3
import os
import re
import requests
import arxiv
from urllib.parse import urlparse, quote

def extract_arxiv_id(url: str) -> str:
    """
    从arXiv链接中提取arXiv ID。
    形如：https://arxiv.org/abs/1605.08386
    则返回：1605.08386
    """
    parsed = urlparse(url)
    if 'arxiv.org' not in parsed.netloc:
        raise ValueError("这不是一个有效的arXiv链接。")
    match = re.search(r'/abs/([0-9]+\.[0-9]+)', parsed.path)
    if not match:
        match = re.search(r'/pdf/([0-9]+\.[0-9]+)', parsed.path)
    if not match:
        raise ValueError("未能从链接中提取到arXiv ID。")
    return match.group(1)

def fetch_arxiv_metadata(arxiv_id: str):
    """
    使用arxiv Python包从arxiv获取元数据
    返回字典包含：title, authors, year, journal_ref, pdf_url
    """
    search = arxiv.Search(id_list=[arxiv_id])
    paper = next(search.results(), None)
    if paper is None:
        raise ValueError("未能在arXiv找到对应论文信息。")
    journal_ref = paper.journal_ref if paper.journal_ref else "N/A"
    authors = [au.name for au in paper.authors]
    year = paper.published.year
    return {
        "title": paper.title.strip(),
        "authors": authors,
        "year": year,
        "journal": journal_ref,
        "pdf_url": paper.pdf_url
    }

def download_pdf(pdf_url: str, save_dir: str = "./pdfs") -> str:
    """
    下载pdf文件到本地save_dir中，并返回本地文件相对路径。
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # 尝试从pdf_url中提取文件名
    basename = os.path.basename(pdf_url)
    if not basename.endswith(".pdf"):
        basename += ".pdf"
    local_filename = os.path.join(save_dir, basename)
    r = requests.get(pdf_url)
    r.raise_for_status()
    with open(local_filename, 'wb') as f:
        f.write(r.content)
    return local_filename

def fetch_citation_count_by_arxiv_id(arxiv_id: str) -> int:
    """
    调用 Semantic Scholar API 使用 ArXiv:<arxiv_id> 获取引用数。
    """
    url = f"https://api.semanticscholar.org/graph/v1/paper/ArXiv:{arxiv_id}?fields=citationCount"
    r = requests.get(url)
    if r.status_code == 200:
        data = r.json()
        return data.get("citationCount", 0)
    return 0

def fetch_citation_count_by_title(title: str) -> int:
    """
    如果直接使用ArXiv ID获取不到合适引用数，则通过标题在 Semantic Scholar 搜索。
    取搜索结果中匹配度最高（即第一个结果）的citationCount作为参考。
    """
    query = quote(title)
    url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&fields=title,citationCount"
    r = requests.get(url)
    if r.status_code == 200:
        data = r.json()
        papers = data.get("data", [])
        if papers:
            best_match = papers[0]
            if best_match["title"].lower().strip() == title.lower().strip():
                return best_match.get("citationCount", 0)
    return 0

def fetch_citation_count(arxiv_id: str, title: str) -> int:
    """
    尝试通过arxiv_id获取citationCount，如果为0则尝试通过标题获取。
    """
    count = fetch_citation_count_by_arxiv_id(arxiv_id)
    if count == 0:
        # 如果通过arxiv_id获取不到或为0，尝试通过标题搜索
        count = fetch_citation_count_by_title(title)
    return count

def generate_markdown(md_filename: str, title: str, authors: list, journal: str, year: int, local_pdf_path: str, citation_count: int, arxiv_url: str):
    """
    生成Markdown文件：
    包含标题、作者、期刊/会议信息、年份、本地PDF链接、原始arxiv链接和引用次数。
    """
    authors_str = ", ".join(authors)
    rel_pdf_path = os.path.relpath(local_pdf_path)
    with open(md_filename, 'w', encoding='utf-8') as f:
        f.write(f"# {title}\n\n")
        f.write(f"- **Authors:** {authors_str}\n")
        f.write(f"- **Venue/Journal:** {journal}\n")
        f.write(f"- **Year:** {year}\n")
        f.write(f"- **Local PDF:** [{rel_pdf_path}]({rel_pdf_path})\n")
        f.write(f"- **ArXiv Link:** [{arxiv_url}]({arxiv_url})\n\n")
        f.write(f"**Citations:** {citation_count}\n")

def main():
    # 输入 arxiv 链接
    arxiv_url = input("请输入arXiv链接：").strip()
    arxiv_id = extract_arxiv_id(arxiv_url)

    # 获取arxiv元数据
    meta = fetch_arxiv_metadata(arxiv_id)

    # 下载PDF
    local_pdf = download_pdf(meta["pdf_url"])

    # 获取引用数
    citation_count = fetch_citation_count(arxiv_id, meta["title"])

    # 询问用户md文件名
    default_md_name = f"{arxiv_id}.md"
    md_name_input = input(f"请输入要保存的Markdown文件名(不需扩展名，留空则使用 {default_md_name[:-3]}): ").strip()
    if md_name_input == "":
        md_filename = default_md_name
    else:
        md_filename = f"{md_name_input}.md"

    # 生成markdown文件
    generate_markdown(md_filename,
                      meta["title"],
                      meta["authors"],
                      meta["journal"],
                      meta["year"],
                      local_pdf,
                      citation_count,
                      arxiv_url)
    print(f"Markdown文件已生成：{md_filename}")

if __name__ == "__main__":
    main()

运行：

bash 复制代码

python ./arxiv_2_md.py

如果在输入arXiv后报错：

bash 复制代码

python ./arxiv_to_md_1.2.py 
请输入arXiv链接：https://arxiv.org/abs/2410.24207
/home/crist/WorkSpace/3D-reconstruction-paper/./arxiv_to_md_1.2.py:30: DeprecationWarning: The 'Search.results' method is deprecated, use 'Client.results' instead
  paper = next(search.results(), None)
Traceback (most recent call last):
  File "/home/crist/WorkSpace/3D-reconstruction-paper/./arxiv_to_md_1.2.py", line 150, in <module>
    main()
  File "/home/crist/WorkSpace/3D-reconstruction-paper/./arxiv_to_md_1.2.py", line 125, in main
    local_pdf = download_pdf(meta["pdf_url"])
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/WorkSpace/3D-reconstruction-paper/./arxiv_to_md_1.2.py", line 56, in download_pdf
    r = requests.get(pdf_url)
        ^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/sessions.py", line 703, in send
    r = adapter.send(request, **kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/adapters.py", line 633, in send
    conn = self.get_connection_with_tls_context(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/adapters.py", line 483, in get_connection_with_tls_context
    proxy_manager = self.proxy_manager_for(proxy)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/requests/adapters.py", line 282, in proxy_manager_for
    manager = self.proxy_manager[proxy] = SOCKSProxyManager(
                                          ^^^^^^^^^^^^^^^^^^
  File "/home/crist/miniconda3/lib/python3.12/site-packages/urllib3/contrib/socks.py", line 212, in __init__
    raise ValueError(f"Unable to determine SOCKS version from {proxy_url}")
ValueError: Unable to determine SOCKS version from socks://127.0.0.1:7890/

解决办法：

bash 复制代码

export ALL_PROXY=socks5://127.0.0.1:7890
export HTTP_PROXY=socks5://127.0.0.1:7890
export HTTPS_PROXY=socks5://127.0.0.1:7890

脚本二：

提取PDF中的图片，将我的脚本和pdf文件放到一起：

bash 复制代码

#!/usr/bin/env python3
import os
import subprocess
import tkinter as tk
from tkinter import messagebox, font

def list_pdfs(directory="."):
    """列出指定目录中的所有PDF文件并返回列表。"""
    pdfs = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]
    return pdfs

def extract_images(pdf_path, output_dir="images"):
    """使用pdfimages从指定PDF中提取图片。"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_prefix = os.path.join(output_dir, base_name)
    cmd = ["pdfimages", "-j", pdf_path, output_prefix]
    try:
        subprocess.run(cmd, check=True)
        return True, f"图片已提取到 {output_dir} 目录中，以 {base_name}-xxx 的形式命名。"
    except subprocess.CalledProcessError:
        return False, "提取图片失败，请确保已安装pdfimages工具。"

def on_extract():
    selection = listbox.curselection()
    if not selection:
        messagebox.showwarning("警告", "请先选择一个PDF文件")
        return
    index = selection[0]
    pdf_file = pdfs[index]
    success, msg = extract_images(pdf_file)
    if success:
        messagebox.showinfo("提取完成", msg)
    else:
        messagebox.showerror("错误", msg)

root = tk.Tk()
root.title("PDF图片提取器")

# 设置全局字体
root.option_add("*Font", "Helvetica 12")

pdfs = list_pdfs(".")

frame = tk.Frame(root)
frame.pack(padx=10, pady=10, fill="both", expand=True)

label = tk.Label(frame, text="请选择一个PDF文件：", font=("Helvetica", 12, "bold"))
label.pack(anchor="w")

listbox = tk.Listbox(frame, height=10)
listbox.pack(fill="both", expand=True)

for pdf in pdfs:
    listbox.insert(tk.END, pdf)

if not pdfs:
    listbox.insert(tk.END, "当前目录未找到PDF文件")

btn_frame = tk.Frame(root)
btn_frame.pack(pady=5)
extract_btn = tk.Button(btn_frame, text="提取图片", font=("Helvetica", 12))
extract_btn.config(command=on_extract)
extract_btn.pack()

root.mainloop()

这样就可以把图片保存到img文件夹下了