【1902】process_assignment_pdf(）

帮我改进process_assignment_pdf(pdf_path)函数。
1.将assignment文件夹里的问题，根据题型分类，分成选择题、简答题、判断题、其它题型；

2.设置一个参数，用于评估其他题型转换为选择题的可行性，设置阈值，如果参数大于阈值，则不进行转化，抛弃这道非选择题；

3.将简答题转化成选择题，设置至少三个选项，使用llm生成选项和正确答案（如果没有给出参考答案）；

4.将判断题转化为选择题，设置正确/错误两个选项，使用llm判断正误（如果没有给出参考答案）；

5.如果是其它题型，且参数小于等于阈值，将题目转化成选择题，设置至少三个选项，使用llm生成选项和正确答案（如果没有给出参考答案）。

python 复制代码

import re
import json
import fitz

# ---------------------------
# 题型分类常量
# ---------------------------
QTYPE_MCQ = "mcq"     # 选择题
QTYPE_SHORT = "short" # 简答题
QTYPE_TF = "tf"       # 判断题
QTYPE_OTHER = "other" # 其它题型

# ---------------------------
# 主函数：改进版
# ---------------------------
def process_assignment_pdf(
    pdf_path,
    *,
    use_llm: bool = USE_LLM,
    other_type_risk_threshold: float = 0.6,
    min_options: int = 3,
    max_options: int = 5,
):
    """
    处理assignment PDF：
    1) 识别题块
    2) 分类：选择/简答/判断/其它
    3) 简答、判断、其它题型 =>（按规则）转为选择题
    4) 输出统一结构，供 assign_quizzes_to_slides 使用
    """
    doc = fitz.open(pdf_path)

    # 解析整份PDF文本（按行）
    all_lines = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()
        lines = [ln.rstrip() for ln in text.split("\n")]
        all_lines.extend(lines)

    # （可选）尝试从文末/文中提取答案索引（很粗糙，没答案也没关系）
    answer_map = extract_answer_key(all_lines)  # {qnum: "A"/"B"/"T"/"F"/"True"/"False"/...}

    # 切分题块
    blocks = split_into_question_blocks(all_lines)

    results = []
    for blk in blocks:
        qnum = blk.get("qnum")               # 题号（可能None）
        qtext = blk["question_text"].strip() # 含题干与可能的子问
        raw_lines = blk["lines"]             # 题块原始行

        # 先尝试识别是否已是选择题（有 A/B/C... 选项）
        options = parse_mcq_options(raw_lines)
        if options and len(options) >= 2:
            qtype = QTYPE_MCQ
        else:
            qtype = classify_non_mcq_type(qtext, raw_lines)

        # 找参考答案（若能识别）
        ref_ans = None
        if qnum is not None and qnum in answer_map:
            ref_ans = answer_map[qnum]

        # ---- 按题型处理 ----
        if qtype == QTYPE_MCQ:
            # 已经是选择题：尽量确定correct；如果无法确定就留空或默认A（你也可以选择丢弃）
            correct = normalize_mcq_answer(ref_ans, options) if ref_ans else "A"
            results.append({
                "question": strip_options_from_text(qtext),
                "options": normalize_options_dict(options),
                "correct": correct,
                "source": "assignment",
                "qtype": QTYPE_MCQ
            })
            continue

        if qtype == QTYPE_TF:
            # 判断题 -> 选择题：正确/错误 两个选项
            tf_options = {"A": "正确", "B": "错误"}
            if ref_ans:
                tf_correct = normalize_tf_answer(ref_ans)  # "A" or "B"
                if tf_correct is None:
                    # 参考答案解析失败：走LLM
                    tf_correct = llm_judge_tf(qtext, use_llm=use_llm)
            else:
                tf_correct = llm_judge_tf(qtext, use_llm=use_llm)

            if tf_correct is None:
                # LLM不可用或失败：丢弃这题（避免污染题库）
                continue

            results.append({
                "question": qtext,
                "options": tf_options,
                "correct": tf_correct,
                "source": "assignment",
                "qtype": QTYPE_TF
            })
            continue

        if qtype == QTYPE_SHORT:
            # 简答题 -> 选择题（>=3选项）
            mcq = llm_convert_to_mcq(
                qtext,
                reference_answer=ref_ans,
                min_options=min_options,
                max_options=max_options,
                use_llm=use_llm
            )
            if mcq is None:
                continue
            results.append({
                "question": qtext,
                "options": mcq["options"],
                "correct": mcq["correct"],
                "source": "assignment",
                "qtype": QTYPE_SHORT
            })
            continue

        # qtype == OTHER
        risk_score = assess_other_type_risk(qtext, raw_lines, use_llm=use_llm)
        if risk_score is None:
            # 评估都做不了：保守丢弃
            continue
        if risk_score > other_type_risk_threshold:
            # 超阈值：不转化，丢弃
            continue

        mcq = llm_convert_to_mcq(
            qtext,
            reference_answer=ref_ans,
            min_options=min_options,
            max_options=max_options,
            use_llm=use_llm
        )
        if mcq is None:
            continue

        results.append({
            "question": qtext,
            "options": mcq["options"],
            "correct": mcq["correct"],
            "source": "assignment",
            "qtype": QTYPE_OTHER,
            "risk_score": round(float(risk_score), 3)
        })

    return results


# =========================================================
# 下面是 helper：题块切分 / 分类 / 选项解析 / 答案提取 / LLM
# =========================================================

def split_into_question_blocks(lines):
    """
    把整份PDF行切成题块：
    - 遇到题号行（1. / 1) / Question 1 / 问题1）开新块
    """
    blocks = []
    cur = None

    for ln in lines:
        if is_question_start_line(ln):
            # flush
            if cur and cur["lines"]:
                cur["question_text"] = "\n".join(cur["lines"])
                blocks.append(cur)

            qnum = extract_question_number(ln)
            cur = {"qnum": qnum, "lines": [ln], "question_text": ""}
        else:
            if cur is not None:
                # 归入当前题块
                if ln.strip() != "":
                    cur["lines"].append(ln)

    if cur and cur["lines"]:
        cur["question_text"] = "\n".join(cur["lines"])
        blocks.append(cur)

    return blocks


def is_question_start_line(line: str) -> bool:
    s = line.strip()
    patterns = [
        r'^\d+\s*[\.\)]\s+.+',        # 1. xxx / 1) xxx
        r'^[Qq]uestion\s+\d+[:\.\)]', # Question 1:
        r'^问题\s*\d+[:\.\)]?',       # 问题1：
        r'^Exercise\s+\d+[:\.\)]',    # Exercise 1:
    ]
    return any(re.match(p, s) for p in patterns)


def extract_question_number(line: str):
    s = line.strip()
    m = re.match(r'^(\d+)\s*[\.\)]', s)
    if m:
        return int(m.group(1))
    m = re.match(r'^[Qq]uestion\s+(\d+)', s)
    if m:
        return int(m.group(1))
    m = re.match(r'^问题\s*(\d+)', s)
    if m:
        return int(m.group(1))
    m = re.match(r'^Exercise\s+(\d+)', s)
    if m:
        return int(m.group(1))
    return None


def parse_mcq_options(block_lines):
    """
    从题块中提取 A/B/C/D... 选项
    支持：
      A. xxx / A) xxx / (A) xxx
    """
    opts = {}
    for ln in block_lines:
        s = ln.strip()
        m = re.match(r'^\(?\s*([A-E])\s*[\.\)]\s*(.+)$', s)
        if m:
            k = m.group(1)
            v = m.group(2).strip()
            if v:
                opts[k] = v
    return opts


def normalize_options_dict(options: dict) -> dict:
    # 保证键是连续字母（至少 A/B/C...），并裁剪空值
    if not options:
        return {}
    ordered = {}
    for k in ["A","B","C","D","E"]:
        if k in options and str(options[k]).strip():
            ordered[k] = str(options[k]).strip()
    return ordered


def strip_options_from_text(qtext: str) -> str:
    # 如果题干里包含选项行，去掉（避免重复展示）
    lines = qtext.split("\n")
    kept = []
    for ln in lines:
        if re.match(r'^\(?\s*[A-E]\s*[\.\)]\s+.+$', ln.strip()):
            continue
        kept.append(ln)
    return "\n".join(kept).strip()


def classify_non_mcq_type(qtext: str, block_lines) -> str:
    """
    在非选择题情况下分类：判断 / 简答 / 其它
    """
    t = qtext.lower()

    # 判断题信号
    tf_signals = [
        "true or false", "true/false", "t/f", "判断", "对错", "正确或错误",
        "is it true", "是否正确"
    ]
    if any(sig in t for sig in tf_signals):
        return QTYPE_TF

    # 简答题信号（可再扩充）
    short_signals = [
        "explain", "why", "justify", "derive", "prove", "show that",
        "calculate", "compute", "简答", "解释", "证明", "推导", "计算"
    ]
    # 多子问 (a)(b)(c) 常见于简答/综合题
    has_subparts = any(re.match(r'^\(?[a-h]\)\s+', ln.strip().lower()) for ln in block_lines)

    if any(sig in t for sig in short_signals) or has_subparts:
        return QTYPE_SHORT

    return QTYPE_OTHER


def extract_answer_key(lines):
    """
    非严格：尝试从文本中提取"答案/Answer"段落里的答案映射
    返回 {题号: 答案字符串}
    """
    answer_map = {}

    # 找到可能的答案区起点
    start_idx = None
    for i, ln in enumerate(lines):
        s = ln.strip().lower()
        if s in ["answers", "answer key", "solutions", "solution", "参考答案", "答案", "解答"]:
            start_idx = i
            break
    if start_idx is None:
        return answer_map

    # 在答案区内扫描 "1. A" / "2) C" / "3. True" 等
    for ln in lines[start_idx:start_idx+500]:  # 限制扫描范围避免误伤
        s = ln.strip()
        m = re.match(r'^(\d+)\s*[\.\)]\s*([A-E])\b', s, flags=re.I)
        if m:
            answer_map[int(m.group(1))] = m.group(2).upper()
            continue
        m = re.match(r'^(\d+)\s*[\.\)]\s*(True|False|T|F|正确|错误)\b', s, flags=re.I)
        if m:
            answer_map[int(m.group(1))] = m.group(2)
            continue

    return answer_map


def normalize_mcq_answer(ref_ans, options_dict):
    """
    把参考答案归一为 A/B/C...
    """
    if ref_ans is None:
        return "A"
    s = str(ref_ans).strip().upper()
    if s in ["A","B","C","D","E"] and (not options_dict or s in options_dict):
        return s
    # 可能是 "Answer: B" 之类
    m = re.search(r'\b([A-E])\b', s)
    if m:
        k = m.group(1).upper()
        if not options_dict or k in options_dict:
            return k
    return "A"


def normalize_tf_answer(ref_ans):
    """
    返回 "A"(正确) / "B"(错误) 或 None
    """
    if ref_ans is None:
        return None
    s = str(ref_ans).strip().lower()
    if s in ["true", "t", "正确", "对", "yes"]:
        return "A"
    if s in ["false", "f", "错误", "错", "no"]:
        return "B"
    return None


# ---------------------------
# 其它题型转换可行性评估（risk_score）
# ---------------------------
def assess_other_type_risk(qtext: str, block_lines, *, use_llm: bool):
    """
    risk_score: 0~1，越大越不适合转选择题（综合编程/长推导/开放题）
    - 若 use_llm=True：用LLM评估更准
    - 否则：用启发式规则估计
    """
    if use_llm:
        return llm_assess_risk(qtext)

    # 启发式：越长、多子问、含代码/公式/开放要求 => 风险更高
    length = len(qtext)
    subparts = sum(1 for ln in block_lines if re.match(r'^\(?[a-h]\)\s+', ln.strip().lower()))
    code_like = any(("def " in ln or "```" in ln or "import " in ln) for ln in block_lines)
    open_ended = any(k in qtext.lower() for k in ["design", "open-ended", "discuss", "analyze", "prove", "derive"])

    score = 0.0
    score += min(0.4, length / 2000.0)         # 长度贡献
    score += min(0.3, subparts * 0.08)         # 子问贡献
    score += 0.2 if code_like else 0.0
    score += 0.2 if open_ended else 0.0
    return max(0.0, min(1.0, score))


# ---------------------------
# LLM：判断题判定 / 转MCQ / 风险评估
# 你只要把 call_llm_json(...) 接上你自己的LLM即可
# ---------------------------
def llm_judge_tf(question_text: str, *, use_llm: bool):
    if not use_llm:
        return None
    payload = call_llm_json({
        "task": "judge_true_false",
        "question": question_text
    })
    # 期望返回 {"answer": "true"} 或 {"answer":"false"}
    ans = (payload or {}).get("answer", "")
    if str(ans).strip().lower() in ["true", "t", "正确"]:
        return "A"
    if str(ans).strip().lower() in ["false", "f", "错误"]:
        return "B"
    return None


def llm_convert_to_mcq(
    question_text: str,
    *,
    reference_answer: str = None,
    min_options: int = 3,
    max_options: int = 5,
    use_llm: bool
):
    if not use_llm:
        return None

    payload = call_llm_json({
        "task": "convert_to_mcq",
        "question": question_text,
        "reference_answer": reference_answer,
        "min_options": min_options,
        "max_options": max_options
    })

    # 期望返回：
    # {
    #   "options": {"A":"...", "B":"...", "C":"...", "D":"..."},
    #   "correct": "A"
    # }
    if not payload:
        return None
    options = payload.get("options")
    correct = payload.get("correct")

    if not isinstance(options, dict) or len(options) < min_options:
        return None
    correct = str(correct).strip().upper()
    if correct not in options:
        return None

    # 裁剪到 max_options（如给太多）
    normalized = {}
    for k in ["A","B","C","D","E","F"]:
        if k in options and str(options[k]).strip():
            normalized[k] = str(options[k]).strip()
        if len(normalized) >= max_options:
            break
    if len(normalized) < min_options:
        return None
    if correct not in normalized:
        return None

    return {"options": normalized, "correct": correct}


def llm_assess_risk(question_text: str):
    payload = call_llm_json({
        "task": "assess_conversion_risk",
        "question": question_text
    })
    # 期望返回 {"risk_score": 0.0~1.0}
    try:
        return float((payload or {}).get("risk_score"))
    except Exception:
        return None


def call_llm_json(job: dict) -> dict:
    """
    你需要在这里接入你自己的LLM。
    由于你文件里目前只有 USE_LLM 和 ANTHROPIC_API_KEY 常量:contentReference[oaicite:2]{index=2}，
    但没有真实调用代码，所以我在这里给一个"规范化接口"。

    要求：无论用 OpenAI / Anthropic / 本地模型，都请最终返回 dict。
    """
    # TODO: 在此处实现真实LLM调用
    # 建议：让模型"只输出JSON"，避免解析失败
    raise NotImplementedError("请在 call_llm_json(job) 内接入你的LLM，并返回dict(JSON)。")

Dialogue 1

关键点解释（你最在意的 5 条怎么落地）

1）题型分类

选择题：题块里能解析出 A/B/C... 选项（parse_mcq_options）

判断题：包含 True/False、判断、对错...（classify_non_mcq_type）

简答题：包含 explain/derive/prove/计算/证明/解释或 (a)(b)© 子问

其它题型：上述都不是 → other

2）其它题型 risk_score 与阈值

assess_other_type_risk(...) 输出 0~1，越大越不适合转选择题

if risk_score > other_type_risk_threshold: continue → 丢弃

默认阈值我给了 0.6，你可以按课程风格调（比如 CS229 可能要设低一点）

3）简答题转选择题（>=3选项）

走 llm_convert_to_mcq(...)

reference_answer 如果能从 PDF 抓到就传入；抓不到也能让 LLM 自己推正确项（你要求的"如果没有参考答案就用 LLM"）

4）判断题转选择题（正确/错误）

选项固定：{"A":"正确","B":"错误"}

ref_ans 能识别就直接用，否则走 llm_judge_tf(...)

5）其它题型转选择题（受阈值控制）

先 risk_score，过阈值直接丢弃；没过阈值就 llm_convert_to_mcq(...)

你还需要做的一件事：把 LLM 接上

你原文件里只有：

USE_LLM = False

ANTHROPIC_API_KEY = "your-api-key"

但没有实际调用

batch_pdf_processor

。所以我把所有 LLM 工作都统一到 call_llm_json(job)：

你只要实现它，让它根据 job["task"] 返回 JSON dict 就行。

建议你强制模型只输出 JSON（不带解释），否则很容易解析失败。

额外改进建议（很实用）

题块切分更稳：现在是"遇到题号行开新块"。如果你的作业格式是 (a)(b) 作为同一题的子问，当前逻辑会保留在同一块，适合转 MCQ。

答案提取不要太指望 PDF：assignment 和 solution 往往分文件。你可以把 extract_answer_key 设计成可选输入（比如额外传入 solutions.pdf）。

避免低质量 MCQ 污染题库：如果 llm_convert_to_mcq 生成的选项少于 3、正确项不在选项里、或选项重复/太相似 → 直接丢弃（我代码里已经做了基本校验）。

返回里保留 qtype/risk_score 很关键：后续你可以按题型做不同的 quiz 策略，而不必重新解析 PDF。

如果你愿意，我也可以顺手把你现在的 assign_quizzes_to_slides 升级为：优先抽"原生选择题"，不够再用"转换题"补齐，并且按章节/难度做更合理的分配（你现在是平均分配且会重复题目）

Dialogue 2

我把你生成的段落直接复制粘贴去覆盖原py文件，有什么需要注意的吗？

在你将我提供的段落直接复制粘贴到你的原 py 文件时，有几个需要注意的地方，确保代码能够顺利运行：

1. 确保你有正确的环境与依赖

你需要确保 PyMuPDF (fitz 模块) 已经安装，因为这是处理 PDF 的关键库。如果尚未安装，可以运行以下命令：

python 复制代码

pip install pymupdf

如果你使用的是 LLM（例如 OpenAI 或 Anthropic），需要确保你有正确的 API 密钥，并且安装了必要的库。如果没有配置，请参考相关 API 的文档。

2. LLM 接口的实现

你需要在 call_llm_json 函数中实现 LLM 调用，这是核心的部分，用来生成选择题选项、判断题正误、以及简答题转换成选择题。如果你还没有配置接口，你可以用下面的伪代码来测试：

python 复制代码

def call_llm_json(job: dict) -> dict:
    """
    这里是伪代码示例，实际调用时你需要连接到你选择的 LLM 提供者
    """
    if job["task"] == "convert_to_mcq":
        return {
            "options": {
                "A": "选项A",
                "B": "选项B",
                "C": "选项C"
            },
            "correct": "A"
        }
    elif job["task"] == "judge_true_false":
        return {
            "answer": "true"
        }
    elif job["task"] == "assess_conversion_risk":
        return {
            "risk_score": 0.4
        }
    return {}

你需要替换这部分伪代码，确保调用你的 LLM 提供者并返回合适的 JSON 格式。

3. 检查函数参数和默认值

在 process_assignment_pdf 函数中，你有一些参数（例如 use_llm, other_type_risk_threshold, min_options 等）。如果你没有传递这些参数，函数会使用默认值。例如：

python 复制代码

def process_assignment_pdf(
    pdf_path,
    *,
    use_llm: bool = USE_LLM,  # 默认使用 USE_LLM
    other_type_risk_threshold: float = 0.6,  # 默认阈值
    min_options: int = 3,  # 默认最少选项
    max_options: int = 5   # 默认最多选项
):

确保在调用这个函数时，如果你需要调整这些默认值（比如使用不同的风险阈值），你可以传递相应的参数。

4. 更新的 llm_convert_to_mcq 和 normalize_options_dict

如果你对选项的返回格式或数量有要求，llm_convert_to_mcq 和 normalize_options_dict 函数已经做了适当的限制。例如，min_options 控制最少选项数量（默认 3），max_options 控制最多选项数量（默认 5）。你可以根据实际需求修改这些参数的默认值。

5. 可能的无效题目

在 process_assignment_pdf 中，risk_score > threshold 时会丢弃题目。你可以通过修改 other_type_risk_threshold 参数调整丢弃的标准。如果你不希望丢弃题目，可以将阈值设得更高或者将评估功能禁用。

6. 测试阶段的调试输出

在某些地方，我插入了 print 语句（比如在 llm_convert_to_mcq 函数里）。你可以根据需要将其保留，用于调试。

如果你希望生产环境没有调试信息，可以移除这些输出或改为日志记录（例如，使用 logging 模块）。

7. 代码风格和格式

确保代码风格与现有项目保持一致。例如，检查 import 语句的位置、函数的间距、以及注释的格式等。

如果你的 IDE 支持自动格式化（如 VSCode 或 PyCharm），可以运行自动格式化工具确保一致性。

8. 依赖项更新

如果你的项目已经在使用数据库连接，确保你没有与其他模块或库发生冲突，特别是数据库和 LLM 部分。保持依赖项的兼容性是很重要的。

9. 数据库相关（process_assignment_pdf 与数据库无关）

虽然 process_assignment_pdf 函数本身不涉及数据库操作，但你要确保与数据库交互（例如保存题目）时，传入的数据结构与你之前的 Slide 和 LearningPath 模型相兼容。

总结

只要确保：

LLM 接口（call_llm_json）能正确处理你的任务。

适当调整参数：例如，other_type_risk_threshold、min_options、use_llm 等，根据需求调整。

确保相关依赖项（如 PyMuPDF、LLM 客户端）已经正确安装，并且你的 LLM 提供商 API 配置正确。

直接替换这些代码段后，应该就能顺利运行。