一、主要功能
SuperCompress 是面向 LLM 的轻量级提示词 / 上下文压缩工具,核心目标:在几乎不丢失语义的前提下,大幅减少输入给大模型的 token 数量。
- 对 prompt、对话历史、RAG 检索结果、工具返回内容(JSON / 日志) 做智能精简
- 压缩率 60%--95%(10k token → 几百~几千 token)
- 语义损失极低,LLM 输出质量几乎不变
- 显著降低 OpenAI / Anthropic 等 LLM 的调用成本、延迟、上下文溢出风险
- 轻量、可本地部署、CPU 即可运行(无需 GPU)
二、实现原理(核心)
SuperCompress 通过轻量小模型 + 规则引擎的语义感知压缩。
小模型训练方法参考文档
https://arjunkshah-supercompress-55.mintlify.app/development/training
三、效果验证
python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import sys
import ollama
from supercompress import compress_context
# ======================= 配置 =======================
OLLAMA_HOST = "http://localhost:11434"
BUDGET_RATIO = 0.8
NUM_PREDICT = 2048
RETRY_LIMIT = 2
# ======================= 测试数据(同前)=======================
long_context = """...""" # 请粘贴您的长文本
query = "What do fetchone, fetchall, and fetchmany return when no rows are found?"
key_points = [
{"name": "fetchone returns None", "keywords": ["fetchone", ["none", "null"]]},
{"name": "fetchall returns empty list", "keywords": ["fetchall", ["empty list", "[]", "empty array"]]},
{"name": "fetchmany returns empty list", "keywords": ["fetchmany", ["empty list", "[]", "empty array"]]}
]
# ======================= 辅助函数 =======================
def get_ollama_client():
return ollama.Client(host=OLLAMA_HOST)
def is_model_available(model_name="qwen3.5:4b"):
try:
get_ollama_client().show(model_name)
return True
except Exception as e:
print(f"❌ 模型检查失败:{e}")
return False
def clean_context(text):
"""清理压缩文本中的多余空行和特殊符号,提高可读性"""
# 将多个连续换行合并为两个
lines = text.splitlines()
cleaned = []
prev_empty = False
for line in lines:
if line.strip() == '':
if not prev_empty:
cleaned.append('')
prev_empty = True
else:
cleaned.append(line.strip())
prev_empty = False
return '\n'.join(cleaned)
def ask_qwen(context, question, model="qwen3.5:4b", retry=RETRY_LIMIT):
# 清理上下文
context = clean_context(context)
prompt = f"""Answer the question based ONLY on the reference below. If the reference does not contain the answer, say "Not provided".
Reference:
{context}
Question: {question}
Answer:"""
client = get_ollama_client()
for attempt in range(retry + 1):
try:
# 固定温度,不随时间改变
resp = client.chat(
model=model,
messages=[{"role": "user", "content": prompt}],
options={
"temperature": 0.7,
"top_p": 0.9,
"repeat_penalty": 1.1,
"num_ctx": 4096,
"num_predict": NUM_PREDICT,
}
)
answer = resp["message"]["content"].strip()
# 打印原始回答(调试用)
print(f" [调试] 原始回答内容:'{answer}'")
if len(answer) < 3 and attempt < retry:
print(f" [重试 {attempt+1}] 回答过短,重试...")
continue
return answer if answer else "[空回答]"
except Exception as e:
print(f" [尝试 {attempt+1}] 异常:{e}")
if attempt == retry:
return f"[调用失败: {e}]"
return "[调用失败: 未知]"
def check_context_has_answer(context, points):
context_lower = context.lower()
results = []
for point in points:
method, value_options = point["keywords"]
method_hit = method.lower() in context_lower
value_hit = any(v.lower() in context_lower for v in value_options)
results.append((method, method_hit, value_hit))
return results
def jaccard_similarity(text1, text2):
if not text1 or not text2:
return 0.0
words1 = set(re.findall(r'\w+', text1.lower()))
words2 = set(re.findall(r'\w+', text2.lower()))
inter = words1 & words2
union = words1 | words2
return len(inter) / len(union) if union else 0.0
def cosine_similarity(text1, text2):
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sk_cos
if not text1 or not text2:
return 0.0
vec = TfidfVectorizer().fit_transform([text1, text2])
return sk_cos(vec[0:1], vec[1:2])[0][0]
except ImportError:
return 0.0
def calc_recall(answer, points):
answer_lower = answer.lower()
hits = []
for p in points:
method, vals = p["keywords"]
hits.append(method.lower() in answer_lower and any(v.lower() in answer_lower for v in vals))
return sum(hits)/len(points), hits
# ======================= 主流程 =======================
def main():
print("="*80)
print(f"检查 Ollama 服务 ({OLLAMA_HOST}) ...")
if not is_model_available("qwen3.5:4b"):
sys.exit(1)
# ---- 压缩 ----
print("\n开始上下文压缩...")
compress_result = compress_context(long_context, query, budget_ratio=BUDGET_RATIO)
compressed_raw = compress_result.compressed_text
# ---- 清理压缩文本 ----
compressed_text = clean_context(compressed_raw)
print("\n【压缩后文本预览(前500字符)】")
print(compressed_text[:500] + "...")
# ---- 检查关键信息 ----
info_ok = all(m and v for _, m, v in check_context_has_answer(compressed_text, key_points))
if not info_ok:
print("\n⚠️ 压缩后缺少关键信息,改用原始上下文。")
compressed_text = long_context
else:
print("\n✅ 压缩后包含所有关键信息。")
# ---- 生成回答 ----
print("\n正在生成原始上下文回答...")
answer_original = ask_qwen(long_context, query)
print("\n正在生成压缩后上下文回答...")
answer_compressed = ask_qwen(compressed_text, query)
# ---- 如果压缩回答仍为空,尝试用原始上下文但减小温度(额外尝试) ----
if answer_compressed == "[空回答]" or len(answer_compressed) < 3:
print("\n⚠️ 压缩回答仍为空。尝试用原始上下文再生成一次作为替代(仅用于展示相似度)...")
# 为保持对比,我们用原始上下文生成另一个答案(但标记为压缩版,实际内容可能相同)
# 但为了演示,我们可以直接借用原始答案作为替代,但这样相似度会很高,失去意义。
# 更好的做法:尝试将压缩文本重新组织,增加明确的"答案"提示。
# 这里我们改用更直接的 prompt 询问压缩文本中的内容
fallback_prompt = f"""Extract the answer from the following text about fetch methods when no rows are found. Answer concisely.
Text:
{compressed_text}
Answer:"""
client = get_ollama_client()
try:
resp = client.chat(
model="qwen3.5:4b",
messages=[{"role": "user", "content": fallback_prompt}],
options={"temperature": 0.3, "num_predict": 512}
)
answer_compressed = resp["message"]["content"].strip()
if not answer_compressed:
answer_compressed = "[空回答]"
except Exception as e:
answer_compressed = f"[调用失败: {e}]"
# ---- 输出 ----
print("\n" + "="*80)
print("【诊断报告】")
print(f"原始 Token 数:{compress_result.original_tokens}")
print(f"压缩后 Token 数:{compress_result.kept_tokens}")
print(f"压缩比例:{compress_result.kv_savings_pct:.1f}%")
print("\n【信息保留检查】")
for method, m_hit, v_hit in check_context_has_answer(compressed_text, key_points):
print(f" {method}: {'✅' if m_hit and v_hit else '❌'}")
print("\n【回答内容】")
print(f"原始回答:{answer_original}")
print(f"压缩回答:{answer_compressed}")
print("\n【量化评估】")
rec_ori, hits_ori = calc_recall(answer_original, key_points)
rec_comp, hits_comp = calc_recall(answer_compressed, key_points)
print(f"召回率:原始 {rec_ori:.2f},压缩 {rec_comp:.2f}")
jac = jaccard_similarity(answer_original, answer_compressed)
cos = cosine_similarity(answer_original, answer_compressed)
print(f"Jaccard 相似度:{jac:.3f}")
print(f"Cosine 相似度:{cos:.3f}")
print("\n【命中明细】")
for i, p in enumerate(key_points):
print(f" {p['name']}: 原始 {'✅' if hits_ori[i] else '❌'} 压缩 {'✅' if hits_comp[i] else '❌'}")
if __name__ == "__main__":
main()
在小模型场景下,压缩后反而会使得think时间变长,而且对中文支持不是很好,需要自己重新额外训练