带有问题生成的文档增强RAG
通过问题生成使用文档增强来实现增强的RAG方法。通过为每个文本块生成相关问题,改进了检索过程,从而从语言模型中获得更好的响应。
具体实现步骤
1.数据摄取 :从PDF文件中提取文本。 2.chunking :将文本拆分为可管理的块。 3.问题生成 :为每个块生成相关问题。 4.Embedding Creation :为块和生成的问题创建嵌入。 5.向量存储创建 :使用NumPy构建一个简单的向量存储。 6.语义搜索 :检索相关的块和问题以供用户查询。 7.响应生成 :根据检索到的内容生成答案。 8.评估:评估生成的回复的质量。
代码实现
PDF文件中提取全部文本
python
def extract_text_from_pdf(pdf_path):
"""
从PDF文件中提取全部文本
:param pdf_path: PDF文件路径
:return: 提取的文本内容(str)
"""
print(f"[步骤] 正在从PDF提取文本: {pdf_path}")
with open(pdf_path, 'rb') as f:
reader = PdfReader(f)
text = ""
for i, page in enumerate(reader.pages):
page_text = page.extract_text()
if page_text:
text += page_text
print(f" - 已提取第{i+1}页")
print(f"[完成] PDF文本提取完成,总长度: {len(text)} 字符\n")
return text
文本分割为带重叠的块
python
def chunk_text(text, n, overlap):
"""
将文本分割为带重叠的块
:param text: 原始文本
:param n: 每块字符数
:param overlap: 块间重叠字符数
:return: 文本块列表
"""
print(f"[步骤] 正在分块: 每块{n}字符,重叠{overlap}字符")
chunks = []
for i in range(0, len(text), n - overlap):
chunks.append(text[i:i + n])
print(f"[完成] 分块完成,共{len(chunks)}块\n")
return chunks
针对文本块用Qwen生成可回答的问题
python
def generate_questions(text_chunk, num_questions=5, model=LLM_MODEL):
"""
针对文本块用Qwen生成可回答的问题
:param text_chunk: 文本块
:param num_questions: 生成问题数
:param model: 生成模型名
:return: 问题列表
"""
system_prompt = "你是文本理解专家,请根据给定文本生成可由该文本直接回答的简明问题。只输出问题列表,不要输出其他内容。"
user_prompt = f"""
请基于以下文本,生成{num_questions}个不同的问题:\n\n{text_chunk}\n\n只输出编号的问题列表。
"""
try:
response = Generation.call(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
api_key=ALI_API_KEY,
result_format='message'
)
if response.status_code == 200:
questions_text = response.output.choices[0].message.content.strip()
questions = []
for line in questions_text.split('\n'):
cleaned = re.sub(r'^\d+[.、]\s*', '', line.strip())
if cleaned and (cleaned.endswith('?') or cleaned.endswith('?')):
questions.append(cleaned)
print(f" - 问题生成成功,共{len(questions)}个")
return questions
else:
print(f" - 问题生成失败: {response.message}")
return []
except Exception as e:
print(f" - 问题生成异常: {e}")
return []
阿里embedding模型生成单条文本的向量
python
def create_embedding(text, model=EMBEDDING_MODEL):
"""
用阿里embedding模型生成单条文本的向量
:param text: 输入文本
:param model: 嵌入模型名
:return: 向量(np.ndarray)
"""
try:
response = TextEmbedding.call(
model=model,
input=[text],
api_key=ALI_API_KEY
)
if response.status_code == 200:
emb = response.output['embeddings'][0]['embedding']
print(f" - 嵌入生成成功,向量维度: {len(emb)}")
# 打印嵌入内容(前60个元素,防止过长)
print(f" 嵌入内容(前60维): {emb[:60]}")
return np.array(emb)
else:
print(f" - 嵌入生成失败: {response.message}")
return None
except Exception as e:
print(f" - 嵌入生成异常: {e}")
return None
简单的向量存储与检索类
python
class SimpleVectorStore:
"""
简单的向量存储与检索类
"""
def __init__(self):
self.vectors = []
self.texts = []
self.metadata = []
def add_item(self, text, embedding, metadata=None):
self.vectors.append(np.array(embedding))
self.texts.append(text)
self.metadata.append(metadata or {})
def similarity_search(self, query_embedding, k=5):
if not self.vectors:
return []
query_vector = np.array(query_embedding)
similarities = []
for i, vector in enumerate(self.vectors):
sim = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
similarities.append((i, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
results = []
for i in range(min(k, len(similarities))):
idx, score = similarities[i]
results.append({
"text": self.texts[idx],
"metadata": self.metadata[idx],
"similarity": score
})
return results
提取文本、分块、生成问题、生成向量、构建向量库
python
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200, questions_per_chunk=3):
"""
主流程:提取文本、分块、生成问题、生成向量、构建向量库
:param pdf_path: PDF文件路径
:param chunk_size: 块大小
:param chunk_overlap: 块重叠
:param questions_per_chunk: 每块生成问题数
:return: 文本块列表、向量库
"""
print("[主流程] 开始处理文档...\n")
extracted_text = extract_text_from_pdf(pdf_path)
text_chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
print("[主流程] 初始化向量库...")
vector_store = SimpleVectorStore()
print("[主流程] 为每个块生成向量并增强问题...")
for i, chunk in enumerate(tqdm(text_chunks, desc="处理文本块")):
print(f"\n[块{i+1}/{len(text_chunks)}] 正在处理文本块,长度: {len(chunk)} 字符")
# 块向量
chunk_emb = create_embedding(chunk)
if chunk_emb is not None:
vector_store.add_item(chunk, chunk_emb, {"type": "chunk", "index": i})
# 生成问题
questions = generate_questions(chunk, num_questions=questions_per_chunk)
for j, q in enumerate(questions):
print(f" - 正在生成问题{j+1}/{len(questions)}的嵌入...")
q_emb = create_embedding(q)
if q_emb is not None:
vector_store.add_item(q, q_emb, {"type": "question", "chunk_index": i, "original_chunk": chunk})
print("[主流程] 文档处理完毕,向量库构建完成\n")
return text_chunks, vector_store
向量库做语义检索
python
def semantic_search(query, vector_store, k=5):
"""
用向量库做语义检索
:param query: 查询问题
:param vector_store: 向量库
:param k: 返回top-k
:return: 检索结果
"""
print(f"[检索] 正在对问题进行向量化: {query}")
query_emb = create_embedding(query)
if query_emb is None:
print("[检索] 查询向量生成失败,无法检索!")
return []
print("[检索] 正在进行相似度检索...")
results = vector_store.similarity_search(query_emb, k=k)
print(f"[检索] 检索完成,返回Top-{k}结果\n")
return results
根据检索结果准备上下文
python
def prepare_context(search_results):
"""
根据检索结果准备上下文
:param search_results: 检索结果
:return: 上下文字符串
"""
print("[上下文] 正在整理检索结果上下文...")
chunk_indices = set()
context_chunks = []
for result in search_results:
if result["metadata"]["type"] == "chunk":
chunk_indices.add(result["metadata"]["index"])
context_chunks.append(f"Chunk {result['metadata']['index']}:\n{result['text']}")
for result in search_results:
if result["metadata"]["type"] == "question":
chunk_idx = result["metadata"]["chunk_index"]
if chunk_idx not in chunk_indices:
chunk_indices.add(chunk_idx)
context_chunks.append(f"Chunk {chunk_idx} (由问题'{result['text']}'关联):\n{result['metadata']['original_chunk']}")
print(f"[上下文] 上下文整理完成,共{len(context_chunks)}段\n")
return "\n\n".join(context_chunks)
Qwen生成基于上下文的回答
python
def generate_response(query, context, model=LLM_MODEL):
"""
用Qwen生成基于上下文的回答
:param query: 用户问题
:param context: 上下文
:param model: 生成模型名
:return: 回答内容
"""
print("[生成] 正在调用大模型生成最终回答...")
system_prompt = "你是一个AI助手,只能基于给定上下文回答问题。如果上下文无法直接回答,请回复:'信息不足,无法回答。'"
user_prompt = f"""
上下文:
{context}
问题:{query}
请只基于上述上下文简明准确作答。
"""
try:
response = Generation.call(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
api_key=ALI_API_KEY,
result_format='message'
)
if response.status_code == 200:
print("[生成] 回答生成成功\n")
return response.output.choices[0].message.content.strip()
else:
print(f"[生成] 回答生成失败: {response.message}")
return ""
except Exception as e:
print(f"[生成] 回答生成异常: {e}")
return ""
结果展示
python
========== RAG主流程开始 ==========
[配置] 使用API密钥: sk-6b7bb...6e3f
[配置] PDF路径: data/2888年Java程序员找工作最新场景题.pdf
[配置] 验证集路径: data/java_val.json
[配置] 块大小: 1000,重叠: 200,每块问题数: 3,检索TopK: 5
[主流程] 开始处理文档...
[步骤] 正在从PDF提取文本: data/2888年Java程序员找工作最新场景题.pdf
- 已提取第1页
- 已提取第2页
- 已提取第3页
- 已提取第4页
- 已提取第5页
- 已提取第6页
- 已提取第7页
- 已提取第8页
- 已提取第9页
- 已提取第10页
[完成] PDF文本提取完成,总长度: 6984 字符
[步骤] 正在分块: 每块1000字符,重叠200字符
[完成] 分块完成,共9块
[主流程] 初始化向量库...
[主流程] 为每个块生成向量并增强问题...
[块1/9] 正在处理文本块,长度: 1000 字符
处理文本块: 0%| | 0/9 [00:00<?, ?it/s] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.00423507937586087, 0.01152804552144631, 0.0008922744753682508, 0.038833285971348844, -0.0022277549319312437, -0.023937812994899093, -0.0056139424284667346, 0.00942223085947341, -0.02050472539453347, 0.016124255696799192, 0.035962999616944796, -0.06569766544524813, 0.038214204600791106, 0.020260844854616787, 0.0018853841739712844, 0.02262461008765541, -0.0024177003524432764, -0.0190883422588635, 0.004715805440119718, 0.019060202196565422, -0.02495085523762993, -0.03299891305488049, 0.008446708699806675, -0.008826599540830739, -0.02071108585138605, -0.020598525602193732, 0.021686608011052783, -0.037407522814912844, -0.008653069156659253, 0.029359464997662288, 0.014623452374234987, 0.015383234056283116, -0.04806322640511871, -0.032398591725854804, 0.021630327886456625, -0.04329817585597735, 0.05470428110746532, 0.012606747909539334, -0.013188309197032963, -0.03973376796488737, -0.02581381714810435, 0.04975163014300344, 0.017540638832469164, 0.031423069566188074, -0.015392614077049142, -0.020392165145341156, -0.04157225203502852, 0.0019440093037589486, 0.007841697360397979, 0.005731192688042063, -0.014782912727257434, 0.010927724192420628, 0.0232624514997452, -0.029772185911367446, -0.0002491568015975733, -0.014961133121811934, -0.03440591616978443, -0.012494187660347019, 0.05128995354863176, -0.006547254494686351]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.010061754679372889, 0.0009277300867385699, 0.0013731183254453464, 0.01679379466588114, -0.005539151545139302, -0.00301593316806976, -0.006815023530255658, -0.013868624748785105, -0.011939257356657932, 0.012011867957436912, 0.023256138135210325, -0.05319245154208677, 0.02759202829601225, 0.027031889375717266, -0.010881217173878516, -0.006706107629087188, 0.011804409098068399, 0.0008382634536358985, -0.005173505305502297, 0.03850436429879604, -0.051698747754633484, -0.022592269785231083, -0.015538668566701636, -0.010715250086383705, -0.012395666847268661, -0.00929415689970939, -0.004522603134233587, -0.026907414060096158, 0.0049038087883232296, 0.03781975006287995, 0.023256138135210325, 0.027488298866327993, -0.015611279167480615, -0.02755053652413855, 0.0015663143882322743, -0.04979012624844316, 0.04132580478620782, 0.027052635261654117, 0.011845900869942102, -0.03431369533955208, -0.01976045635485088, 0.04097312472528135, 0.007515197180624389, 0.03974911745500712, 0.003508647959069979, -0.01595358628543866, -0.024500891291421405, 0.02007164464390365, 0.016690065236196885, -0.010367756496941445, 0.0037057338754700664, -0.0047326552293442065, -0.005295387385381298, -0.023837022941442163, -0.009335648671583093, -0.014511747212827495, -0.04302696743302963, -0.001902138416835055, 0.0552255483638982, -0.009906160534846505]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.012134810096571259, 0.007952985644797655, 0.016345554496144128, 0.010393829239055555, 0.013141224279985654, 0.0026317152497330426, -0.02223365586807432, -0.0007577026323407221, -0.02376062911187547, 0.028434092676236564, 0.00849089667386397, -0.03923858517404167, 0.05321270395064613, 0.02329790994708724, 0.03819746705326816, 0.010804492497805107, 0.01553579595776473, -0.025218194480958384, 0.02233776768015167, 0.03852137046861992, -0.031002184040811225, -0.017779983906987634, 0.05154691495740851, 0.012354601699845667, -0.012528121386641253, -0.06491949881978828, 0.024848019149127803, -0.05959822842472367, -0.020174555584766708, 0.012956136614070362, 0.0005299580434215166, -0.02303184642733401, -0.0018754586147822834, -0.0029787546233242127, -0.0060731890378454825, -0.015570499895123846, 0.00097749423561513, 0.01158533108838524, 0.028896811841024793, -0.015073076792976502, -0.0073398827514532545, 0.03178880662095121, 0.005650957799976225, 0.013245336092063005, -0.03127981553968416, -0.0074324265844109005, -0.03921544921580226, -0.035953279104045256, 0.02616676876877425, -0.0057695295859532085, -0.010231877531379675, -0.0018465386669830194, 0.005382002285443068, -0.013789031110689172, 0.014494677836991219, -0.00481517130857749, -0.0005393570264562773, -0.009942678053387033, 0.03315382815707648, -0.06274471874528362]
- 正在生成问题3/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.02398737255925337, 0.026740810311668917, -0.009342450365182891, -0.034436007523659974, 0.03806717870806825, 0.029794841705177866, -0.014248138587827188, 0.019237993029977637, -0.012685051654141506, -0.016929433866380322, 0.03994288302849107, -0.06728488062234679, 0.039966930519778544, -0.025225818360558178, -0.005762380099760489, 0.026332002959781893, 0.006733297560492174, -0.029458176827153258, 0.021450362228425068, -0.0019869239676273778, -0.011482677089767903, -0.024792963517383682, 0.025586530729870258, 0.01369504628821533, -0.03532576470129644, -0.03797098874291836, -0.00937250972929223, -0.012769217873647657, -0.017831214789660522, -0.017145861287967572, 0.006426692046576905, -0.037730513830043645, -0.034003152680485475, -0.014055758657527413, -0.004902682286233364, -0.03417148511949778, -0.002977380015030133, 0.021787027106449676, -0.019394301723346205, -0.017674906096291954, 0.02585105313403245, 0.018035618465604034, 0.005594047660748185, 0.011657021401602075, -0.02183512208902462, -0.022508451845073836, -0.03749003891716892, -0.008861500539433449, -0.01533027569576343, -0.02259261806457999, -0.004821522003138146, -0.05809873895053247, 0.0036732542941613554, 0.005344554938640662, -0.014620874702783004, -0.03027579153092731, 0.00904185672408949, -0.02046441508563871, 0.02549034076472037, 0.01730216998133614]
[块2/9] 正在处理文本块,长度: 1000 字符
处理文本块: 11%|█ | 1/9 [00:10<01:20, 10.12s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.017324478024649804, 0.014403428694926506, -0.006134203592418922, 0.01906648562492115, 0.023049734710907464, -0.02604513802356917, 0.005074659335546562, 0.02209375493027075, -0.047161669178077945, 0.013171276977661408, 0.023155954686533767, -0.022454902847400173, 0.04367765397753525, -0.0006084412978844092, 0.03250331254164831, 0.042063110348015464, 0.004671023428166616, -0.025683990106439746, 0.019438255539613206, -0.01887528966879381, -0.002174854000948527, -0.04873372481734721, 0.041064642577128226, 0.02916800530698244, -0.0101758736649997, -0.04274291819202379, 0.015476250448752154, -0.040151150786742036, -0.04456990177279618, 0.025556526135688182, -0.006606882483955964, -0.014998260558433797, -0.04622693339256649, -0.006904298415709609, -0.01652782820745254, -0.04894616476859981, 0.018163615832097586, 0.018822179680980655, -0.001506995904198155, -0.02336839463778637, 0.021371459096011895, 0.004660401430603986, 0.033013168424654564, 0.05047573241761855, -0.043507702016533166, -0.005318965279487056, -0.04389009392878785, -0.0127463970751562, 0.000892247795260934, 0.03418158815654388, 0.007865589195127639, -0.019108973615171672, -0.002921049329723296, -0.023113466696283246, 0.009942189718621836, -0.010876925504133291, -0.006904298415709609, -0.014955772568183276, 0.012799507062969352, 0.0029874368144897348]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.04229083105517949, 0.04665933448285737, -0.013244930605193576, -0.014197636139953113, 0.036226047041222426, -0.008324555068965962, -0.03968831837486075, 0.014313819741753057, -0.03048657711230521, -0.0031834306893184557, 0.02593217992174742, -0.038665902679021244, 0.05446687252381358, -0.019530463462570525, -0.017961984838271286, 0.025235078310947758, 0.04819295802661662, -0.016277322612172105, 0.013651573211493379, 0.0036016916557982527, -0.0044788778493878275, -0.0401995262227805, 0.03915387380658101, 0.03692314865202209, -0.036969622092742065, -0.030277446629065313, -0.009178504542195548, -0.020134618191930232, -0.021447492892269597, 0.0028958762748635953, -0.015800969844792335, -0.021819280418029417, -0.03852648235686131, 0.005881794841122147, 0.020576115878770018, -0.0375273033813818, 0.009039084220035616, -0.010543661863344886, -0.005829512220312172, -0.030207736467985347, -0.03420445236990341, 0.0006150469420284516, 0.01205985786683415, 0.019379424780230598, -0.014952829551652747, -0.023736309847728485, -0.04851827211165646, -0.008109615405636067, 0.007824965581226204, -0.0010354863510419977, -0.01352377124951344, -0.030997784960224962, -0.007592598377626317, -0.01757857895233147, 0.04333648347137898, -0.032856722589024064, 0.010607562844334855, -0.004484687029477825, -0.011891391644224232, 0.01714869962567168]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.0019449213655894076, 0.012648940541259784, -0.0039887108546083, -0.03490044757060834, 0.02540910772137774, -0.007297703360638889, -0.05383369320735479, 0.01316181893155024, -0.06169370901999887, -0.006092748106342036, 0.014484180082419607, -0.011196814978389218, 0.0212195709910533, 0.008601526738365227, -0.008879593335510656, -0.01667163731329697, 0.027856093776257505, 0.0008959923685797112, 0.005607676375766124, 0.01689409059101331, 0.005456284561764724, -0.04290876556839665, 0.048494814542162576, 0.012580968706402013, 0.010084548589807507, -0.04014045811237106, -0.014966162184138349, 0.007279165587495861, -0.0077611476892146015, -0.008799262985224198, -0.02647194004824471, -0.03677894191576856, -0.03789120830435027, 0.004473782585184213, -0.0007314696319353332, -0.03035251389285201, -0.006828079774348833, -0.02392608586993546, 0.0026524463738816623, -0.022195893709919465, 0.005950625178912151, 0.012253468047541844, 0.02115777841390987, -0.013915688372700066, -0.0002827010404311847, -0.034702711323749365, -0.04028876029751529, 0.0021735539010200927, -0.017944564402451596, 0.004940316542617097, 0.00021492230737698675, -0.06470918678459818, -0.023394669706501975, -0.015880692325861086, 0.0011138112030102962, -0.038682153291786155, -0.000982501976580511, -0.0072173730103524325, 0.021503816845913067, 0.01499087921499572]
- 正在生成问题3/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.005322666345764534, -0.00765786266828298, -0.0058958765676160995, -0.03137690633110637, 0.056970883921562465, 0.00897370485243485, -0.025548798459669765, 0.032438615904585126, -0.008561445284009586, -0.029592330665046324, 0.020454851463511028, -0.03808600725287641, 0.023696454097430222, -0.02728819499494348, 0.02204741582372917, 0.010447673994338873, -0.005941055698402429, 0.013045474014552864, 0.001291840770921631, -0.002836402304679297, 0.005017707212956805, -0.0353074907095171, 0.022081300171818915, 0.029050181095610358, -0.027762575868199947, -0.04888381951080935, -0.006567916138062762, -0.02618130629067839, 6.172422262702738e-05, -0.014773575767129996, -0.03158021241964486, -0.004983822864867057, -0.0485223864645187, -0.004334372859813559, 0.004164951119364821, -0.011656215742873208, 0.009651391814229802, -0.009702218336364424, 0.005093946996158737, -0.029569741099653157, 0.0008951115287041683, 0.01616283403880965, 0.011859521831411694, 0.020861463640588, -0.04316865946633857, -0.03767939507579944, -0.05466674825145962, 0.008600977023447623, 0.009967645729734114, -0.00598623482918876, -0.0050346493870016785, -0.10797812257932933, -0.008194364846370652, 0.004441673295431094, 0.011588447046693713, -0.010492853125125204, -0.031399495896499534, 0.008516266153223255, 0.023606095835857564, 0.006042708742671672]
[块3/9] 正在处理文本块,长度: 1000 字符
处理文本块: 22%|██▏ | 2/9 [00:13<00:44, 6.36s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.050091071177696116, 0.03007661247467806, -0.0008355377415468224, 0.0005973030691419355, 7.217412085465055e-05, -0.032778893946106405, -0.012292085229789902, 0.016312552784841828, -0.0501350107138169, -0.006596422860133008, -0.0014294080344293217, -0.022332269233389518, 0.03897436853913724, -0.02878039615911487, 0.029175851984201945, 0.038600882482110556, 0.014467092267768811, -0.015049291121369227, 0.009529386896195478, -0.014653835296282152, 0.005827480978019252, -0.02399098672194919, 0.05575927133727752, 0.02137658432276242, -0.014148530630893113, -0.04268725934134366, 0.013862923646108003, -0.018850060995817222, -0.01592808184378495, 0.014478077151799008, -0.0032020936948022843, -0.015082245773459816, -0.03471223353542099, -0.0014706013495425585, 0.021365599438732223, -0.043785747744363314, 0.0054951882361058075, 0.013357618980718964, -0.023441742520439367, -0.017608769100405014, 0.009468970034029398, 0.03726072663042658, 0.023529621592680936, 0.021244765714400062, -0.017070509782925385, -0.005599544634392674, -0.05132137818907812, 0.001079951411218695, 0.0028917707209492327, 0.030845554356791816, -0.01308299687996405, -0.010463102038762181, -0.007650971727031873, -0.02043188429616552, 0.007733358357258347, -0.030625856676187887, 0.01336860386474916, -0.012500798026363636, 0.01952013892165921, -0.014005727138500557]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.03210761479849173, 0.03997018818615388, -0.015763069155007556, -0.01202139435959277, 0.03215817797140274, -0.029655300912307718, -0.03137444879128208, 0.027784463514600326, -0.05288907886491709, 0.0026687874702092784, -0.014486349039004537, -0.037290340021870316, 0.0319812068662142, -0.022917758121915554, -0.01085212098602565, 0.0068576303260558125, 0.06001848624536958, -0.009196177073190053, -0.005343895337032433, 0.0073822232450075475, 0.003327688817205885, -0.023056806847420834, 0.0319559252797587, 0.012647113624366527, -0.011907627220542996, -0.041032014817285106, 0.018164819868280557, 0.012944172265218714, 0.018544043665113138, -0.0013241230906070902, -0.014195610794766226, -0.008368205116772255, -0.02641925784600304, 0.009101371123981909, 0.009227779056259435, -0.030540156438250404, -0.009095050727368032, 0.02947832980711918, 0.01588947708728508, -0.0348127445492308, -0.003678470829276021, 0.01964379267592762, 0.023625642542669703, 0.00703460143124435, -0.0007489669987443445, -0.02008622043889896, -0.04490009754497742, 0.011256626369313735, -0.004610729329822779, 0.008222835994653099, -0.001601430492040914, -0.05089183353493217, 0.0015966901945805067, -0.011774898891651593, 0.025445916767466086, -0.046366429559396724, 0.01859460683802415, 0.00639940157154978, 0.0052522495861312265, -0.019694355848838628]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.023896582737083585, 0.029335567290587984, 0.04314054013321762, -0.010501171984326265, -0.021351836952010244, -0.039230587381903215, -0.039689296922560215, 0.005037613705429526, -0.042332337609202915, -0.0075086383143258524, 0.006236265421789181, -0.03208782453452997, 0.03418478243467624, -0.018042575265841902, -0.011292992024746082, -0.0039481785463691575, 0.06836956486935249, -0.035517224433727526, -0.012920318728505431, 0.026124600505989002, 0.0022853564614875414, -0.011336678647665797, 0.012079351237300934, 0.0007283379164896077, -0.04104358223307135, -0.004873788869480599, 0.028177871783215562, 0.03027482968336184, -0.010594006058030658, 0.013105986875914216, -0.018697874609637614, 0.003380252448412875, -0.04178625482270649, -0.003470356108184785, 0.006678592478851286, -0.03901215426730465, 0.03617252377752323, 0.00708269374085864, 0.01204658627011115, -0.035058514893070526, -0.008568038920128918, 0.014962668350002062, 0.008540734780804096, 0.02084944078843353, 0.0017774994700458654, -0.02916082079890913, -0.039689296922560215, 0.03047141948650055, 0.021493818476499314, 0.0055700444222635415, -0.00025205133614225643, 0.004764572312181314, 0.002432798813841576, -0.016655524988140984, -0.005223281852838311, -0.03425031236905582, 0.00268126648169745, 0.01976819687117061, -0.006482002675712572, 0.01997570833003925]
- 正在生成问题3/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.012712549322342306, -0.005843017081322934, 0.015635507743172836, -0.008258517442981497, 0.04229010477122267, 0.01593708281833789, -0.042498887515567706, 0.019555983720318547, -0.003871180051397577, -0.03760409206481182, 0.014904768137965202, -0.033126862102746006, 0.006043100544653596, -0.016366247348380467, 0.027837699246005047, 0.005990904858567336, -0.032941277441105976, 0.019660375092491064, 0.013106916728327377, 0.0002620658405580944, -0.0012990926314802355, -0.02994872477216043, 0.024126006013204374, 0.021005863889381308, -0.02239774885168156, -0.04263807601179773, 0.016134266521330425, -0.014301617987635093, 0.011222072508545785, -0.014475603607922626, -0.033706814170371115, -0.011001690722848245, -0.035980226275461526, -0.019834360712778595, -0.005335559022150968, -0.023801232855334316, -0.018465673833183347, -0.01119887442584078, 0.017769731352033222, -0.012086201089307191, 0.0019950351126303617, 0.03502910488455635, 0.028696028306090204, 0.011042287367582003, -0.04590900567320332, -0.04774165420689866, -0.04969029315411901, -0.002506842812309517, -0.0015397727395446543, 0.009453218702289214, -0.02084347731044628, -0.05363396721396972, 0.00020624545404917803, 0.012700950280989803, 0.002915709019985216, -0.01676061475436554, -0.04551463826721825, 0.002073328641759751, 0.02602824879501472, -0.009696798570691759]
[块4/9] 正在处理文本块,长度: 1000 字符
处理文本块: 33%|███▎ | 3/9 [00:17<00:30, 5.13s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.018454854097455602, 0.0016788041414189632, -0.023910967557067234, 0.026788917513785458, 0.007812434986674676, -0.024102830887515116, 0.0018301963006004949, 0.008681815702766639, -0.06177999240421785, 0.010912226919223262, -0.004059108584787994, -0.007290806557019498, 0.05141937256003225, -0.005072386798715868, 0.017687400775664077, 0.034871160308902464, 0.037869024847150615, -0.017075836409861454, -0.015840716220103218, 0.027460439170353044, -0.010456551509409544, -0.0370775886090531, 0.04302535185293743, 0.020121666780721574, -0.018910529507269323, -0.05640781915167717, 0.0213807698867858, -0.018682691802362462, -0.026812900430091442, 0.027580353751882968, -0.002940905112021434, 0.01802316160394787, -0.033839894907745104, -0.0013947564764199513, -0.027388490421435086, -0.02087712864436011, -0.002922917924791945, 0.017627443484899114, 0.01381415979224747, -0.027052729593151293, -0.00903556371827992, 0.04412856600301275, 0.04475212182696837, 0.07746481966833216, 0.001977091662974654, -0.013166621051985869, -0.03136965452822863, -0.010420577134950565, -0.020265564278557486, 0.02573366919632211, 0.010852269628458299, -0.01648825496036482, -0.002375807646561658, -0.03247286867830395, 0.031033893699944835, -0.03434353615017079, 0.022927667988521843, -0.023671138394007384, 0.020397470318240404, -0.026165361689829844]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.0004379651367598164, 0.015351909120902864, -0.02514084028664186, 0.00961583421219849, 0.023290493541898515, -0.015375784562770521, -0.020413502796845957, -0.013501562376159518, -0.004023011954700051, 0.017297757633116837, -0.006109128687886501, 0.022741358378942426, 0.014599832702071698, 0.016867999679499028, -0.019589800052411822, 0.006022580211116248, 0.06575296690352486, 0.014874400283549744, 0.0077177365837198305, 0.012283914840909058, -0.013883569446042016, -0.0014377492649679151, 0.0020144904075834824, -0.0032858576870361694, -0.025475096472789047, 0.016867999679499028, -0.0039663077802643675, -0.009412892956323414, -0.022992050518552812, 0.04713012224675312, -0.009609865351731575, -0.007604328234848464, 0.01257042014332093, 0.02270554521614094, -0.02697924931045138, 0.04113738633797144, -0.022753296099876252, -0.010379848351963485, 0.030011430427643704, 0.004005105373299309, -0.04531558866481126, 0.014134261585652404, -0.007073099653293116, 0.047798634619047486, -0.019028727168521904, -0.005861420978509569, -0.0009281578026051305, 0.02050900456431658, -0.022538417123067346, -0.020735821262059313, -0.010302253165893602, -0.005246628350417426, -0.019971807122294318, -0.005530149222595842, 0.020735821262059313, -0.025093089402906547, 0.039275101872294264, -0.003805148547657689, 0.01820502442408777, 0.030202433962584953]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.007970831933839136, 0.02534019376211078, -0.005205028394794881, 0.004164600731204567, 0.004340895418646258, -0.011138356154102982, -0.022658202451522415, -0.004947811555740609, -0.03377921814456556, 0.039651854290164226, -0.0042946541891533555, 0.021548412943692745, 0.026843033720630127, 0.025432676221096587, -0.035929435315985546, -0.028623321056106887, 0.06274934842186922, -0.014300100220680219, -0.017930036735873095, 0.007913030396973006, -0.011115235539356531, -0.044576545231158386, 0.005332191775900364, 0.010138389566318958, -0.03172148343213138, -0.021282525874108553, 0.023767991959352086, 0.018912662862597282, -0.02182586032065016, 0.001557751418542166, -0.024924022696674656, -0.014589107905010862, -0.025871967901279165, -0.0058090544550459255, -0.004531640490304483, -0.0013735090197813811, 0.02959438687545785, -0.0030374707623150583, 0.03384857998880492, 0.006872602733382692, -0.022207350463966612, 0.003806231202634569, 0.029825593022922363, 0.03461156027543781, -0.025108987614646267, 0.014820314052475376, -0.032230136956553315, 0.007884129628539942, -0.012438890733590877, 0.02529395253261788, 0.025687002983307554, -0.07389348472965881, -0.020380821898996947, -0.0255713999095753, 0.003921834276366826, -0.021005078497151136, -0.011930237209168946, 0.008271399925543004, 0.023236217820183702, 0.03269254925148234]
- 正在生成问题3/3的嵌入...
处理文本块: 44%|████▍ | 4/9 [00:21<00:22, 4.55s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.03227839495722577, 0.02595247835756846, 0.00015930650777525282, 0.02024061593333576, 0.018896648304104535, -0.016625806447817294, -0.03139786444152255, 0.03814087444335507, -0.03190764526640336, 0.016579462736464493, 0.024538995161308034, -0.0169502124272869, 0.05389773630330735, 0.026925696295977276, 0.007635126445373935, 0.022337668872049996, 0.04013365403152551, 0.011968263456860811, 0.0029587563216803775, 0.022847449696930804, 0.007432372708205431, -0.04117638753696353, 0.02203643474825679, 0.0004294034505032951, -0.03478095537027701, -0.05709545238665061, -0.03763109361847426, -0.014864745416410863, -0.027968429801415295, 0.015328182529938872, 0.013416504436635838, -0.006656115543046018, -0.023461503872355415, -0.025210978975923646, -0.018931406087619135, -0.03505901763839382, 0.004199898841347575, 0.019058851293839336, 0.021758372480139985, -0.04400335392948438, -0.018433211190576525, 0.013833597838811044, -0.008880613687980456, 0.024608510728337234, -0.002728486005896148, 0.008341868043504147, -0.029683147121468925, -0.03663470382438905, 0.012234739797139417, -0.017958188149210318, -0.012605489487961823, -0.052183018983253726, -0.008770547373517555, -0.025210978975923646, 0.03454923681351301, -0.04601930537333122, 0.011064561085481196, -0.010884979203989092, 0.01639408789105329, -0.007530853094830133]
[块5/9] 正在处理文本块,长度: 1000 字符
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.019952177866800903, -0.0068800613333796215, -0.009707883152607687, 0.043076180619498854, 0.03950787762286467, -0.008722518436216877, 0.008757501798928976, 0.020826761934603397, -0.010489178253177914, 0.013048794291613215, 0.015171118296147267, -0.054294178929178843, 0.04846361847716222, 0.0003403589663864707, 0.015625902011404565, 0.009515474657691137, 0.05037604230542367, 0.021922907299582524, -0.011900173882565939, -0.0031659943254450295, -0.03953119986467274, -0.02209782411314302, 0.04897670779693968, 0.030248947625062268, -0.014273211986536707, -0.058911982807176015, 0.0040551547943775655, -0.0031485026440889793, -0.005358285055403282, 0.02109496771539616, -0.006436938739026358, -0.01328201670969388, -0.018086398522155583, -0.006530227706258624, -0.038644954675966216, -0.014028328447552009, 0.0025756500796783456, 0.010413380967301699, 0.015171118296147267, -0.02770682326798302, 0.003886068541269083, 0.020686828483755, 0.021817957211446224, 0.02814994586233628, -0.008396007050903945, 0.022972408180945515, 0.0006778026525469331, -0.0032592832926772956, -0.013153744379749513, -0.00923560775599434, 0.010302600318713382, -0.03010901417421387, 0.05121564301051407, -0.02756688981713462, 0.03454024011774651, 0.0016573368084857266, 0.014121617414784274, -0.046551194648900764, 0.01482128466902627, -0.0025843959203563707]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.017083061556491707, 0.045983150247426975, 0.01733546407780992, -0.018069725958008355, 0.045616019307327754, -0.02411591362776734, 0.0317568263185823, 0.03726379042007056, -0.014146014035697969, -0.012551289014641993, 0.010904936205134564, -0.05048050426364238, 0.012344777860836183, 0.03370720943785939, 0.013744464569964449, 0.025033740978015383, 0.03322535007897917, -0.03267465366883034, -0.006602620500846862, 0.019744760872211033, -0.0018586003842522877, -0.02838381080642074, 0.006229753139808595, 0.03258287093380554, -0.012769273010325903, -0.04749756537533625, 0.04311493977790184, 0.03111434717340867, -0.022830955337420078, 0.04609787866620798, 0.016440582411318076, 0.009241374132809987, -0.016004614419950255, 0.022280258927271254, 0.0402237836246205, -0.010864781258561213, -0.027649548926222307, 0.0190678632014031, -0.002843830680534172, -0.033041784608929564, 0.03547402708708688, -0.00023644810058147836, 0.02137390441890131, 0.009195482765297584, -0.015626010637972938, -0.012849582903472608, -0.032123957258681515, 0.004784175063167926, -0.017392828287200422, 0.027534820507441302, 0.003134954043190973, -0.0613108669965693, 0.007675331216449263, -0.007032852071275632, -0.00593145925097798, -0.019182591620184108, -0.0021511578521438515, -0.01866631373566958, -0.012792218694082104, -0.02820024533637113]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.010541161968128597, 0.008980811893825755, -0.008171306968059618, 0.027828198317641658, 0.01857168547083758, -0.015462717277677784, 0.0230884883175037, 0.01745714970347841, 0.022314179258075224, 0.010329986770102648, 0.0034902567451510933, -0.01936945844115783, 0.04272778173391691, -0.01890018022332239, -0.0251063846541961, -0.01231855321818033, 0.0464350796548169, 0.0031353650929130407, -0.0030033805941468233, 0.002139615374443464, -0.019838736658993272, -0.0019079092543872152, 0.030057269852360003, 0.04566077059538842, -0.024848281634386606, -0.015251542079651835, 0.011350666893894731, -0.0009678863242855973, 0.03798807173377896, 0.019533705817400236, -0.03519586633765808, -0.021246571312499598, -0.01518115034697652, -0.04216464787251439, -0.06114695178395798, 0.015392325545002469, -0.0017891232054976192, 0.02524716811954673, 0.04209425613983907, -0.017234242550006575, 0.023440446980880283, 0.025364487674005592, 0.015931995495513225, 0.009174389158682874, -0.02100020024813599, 0.04425293594188209, -0.004352555470423717, -0.014066614579617347, -0.008071585346769588, 0.00617100856453605, -7.089584013588158e-05, 0.029024857773122033, 0.06405647673453771, -0.005918771522449501, -0.0016087443905171215, -0.008740306807185091, -0.018689005025296444, -0.0009972162129003124, 0.0019445716151556092, 0.002962318750086222]
- 正在生成问题3/3的嵌入...
处理文本块: 56%|█████▌ | 5/9 [00:25<00:17, 4.35s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.014465555883369126, 0.015079507249975262, 0.014885627871047009, 0.013313050686406732, 0.03591076940815537, -0.00766362100596957, 0.008024452072308265, 0.018590878223898072, 0.002967431605262989, 0.024256464519245922, 0.022705429487819895, -0.03504908327958536, 0.03349804824815933, -0.028026341331739736, -0.009160800654359971, -0.024450343898174173, 0.02716465520316972, 0.026346053381028207, -0.01123423290123157, 0.022253044270320637, -0.021434442448179124, 0.008665331130432214, 0.030697568330306783, 0.029771255742094018, -0.016953674579615045, -0.016953674579615045, -0.01297914731158585, 0.00029418502983210665, 0.013948544206227117, 0.002539281310129763, -0.01593042230193815, -0.016081217374437903, -0.034898288207085605, -0.023157814705319153, -0.05605268266347947, 0.030956074168877787, 0.0009034240504226251, 0.009957860323287236, 0.03347650609494508, -0.01651206043872291, 0.011675847042123703, 0.0037994972731634096, -0.01414242358515537, 0.013334592839620981, -0.009144644039449285, 0.014487098036583377, 0.005040863852134587, -0.00562788752722291, 0.004103780187314696, 0.011718931348552203, 0.008455295136593271, 0.0015187218016046514, 0.04273963197707274, -0.015208760169260763, 0.022985477479605148, -0.0016816343352874198, 0.006990428718024247, -0.0049223820094562104, 0.0011612566967056843, 0.018429312074791194]
[块6/9] 正在处理文本块,长度: 1000 字符
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.0058613501362347045, 0.007320175059030898, 0.004681264975472784, 0.051267275858266216, 0.029426582728403212, -0.018339513315152142, -0.0019472707674824186, 0.021319684228864367, 0.015328081867380002, -0.007273284257941021, 0.002704036196182944, -0.008508075353307798, 0.024529099059015992, 0.006366728770203386, -0.019360690761109477, -0.002873364089007502, 0.042660208813768676, 0.028634649198885278, -0.021903214197982843, -0.016057494328778097, -0.026467252170730933, -0.003428238568571054, 0.025883722201612457, 0.007132611854671387, 0.004043029071749449, -0.040951299618493135, -0.014161021929143047, 0.011024548345131231, -0.0007385301171655728, 0.010560850423242441, -0.021319684228864367, 0.004775046577652539, -0.018954303818330537, -0.0019251278891899764, -0.057852828366889035, -0.003402188123521122, -0.008773789892817106, -0.007856814227059498, -0.005262189900086268, -0.021465566721143987, 0.006408409482283277, 0.026217167898251587, -0.007512948352400395, 0.03198994652131652, 0.003378742722976183, 0.011034968523151204, 0.013369088399625114, -0.0016841612724781052, -0.012514633801987344, 0.006715804733872475, -0.005402862303355901, -0.0361371773732657, 0.027154983920049138, -0.010404547752942849, -0.006387569126243332, 0.018766740613971027, 0.00014791768329914525, -0.021653129925503497, 0.033052804679353745, 0.010545220156212481]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.013678610094777848, 0.028565668825748475, 0.0037314065601590884, 0.0015105607952409746, 0.02023508673795144, -0.03903031978171574, -0.006129020077711785, -0.004101011861122306, 0.022343443933011183, -0.016211209896160587, 0.025313142177272163, -0.04301562911384086, 0.018075306196670728, -0.009821859128205147, -0.03874749137750041, -0.009024797261780121, 0.03869606803127944, 0.029311307345952532, 0.0191294847942006, -0.027357220189555696, -0.0072764034902671635, 0.011962355914652995, 0.006180443423932754, 0.018396702110551787, -0.03766760110686005, -0.0580569578834744, -0.005788340408997863, 0.006845732965666545, 0.0007114902043541931, 0.017393946859242882, -0.02540313303315886, 0.0258402314760371, 0.018036738687005, 0.00885124346828435, -0.03630488243200436, -0.01506704044274402, -0.008401289188850868, 0.02182921047080149, 0.0010903356378414908, -0.03391369683272929, -0.025197439648274984, -0.018653818841656632, 0.022690551520002728, -0.0113259920051685, -0.05384024349335491, 0.008060609520136945, -0.03147108788723325, -0.005961894202493634, -0.019052349774869144, 0.005646926206890197, -0.010850326052624534, -0.01797245950422879, 0.01600551651127671, 0.017136830128138037, 0.020980725258155496, -0.010554641811853959, -0.018833800553430025, -0.01731681183991143, 0.009584026151933163, -0.0021806712756829816]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.014870014162016682, 0.012655575324617592, -0.004572299879292938, 0.030198979378416594, 0.023521241537140584, -0.03570639203101537, -0.02687158423413817, 0.015776442494423563, -0.01676318726134751, 0.0018300673293531332, 0.004942329166889418, -0.03361816473357167, 0.03208067870138784, 0.015306017663680751, -0.028432017819041155, 0.0020136477511064257, 0.02386545482792801, 0.01262115399553885, -0.01637307886512176, 0.0023134001585004733, -0.027973066764657925, -0.0033474742529076905, 0.029946556298505816, 0.03979105641502612, -0.00876022825053992, -0.05654276990001405, -0.04112201447273749, 0.017910564897305588, -0.014284851567678062, -0.0042825870262135236, 0.014927383043814586, 0.002867009867850246, -0.0023263081569050016, -0.015374860321838235, -0.06907213368467625, 0.018610465255240013, -0.027055164655891462, -0.00799148523444801, -0.01052718980991536, -0.028386122713602833, -0.016740239708628347, -0.0010541532030364836, -0.006494157419522717, 0.03554575916198124, -0.03212657380682617, 0.00920196864038378, 0.004715722083787698, -0.005452912214890762, -0.024232615671434594, -1.1765102712460753e-06, -0.04158096552712072, -0.04139738510536743, -0.009425707279395606, -0.03621123819083692, -0.01040671515813976, -0.00392689995906652, 0.0020093450849715828, -0.010888613765242154, 0.012150729164796039, 0.024829252042132793]
- 正在生成问题3/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.009625029528551148, -0.006527866561443105, -0.028184183000683188, 0.011000884615862605, 0.003147789669455001, -0.03242491383256958, 0.05460536554316641, -0.00039347370868187515, -0.010655431823377477, 0.006813758527637694, 0.012591158677820005, 0.02461053342325083, 0.033068170756507406, 0.02522996601667244, 0.017260727458998284, 0.009166411166113995, -0.010184901295682218, -0.002573027695751297, -0.0391195507076262, 0.014270773979213211, -0.022073242223273858, -0.032949049103926334, -0.025539682313383244, -0.01522374719986184, 0.014199300987664564, 0.000747488369946268, 0.03628445537619653, 0.001949128040357898, 0.02677854750022646, -0.015855091958541557, -0.013472658906919985, -0.02806506134810211, 0.02928010220442911, -0.0019997547427048562, -0.005741663654407987, -0.0029616620872970657, 0.0023973232581942063, 0.011900253092849748, 0.012424388364206495, -0.025039371372542713, 0.04305056524280179, 0.0046963711530090225, 0.013877672525695653, 0.021513370456142787, -0.011030665029007874, 0.013258239932274044, 0.009184279414001158, -0.04864928291411248, -0.020631870227042808, -0.010196813460940325, -0.011715614531349077, -0.050650526677474605, 0.017379849111579363, 0.006992441006509312, 0.0003279567997622819, 0.0014436055272169458, -0.011334425243089625, 0.0034426157595931705, 0.009077069926678186, -0.008803090125741707]
[块7/9] 正在处理文本块,长度: 1000 字符
处理文本块: 67%|██████▋ | 6/9 [00:28<00:11, 3.84s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.01013475130115276, 0.0138590582188812, -0.008592616095706343, 0.019221253835204687, 0.005699852671110777, -0.03052016602674347, 0.015784207396922153, -0.0014174036814764864, 0.0031296273287000818, -0.005301719725913826, -0.010915898218944247, 0.002221229058171596, 0.015370955479122785, -0.0022187092294045268, -0.020319899177646907, -0.008249919383384917, 0.037878066026585853, -0.008335593561465274, -0.03207238054725817, -0.013969930684632249, 0.016217617944858074, -0.009086502534051927, 0.019271650410546074, 0.0013657471917515654, 0.004160237294431429, -0.02140846520502085, -0.0015660735787335755, 0.023021155615945206, -0.006551554794380203, 0.0179311015064652, -0.00539747321906246, 0.008471664314887016, -0.01728602534209546, -0.011409784657289831, -0.03301983616367622, 0.010784867123056643, 0.01433278602709023, 0.004908626438251014, -0.03451157479378125, -0.00992308568471894, 0.012357240273707892, 0.005006899760166717, -0.004329065821825072, 0.05539591561525169, 0.009046185273778819, 0.006213897739592916, -0.016953407944842312, 0.00335389208896925, -0.009857570136775136, 0.0277181164377624, -0.010104513355947928, -0.030419372876060696, 0.02106576849269942, -0.006566673766982619, -0.010784867123056643, 0.0034068084930777057, -0.002797009931446933, 0.0013077911301089713, -0.009671102808012007, 0.02465400465700612]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.0013690383295032699, 0.028436529482006655, 0.02646314089893888, 0.02664074587141498, -0.016211387209901784, -0.04199370904768228, -0.015727907007050176, -0.025436978835743636, -0.02030616851976742, -0.000967577089635419, 0.0013739718009609392, -0.021016588409671817, 0.04009925600793721, -0.036053809412648276, -0.027410367418811414, -0.021825677728729606, 0.023878001855120092, -0.0038505744727109987, -0.03680369707421403, 0.036014341640986915, -0.02725249633216599, -0.04894003686008085, -0.0008158728423120837, -0.020246966862275385, -0.0038357740583379904, -0.017750630304694648, 0.003537299035148989, 0.005328149174282996, 0.04191477350435957, 0.00032745916800280914, 0.003246224219146492, -0.027548504619626157, -0.012304077815427585, 0.016744202127330084, -0.03733651199164233, 0.006008968235441379, 0.038500811255652315, 0.017336218702250415, -0.004955672079228954, -0.028574666682821402, 0.02064164457888894, 0.05332095951449132, -0.010754967777719381, 0.056162639074108914, 0.001486208276622919, 0.014425470542225444, -0.024134542370918905, 0.002461802257377051, 0.007513677030030559, 0.008983851524416052, -0.028732537769466823, -0.015303628461690606, 0.007395273715046492, -0.010212285917375743, -0.0015367763590640308, 0.0019289873399487513, 0.005614290518827824, -0.00037216875308793845, 0.006956194755313912, 0.0056488248190315095]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.0029212037685760325, 0.038577632414513854, 0.018376336071302592, 0.031632644713083635, -0.040326552675093, -0.024358150295892125, 0.04088417942484287, 0.01000559815744372, -0.02925005769142509, 0.04600420685436442, 0.02879381762344792, 0.010937088296230439, -0.0015200081431392101, -0.013167595295229926, 0.010892731622954881, -0.02141793652448371, 0.038400205721411625, -0.01459968217526937, -0.004394478988085637, 0.017413162594461903, -0.020302683024983965, -0.0010305005702053738, -0.006076864238751443, 0.036600592119946124, 0.009929558146114193, -0.014219482118621729, -0.04101091277705875, -0.023952603568801308, 0.025245283761403284, 0.027830644146607235, -0.028895204305220627, -0.04012377931154759, -0.00808558787137314, -0.0294528310549705, -0.04889372728488648, 0.05591475499764623, 0.01049352156347486, -0.01661474247550186, 0.055864061656759875, 0.008484797930853162, 0.014903842220587481, 0.009929558146114193, 0.03191145808795857, -0.017147022554808555, -0.005813892532903492, 0.026335190590459853, -0.025942317198590623, -0.020112582996660147, -0.03257047151948114, -0.012774721903360698, -0.014625028845712545, 0.004489529002247547, 0.0822245989176629, 0.02172209656980182, 0.020074562990995384, -0.016842862509490443, -0.02620845723824397, -0.031860764747072216, 0.018528416093961646, 0.0013655518701261064]
- 正在生成问题3/3的嵌入...
处理文本块: 78%|███████▊ | 7/9 [00:33<00:08, 4.34s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.006217829831975255, 0.005985652215298584, -0.023353670516453512, 0.005991315083998015, 0.018676140970723492, -0.029333659863052662, 0.03954947499682622, -0.0038649078873616684, 0.037986523235783255, 0.005744980295572766, 0.00029623881883898505, -0.0003049100865349888, 0.010176175052877537, 0.04496317747348227, 0.0019423639639048384, 0.003745987644673617, 0.030194415905366177, -0.02241363431234796, -0.026321013714955363, 0.0074126951275552, 0.0014638515588029177, -0.0021334857825106354, 0.005192850597378242, 0.016139175793378396, -0.0020697785096420365, 0.00835839420036018, 0.004807775525816933, 0.015504934499042122, -0.0030834320068401882, 0.00011608880833833583, -0.018925307193498456, -0.008715154928424333, 0.019593525700031315, -0.010787764872416086, -0.04188257690099179, 0.018913981456099595, 0.0015034916396989347, 0.02288931528310017, -0.007378717915358614, -0.048202338369556806, 0.021099848774079968, -0.024055866235182955, -0.000828902405879215, 0.008177182401978388, -0.02146227237084355, 0.017928642302398597, 0.004380228939009891, -0.051826574337192656, -0.00048700670815106736, 0.03071539982571383, 0.00223117026757582, -0.0871628750216422, 0.014247777647768436, 0.009105892868685074, -0.00958723670813671, -0.0007262629107020278, -0.019808714710609694, 0.008148868058481232, 0.040772654635903315, 0.018800724082110972]
[块8/9] 正在处理文本块,长度: 1000 字符
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.004970817026320638, 0.0034056849738137137, -0.0017346218179222258, 0.015900258618022967, -0.0218853659592508, -0.024999740398756964, 0.015995596611069075, 0.00379233350116737, 0.001648552796422268, 0.010932089869286944, -0.008500971046611213, -0.0056143484793818605, 0.04025381928613409, 0.020677751380666777, -0.0014777388922146595, -0.007637632554027021, 0.019311240147005908, -0.005240941339951274, -0.03493607789622901, -0.01722439741033001, 0.0020140151030990116, 0.020645972049651407, 0.011345221172486741, 0.028898005003308896, -0.0050899895176282715, -0.048474072908776214, -0.00030306226608927436, 0.015095182232300285, -0.03163102747063063, 0.022690442344973482, 0.005336279332997382, 0.012881222171562911, 0.006821963057965883, -0.0021795324521373923, -0.04525376736588549, 0.015360009990761694, 0.011790131806701907, 0.0012830904897455243, -0.02724547979050971, 0.0007296004745611805, 0.005227699952028204, -0.028368349486386082, 0.009486130308087653, 0.06457560062322985, -0.021345117331989526, 0.0024417119330141864, 0.007860087871134605, 0.014480781832669818, 0.004163092363013342, 0.028071742396909304, 0.018018880685714236, 0.0005664003684093374, 0.023283656523927037, -0.006419424865104543, -0.0010334903273956468, 0.00530979655715124, -0.0030004985033677584, -0.02474550575063401, 0.019544288574451948, 0.0496181288253295]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.014352662829251968, 0.04617369109481772, 0.014945157095512902, 0.0342012207489933, -0.009944709796638284, -0.04515214925643679, -0.002550023814058379, 0.00044564762699367765, -0.001737898052545545, 0.014025769440970072, 0.01854098436661375, 0.018091505957726146, 0.005976019754528399, -0.010767050976534928, -0.0053171252687727035, -0.0032842570103946674, 0.015353773830865272, -0.004630138382461534, -0.04253700215018163, -0.010067294817243996, 9.656762690944662e-06, 0.010240956929768752, -0.019296925327015633, 0.006302913142810294, -0.023434169772458373, -0.014250508645413875, -0.036918522039086556, 0.018234521815099474, 0.019664680388832765, 0.0065940525667488575, -0.049524348324707146, -0.013770383981374841, -0.005546972182408411, 0.008999783596135931, -0.06627763447415427, 0.016222084393489055, 0.02796981553486967, 0.0053528792331160365, 0.00920919967300402, 0.0037592739652417963, 0.004333891249331066, 0.020910961431657493, 0.012248286642187266, 0.029869883354258185, 0.006619591112708381, 0.007447040001796928, -0.020890530594889874, -0.039574530818876956, 0.0010911343761206233, 0.019797480827822286, -0.014914510840361475, 0.010889635997140639, 0.0370819687332275, 0.009147907162701165, -0.010562742608858743, -0.002980348313476343, -0.02721387457446779, -0.03236244543990764, 0.015874760168439542, 0.05095450689844044]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.003983394080683475, -0.0012688049498280926, 0.0024178085739213436, 0.010842019979003142, 0.012426664787586725, -0.015890012204284865, 0.001828331596157863, -0.012099933899218977, 0.01997414830888173, -0.0030739931080599066, -0.0184929682816146, -0.010106875480175706, 0.023982047206192788, 0.018623660636961702, 0.002321150686112551, -0.013842498637180305, 0.029057267005505157, 0.024896893693622486, -0.023045418659538575, 0.03064735732889487, -0.009616779147624084, 0.034785948581553025, 0.006512835708130466, 0.030821613802691, -0.015149422190651301, -0.04639578614822038, -0.030821613802691, 0.00440269872075542, -0.029819639078363238, 0.020834539648250137, -0.010874693067839917, 0.04587301672683198, -0.004448985596607518, 0.02237017482357856, -0.07832828497136173, 0.01812267327479782, -0.034589910048532374, -0.0073732270474988724, -0.013003889357036415, -0.016478127803346815, -0.008734605749031161, -0.0032673088836774914, -0.015367242782896468, 0.05175417271745147, -0.04136413046735704, 0.005047992225281724, 0.0026437974383757036, 0.0030331517470139377, -0.033370114731959444, 0.01755633973496039, -0.006643528063477566, -0.017872179593715877, 0.01201280566232091, -0.035896833602003374, 0.008576685819653415, 0.004470767655832034, -0.0026764705272124783, -0.04600370908217908, 0.025920650477174765, 0.013374184363853198]
- 正在生成问题3/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.016281916148428846, 0.016823203254427148, 0.008222151140114169, 0.01144280942080405, -0.0018309036360392478, -0.027995369122232047, 0.0028309315643711063, 0.03845303601011919, -0.016346870601148644, 0.0060786542003609025, 0.01236299750100116, -0.014831266704353405, 0.04089965372923151, 0.01972450214257803, -0.01738614184466538, -0.045251602061457834, 0.011691801489563268, -0.0013653967248807104, -0.013564654876317384, -0.003569788464058785, -0.02725921865807436, 0.011962445042562418, 0.030593547231023886, 0.04165745567762912, 0.0021543226818732317, -0.07140659502329566, -0.02868821661790987, 0.011529415357763778, 0.03877780827371817, 0.0018620276446341501, -0.02140249217117276, -0.03427429955181232, 0.024812600938962048, -0.004863464647394721, -0.04137598638251001, 0.02164065849781201, 0.02624159889879756, 0.0015223699856202171, 0.006711960114378913, 0.010674181730286465, 0.00428428744397654, 0.004514334464025818, 0.023340300010646672, -0.012796027185799799, -0.052050168112796474, 0.013153276675758677, -0.018880094257220686, -0.015675674589710753, -0.008969127346391822, -0.004281581008446549, -0.0002385046310805007, 0.01903165464690021, 0.06755263082858777, 0.03178437886422014, 0.0019932897678387376, -0.004008231019917407, 0.007599670968216125, -0.02515902468680096, 0.026761234520555927, 0.005651037386622247]
[块9/9] 正在处理文本块,长度: 584 字符
处理文本块: 89%|████████▉ | 8/9 [00:39<00:04, 4.79s/it] - 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.007393327515499337, -0.0037067227067503474, -0.017995459762215393, 0.035850094238421275, -0.007951599185037041, -0.02412135970362913, -0.031665571454138654, 0.008630578242582898, -0.002723460441933939, -0.004531556524806056, 0.01309675159888454, 0.010184685863187862, 0.038867778938625086, 0.016003787860080877, 0.005160240837348516, -0.006110811517912717, -0.002663106747929863, 0.014555299203983048, -0.03906895791863867, -0.005718512506886222, -0.005929750435900489, -0.002376426701410501, 0.04433984719499466, 0.047639182467217496, 0.002854226778942771, -0.06707307193653003, 0.013700288538925302, 0.05918685591999741, -0.009641502617151175, -0.0011781544017045711, 0.00799183498103976, -0.0014887244521005467, 0.022773460537538093, -0.0014396870757222347, -0.06719377932453818, -0.017784221833201126, 0.016838680627137264, 0.016888975372140663, -0.018236874538231695, -0.022049216209489176, 0.00622145995692019, 0.005280948225356669, 0.015178954042025168, 0.0031937163077157, -0.022371102577510918, 0.012895572618870952, -0.0057537188283886, 0.012422802015839021, -0.015510899359047587, 0.02170721194346608, 0.0018319860867487302, -0.03520632150237779, 0.00908826042211381, 0.01403223385594772, 0.01497777506201158, 0.011738793483792824, 0.0008744998787465626, -0.023678765947599236, 0.05528398370773381, 0.044621497767013685]
- 问题生成成功,共3个
- 正在生成问题1/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.03244590216925163, 0.022547152354903677, -0.009425809545440221, 0.01698185523705916, 0.011202085206570437, -0.021513282929849557, 0.007473556109832706, 0.02958626333399556, -0.03136803830057819, 0.019478539912455812, 0.007550546386166524, -0.0058127658632032165, 0.02723256060036171, 0.032357913282012984, -0.002300084505472796, -0.026902602273216777, 0.0227671245730003, 0.001443567681259077, -0.017663769113158687, -0.003681785000392198, -0.0209963482173225, 0.0012552164695138449, 0.03814318261795412, 0.03739527707642561, 0.002733154809850519, -0.06515577100021921, -0.018873616312690102, 0.018972603810833582, 0.045314276927903976, 0.007979492211454936, -0.014672146947044638, -0.012857376147747512, 0.03539352989174636, -0.014859123332426765, -0.050593610162222885, 0.02798046614189022, 0.02206321347509111, -0.0043197044328724, -0.007968493600550105, -0.006434187379326171, -0.01377026085284849, -0.0014600655976163236, 0.024922852310347188, -0.021051341271846654, -0.04623816024390978, 0.018752631592736962, -0.00947530329451196, -0.01346229974751322, -0.009689776207156166, -0.0009802511968930684, -0.008930872054722823, 0.015958984422909873, 0.05965646554780368, 0.02329505789643219, -0.00626370891030129, 0.004600169010945592, 0.015255073325000684, -0.019962478792268378, 0.032357913282012984, -0.014683145557949468]
- 正在生成问题2/3的嵌入...
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [0.008920837448954834, 0.011495009449150757, -0.0004888985075803139, 0.018274402087597775, -0.00767813372472232, -0.013603167552759485, -0.01842974005312684, 0.006496455629804795, 0.0028460134398717845, -0.006474264491872071, 0.027805495829702504, 0.023722326450081387, 0.02765015786417344, 0.034263116968125035, 0.009442329190373836, -0.023123165725897853, 0.033885867623268735, -0.0022690438536209744, -0.059738543314891576, 0.013858365638985806, -0.02514255927777569, -0.006696175871199307, 0.006302283172893465, 0.03539486500269393, -0.013647549828624933, -0.02982488938158034, -0.004044334888238852, 0.043760924003330676, -0.0022704307997417697, 0.02123691900161636, 0.02323412141556147, 0.022812489794839724, -0.013714123242423102, -0.036371275071733766, -0.07181052235029314, 0.007938879595431821, -0.014890253552857446, 0.044071599934388805, 0.008981863078269824, -0.046512625106988385, -0.02720633510551897, 0.01497901810458834, -0.01593323703569545, 0.004335593573605847, -0.04953061986583878, 0.012715522035450547, 0.003897318599434559, -0.022057991105127128, -0.028116171760760634, -0.018152350828967795, -0.0040332393192724905, -0.0434058657964071, -0.052326703245361934, 0.028959435002204124, 0.0032787406295598925, 0.002061001935501692, -0.03308698665769069, 0.009298086793811134, 0.047133976969104645, 0.028693141347011442]
- 正在生成问题3/3的嵌入...
处理文本块: 100%|██████████| 9/9 [00:43<00:00, 4.81s/it]
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.018177377690553307, 0.004562490692437795, 0.01148918110732063, 0.010161911087702362, 0.01867510394791016, -0.05396182173510519, -0.022957623620584722, 0.02878516855047118, -0.025010744432181733, -0.00985083217685433, 0.061220329654892595, -0.005197610135419192, 0.042804458132689134, 0.01261943448340181, -0.012474264325006063, -0.01711970939367, -0.004298073618216968, 0.003066719596110177, -0.006309717241700905, 0.01975351083885, 0.0032378129970765945, 0.011914322285479607, 0.023911598947185354, 0.03826270603430787, 0.014890310532592441, -0.05545500050717574, -0.013355654572408818, 0.04952376260700661, 0.018654365353853623, -0.015906501641362678, -0.01725451025503748, -0.0125261108101474, 0.006859289984199094, 0.008259145083015236, -0.07258507919787402, -0.00520020245967626, 0.010784068909398425, -0.012038753849818819, -0.027623807283305195, -0.02791414760009669, 0.0058845760635419285, 0.03353430658941779, -0.006247501459531299, 0.003289659482217933, -0.030568687639333226, 0.009161273924474527, -0.034882315203092595, -0.024907051461899053, -0.0006519695506523326, 0.018312178551920785, 0.002614359013251998, -0.02973914387707181, 0.017907775967818347, 0.01112625571133126, -0.0019986820021986023, 0.0035670381777240944, -0.010379666325295984, -0.01821885487866638, 0.026773524926987244, 0.033430613619135116]
[主流程] 文档处理完毕,向量库构建完成
[主流程] 向量库共包含 36 条数据
[主流程] 正在加载验证集...
[主流程] 示例问题: Java程序员面试中常见的技术问题有哪些?
[检索] 正在对问题进行向量化: Java程序员面试中常见的技术问题有哪些?
- 嵌入生成成功,向量维度: 1536
嵌入内容(前60维): [-0.007530440863964214, 0.006082679757337622, 0.009759368035317384, 0.0136443672638909, 0.021393534625978556, -0.0051895465566669016, -0.005853537711684492, -0.008587618938227516, -0.023309995371441093, 0.02318500880108484, 0.015029635085339365, -0.047286585784782204, 0.048661438058700976, 0.026767957151297414, -0.01733147108940035, -0.004317244451055555, 0.06815934303427637, -0.02605969991927865, -0.04164135902369146, 0.0694092087378389, -0.03526704393552258, -0.0311008249236475, -0.0010174687993063682, -0.010915493811112719, -0.018414688032487872, -0.006728443704178261, 0.03657940292426323, -0.00417403067252235, -0.03520455065034446, 0.03185074434578502, 0.025809726778566144, 0.011092558119117411, -0.027184579052484923, -0.016175345313605014, 0.008056426014213443, -0.017446042112226914, 0.04103725726696957, 0.029663479364550598, 0.03762095767723201, -0.03235069062721003, 0.0033251635488528012, 0.014665090921800294, -0.006436808373347005, 0.017081497948687845, 0.003624610540331323, -0.012529903678214315, -0.009014656386944712, 0.023726617272628602, 0.0051478843665481505, 0.00020473060613042404, -0.003017904896727014, -0.018841725481205066, 0.005030709456839164, -0.03697519373039137, -0.002978846593490685, -0.031392460254478755, -0.009816653546730666, 0.017758508538117546, 0.02018533111253478, -0.009301083944011124]
[检索] 正在进行相似度检索...
[检索] 检索完成,返回Top-5结果
[主流程] 检索结果:
[1] 相似度: 0.7520 类型: question
2024年Java程序员面试的趋势是什么?...
---
[2] 相似度: 0.6724 类型: question
如果在简历中标明熟练掌握Java,应该具备哪些方面的知识?...
---
[3] 相似度: 0.6383 类型: chunk
2024年Java程序员找工作最新面试攻略
这个文档是帮助正在找工作以及准备找工作的同学,在面试之前去复习和突击的一种方
式。
适合已经在技术领域有一定积累,然后不确定面试切入点,所以可以通过这个面试文档
来预热和巩固。
想直接通过刷面试文档找到工作的同学也要注意,面试文档的内容是静态的,但是面试
过程是动态的,
面试官对于某一个领域的考察,通常是通过连环问的方式去问,所以在面试之前,求职
者要对...
---
[4] 相似度: 0.5555 类型: question
在网络编程技能中,哪些是必须了解的关键知识点?...
---
[5] 相似度: 0.5401 类型: chunk
本要求。
对于分布式架构的宣称,则需对CAP原则、微服务架构、弹性设计以及Spring
Cloud、CloudNative等相关技术框架有深刻理解。
关于网络编程的技能,理解TCP/IP协议的三次握手、四次挥手过程,Socket编程
基础,以及select、poll、epoll等I/O多路复用技术,都是必不可少的知识点。
综上所述,你简历上的每一项技术标注,都应当基于你对该技术核心知识点的...
---
[上下文] 正在整理检索结果上下文...
[上下文] 上下文整理完成,共3段
[生成] 正在调用大模型生成最终回答...
[生成] 回答生成成功
[主流程] AI回答:
根据上下文信息,Java程序员面试中常见的技术问题包括但不限于:
- 并发编程、NIO(非阻塞I/O)、JVM(Java虚拟机)等进阶知识。
- Spring、Netty等流行框架的基本原理。
- 分布式架构相关知识点,比如CAP原则、微服务架构、弹性设计以及Spring Cloud、Cloud Native等相关技术框架的理解。
- 网络编程技能,如TCP/IP协议的三次握手和四次挥手过程,Socket编程基础,以及select、poll、epoll等I/O多路复用技术。
这些问题旨在考察求职者对Java及相关技术体系化的理解和掌握程度。
========== RAG主流程结束 ==========
进程已结束,退出代码为 0
附录
完整代码示例
python
# -*- coding: utf-8 -*-
"""
基于阿里大模型(通义千问Qwen+DashScope embedding)的文档增强RAG主流程
支持PDF文本提取、分块、问题生成、向量生成、语义检索、生成回答、自动评测
"""
import os
import numpy as np
from tqdm import tqdm
from PyPDF2 import PdfReader
from dashscope import Generation, TextEmbedding
import json
import re
# 06_doc_augmentation_rag.ipynb
# ========== 密钥配置:从api_keys.py读取 ==========
# try:
# from test.api_keys import ALI_API_KEY
# except ImportError:
# raise RuntimeError("未找到test/api_keys.py或未定义ALI_API_KEY,请配置API密钥!")
ALI_API_KEY="sk-xxxx"
# ==============================================
# 生成模型名称
LLM_MODEL = "qwen-max"
# 向量模型名称
EMBEDDING_MODEL = "text-embedding-v2"
def extract_text_from_pdf(pdf_path):
"""
从PDF文件中提取全部文本
:param pdf_path: PDF文件路径
:return: 提取的文本内容(str)
"""
print(f"[步骤] 正在从PDF提取文本: {pdf_path}")
with open(pdf_path, 'rb') as f:
reader = PdfReader(f)
text = ""
for i, page in enumerate(reader.pages):
page_text = page.extract_text()
if page_text:
text += page_text
print(f" - 已提取第{i+1}页")
print(f"[完成] PDF文本提取完成,总长度: {len(text)} 字符\n")
return text
def chunk_text(text, n, overlap):
"""
将文本分割为带重叠的块
:param text: 原始文本
:param n: 每块字符数
:param overlap: 块间重叠字符数
:return: 文本块列表
"""
print(f"[步骤] 正在分块: 每块{n}字符,重叠{overlap}字符")
chunks = []
for i in range(0, len(text), n - overlap):
chunks.append(text[i:i + n])
print(f"[完成] 分块完成,共{len(chunks)}块\n")
return chunks
def generate_questions(text_chunk, num_questions=5, model=LLM_MODEL):
"""
针对文本块用Qwen生成可回答的问题
:param text_chunk: 文本块
:param num_questions: 生成问题数
:param model: 生成模型名
:return: 问题列表
"""
system_prompt = "你是文本理解专家,请根据给定文本生成可由该文本直接回答的简明问题。只输出问题列表,不要输出其他内容。"
user_prompt = f"""
请基于以下文本,生成{num_questions}个不同的问题:\n\n{text_chunk}\n\n只输出编号的问题列表。
"""
try:
response = Generation.call(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
api_key=ALI_API_KEY,
result_format='message'
)
if response.status_code == 200:
questions_text = response.output.choices[0].message.content.strip()
questions = []
for line in questions_text.split('\n'):
cleaned = re.sub(r'^\d+[.、]\s*', '', line.strip())
if cleaned and (cleaned.endswith('?') or cleaned.endswith('?')):
questions.append(cleaned)
print(f" - 问题生成成功,共{len(questions)}个")
return questions
else:
print(f" - 问题生成失败: {response.message}")
return []
except Exception as e:
print(f" - 问题生成异常: {e}")
return []
def create_embedding(text, model=EMBEDDING_MODEL):
"""
用阿里embedding模型生成单条文本的向量
:param text: 输入文本
:param model: 嵌入模型名
:return: 向量(np.ndarray)
"""
try:
response = TextEmbedding.call(
model=model,
input=[text],
api_key=ALI_API_KEY
)
if response.status_code == 200:
emb = response.output['embeddings'][0]['embedding']
print(f" - 嵌入生成成功,向量维度: {len(emb)}")
# 打印嵌入内容(前60个元素,防止过长)
print(f" 嵌入内容(前60维): {emb[:60]}")
return np.array(emb)
else:
print(f" - 嵌入生成失败: {response.message}")
return None
except Exception as e:
print(f" - 嵌入生成异常: {e}")
return None
class SimpleVectorStore:
"""
简单的向量存储与检索类
"""
def __init__(self):
self.vectors = []
self.texts = []
self.metadata = []
def add_item(self, text, embedding, metadata=None):
self.vectors.append(np.array(embedding))
self.texts.append(text)
self.metadata.append(metadata or {})
def similarity_search(self, query_embedding, k=5):
if not self.vectors:
return []
query_vector = np.array(query_embedding)
similarities = []
for i, vector in enumerate(self.vectors):
sim = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
similarities.append((i, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
results = []
for i in range(min(k, len(similarities))):
idx, score = similarities[i]
results.append({
"text": self.texts[idx],
"metadata": self.metadata[idx],
"similarity": score
})
return results
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200, questions_per_chunk=3):
"""
主流程:提取文本、分块、生成问题、生成向量、构建向量库
:param pdf_path: PDF文件路径
:param chunk_size: 块大小
:param chunk_overlap: 块重叠
:param questions_per_chunk: 每块生成问题数
:return: 文本块列表、向量库
"""
print("[主流程] 开始处理文档...\n")
extracted_text = extract_text_from_pdf(pdf_path)
text_chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
print("[主流程] 初始化向量库...")
vector_store = SimpleVectorStore()
print("[主流程] 为每个块生成向量并增强问题...")
for i, chunk in enumerate(tqdm(text_chunks, desc="处理文本块")):
print(f"\n[块{i+1}/{len(text_chunks)}] 正在处理文本块,长度: {len(chunk)} 字符")
# 块向量
chunk_emb = create_embedding(chunk)
if chunk_emb is not None:
vector_store.add_item(chunk, chunk_emb, {"type": "chunk", "index": i})
# 生成问题
questions = generate_questions(chunk, num_questions=questions_per_chunk)
for j, q in enumerate(questions):
print(f" - 正在生成问题{j+1}/{len(questions)}的嵌入...")
q_emb = create_embedding(q)
if q_emb is not None:
vector_store.add_item(q, q_emb, {"type": "question", "chunk_index": i, "original_chunk": chunk})
print("[主流程] 文档处理完毕,向量库构建完成\n")
return text_chunks, vector_store
def semantic_search(query, vector_store, k=5):
"""
用向量库做语义检索
:param query: 查询问题
:param vector_store: 向量库
:param k: 返回top-k
:return: 检索结果
"""
print(f"[检索] 正在对问题进行向量化: {query}")
query_emb = create_embedding(query)
if query_emb is None:
print("[检索] 查询向量生成失败,无法检索!")
return []
print("[检索] 正在进行相似度检索...")
results = vector_store.similarity_search(query_emb, k=k)
print(f"[检索] 检索完成,返回Top-{k}结果\n")
return results
def prepare_context(search_results):
"""
根据检索结果准备上下文
:param search_results: 检索结果
:return: 上下文字符串
"""
print("[上下文] 正在整理检索结果上下文...")
chunk_indices = set()
context_chunks = []
for result in search_results:
if result["metadata"]["type"] == "chunk":
chunk_indices.add(result["metadata"]["index"])
context_chunks.append(f"Chunk {result['metadata']['index']}:\n{result['text']}")
for result in search_results:
if result["metadata"]["type"] == "question":
chunk_idx = result["metadata"]["chunk_index"]
if chunk_idx not in chunk_indices:
chunk_indices.add(chunk_idx)
context_chunks.append(f"Chunk {chunk_idx} (由问题'{result['text']}'关联):\n{result['metadata']['original_chunk']}")
print(f"[上下文] 上下文整理完成,共{len(context_chunks)}段\n")
return "\n\n".join(context_chunks)
def generate_response(query, context, model=LLM_MODEL):
"""
用Qwen生成基于上下文的回答
:param query: 用户问题
:param context: 上下文
:param model: 生成模型名
:return: 回答内容
"""
print("[生成] 正在调用大模型生成最终回答...")
system_prompt = "你是一个AI助手,只能基于给定上下文回答问题。如果上下文无法直接回答,请回复:'信息不足,无法回答。'"
user_prompt = f"""
上下文:
{context}
问题:{query}
请只基于上述上下文简明准确作答。
"""
try:
response = Generation.call(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
api_key=ALI_API_KEY,
result_format='message'
)
if response.status_code == 200:
print("[生成] 回答生成成功\n")
return response.output.choices[0].message.content.strip()
else:
print(f"[生成] 回答生成失败: {response.message}")
return ""
except Exception as e:
print(f"[生成] 回答生成异常: {e}")
return ""
def main():
"""
主方法示例:可直接运行查看效果
"""
# ========== 路径和参数 =============
pdf_path = "data/2888年Java程序员找工作最新场景题.pdf" # 请替换为你的PDF路径
val_path = "data/java_val.json" # 验证集json
chunk_size = 1000
chunk_overlap = 200
questions_per_chunk = 3
top_k = 5
print("\n========== RAG主流程开始 ==========")
print(f"[配置] 使用API密钥: {ALI_API_KEY[:8]}...{ALI_API_KEY[-4:]}")
print(f"[配置] PDF路径: {pdf_path}")
print(f"[配置] 验证集路径: {val_path}")
print(f"[配置] 块大小: {chunk_size},重叠: {chunk_overlap},每块问题数: {questions_per_chunk},检索TopK: {top_k}\n")
# ========== 处理文档 =============
text_chunks, vector_store = process_document(pdf_path, chunk_size, chunk_overlap, questions_per_chunk)
print(f"[主流程] 向量库共包含 {len(vector_store.texts)} 条数据\n")
# ========== 加载验证集并检索 =============
print("[主流程] 正在加载验证集...")
with open(val_path, encoding='utf-8') as f:
data = json.load(f)
query = data[0]['question']
print(f"[主流程] 示例问题: {query}")
search_results = semantic_search(query, vector_store, k=top_k)
print("[主流程] 检索结果:")
for i, r in enumerate(search_results):
print(f"[{i+1}] 相似度: {r['similarity']:.4f} 类型: {r['metadata']['type']}\n{r['text'][:200]}...\n---")
# ========== 生成回答 =============
context = prepare_context(search_results)
answer = generate_response(query, context)
print(f"[主流程] AI回答:\n{answer}\n")
print("========== RAG主流程结束 ==========")
if __name__ == "__main__":
main()