上下文学习In-context learning
- 应用大语言模型要从传统机器学习思维切换为上下文学习的思路
- 上下文学习包括Zero-shot learning 和 Few-shot Learning 两者并无明显界限,可以根据实际需要灵活运用
- Zero-shot learning 只是描述问题,不是传统的机器学习喂大量的数据
- Few-shot learning 只是给出简短的几个示例,让大语言模型按照例子出
Zero-shot learning
- 意图识别
python
import openai
def recognize_intent(input):
response = openai.ChatCompletion.create(
engine=deployment, # engine = "deployment_name".
temperature = 0,
messages=[
{"role": "system", "content": """
Recognize the intent from the user's input and format output as JSON string.
The output JSON string includes: "intention", "paramters" """},
{"role": "user", "content": input}
]
)
return response.choices[0].message.content
python
print(recognize_intent("订8月19日北京到上海的飞机"))
{"intention": "book_flight", "parameters": {"date": "8月19日", "departure": "北京", "destination": "上海"}}
- 情感分类
python
import openai
def classify_sentiment(input):
response = openai.ChatCompletion.create(
engine=deployment, # engine = "deployment_name".
messages=[
{"role": "system", "content": """
根据给定文字所表达的情感,将文字分为三类:正面 ,中立, 负面。请将用户给定的文字进行分类,并输出分类。
"""},
{"role": "user", "content": input}
],
temperature = 0.9,
max_tokens = 200
)
return response.choices[0].message.content
Few-shot Learning
python
import openai
def book_flight(input):
response = openai.ChatCompletion.create(
engine=deployment, # engine = "deployment_name".
temperature = 0,
messages=[
{"role": "system", "content": """
通过用户描述,提取航班预订信息并以JSON格式输出.
以下是一些示例:
示例1
输入: "订8月9日上海到北京的航班,上午出发"
输出:"{"date":"0809","dispatch_time":"6-12","from":"shanghai","to":"beijing"}"
示例2
输入: "订8月9日上海到北京的航班,下午出发"
输出:"{"date":"0809","dispatch_time":"12-18","from":"shanghai","to":"beijing"}"
示例3
输入: "订8月9日上海到北京的航班,晚上出发"
输出:"{"date":"0809","dispatch_time":"18-24","from":"shanghai","to":"beijing"}"
示例4
输入: "订8月9日上海到北京的航班"
输出:"{"date":"0809","dispatch_time":"unknown","from":"shanghai","to":"beijing"}"
"""},
{"role": "user", "content": input}
]
)
return response.choices[0].message.content
ReAct Agent
- 学会ai思考的过程,提问可以遵循这样的提
python
import openai
tool = """
1 tool: python_interpreter, description: use it to execute python code
2 tool: web_access, description: use it to get realtime info, input is the question or query
"""
react_prompt = f"""
Try your best to anwser user's question, and use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should use one of tools in the given tool list:
[{tool}]
Action Input: the input to the action
Here, you should pause the process and return to wait the outside observation.
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
"""
def react_demo(request):
response = openai.ChatCompletion.create(
engine=deployment, # engine = "deployment_name".
temperature = 0,
messages=[
{"role": "system", "content": react_prompt,
},
{"role": "user", "content": request}
]
)
print(response.choices[0].message.content)
python
print(react_demo("今天北京适合穿什么?"))
python
# run the web_access tool to get the Beijing's weather today. For example,
weather = "The weather today in Beijing is 33°C, with 0% precipitation, 31% humidity, and a wind speed of 4 mph."
next_prompt = """
Question: 今天北京适合穿什么?
Thought: I need to check the current weather in Beijing to give a proper clothing suggestion.
Action: web_access
Action Input: 今天北京天气如何?
Observation: """ + weather
print(react_demo(next_prompt))
文本向量化embedding
python
from langchain.embeddings.openai import OpenAIEmbeddings
# embedding = OpenAIEmbeddings() #如果直接使用OpenAI的GPT服务
embedding = OpenAIEmbeddings(deployment=embedding_deployment) #deployment是你在Azure中的 embedding 模型的部署名字
sentence1 = "我是一名软件工程师"
sentence2 = "小张从事法律工作"
sentence3 = "我是一名程序员"
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)
- 用点集衡量向量之间的距离
python
import numpy as np
print(np.dot(embedding1,embedding2))
print(np.dot(embedding2,embedding3))
print(np.dot(embedding1,embedding3))
0.7987662070931474 0.7992461569419606 0.9592105615669977
处理大文本
思路
-
- 把文章切割部分,分别向量化
-
- 在用用户给定的关键词,用 numpy 的点集 dot,查找计算关键词相似的段落
-
- 再把段落交给gpt,总结出通顺的话
做法
- 向量化
python
from langchain.embeddings.openai import OpenAIEmbeddings
def embed(chunks):
# embedding = OpenAIEmbeddings()#如果直接使用OpenAI的GPT服务
embedding = OpenAIEmbeddings(deployment=embedding_deployment)
return [embedding.embed_query(chunk) for chunk in chunks]
chunks = split_file_into_chunks("spotmax_intro.txt",100)
embeddeds = embed(chunks)
- 向量检索
python
def find_k_largest_indices(input_list, k):
sorted_indices = sorted(range(len(input_list)), key=lambda i: input_list[i], reverse=True)
print(input_list)
print(sorted_indices)
return sorted_indices[:k]
def search(chunks,embeddeds,top_k, txt):
embedding = OpenAIEmbeddings(deployment="embedding",chunk_size=1)
embedded_txt = embedding.embed_query(txt)
distances = [np.dot(embedded_txt,embedded) for embedded in embeddeds]
ret_idx = find_k_largest_indices(distances, top_k)
return [chunks[i] for i in ret_idx]
- 使用
python
search(chunks,embeddeds,2, "提高系统可用性")
- gpt总结
python
import openai
import numpy as np
def anwser_question_with_doc(question, chunks, embeddeds):
relevent_chunks = search(chunks,embeddeds,2, question)
prompt = """
仅通过总结以下的文字片段回答用户问题, 注意保持回答的语言通顺(字数在30字以内)
---
"""
for rchunks in relevent_chunks:
prompt = prompt + "\n'" + rchunks + "'"
response = openai.ChatCompletion.create(
engine=deployment, # engine = "deployment_name".
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": question}
],
temperature = 0.9,
max_tokens = 200
)
return response.choices[0].message.content
- 最终使用
python
print(anwser_question_with_doc("如何提高可用性", chunks,embeddeds))
用 LangChain 连接内部数据和大模型
向量数据库 Chroma
python
!pip install chromadb
- 加载文档 (Document Loader)
python
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader('./', glob="spotmax_intro.txt")
docs = loader.load()
python
pip install pypdf
python
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("Spotmax_intro_cn_2020.pdf")
docs = loader.load()
- 切割文档
- 这个文本分割器是通用文本的推荐分割器。它由一个字符列表参数化。它会按顺序尝试在这些字符上进行分割,直到分块足够小。默认列表是["\n\n", "\n", " ", ""]。这样做的效果是尽可能地保持所有段落(然后是句子,然后是单词)在一起。
python
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 200,
chunk_overlap = 10
)
python
splits = text_splitter.split_documents(docs)
persist_directory = 'data/'
!rm -rf ./data
- 将数据存入向量数据库
python
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
# embedding = OpenAIEmbeddings() #如何直接调用OpenAI的 GPT 服务
embedding = OpenAIEmbeddings(deployment=embedding_deployment) #通过Azure的OpenAI GPT服务
vectordb = Chroma.from_documents(
documents=splits,
collection_name="spotmax",
embedding=embedding,
persist_directory=persist_directory
)
vectordb.persist()
print(vectordb._collection.count())
python
docs = vectordb.similarity_search("如何提高可用性",k=2)
利用向量数据库进行QA
python
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
# from langchain.chat_models import ChatOpenAI #直接访问OpenAI的GPT服务
#llm = ChatOpenAI(model_name="gpt-4", temperature=0) #直接访问OpenAI的GPT服务
llm = AzureChatOpenAI(deployment_name = deployment, model_name=model, temperature=0.7, max_tokens=1000) #通过Azure的OpenAI服务
retriever=vectordb.as_retriever(
search_type="mmr", search_kwargs={"k": 3}
#search_type="similarity", search_kwargs={"k": 3}
)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever,
return_source_documents=False, verbose=True)
python
result = qa({"query": "什么是SpotMax?"})
print(result["result"])
从文本到图片
python
import os
import http.client
import requests
import time
def create_image(prompt):
api_base = os.getenv("OPENAI_API_BASE")
api_key = os.getenv("OPENAI_API_KEY")
api_version = api_version = '2022-08-03-preview'
url = "{}dalle/text-to-image?api-version={}".format(api_base, api_version)
headers= { "api-key": api_key, "Content-Type": "application/json" }
body = {
"caption": prompt,
"resolution": "512x512"
}
submission = requests.post(url, headers=headers, json=body)
operation_location = submission.headers['Operation-Location']
retry_after = submission.headers['Retry-after']
status = ""
while (status != "Succeeded"):
time.sleep(int(retry_after))
response = requests.get(operation_location, headers=headers)
status = response.json()['status']
image_url = response.json()['result']['contentUrl']
#display(Image(url=image_url))
return "\n![image]("+image_url+")"
python
print (create_image("A dog on the street."))