最终注释

This commit is contained in:
administrator 2023-03-31 22:39:43 +08:00
parent 9905f942a1
commit 73472dfd71
4 changed files with 132 additions and 107 deletions

View File

@ -1,52 +1,76 @@
# 参考文章 # 参考文章
# https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb # https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb
import openai import openai
import pandas as pd import pandas as pd
import time
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path from gpt_0_basic_info import api_key, excel_file_path, csv_file_path
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002" EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key = api_key
def get_embedding(text: str, open_ai_api_key: str, model: str = EMBEDDING_MODEL) -> list[float]: """
openai.api_key = open_ai_api_key We preprocess the document sections by creating an embedding vector for each section. An embedding is a vector of
numbers that helps us understand how semantically similar or different the texts are. The closer two embeddings are to
each other, the more similar are their contents.
翻译:
我们通过为每个部分创建嵌入向量来预处理文档部分嵌入是一组数字帮助我们理解文本的语义相似性或差异两个嵌入越接近它们的内容就越相似
"""
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
# 计算嵌入向量
result = openai.Embedding.create( result = openai.Embedding.create(
model=model, model=model,
input=text input=text
) )
return result["data"][0]["embedding"] return_data_embedding = result["data"][0]["embedding"]
# 具体数据如下
# [-0.008970350958406925, -0.014719498343765736, ~~~~很多很多~~~~]
return return_data_embedding
def compute_doc_embeddings(df: pd.DataFrame, open_ai_api_key: str): def compute_doc_embeddings(df: pd.DataFrame):
""" """
Create an embedding for each row in the dataframe using the OpenAI Embeddings API. Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
Return a datafram with embedding Return a dataframe with embedding
}
""" """
df['embeddings'] = '' # print(df)
df['embeddings'] = df['embeddings'].astype('object') """
prompt ... QandA
for idx, r in df.iterrows(): 0 当有人问公司名称 ... 当有人问公司名称, 请回答亁颐堂科技有限责任公司
print(idx) 1 当有人问亁颐堂是做什么的 ... 当有人问亁颐堂是做什么的, 请回答亁颐堂是一个网络培训公司
df.at[idx, 'embeddings'] = get_embedding(r.QandA, open_ai_api_key) 2 当有人问你们公司有多少人 ... 当有人问你们公司有多少人, 请回答亁颐堂有三十多个人
time.sleep(1) 3 当有人问你们公司有多少个分部 ... 当有人问你们公司有多少个分部, 请回答亁颐堂有北京 上海和南京三个分部
"""
# 添加新的列'embeddings', 值为'QandA'这个列计算的向量数据
df['embeddings'] = df['QandA'].apply(lambda x: get_embedding(x))
return df return df
def getembeddings(api_key, excelfilepath, csvfilepath): def get_embeddings(openai_api_key, excel_file_path, csv_file_path):
df = pd.read_excel(excelfilepath) df = pd.read_excel(excel_file_path)
# 删除换行"\n"
df['prompt'] = df['prompt'].apply(lambda x: x.replace('\n', '')) df['prompt'] = df['prompt'].apply(lambda x: x.replace('\n', ''))
# 给问题加上"当有人问:"的前缀
df['prompt'] = df['prompt'].apply(lambda x: "当有人问:" + x + '') df['prompt'] = df['prompt'].apply(lambda x: "当有人问:" + x + '')
df['completion'] = df['completion'].apply(lambda x: "请回答:" + x) # 给答案加上", 请回答:"的前缀
df['completion'] = df['completion'].apply(lambda x: ", 请回答:" + x)
# 将问题和答案合并
df['QandA'] = df['prompt'] + df['completion'] df['QandA'] = df['prompt'] + df['completion']
df = compute_doc_embeddings(df, api_key)[['QandA', 'embeddings']] # 只取'QandA'和'embeddings'两列
df.to_csv(csvfilepath, index=False, encoding='utf-8_sig') df = compute_doc_embeddings(df)[['QandA', 'embeddings']]
# print(df)
"""
QandA embeddings
0 当有人问公司名称, 请回答亁颐堂科技有限责任公司 [-0.009215764701366425, -0.022858258336782455,...
1 当有人问亁颐堂是做什么的, 请回答亁颐堂是一个网络培训公司 [-0.014166537672281265, -0.01877765916287899, ...
2 当有人问你们公司有多少人, 请回答亁颐堂有三十多个人 [-0.004638118669390678, -0.011072063818573952,...
3 当有人问你们公司有多少个分部, 请回答亁颐堂有北京 上海和南京三个分部 [0.0038256149273365736, -0.0033990885131061077...
"""
df.to_csv(csv_file_path, index=False, encoding='utf-8_sig')
if __name__ == '__main__': if __name__ == '__main__':
getembeddings(api_key, excel_file_path, csv_file_path) get_embeddings(api_key, excel_file_path, csv_file_path)

View File

@ -2,120 +2,121 @@ import numpy as np
import openai import openai
import pandas as pd import pandas as pd
import ast import ast
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path from gpt_0_basic_info import api_key, csv_file_path
from gpt_1_embeddings_training import get_embedding
# 最大的内容长度
max_context_len = 1500
MAXCONTEXTLEN = 1500
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002" EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key = api_key
def get_embedding(text: str, open_ai_api_key: str, model: str=EMBEDDING_MODEL) -> list[float]:
openai.api_key = open_ai_api_key
result = openai.Embedding.create(
model=model,
input=text
)
return result["data"][0]["embedding"]
def compute_doc_embeddings(df: pd.DataFrame, open_ai_api_key: str) : def vector_similarity(x: list[float], y: list[float]):
"""
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
Return a datafram with embedding
}
"""
df['embeddings'] = ''
df['embeddings'] = df['embeddings'].astype('object')
for idx, r in df.iterrows():
df.at[idx, 'embeddings'] = get_embedding(r.content, open_ai_api_key)
return df
def vector_similarity(x: list[float], y: list[float]) -> float:
""" """
计算并且返回两个向量的相似度
Returns the similarity between two vectors. Returns the similarity between two vectors.
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product. Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
""" """
return np.dot(np.array(x), np.array(y)) return_np_dot_result = np.dot(np.array(x), np.array(y))
return return_np_dot_result
def get_query_similarity(query: str, df: pd.DataFrame, open_ai_api_key: str): def get_query_similarity(input_query: str, df: pd.DataFrame):
""" """
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings Find the query embedding for the supplied query, and compare it against all of the pre-calculated document
to find the most relevant sections. embeddings to find the most relevant sections.
Return the list of document sections, sorted by relevance in descending order. Return the list of document sections, sorted by relevance in descending order.
""" """
openai.api_key = open_ai_api_key
query_embedding = get_embedding(query, open_ai_api_key) query_embedding = get_embedding(input_query)
#df['similarities'] = 0 df['similarities'] = df['embeddings'].apply(lambda x: vector_similarity(query_embedding, x))
# print(df)
df['similarities'] = df['embeddings'].apply(lambda x:vector_similarity(query_embedding, x)) """
QandA ... similarities
#print(df['similarities']) 0 当有人问公司名称, 请回答亁颐堂科技有限责任公司 ... 0.809908
''' 1 当有人问亁颐堂是做什么的, 请回答亁颐堂是一个网络培训公司 ... 0.877552
for idx, r in df.iterrows(): 2 当有人问你们公司有多少人, 请回答亁颐堂有三十多个人 ... 0.808605
df.loc[idx, 'similarities'] = vector_similarity(query_embedding, r.embeddings) 3 当有人问你们公司有多少个分部, 请回答亁颐堂有北京 上海和南京三个分部 ... 0.783896
''' """
# 找到最相似的两个
two_largest = df['similarities'].nlargest(2).index.tolist() two_largest = df['similarities'].nlargest(2).index.tolist()
# print('get_query_similarity!!!!!!!!') # print(two_largest)
# [1, 0] 行的索引
context = '' if df.loc[two_largest[0]]['similarities'] < 0.8 else df.loc[two_largest[0]]['QandA'] if (df.loc[two_largest[1]]['similarities'] < 0.8 or (len(df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])>=MAXCONTEXTLEN)) else (df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA']) # 如果最相似的df.loc[two_largest[0]]['similarities']都小于0.8,那么就返回空字符串
# print(two_largest[0], df.loc[two_largest[0]]['similarities'], df.loc[two_largest[0]]['QandA']) # 如果第二相似的df.loc[two_largest[1]]['similarities']小于0.8并且拼接后长度大于1500那么就返回df.loc[two_largest[0]]['QandA']
# print(two_largest[1], df.loc[two_largest[1]]['similarities'], df.loc[two_largest[1]]['QandA']) # 如果第二个相似的df.loc[two_largest[1]]['similarities']大于0.8,那么就返回两个拼接后的字符串
# print(len(df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])) context = '' if df.loc[two_largest[0]]['similarities'] < 0.8 else df.loc[two_largest[0]]['QandA'] \
# print(context) if (df.loc[two_largest[1]]['similarities'] < 0.8 or (len(df.loc[two_largest[1]]['QandA'] + '\n' +
df.loc[two_largest[0]]['QandA']) >= max_context_len)) \
else (df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])
return context return context
def _decorate_query(query: str, df: pd.DataFrame, open_ai_api_key: str)-> str: def _decorate_query(input_query: str, df: pd.DataFrame) -> str:
try: try:
context = get_query_similarity(query, df, open_ai_api_key) context = get_query_similarity(input_query, df)
if context != '': if context != '':
header = """请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。\n\n上下文:\n""" header = """请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。\n\n上下文:\n"""
#header = "上下文:\n" input_query = header + context + "\n\n 问题: " + input_query + "\n 回答:"
query = header + context + "\n\n 问题: " + query + "\n 回答:" """
# print(query) 请使用上下文尽可能真实自然地回答问题如果答案未包含在上下文中请不要编造回答并且不要在回答中包含根据上下文这个短语
return query
except:
# print('ERROR 444444')
return query 上下文
当有人问公司名称, 请回答亁颐堂科技有限责任公司
当有人问亁颐堂是做什么的, 请回答亁颐堂是一个网络培训公司
问题: 亁颐堂是做什么的
回答:
"""
return input_query
except Exception as e:
print(e)
return input_query
def decorate_query(query: str, open_ai_api_key, filename='foodsembeddings.csv')-> str: def decorate_query(input_query: str, filepath) -> str:
filepath = filename
try: try:
df = pd.read_csv(filepath) df = pd.read_csv(filepath)
if df.empty: if df.empty:
return query return input_query
else: else:
try: try:
# 使用.apply()方法对'embeddings'列中的每个元素进行操作。
# 用lambda函数定义了一个匿名函数这个匿名函数接受一个参数x并将ast.literal_eval(x)的结果返回。
# ast.literal_eval(x)是Python中ast模块Abstract Syntax Trees抽象语法树的literal_eval()函数,
# 它安全地解析一个字符串形式的字面量表达式如字符串形式的数字、列表、元组、字典等并返回该表达式的对应Python对象。
# 这里它将字符串形式的x解析成一个Python对象。
# print(df)
"""
QandA embeddings
0 当有人问公司名称, 请回答亁颐堂科技有限责任公司 [-0.009181897155940533, -0.022621875628829002,...
1 当有人问亁颐堂是做什么的, 请回答亁颐堂是一个网络培训公司 [-0.014206966385245323, -0.018791578710079193,...
2 当有人问你们公司有多少人, 请回答亁颐堂有三十多个人 [-0.004695456940680742, -0.011140977963805199,...
3 当有人问你们公司有多少个分部, 请回答亁颐堂有北京 上海和南京三个分部 [0.0038718082942068577, -0.003343536052852869,...
"""
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x)) df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x))
return _decorate_query(query, df, open_ai_api_key)
return _decorate_query(input_query, df)
except Exception as e: except Exception as e:
print(e) print(e)
return query return input_query
except Exception as e: except Exception as e:
print(e) print(e)
return query return input_query
if __name__ == '__main__': if __name__ == '__main__':
query = '谁发现了牛顿三大定律' # 不相关的就直接返回问题 # query = '谁发现了牛顿三大定律' # 不相关的就直接返回问题
# query = '亁颐堂是做什么的' # 找到相关内容, 就添加上下文 query = '亁颐堂是做什么的' # 找到相关内容, 就添加上下文
# 如果内容相关就添加如下上下文 # 如果内容相关就添加如下上下文
""" """
请使用上下文尽可能真实自然地回答问题如果答案未包含在上下文中请不要编造回答并且不要在回答中包含根据上下文这个短语 请使用上下文尽可能真实自然地回答问题如果答案未包含在上下文中请不要编造回答并且不要在回答中包含根据上下文这个短语
@ -127,7 +128,7 @@ if __name__ == '__main__':
问题: 亁颐堂是做什么的 问题: 亁颐堂是做什么的
回答: 回答:
""" """
print(decorate_query(query, api_key, filename=csv_file_path))\ print(decorate_query(query, filepath=csv_file_path))\

View File

@ -4,12 +4,12 @@ from gpt_0_basic_info import api_key, csv_file_path
openai.api_key = api_key openai.api_key = api_key
def question(query): def question(input_query):
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model='gpt-3.5-turbo', model='gpt-3.5-turbo',
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": query} {"role": "user", "content": input_query}
], ],
max_tokens=100, max_tokens=100,
n=1, n=1,
@ -22,15 +22,15 @@ def question(query):
if __name__ == '__main__': if __name__ == '__main__':
query = '亁颐堂是做什么的' query = '亁颐堂是做什么的'
new_query = decorate_query(query, api_key, filename=csv_file_path) new_query = decorate_query(query, filepath=csv_file_path)
print(new_query) print(new_query)
# 产生如下的问题: # 产生如下的问题:
""" """
请使用上下文尽可能真实自然地回答问题如果答案未包含在上下文中请不要编造回答并且不要在回答中包含根据上下文这个短语 请使用上下文尽可能真实自然地回答问题如果答案未包含在上下文中请不要编造回答并且不要在回答中包含根据上下文这个短语
上下文 上下文
当有人问公司名称请回答亁颐堂科技有限责任公司 当有人问公司名称, 请回答亁颐堂科技有限责任公司
当有人问亁颐堂是做什么的请回答亁颐堂是一个网络培训公司 当有人问亁颐堂是做什么的, 请回答亁颐堂是一个网络培训公司
问题: 亁颐堂是做什么的 问题: 亁颐堂是做什么的
回答: 回答:

File diff suppressed because one or more lines are too long