mirror of
https://github.com/collinsctk/chatgpt_embeddings.git
synced 2025-07-19 00:00:05 +08:00
最终注释
This commit is contained in:
parent
9905f942a1
commit
73472dfd71
@ -1,52 +1,76 @@
|
||||
# 参考文章
|
||||
# https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb
|
||||
|
||||
|
||||
import openai
|
||||
import pandas as pd
|
||||
import time
|
||||
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path
|
||||
|
||||
COMPLETIONS_MODEL = "text-davinci-003"
|
||||
EMBEDDING_MODEL = "text-embedding-ada-002"
|
||||
|
||||
openai.api_key = api_key
|
||||
|
||||
def get_embedding(text: str, open_ai_api_key: str, model: str = EMBEDDING_MODEL) -> list[float]:
|
||||
openai.api_key = open_ai_api_key
|
||||
"""
|
||||
We preprocess the document sections by creating an embedding vector for each section. An embedding is a vector of
|
||||
numbers that helps us understand how semantically similar or different the texts are. The closer two embeddings are to
|
||||
each other, the more similar are their contents.
|
||||
|
||||
翻译:
|
||||
我们通过为每个部分创建嵌入向量来预处理文档部分。嵌入是一组数字,帮助我们理解文本的语义相似性或差异。两个嵌入越接近,它们的内容就越相似。
|
||||
"""
|
||||
|
||||
|
||||
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
|
||||
# 计算嵌入向量
|
||||
result = openai.Embedding.create(
|
||||
model=model,
|
||||
input=text
|
||||
model=model,
|
||||
input=text
|
||||
)
|
||||
return result["data"][0]["embedding"]
|
||||
return_data_embedding = result["data"][0]["embedding"]
|
||||
# 具体数据如下
|
||||
# [-0.008970350958406925, -0.014719498343765736, ~~~~很多很多~~~~]
|
||||
return return_data_embedding
|
||||
|
||||
|
||||
def compute_doc_embeddings(df: pd.DataFrame, open_ai_api_key: str):
|
||||
def compute_doc_embeddings(df: pd.DataFrame):
|
||||
"""
|
||||
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
|
||||
|
||||
Return a datafram with embedding
|
||||
}
|
||||
Return a dataframe with embedding
|
||||
"""
|
||||
df['embeddings'] = ''
|
||||
df['embeddings'] = df['embeddings'].astype('object')
|
||||
|
||||
for idx, r in df.iterrows():
|
||||
print(idx)
|
||||
df.at[idx, 'embeddings'] = get_embedding(r.QandA, open_ai_api_key)
|
||||
time.sleep(1)
|
||||
|
||||
# print(df)
|
||||
"""
|
||||
prompt ... QandA
|
||||
0 当有人问:公司名称 ... 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司
|
||||
1 当有人问:亁颐堂是做什么的 ... 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司
|
||||
2 当有人问:你们公司有多少人 ... 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人
|
||||
3 当有人问:你们公司有多少个分部 ... 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部
|
||||
"""
|
||||
# 添加新的列'embeddings', 值为'QandA'这个列计算的向量数据
|
||||
df['embeddings'] = df['QandA'].apply(lambda x: get_embedding(x))
|
||||
return df
|
||||
|
||||
|
||||
def getembeddings(api_key, excelfilepath, csvfilepath):
|
||||
df = pd.read_excel(excelfilepath)
|
||||
def get_embeddings(openai_api_key, excel_file_path, csv_file_path):
|
||||
df = pd.read_excel(excel_file_path)
|
||||
# 删除换行"\n"
|
||||
df['prompt'] = df['prompt'].apply(lambda x: x.replace('\n', ''))
|
||||
# 给问题加上"当有人问:"的前缀
|
||||
df['prompt'] = df['prompt'].apply(lambda x: "当有人问:" + x + '')
|
||||
df['completion'] = df['completion'].apply(lambda x: "请回答:" + x)
|
||||
# 给答案加上", 请回答:"的前缀
|
||||
df['completion'] = df['completion'].apply(lambda x: ", 请回答:" + x)
|
||||
# 将问题和答案合并
|
||||
df['QandA'] = df['prompt'] + df['completion']
|
||||
df = compute_doc_embeddings(df, api_key)[['QandA', 'embeddings']]
|
||||
df.to_csv(csvfilepath, index=False, encoding='utf-8_sig')
|
||||
# 只取'QandA'和'embeddings'两列
|
||||
df = compute_doc_embeddings(df)[['QandA', 'embeddings']]
|
||||
# print(df)
|
||||
"""
|
||||
QandA embeddings
|
||||
0 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司 [-0.009215764701366425, -0.022858258336782455,...
|
||||
1 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司 [-0.014166537672281265, -0.01877765916287899, ...
|
||||
2 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人 [-0.004638118669390678, -0.011072063818573952,...
|
||||
3 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部 [0.0038256149273365736, -0.0033990885131061077...
|
||||
"""
|
||||
df.to_csv(csv_file_path, index=False, encoding='utf-8_sig')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
getembeddings(api_key, excel_file_path, csv_file_path)
|
||||
get_embeddings(api_key, excel_file_path, csv_file_path)
|
||||
|
@ -2,120 +2,121 @@ import numpy as np
|
||||
import openai
|
||||
import pandas as pd
|
||||
import ast
|
||||
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path
|
||||
from gpt_0_basic_info import api_key, csv_file_path
|
||||
from gpt_1_embeddings_training import get_embedding
|
||||
# 最大的内容长度
|
||||
max_context_len = 1500
|
||||
|
||||
MAXCONTEXTLEN = 1500
|
||||
|
||||
COMPLETIONS_MODEL = "text-davinci-003"
|
||||
EMBEDDING_MODEL = "text-embedding-ada-002"
|
||||
|
||||
|
||||
def get_embedding(text: str, open_ai_api_key: str, model: str=EMBEDDING_MODEL) -> list[float]:
|
||||
|
||||
openai.api_key = open_ai_api_key
|
||||
result = openai.Embedding.create(
|
||||
model=model,
|
||||
input=text
|
||||
)
|
||||
return result["data"][0]["embedding"]
|
||||
openai.api_key = api_key
|
||||
|
||||
|
||||
def compute_doc_embeddings(df: pd.DataFrame, open_ai_api_key: str) :
|
||||
"""
|
||||
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
|
||||
|
||||
Return a datafram with embedding
|
||||
}
|
||||
"""
|
||||
df['embeddings'] = ''
|
||||
df['embeddings'] = df['embeddings'].astype('object')
|
||||
|
||||
for idx, r in df.iterrows():
|
||||
df.at[idx, 'embeddings'] = get_embedding(r.content, open_ai_api_key)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def vector_similarity(x: list[float], y: list[float]) -> float:
|
||||
def vector_similarity(x: list[float], y: list[float]):
|
||||
"""
|
||||
计算并且返回两个向量的相似度
|
||||
Returns the similarity between two vectors.
|
||||
|
||||
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
|
||||
"""
|
||||
return np.dot(np.array(x), np.array(y))
|
||||
return_np_dot_result = np.dot(np.array(x), np.array(y))
|
||||
return return_np_dot_result
|
||||
|
||||
|
||||
def get_query_similarity(query: str, df: pd.DataFrame, open_ai_api_key: str):
|
||||
def get_query_similarity(input_query: str, df: pd.DataFrame):
|
||||
"""
|
||||
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
|
||||
to find the most relevant sections.
|
||||
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document
|
||||
embeddings to find the most relevant sections.
|
||||
|
||||
Return the list of document sections, sorted by relevance in descending order.
|
||||
"""
|
||||
openai.api_key = open_ai_api_key
|
||||
|
||||
query_embedding = get_embedding(query, open_ai_api_key)
|
||||
|
||||
#df['similarities'] = 0
|
||||
query_embedding = get_embedding(input_query)
|
||||
|
||||
df['similarities'] = df['embeddings'].apply(lambda x:vector_similarity(query_embedding, x))
|
||||
|
||||
#print(df['similarities'])
|
||||
'''
|
||||
for idx, r in df.iterrows():
|
||||
df.loc[idx, 'similarities'] = vector_similarity(query_embedding, r.embeddings)
|
||||
'''
|
||||
|
||||
df['similarities'] = df['embeddings'].apply(lambda x: vector_similarity(query_embedding, x))
|
||||
# print(df)
|
||||
"""
|
||||
QandA ... similarities
|
||||
0 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司 ... 0.809908
|
||||
1 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司 ... 0.877552
|
||||
2 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人 ... 0.808605
|
||||
3 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部 ... 0.783896
|
||||
"""
|
||||
|
||||
# 找到最相似的两个
|
||||
two_largest = df['similarities'].nlargest(2).index.tolist()
|
||||
|
||||
# print('get_query_similarity!!!!!!!!')
|
||||
# print(two_largest)
|
||||
# [1, 0] 行的索引
|
||||
|
||||
context = '' if df.loc[two_largest[0]]['similarities'] < 0.8 else df.loc[two_largest[0]]['QandA'] if (df.loc[two_largest[1]]['similarities'] < 0.8 or (len(df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])>=MAXCONTEXTLEN)) else (df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])
|
||||
# print(two_largest[0], df.loc[two_largest[0]]['similarities'], df.loc[two_largest[0]]['QandA'])
|
||||
# print(two_largest[1], df.loc[two_largest[1]]['similarities'], df.loc[two_largest[1]]['QandA'])
|
||||
# print(len(df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA']))
|
||||
# print(context)
|
||||
# 如果最相似的df.loc[two_largest[0]]['similarities']都小于0.8,那么就返回空字符串
|
||||
# 如果第二相似的df.loc[two_largest[1]]['similarities']小于0.8,并且拼接后长度大于1500,那么就返回df.loc[two_largest[0]]['QandA']
|
||||
# 如果第二个相似的df.loc[two_largest[1]]['similarities']大于0.8,那么就返回两个拼接后的字符串
|
||||
context = '' if df.loc[two_largest[0]]['similarities'] < 0.8 else df.loc[two_largest[0]]['QandA'] \
|
||||
if (df.loc[two_largest[1]]['similarities'] < 0.8 or (len(df.loc[two_largest[1]]['QandA'] + '\n' +
|
||||
df.loc[two_largest[0]]['QandA']) >= max_context_len)) \
|
||||
else (df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def _decorate_query(query: str, df: pd.DataFrame, open_ai_api_key: str)-> str:
|
||||
|
||||
def _decorate_query(input_query: str, df: pd.DataFrame) -> str:
|
||||
try:
|
||||
context = get_query_similarity(query, df, open_ai_api_key)
|
||||
context = get_query_similarity(input_query, df)
|
||||
if context != '':
|
||||
header = """请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。\n\n上下文:\n"""
|
||||
#header = "上下文:\n"
|
||||
query = header + context + "\n\n 问题: " + query + "\n 回答:?"
|
||||
# print(query)
|
||||
return query
|
||||
except:
|
||||
# print('ERROR 444444')
|
||||
input_query = header + context + "\n\n 问题: " + input_query + "\n 回答:?"
|
||||
"""
|
||||
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
||||
|
||||
return query
|
||||
上下文:
|
||||
当有人问:公司名称, 请回答:亁颐堂科技有限责任公司
|
||||
当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司
|
||||
|
||||
问题: 亁颐堂是做什么的
|
||||
回答:?
|
||||
"""
|
||||
return input_query
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return input_query
|
||||
|
||||
|
||||
def decorate_query(query: str, open_ai_api_key, filename='foodsembeddings.csv')-> str:
|
||||
filepath = filename
|
||||
def decorate_query(input_query: str, filepath) -> str:
|
||||
try:
|
||||
df = pd.read_csv(filepath)
|
||||
if df.empty:
|
||||
return query
|
||||
return input_query
|
||||
else:
|
||||
try:
|
||||
# 使用.apply()方法对'embeddings'列中的每个元素进行操作。
|
||||
# 用lambda函数定义了一个匿名函数,这个匿名函数接受一个参数x,并将ast.literal_eval(x)的结果返回。
|
||||
# ast.literal_eval(x)是Python中ast模块(Abstract Syntax Trees,抽象语法树)的literal_eval()函数,
|
||||
# 它安全地解析一个字符串形式的字面量表达式(如字符串形式的数字、列表、元组、字典等),并返回该表达式的对应Python对象。
|
||||
# 这里,它将字符串形式的x解析成一个Python对象。
|
||||
|
||||
# print(df)
|
||||
"""
|
||||
QandA embeddings
|
||||
0 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司 [-0.009181897155940533, -0.022621875628829002,...
|
||||
1 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司 [-0.014206966385245323, -0.018791578710079193,...
|
||||
2 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人 [-0.004695456940680742, -0.011140977963805199,...
|
||||
3 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部 [0.0038718082942068577, -0.003343536052852869,...
|
||||
"""
|
||||
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x))
|
||||
return _decorate_query(query, df, open_ai_api_key)
|
||||
|
||||
return _decorate_query(input_query, df)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return query
|
||||
return input_query
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return query
|
||||
return input_query
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
query = '谁发现了牛顿三大定律' # 不相关的就直接返回问题
|
||||
# query = '亁颐堂是做什么的' # 找到相关内容, 就添加上下文
|
||||
# query = '谁发现了牛顿三大定律' # 不相关的就直接返回问题
|
||||
query = '亁颐堂是做什么的' # 找到相关内容, 就添加上下文
|
||||
# 如果内容相关就添加如下上下文
|
||||
"""
|
||||
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
||||
@ -127,7 +128,7 @@ if __name__ == '__main__':
|
||||
问题: 亁颐堂是做什么的
|
||||
回答:?
|
||||
"""
|
||||
print(decorate_query(query, api_key, filename=csv_file_path))\
|
||||
print(decorate_query(query, filepath=csv_file_path))\
|
||||
|
||||
|
||||
|
||||
|
@ -4,12 +4,12 @@ from gpt_0_basic_info import api_key, csv_file_path
|
||||
openai.api_key = api_key
|
||||
|
||||
|
||||
def question(query):
|
||||
def question(input_query):
|
||||
response = openai.ChatCompletion.create(
|
||||
model='gpt-3.5-turbo',
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": query}
|
||||
{"role": "user", "content": input_query}
|
||||
],
|
||||
max_tokens=100,
|
||||
n=1,
|
||||
@ -22,15 +22,15 @@ def question(query):
|
||||
|
||||
if __name__ == '__main__':
|
||||
query = '亁颐堂是做什么的'
|
||||
new_query = decorate_query(query, api_key, filename=csv_file_path)
|
||||
new_query = decorate_query(query, filepath=csv_file_path)
|
||||
print(new_query)
|
||||
# 产生如下的问题:
|
||||
"""
|
||||
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
||||
|
||||
|
||||
上下文:
|
||||
当有人问:公司名称请回答:亁颐堂科技有限责任公司
|
||||
当有人问:亁颐堂是做什么的请回答:亁颐堂是一个网络培训公司
|
||||
当有人问:公司名称, 请回答:亁颐堂科技有限责任公司
|
||||
当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司
|
||||
|
||||
问题: 亁颐堂是做什么的
|
||||
回答:?
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user