mirror of
https://github.com/collinsctk/chatgpt_embeddings.git
synced 2025-07-19 00:00:05 +08:00
最终注释
This commit is contained in:
parent
9905f942a1
commit
73472dfd71
@ -1,52 +1,76 @@
|
|||||||
# 参考文章
|
# 参考文章
|
||||||
# https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb
|
# https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb
|
||||||
|
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import time
|
|
||||||
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path
|
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path
|
||||||
|
|
||||||
COMPLETIONS_MODEL = "text-davinci-003"
|
|
||||||
EMBEDDING_MODEL = "text-embedding-ada-002"
|
EMBEDDING_MODEL = "text-embedding-ada-002"
|
||||||
|
|
||||||
|
openai.api_key = api_key
|
||||||
|
|
||||||
def get_embedding(text: str, open_ai_api_key: str, model: str = EMBEDDING_MODEL) -> list[float]:
|
"""
|
||||||
openai.api_key = open_ai_api_key
|
We preprocess the document sections by creating an embedding vector for each section. An embedding is a vector of
|
||||||
|
numbers that helps us understand how semantically similar or different the texts are. The closer two embeddings are to
|
||||||
|
each other, the more similar are their contents.
|
||||||
|
|
||||||
|
翻译:
|
||||||
|
我们通过为每个部分创建嵌入向量来预处理文档部分。嵌入是一组数字,帮助我们理解文本的语义相似性或差异。两个嵌入越接近,它们的内容就越相似。
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
|
||||||
|
# 计算嵌入向量
|
||||||
result = openai.Embedding.create(
|
result = openai.Embedding.create(
|
||||||
model=model,
|
model=model,
|
||||||
input=text
|
input=text
|
||||||
)
|
)
|
||||||
return result["data"][0]["embedding"]
|
return_data_embedding = result["data"][0]["embedding"]
|
||||||
|
# 具体数据如下
|
||||||
|
# [-0.008970350958406925, -0.014719498343765736, ~~~~很多很多~~~~]
|
||||||
|
return return_data_embedding
|
||||||
|
|
||||||
|
|
||||||
def compute_doc_embeddings(df: pd.DataFrame, open_ai_api_key: str):
|
def compute_doc_embeddings(df: pd.DataFrame):
|
||||||
"""
|
"""
|
||||||
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
|
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
|
||||||
|
|
||||||
Return a datafram with embedding
|
Return a dataframe with embedding
|
||||||
}
|
|
||||||
"""
|
"""
|
||||||
df['embeddings'] = ''
|
# print(df)
|
||||||
df['embeddings'] = df['embeddings'].astype('object')
|
"""
|
||||||
|
prompt ... QandA
|
||||||
for idx, r in df.iterrows():
|
0 当有人问:公司名称 ... 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司
|
||||||
print(idx)
|
1 当有人问:亁颐堂是做什么的 ... 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司
|
||||||
df.at[idx, 'embeddings'] = get_embedding(r.QandA, open_ai_api_key)
|
2 当有人问:你们公司有多少人 ... 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人
|
||||||
time.sleep(1)
|
3 当有人问:你们公司有多少个分部 ... 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部
|
||||||
|
"""
|
||||||
|
# 添加新的列'embeddings', 值为'QandA'这个列计算的向量数据
|
||||||
|
df['embeddings'] = df['QandA'].apply(lambda x: get_embedding(x))
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def getembeddings(api_key, excelfilepath, csvfilepath):
|
def get_embeddings(openai_api_key, excel_file_path, csv_file_path):
|
||||||
df = pd.read_excel(excelfilepath)
|
df = pd.read_excel(excel_file_path)
|
||||||
|
# 删除换行"\n"
|
||||||
df['prompt'] = df['prompt'].apply(lambda x: x.replace('\n', ''))
|
df['prompt'] = df['prompt'].apply(lambda x: x.replace('\n', ''))
|
||||||
|
# 给问题加上"当有人问:"的前缀
|
||||||
df['prompt'] = df['prompt'].apply(lambda x: "当有人问:" + x + '')
|
df['prompt'] = df['prompt'].apply(lambda x: "当有人问:" + x + '')
|
||||||
df['completion'] = df['completion'].apply(lambda x: "请回答:" + x)
|
# 给答案加上", 请回答:"的前缀
|
||||||
|
df['completion'] = df['completion'].apply(lambda x: ", 请回答:" + x)
|
||||||
|
# 将问题和答案合并
|
||||||
df['QandA'] = df['prompt'] + df['completion']
|
df['QandA'] = df['prompt'] + df['completion']
|
||||||
df = compute_doc_embeddings(df, api_key)[['QandA', 'embeddings']]
|
# 只取'QandA'和'embeddings'两列
|
||||||
df.to_csv(csvfilepath, index=False, encoding='utf-8_sig')
|
df = compute_doc_embeddings(df)[['QandA', 'embeddings']]
|
||||||
|
# print(df)
|
||||||
|
"""
|
||||||
|
QandA embeddings
|
||||||
|
0 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司 [-0.009215764701366425, -0.022858258336782455,...
|
||||||
|
1 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司 [-0.014166537672281265, -0.01877765916287899, ...
|
||||||
|
2 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人 [-0.004638118669390678, -0.011072063818573952,...
|
||||||
|
3 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部 [0.0038256149273365736, -0.0033990885131061077...
|
||||||
|
"""
|
||||||
|
df.to_csv(csv_file_path, index=False, encoding='utf-8_sig')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
getembeddings(api_key, excel_file_path, csv_file_path)
|
get_embeddings(api_key, excel_file_path, csv_file_path)
|
||||||
|
@ -2,120 +2,121 @@ import numpy as np
|
|||||||
import openai
|
import openai
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import ast
|
import ast
|
||||||
from gpt_0_basic_info import api_key, excel_file_path, csv_file_path
|
from gpt_0_basic_info import api_key, csv_file_path
|
||||||
|
from gpt_1_embeddings_training import get_embedding
|
||||||
|
# 最大的内容长度
|
||||||
|
max_context_len = 1500
|
||||||
|
|
||||||
MAXCONTEXTLEN = 1500
|
|
||||||
|
|
||||||
COMPLETIONS_MODEL = "text-davinci-003"
|
|
||||||
EMBEDDING_MODEL = "text-embedding-ada-002"
|
EMBEDDING_MODEL = "text-embedding-ada-002"
|
||||||
|
|
||||||
|
openai.api_key = api_key
|
||||||
def get_embedding(text: str, open_ai_api_key: str, model: str=EMBEDDING_MODEL) -> list[float]:
|
|
||||||
|
|
||||||
openai.api_key = open_ai_api_key
|
|
||||||
result = openai.Embedding.create(
|
|
||||||
model=model,
|
|
||||||
input=text
|
|
||||||
)
|
|
||||||
return result["data"][0]["embedding"]
|
|
||||||
|
|
||||||
|
|
||||||
def compute_doc_embeddings(df: pd.DataFrame, open_ai_api_key: str) :
|
def vector_similarity(x: list[float], y: list[float]):
|
||||||
"""
|
|
||||||
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
|
|
||||||
|
|
||||||
Return a datafram with embedding
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
df['embeddings'] = ''
|
|
||||||
df['embeddings'] = df['embeddings'].astype('object')
|
|
||||||
|
|
||||||
for idx, r in df.iterrows():
|
|
||||||
df.at[idx, 'embeddings'] = get_embedding(r.content, open_ai_api_key)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def vector_similarity(x: list[float], y: list[float]) -> float:
|
|
||||||
"""
|
"""
|
||||||
|
计算并且返回两个向量的相似度
|
||||||
Returns the similarity between two vectors.
|
Returns the similarity between two vectors.
|
||||||
|
|
||||||
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
|
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
|
||||||
"""
|
"""
|
||||||
return np.dot(np.array(x), np.array(y))
|
return_np_dot_result = np.dot(np.array(x), np.array(y))
|
||||||
|
return return_np_dot_result
|
||||||
|
|
||||||
|
|
||||||
def get_query_similarity(query: str, df: pd.DataFrame, open_ai_api_key: str):
|
def get_query_similarity(input_query: str, df: pd.DataFrame):
|
||||||
"""
|
"""
|
||||||
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
|
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document
|
||||||
to find the most relevant sections.
|
embeddings to find the most relevant sections.
|
||||||
|
|
||||||
Return the list of document sections, sorted by relevance in descending order.
|
Return the list of document sections, sorted by relevance in descending order.
|
||||||
"""
|
"""
|
||||||
openai.api_key = open_ai_api_key
|
|
||||||
|
|
||||||
query_embedding = get_embedding(query, open_ai_api_key)
|
query_embedding = get_embedding(input_query)
|
||||||
|
|
||||||
#df['similarities'] = 0
|
df['similarities'] = df['embeddings'].apply(lambda x: vector_similarity(query_embedding, x))
|
||||||
|
# print(df)
|
||||||
df['similarities'] = df['embeddings'].apply(lambda x:vector_similarity(query_embedding, x))
|
"""
|
||||||
|
QandA ... similarities
|
||||||
#print(df['similarities'])
|
0 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司 ... 0.809908
|
||||||
'''
|
1 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司 ... 0.877552
|
||||||
for idx, r in df.iterrows():
|
2 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人 ... 0.808605
|
||||||
df.loc[idx, 'similarities'] = vector_similarity(query_embedding, r.embeddings)
|
3 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部 ... 0.783896
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
# 找到最相似的两个
|
||||||
two_largest = df['similarities'].nlargest(2).index.tolist()
|
two_largest = df['similarities'].nlargest(2).index.tolist()
|
||||||
|
|
||||||
# print('get_query_similarity!!!!!!!!')
|
# print(two_largest)
|
||||||
|
# [1, 0] 行的索引
|
||||||
|
|
||||||
context = '' if df.loc[two_largest[0]]['similarities'] < 0.8 else df.loc[two_largest[0]]['QandA'] if (df.loc[two_largest[1]]['similarities'] < 0.8 or (len(df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])>=MAXCONTEXTLEN)) else (df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])
|
# 如果最相似的df.loc[two_largest[0]]['similarities']都小于0.8,那么就返回空字符串
|
||||||
# print(two_largest[0], df.loc[two_largest[0]]['similarities'], df.loc[two_largest[0]]['QandA'])
|
# 如果第二相似的df.loc[two_largest[1]]['similarities']小于0.8,并且拼接后长度大于1500,那么就返回df.loc[two_largest[0]]['QandA']
|
||||||
# print(two_largest[1], df.loc[two_largest[1]]['similarities'], df.loc[two_largest[1]]['QandA'])
|
# 如果第二个相似的df.loc[two_largest[1]]['similarities']大于0.8,那么就返回两个拼接后的字符串
|
||||||
# print(len(df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA']))
|
context = '' if df.loc[two_largest[0]]['similarities'] < 0.8 else df.loc[two_largest[0]]['QandA'] \
|
||||||
# print(context)
|
if (df.loc[two_largest[1]]['similarities'] < 0.8 or (len(df.loc[two_largest[1]]['QandA'] + '\n' +
|
||||||
|
df.loc[two_largest[0]]['QandA']) >= max_context_len)) \
|
||||||
|
else (df.loc[two_largest[1]]['QandA'] + '\n' + df.loc[two_largest[0]]['QandA'])
|
||||||
|
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
|
||||||
def _decorate_query(query: str, df: pd.DataFrame, open_ai_api_key: str)-> str:
|
def _decorate_query(input_query: str, df: pd.DataFrame) -> str:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
context = get_query_similarity(query, df, open_ai_api_key)
|
context = get_query_similarity(input_query, df)
|
||||||
if context != '':
|
if context != '':
|
||||||
header = """请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。\n\n上下文:\n"""
|
header = """请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。\n\n上下文:\n"""
|
||||||
#header = "上下文:\n"
|
input_query = header + context + "\n\n 问题: " + input_query + "\n 回答:?"
|
||||||
query = header + context + "\n\n 问题: " + query + "\n 回答:?"
|
"""
|
||||||
# print(query)
|
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
||||||
return query
|
|
||||||
except:
|
|
||||||
# print('ERROR 444444')
|
|
||||||
|
|
||||||
return query
|
上下文:
|
||||||
|
当有人问:公司名称, 请回答:亁颐堂科技有限责任公司
|
||||||
|
当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司
|
||||||
|
|
||||||
|
问题: 亁颐堂是做什么的
|
||||||
|
回答:?
|
||||||
|
"""
|
||||||
|
return input_query
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return input_query
|
||||||
|
|
||||||
|
|
||||||
def decorate_query(query: str, open_ai_api_key, filename='foodsembeddings.csv')-> str:
|
def decorate_query(input_query: str, filepath) -> str:
|
||||||
filepath = filename
|
|
||||||
try:
|
try:
|
||||||
df = pd.read_csv(filepath)
|
df = pd.read_csv(filepath)
|
||||||
if df.empty:
|
if df.empty:
|
||||||
return query
|
return input_query
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
# 使用.apply()方法对'embeddings'列中的每个元素进行操作。
|
||||||
|
# 用lambda函数定义了一个匿名函数,这个匿名函数接受一个参数x,并将ast.literal_eval(x)的结果返回。
|
||||||
|
# ast.literal_eval(x)是Python中ast模块(Abstract Syntax Trees,抽象语法树)的literal_eval()函数,
|
||||||
|
# 它安全地解析一个字符串形式的字面量表达式(如字符串形式的数字、列表、元组、字典等),并返回该表达式的对应Python对象。
|
||||||
|
# 这里,它将字符串形式的x解析成一个Python对象。
|
||||||
|
|
||||||
|
# print(df)
|
||||||
|
"""
|
||||||
|
QandA embeddings
|
||||||
|
0 当有人问:公司名称, 请回答:亁颐堂科技有限责任公司 [-0.009181897155940533, -0.022621875628829002,...
|
||||||
|
1 当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司 [-0.014206966385245323, -0.018791578710079193,...
|
||||||
|
2 当有人问:你们公司有多少人, 请回答:亁颐堂有三十多个人 [-0.004695456940680742, -0.011140977963805199,...
|
||||||
|
3 当有人问:你们公司有多少个分部, 请回答:亁颐堂有北京 上海和南京三个分部 [0.0038718082942068577, -0.003343536052852869,...
|
||||||
|
"""
|
||||||
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x))
|
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x))
|
||||||
return _decorate_query(query, df, open_ai_api_key)
|
|
||||||
|
return _decorate_query(input_query, df)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
return query
|
return input_query
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
return query
|
return input_query
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
query = '谁发现了牛顿三大定律' # 不相关的就直接返回问题
|
# query = '谁发现了牛顿三大定律' # 不相关的就直接返回问题
|
||||||
# query = '亁颐堂是做什么的' # 找到相关内容, 就添加上下文
|
query = '亁颐堂是做什么的' # 找到相关内容, 就添加上下文
|
||||||
# 如果内容相关就添加如下上下文
|
# 如果内容相关就添加如下上下文
|
||||||
"""
|
"""
|
||||||
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
||||||
@ -127,7 +128,7 @@ if __name__ == '__main__':
|
|||||||
问题: 亁颐堂是做什么的
|
问题: 亁颐堂是做什么的
|
||||||
回答:?
|
回答:?
|
||||||
"""
|
"""
|
||||||
print(decorate_query(query, api_key, filename=csv_file_path))\
|
print(decorate_query(query, filepath=csv_file_path))\
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,12 +4,12 @@ from gpt_0_basic_info import api_key, csv_file_path
|
|||||||
openai.api_key = api_key
|
openai.api_key = api_key
|
||||||
|
|
||||||
|
|
||||||
def question(query):
|
def question(input_query):
|
||||||
response = openai.ChatCompletion.create(
|
response = openai.ChatCompletion.create(
|
||||||
model='gpt-3.5-turbo',
|
model='gpt-3.5-turbo',
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
{"role": "user", "content": query}
|
{"role": "user", "content": input_query}
|
||||||
],
|
],
|
||||||
max_tokens=100,
|
max_tokens=100,
|
||||||
n=1,
|
n=1,
|
||||||
@ -22,15 +22,15 @@ def question(query):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
query = '亁颐堂是做什么的'
|
query = '亁颐堂是做什么的'
|
||||||
new_query = decorate_query(query, api_key, filename=csv_file_path)
|
new_query = decorate_query(query, filepath=csv_file_path)
|
||||||
print(new_query)
|
print(new_query)
|
||||||
# 产生如下的问题:
|
# 产生如下的问题:
|
||||||
"""
|
"""
|
||||||
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
请使用上下文尽可能真实、自然地回答问题,如果答案未包含在上下文中,请不要编造回答,并且不要在回答中包含”根据上下文”这个短语。
|
||||||
|
|
||||||
上下文:
|
上下文:
|
||||||
当有人问:公司名称请回答:亁颐堂科技有限责任公司
|
当有人问:公司名称, 请回答:亁颐堂科技有限责任公司
|
||||||
当有人问:亁颐堂是做什么的请回答:亁颐堂是一个网络培训公司
|
当有人问:亁颐堂是做什么的, 请回答:亁颐堂是一个网络培训公司
|
||||||
|
|
||||||
问题: 亁颐堂是做什么的
|
问题: 亁颐堂是做什么的
|
||||||
回答:?
|
回答:?
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user