Spaces:
Running
Running
Create Pinecone.py
Browse files- bin_public/utils/Pinecone.py +107 -0
bin_public/utils/Pinecone.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
import json
|
3 |
+
from bin_public.utils.utils_db import *
|
4 |
+
from bin_public.config.presets import MIGRAINE_PROMPT
|
5 |
+
import PyPDF2
|
6 |
+
import pinecone
|
7 |
+
from langchain.vectorstores import Pinecone
|
8 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
+
|
11 |
+
|
12 |
+
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
|
13 |
+
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
|
14 |
+
|
15 |
+
|
16 |
+
def load_local_file_PDF(path, file_name):
|
17 |
+
result = {}
|
18 |
+
temp = ''
|
19 |
+
pdf_reader = PyPDF2.PdfReader(open(path, 'rb'))
|
20 |
+
for i in range(len(pdf_reader.pages)):
|
21 |
+
pages = pdf_reader.pages[i]
|
22 |
+
temp += pages.extract_text()
|
23 |
+
if file_name.endswith('.pdf'):
|
24 |
+
index = file_name[:-4]
|
25 |
+
temp = temp.replace('\n', '').replace('\t', '')
|
26 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
27 |
+
texts = text_splitter.split_text(temp)
|
28 |
+
i = 0
|
29 |
+
for content in texts:
|
30 |
+
result[f'{index}_{i}'] = content
|
31 |
+
i += 1
|
32 |
+
return result
|
33 |
+
|
34 |
+
|
35 |
+
def holo_query_insert_file_contents(file_name, file_content):
|
36 |
+
run_sql = f"""
|
37 |
+
insert into s_context(
|
38 |
+
file_name,
|
39 |
+
content
|
40 |
+
)
|
41 |
+
select
|
42 |
+
'{file_name}' as file_name,
|
43 |
+
'{file_content}' as content
|
44 |
+
"""
|
45 |
+
holo_query_func(run_sql, is_query=0)
|
46 |
+
|
47 |
+
|
48 |
+
def holo_query_get_content(run_sql):
|
49 |
+
temp = []
|
50 |
+
data = holo_query_func(run_sql, is_query=1)
|
51 |
+
for i in data:
|
52 |
+
temp.append(i[1].replace('\n', '').replace('\t', ''))
|
53 |
+
return temp
|
54 |
+
|
55 |
+
|
56 |
+
def pdf2database(path, file_name):
|
57 |
+
temp = ''
|
58 |
+
pdf_reader = PyPDF2.PdfReader(open(path, 'rb'))
|
59 |
+
for i in range(len(pdf_reader.pages)):
|
60 |
+
pages = pdf_reader.pages[i]
|
61 |
+
temp += pages.extract_text()
|
62 |
+
if file_name.endswith('.pdf'):
|
63 |
+
index = file_name[:-4]
|
64 |
+
temp = temp.replace('\n', '').replace('\t', '')
|
65 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
66 |
+
texts = text_splitter.split_text(temp)
|
67 |
+
for i in range(len(texts)):
|
68 |
+
holo_query_insert_file_contents(f'{index}_{i}', f'{texts[i]}')
|
69 |
+
logger.info(f'{index}_{i} stored')
|
70 |
+
|
71 |
+
|
72 |
+
def load_json(path):
|
73 |
+
with open(path, 'r', encoding='utf-8') as f:
|
74 |
+
data = json.load(f)
|
75 |
+
return data
|
76 |
+
|
77 |
+
|
78 |
+
def get_content_from_json(path):
|
79 |
+
result = []
|
80 |
+
data = load_json(path)
|
81 |
+
for item in data:
|
82 |
+
key = list(item.keys())[0]
|
83 |
+
value = item[key]
|
84 |
+
result.append(key + ',' + value)
|
85 |
+
return result
|
86 |
+
|
87 |
+
def data2embeddings(index_name, data, embeddings):
|
88 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
|
89 |
+
Pinecone.from_texts([t for t in data], embeddings, index_name=index_name)
|
90 |
+
logger.info("Stored Successfully")
|
91 |
+
|
92 |
+
|
93 |
+
def context_construction(api_key, query, model, pinecone_api_key, pinecone_api_env, temperature, index_name, mode="map_reduce"):
|
94 |
+
temp = []
|
95 |
+
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
|
96 |
+
# llm = OpenAI(temperature=temperature, openai_api_key=api_key, model_name=model)
|
97 |
+
pinecone.init(api_key=pinecone_api_key, environment=pinecone_api_env)
|
98 |
+
docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
|
99 |
+
# chain = load_qa_chain(llm, chain_type=mode)
|
100 |
+
if not any(char.isalnum() for char in query):
|
101 |
+
return MIGRAINE_PROMPT, "Connecting to Pinecone"
|
102 |
+
else:
|
103 |
+
docs = docsearch.similarity_search(query, include_metadata=True, k=2)
|
104 |
+
# response = chain.run(input_documents=docs, question=str(query))
|
105 |
+
for i in docs:
|
106 |
+
temp.append(i.page_content)
|
107 |
+
return '用以下资料进行辅助回答\n' + ' '.join(temp), "Connecting to Pinecone"
|