AllenYkl commited on
Commit
0be93a7
·
1 Parent(s): 88ff42b

Create Pinecone.py

Browse files
Files changed (1) hide show
  1. bin_public/utils/Pinecone.py +107 -0
bin_public/utils/Pinecone.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import json
3
+ from bin_public.utils.utils_db import *
4
+ from bin_public.config.presets import MIGRAINE_PROMPT
5
+ import PyPDF2
6
+ import pinecone
7
+ from langchain.vectorstores import Pinecone
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+
11
+
12
+ PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
13
+ PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
14
+
15
+
16
+ def load_local_file_PDF(path, file_name):
17
+ result = {}
18
+ temp = ''
19
+ pdf_reader = PyPDF2.PdfReader(open(path, 'rb'))
20
+ for i in range(len(pdf_reader.pages)):
21
+ pages = pdf_reader.pages[i]
22
+ temp += pages.extract_text()
23
+ if file_name.endswith('.pdf'):
24
+ index = file_name[:-4]
25
+ temp = temp.replace('\n', '').replace('\t', '')
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
27
+ texts = text_splitter.split_text(temp)
28
+ i = 0
29
+ for content in texts:
30
+ result[f'{index}_{i}'] = content
31
+ i += 1
32
+ return result
33
+
34
+
35
+ def holo_query_insert_file_contents(file_name, file_content):
36
+ run_sql = f"""
37
+ insert into s_context(
38
+ file_name,
39
+ content
40
+ )
41
+ select
42
+ '{file_name}' as file_name,
43
+ '{file_content}' as content
44
+ """
45
+ holo_query_func(run_sql, is_query=0)
46
+
47
+
48
+ def holo_query_get_content(run_sql):
49
+ temp = []
50
+ data = holo_query_func(run_sql, is_query=1)
51
+ for i in data:
52
+ temp.append(i[1].replace('\n', '').replace('\t', ''))
53
+ return temp
54
+
55
+
56
+ def pdf2database(path, file_name):
57
+ temp = ''
58
+ pdf_reader = PyPDF2.PdfReader(open(path, 'rb'))
59
+ for i in range(len(pdf_reader.pages)):
60
+ pages = pdf_reader.pages[i]
61
+ temp += pages.extract_text()
62
+ if file_name.endswith('.pdf'):
63
+ index = file_name[:-4]
64
+ temp = temp.replace('\n', '').replace('\t', '')
65
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
66
+ texts = text_splitter.split_text(temp)
67
+ for i in range(len(texts)):
68
+ holo_query_insert_file_contents(f'{index}_{i}', f'{texts[i]}')
69
+ logger.info(f'{index}_{i} stored')
70
+
71
+
72
+ def load_json(path):
73
+ with open(path, 'r', encoding='utf-8') as f:
74
+ data = json.load(f)
75
+ return data
76
+
77
+
78
+ def get_content_from_json(path):
79
+ result = []
80
+ data = load_json(path)
81
+ for item in data:
82
+ key = list(item.keys())[0]
83
+ value = item[key]
84
+ result.append(key + ',' + value)
85
+ return result
86
+
87
+ def data2embeddings(index_name, data, embeddings):
88
+ pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
89
+ Pinecone.from_texts([t for t in data], embeddings, index_name=index_name)
90
+ logger.info("Stored Successfully")
91
+
92
+
93
+ def context_construction(api_key, query, model, pinecone_api_key, pinecone_api_env, temperature, index_name, mode="map_reduce"):
94
+ temp = []
95
+ embeddings = OpenAIEmbeddings(openai_api_key=api_key)
96
+ # llm = OpenAI(temperature=temperature, openai_api_key=api_key, model_name=model)
97
+ pinecone.init(api_key=pinecone_api_key, environment=pinecone_api_env)
98
+ docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
99
+ # chain = load_qa_chain(llm, chain_type=mode)
100
+ if not any(char.isalnum() for char in query):
101
+ return MIGRAINE_PROMPT, "Connecting to Pinecone"
102
+ else:
103
+ docs = docsearch.similarity_search(query, include_metadata=True, k=2)
104
+ # response = chain.run(input_documents=docs, question=str(query))
105
+ for i in docs:
106
+ temp.append(i.page_content)
107
+ return '用以下资料进行辅助回答\n' + ' '.join(temp), "Connecting to Pinecone"