Create load_push.py
Browse files- load_push.py +135 -0
load_push.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
from langchain.docstore.document import Document
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from langchain_pinecone import PineconeVectorStore
|
8 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
9 |
+
from pinecone import ServerlessSpec
|
10 |
+
import time
|
11 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
12 |
+
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
|
17 |
+
# ๋ฐ์ดํฐ ๋ฐ์ผ๋ฉด ๊ฐ๋ผ์ค
|
18 |
+
def come_data(splits):
|
19 |
+
docs = []
|
20 |
+
for i in range(len(splits)):
|
21 |
+
spcon = splits[i].page_content
|
22 |
+
url = splits[i].metadata['source']
|
23 |
+
con = Document(page_content=spcon, metadata={'source': url})
|
24 |
+
docs.append(con)
|
25 |
+
return docs
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
# ํํํ
|
32 |
+
def flatten_list(lst):
|
33 |
+
return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst]
|
34 |
+
|
35 |
+
|
36 |
+
# ๋ชจ๋ธ ๋ถ๋ฌ์์ VectorDB๋ก ์ฌ๋ฆฌ๋ ๋ถ๋ถ
|
37 |
+
def all_files(path):
|
38 |
+
print(f'RAG์ ๋ค์ด๊ฐ ๋ชจ๋ ๋ฐ์ดํฐ๋ {path}์ ๋ด์์ฃผ์ธ์.\n\n\n')
|
39 |
+
f = glob.glob(path + '/**', recursive=True)
|
40 |
+
f_docs = []
|
41 |
+
for file in f:
|
42 |
+
a = False
|
43 |
+
if file.endswith('.txt'):
|
44 |
+
loader = TextLoader(file)
|
45 |
+
document = loader.load()
|
46 |
+
a = True
|
47 |
+
elif file.endswith('.csv'):
|
48 |
+
loader = CSVLoader(file)
|
49 |
+
document = loader.load()
|
50 |
+
a = True
|
51 |
+
elif file.endswith('.pdf'):
|
52 |
+
loader = PyMuPDFLoader(file)
|
53 |
+
document = loader.load()
|
54 |
+
a = True
|
55 |
+
# ------------------- ํ์ผ ํ์
์ถ๊ฐ ์ฌํญ ์์ ์ ์์ ์ถ๊ฐ ----------------#
|
56 |
+
if a:
|
57 |
+
print(file.split('/')[-1] + ' split ์งํ ์ค')
|
58 |
+
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
59 |
+
separator=".",
|
60 |
+
chunk_size=500,
|
61 |
+
chunk_overlap=0,
|
62 |
+
)
|
63 |
+
splits = text_splitter.split_documents(document)
|
64 |
+
docs = come_data(splits)
|
65 |
+
f_docs.append(docs)
|
66 |
+
print(file.split('/')[-1] + ' split ์งํ ์๋ฃ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ : ' + str(len(docs)))
|
67 |
+
flattened_list = flatten_list(f_docs)
|
68 |
+
|
69 |
+
'''
|
70 |
+
flattened ๋ docs๋ฅผ ๋ฒกํฐ db๋ก ๋ฃ์ด์ค ๊ฒ
|
71 |
+
'''
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
# ์๋ฒ ๋ฉ ๋ชจ๋ธ ์ ์ธ
|
76 |
+
embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True})
|
77 |
+
|
78 |
+
# ๋ฒกํฐ์คํ ์ด ์ ์ธ
|
79 |
+
|
80 |
+
api_key = os.environ['PINECONE_API_KEY']
|
81 |
+
pc = Pinecone(api_key=api_key)
|
82 |
+
|
83 |
+
index_name = os.getenv('INDEX_NAME')
|
84 |
+
|
85 |
+
print('Vector DB ์ด๊ธฐํ. Index_name = ' + str(index_name))
|
86 |
+
spec = ServerlessSpec(cloud='aws', region='us-east-1')
|
87 |
+
|
88 |
+
# ์ธ๋ฑ์ค ์กด์ฌ์ฌ๋ถ ํ์ธ ๋ฐ ์ญ์
|
89 |
+
collect_name = []
|
90 |
+
for n in pc.list_indexes().indexes:
|
91 |
+
collect_name.append(n.name)
|
92 |
+
|
93 |
+
if index_name in collect_name:
|
94 |
+
pc.delete_index(index_name)
|
95 |
+
print('๊ธฐ์กด ์ธ๋ฑ์ค ์ญ์ ์๋ฃ')
|
96 |
+
time.sleep(3)
|
97 |
+
|
98 |
+
# ํ์ธ์ฝ ์ธ๋ฑ์ค ์์ฑ
|
99 |
+
pc.create_index(
|
100 |
+
index_name,
|
101 |
+
dimension=768,
|
102 |
+
metric='cosine',
|
103 |
+
spec=spec
|
104 |
+
)
|
105 |
+
|
106 |
+
# ์ธ๋ฑ์ค ์ฌ์์ฑ ๋ฐ ๋ฐ์ดํฐ ์
๋ ฅ
|
107 |
+
# index = pc.Index(index_name)
|
108 |
+
print('Vector DB ๋ค์ด๊ฐ๋ ์ค. Index_name = ' + str(index_name))
|
109 |
+
|
110 |
+
# # ํ
์คํธ ์๋ฒ ๋ฉ ์์ฑ
|
111 |
+
# texts = [doc.page_content for doc in flattened_list]
|
112 |
+
# embedded_texts = []
|
113 |
+
# for txt in texts:
|
114 |
+
# embedded_texts.append(embedding_model.embed_query(txt))
|
115 |
+
|
116 |
+
|
117 |
+
# # ๋ฒกํฐ DB์ ์๋ฒ ๋ฉ ์ถ๊ฐ
|
118 |
+
# ids = [str(i) for i in range(len(embedded_texts))]
|
119 |
+
# metadata = [doc.metadata for doc in flattened_list]
|
120 |
+
|
121 |
+
# # db์ฌ๋ฆด๋ ๋ฌด๋ฃ๋ฒ์ ์ด๊ธฐ๋๋ฌธ์ ์ฉ๋ ํฐ์ง -> ๋๋ ์ ์ฌ๋ฆฌ์
|
122 |
+
# batch_size = 28
|
123 |
+
# for i in range(0, len(embedded_texts), batch_size):
|
124 |
+
# batch_vectors = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids[i:i + batch_size], embedded_texts[i:i + batch_size], metadata[i:i + batch_size])]
|
125 |
+
# index.upsert(vectors=batch_vectors)
|
126 |
+
|
127 |
+
|
128 |
+
Vectorstore = PineconeVectorStore.from_documents(
|
129 |
+
documents=flattened_list,
|
130 |
+
index_name=index_name,
|
131 |
+
embedding=embedding_model
|
132 |
+
)
|
133 |
+
|
134 |
+
print('์ ์ฅ ์๋ฃ')
|
135 |
+
return Vectorstore, flattened_list
|