mintaeng commited on
Commit
e61f5e5
โ€ข
1 Parent(s): 327a406

Create load_push.py

Browse files
Files changed (1) hide show
  1. load_push.py +135 -0
load_push.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.docstore.document import Document
6
+ from sentence_transformers import SentenceTransformer
7
+ from langchain_pinecone import PineconeVectorStore
8
+ from pinecone.grpc import PineconeGRPC as Pinecone
9
+ from pinecone import ServerlessSpec
10
+ import time
11
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
12
+
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+
16
+
17
+ # ๋ฐ์ดํ„ฐ ๋ฐ›์œผ๋ฉด ๊ฐˆ๋ผ์ค˜
18
+ def come_data(splits):
19
+ docs = []
20
+ for i in range(len(splits)):
21
+ spcon = splits[i].page_content
22
+ url = splits[i].metadata['source']
23
+ con = Document(page_content=spcon, metadata={'source': url})
24
+ docs.append(con)
25
+ return docs
26
+
27
+
28
+
29
+
30
+
31
+ # ํ‰ํƒ„ํ™”
32
+ def flatten_list(lst):
33
+ return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst]
34
+
35
+
36
+ # ๋ชจ๋ธ ๋ถˆ๋Ÿฌ์™€์„œ VectorDB๋กœ ์˜ฌ๋ฆฌ๋Š” ๋ถ€๋ถ„
37
+ def all_files(path):
38
+ print(f'RAG์— ๋“ค์–ด๊ฐˆ ๋ชจ๋“  ๋ฐ์ดํ„ฐ๋Š” {path}์— ๋‹ด์•„์ฃผ์„ธ์š”.\n\n\n')
39
+ f = glob.glob(path + '/**', recursive=True)
40
+ f_docs = []
41
+ for file in f:
42
+ a = False
43
+ if file.endswith('.txt'):
44
+ loader = TextLoader(file)
45
+ document = loader.load()
46
+ a = True
47
+ elif file.endswith('.csv'):
48
+ loader = CSVLoader(file)
49
+ document = loader.load()
50
+ a = True
51
+ elif file.endswith('.pdf'):
52
+ loader = PyMuPDFLoader(file)
53
+ document = loader.load()
54
+ a = True
55
+ # ------------------- ํŒŒ์ผ ํƒ€์ž… ์ถ”๊ฐ€ ์‚ฌํ•ญ ์žˆ์„ ์‹œ ์œ„์— ์ถ”๊ฐ€ ----------------#
56
+ if a:
57
+ print(file.split('/')[-1] + ' split ์ง„ํ–‰ ์ค‘')
58
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
59
+ separator=".",
60
+ chunk_size=500,
61
+ chunk_overlap=0,
62
+ )
63
+ splits = text_splitter.split_documents(document)
64
+ docs = come_data(splits)
65
+ f_docs.append(docs)
66
+ print(file.split('/')[-1] + ' split ์ง„ํ–‰ ์™„๋ฃŒ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ˆ˜ : ' + str(len(docs)))
67
+ flattened_list = flatten_list(f_docs)
68
+
69
+ '''
70
+ flattened ๋œ docs๋ฅผ ๋ฒกํ„ฐ db๋กœ ๋„ฃ์–ด์ค„ ๊ฒƒ
71
+ '''
72
+
73
+
74
+
75
+ # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์„ ์–ธ
76
+ embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True})
77
+
78
+ # ๋ฒกํ„ฐ์Šคํ† ์–ด ์„ ์–ธ
79
+
80
+ api_key = os.environ['PINECONE_API_KEY']
81
+ pc = Pinecone(api_key=api_key)
82
+
83
+ index_name = os.getenv('INDEX_NAME')
84
+
85
+ print('Vector DB ์ดˆ๊ธฐํ™”. Index_name = ' + str(index_name))
86
+ spec = ServerlessSpec(cloud='aws', region='us-east-1')
87
+
88
+ # ์ธ๋ฑ์Šค ์กด์žฌ์—ฌ๋ถ€ ํ™•์ธ ๋ฐ ์‚ญ์ œ
89
+ collect_name = []
90
+ for n in pc.list_indexes().indexes:
91
+ collect_name.append(n.name)
92
+
93
+ if index_name in collect_name:
94
+ pc.delete_index(index_name)
95
+ print('๊ธฐ์กด ์ธ๋ฑ์Šค ์‚ญ์ œ์™„๋ฃŒ')
96
+ time.sleep(3)
97
+
98
+ # ํŒŒ์ธ์ฝ˜ ์ธ๋ฑ์Šค ์ƒ์„ฑ
99
+ pc.create_index(
100
+ index_name,
101
+ dimension=768,
102
+ metric='cosine',
103
+ spec=spec
104
+ )
105
+
106
+ # ์ธ๋ฑ์Šค ์žฌ์ƒ์„ฑ ๋ฐ ๋ฐ์ดํ„ฐ ์ž…๋ ฅ
107
+ # index = pc.Index(index_name)
108
+ print('Vector DB ๋“ค์–ด๊ฐ€๋Š” ์ค‘. Index_name = ' + str(index_name))
109
+
110
+ # # ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
111
+ # texts = [doc.page_content for doc in flattened_list]
112
+ # embedded_texts = []
113
+ # for txt in texts:
114
+ # embedded_texts.append(embedding_model.embed_query(txt))
115
+
116
+
117
+ # # ๋ฒกํ„ฐ DB์— ์ž„๋ฒ ๋”ฉ ์ถ”๊ฐ€
118
+ # ids = [str(i) for i in range(len(embedded_texts))]
119
+ # metadata = [doc.metadata for doc in flattened_list]
120
+
121
+ # # db์˜ฌ๋ฆด๋•Œ ๋ฌด๋ฃŒ๋ฒ„์ „์ด๊ธฐ๋•Œ๋ฌธ์— ์šฉ๋Ÿ‰ ํ„ฐ์ง -> ๋‚˜๋ˆ ์„œ ์˜ฌ๋ฆฌ์ž
122
+ # batch_size = 28
123
+ # for i in range(0, len(embedded_texts), batch_size):
124
+ # batch_vectors = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids[i:i + batch_size], embedded_texts[i:i + batch_size], metadata[i:i + batch_size])]
125
+ # index.upsert(vectors=batch_vectors)
126
+
127
+
128
+ Vectorstore = PineconeVectorStore.from_documents(
129
+ documents=flattened_list,
130
+ index_name=index_name,
131
+ embedding=embedding_model
132
+ )
133
+
134
+ print('์ €์žฅ ์™„๋ฃŒ')
135
+ return Vectorstore, flattened_list