File size: 4,448 Bytes
e61f5e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import glob
import os
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from langchain_pinecone import PineconeVectorStore
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time
from langchain_community.embeddings import SentenceTransformerEmbeddings
from dotenv import load_dotenv
load_dotenv()
# ๋ฐ์ดํฐ ๋ฐ์ผ๋ฉด ๊ฐ๋ผ์ค
def come_data(splits):
docs = []
for i in range(len(splits)):
spcon = splits[i].page_content
url = splits[i].metadata['source']
con = Document(page_content=spcon, metadata={'source': url})
docs.append(con)
return docs
# ํํํ
def flatten_list(lst):
return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst]
# ๋ชจ๋ธ ๋ถ๋ฌ์์ VectorDB๋ก ์ฌ๋ฆฌ๋ ๋ถ๋ถ
def all_files(path):
print(f'RAG์ ๋ค์ด๊ฐ ๋ชจ๋ ๋ฐ์ดํฐ๋ {path}์ ๋ด์์ฃผ์ธ์.\n\n\n')
f = glob.glob(path + '/**', recursive=True)
f_docs = []
for file in f:
a = False
if file.endswith('.txt'):
loader = TextLoader(file)
document = loader.load()
a = True
elif file.endswith('.csv'):
loader = CSVLoader(file)
document = loader.load()
a = True
elif file.endswith('.pdf'):
loader = PyMuPDFLoader(file)
document = loader.load()
a = True
# ------------------- ํ์ผ ํ์
์ถ๊ฐ ์ฌํญ ์์ ์ ์์ ์ถ๊ฐ ----------------#
if a:
print(file.split('/')[-1] + ' split ์งํ ์ค')
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
separator=".",
chunk_size=500,
chunk_overlap=0,
)
splits = text_splitter.split_documents(document)
docs = come_data(splits)
f_docs.append(docs)
print(file.split('/')[-1] + ' split ์งํ ์๋ฃ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ : ' + str(len(docs)))
flattened_list = flatten_list(f_docs)
'''
flattened ๋ docs๋ฅผ ๋ฒกํฐ db๋ก ๋ฃ์ด์ค ๊ฒ
'''
# ์๋ฒ ๋ฉ ๋ชจ๋ธ ์ ์ธ
embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True})
# ๋ฒกํฐ์คํ ์ด ์ ์ธ
api_key = os.environ['PINECONE_API_KEY']
pc = Pinecone(api_key=api_key)
index_name = os.getenv('INDEX_NAME')
print('Vector DB ์ด๊ธฐํ. Index_name = ' + str(index_name))
spec = ServerlessSpec(cloud='aws', region='us-east-1')
# ์ธ๋ฑ์ค ์กด์ฌ์ฌ๋ถ ํ์ธ ๋ฐ ์ญ์
collect_name = []
for n in pc.list_indexes().indexes:
collect_name.append(n.name)
if index_name in collect_name:
pc.delete_index(index_name)
print('๊ธฐ์กด ์ธ๋ฑ์ค ์ญ์ ์๋ฃ')
time.sleep(3)
# ํ์ธ์ฝ ์ธ๋ฑ์ค ์์ฑ
pc.create_index(
index_name,
dimension=768,
metric='cosine',
spec=spec
)
# ์ธ๋ฑ์ค ์ฌ์์ฑ ๋ฐ ๋ฐ์ดํฐ ์
๋ ฅ
# index = pc.Index(index_name)
print('Vector DB ๋ค์ด๊ฐ๋ ์ค. Index_name = ' + str(index_name))
# # ํ
์คํธ ์๋ฒ ๋ฉ ์์ฑ
# texts = [doc.page_content for doc in flattened_list]
# embedded_texts = []
# for txt in texts:
# embedded_texts.append(embedding_model.embed_query(txt))
# # ๋ฒกํฐ DB์ ์๋ฒ ๋ฉ ์ถ๊ฐ
# ids = [str(i) for i in range(len(embedded_texts))]
# metadata = [doc.metadata for doc in flattened_list]
# # db์ฌ๋ฆด๋ ๋ฌด๋ฃ๋ฒ์ ์ด๊ธฐ๋๋ฌธ์ ์ฉ๋ ํฐ์ง -> ๋๋ ์ ์ฌ๋ฆฌ์
# batch_size = 28
# for i in range(0, len(embedded_texts), batch_size):
# batch_vectors = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids[i:i + batch_size], embedded_texts[i:i + batch_size], metadata[i:i + batch_size])]
# index.upsert(vectors=batch_vectors)
Vectorstore = PineconeVectorStore.from_documents(
documents=flattened_list,
index_name=index_name,
embedding=embedding_model
)
print('์ ์ฅ ์๋ฃ')
return Vectorstore, flattened_list |