|
import glob |
|
import os |
|
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.docstore.document import Document |
|
from sentence_transformers import SentenceTransformer |
|
from langchain_pinecone import PineconeVectorStore |
|
from pinecone.grpc import PineconeGRPC as Pinecone |
|
from pinecone import ServerlessSpec |
|
import time |
|
from langchain_community.embeddings import SentenceTransformerEmbeddings |
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
|
|
|
|
def come_data(splits): |
|
docs = [] |
|
for i in range(len(splits)): |
|
spcon = splits[i].page_content |
|
url = splits[i].metadata['source'] |
|
con = Document(page_content=spcon, metadata={'source': url}) |
|
docs.append(con) |
|
return docs |
|
|
|
|
|
|
|
|
|
|
|
|
|
def flatten_list(lst): |
|
return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst] |
|
|
|
|
|
|
|
def all_files(path): |
|
print(f'RAG์ ๋ค์ด๊ฐ ๋ชจ๋ ๋ฐ์ดํฐ๋ {path}์ ๋ด์์ฃผ์ธ์.\n\n\n') |
|
f = glob.glob(path + '/**', recursive=True) |
|
f_docs = [] |
|
for file in f: |
|
a = False |
|
if file.endswith('.txt'): |
|
loader = TextLoader(file) |
|
document = loader.load() |
|
a = True |
|
elif file.endswith('.csv'): |
|
loader = CSVLoader(file) |
|
document = loader.load() |
|
a = True |
|
elif file.endswith('.pdf'): |
|
loader = PyMuPDFLoader(file) |
|
document = loader.load() |
|
a = True |
|
|
|
if a: |
|
print(file.split('/')[-1] + ' split ์งํ ์ค') |
|
text_splitter = CharacterTextSplitter.from_tiktoken_encoder( |
|
separator=".", |
|
chunk_size=500, |
|
chunk_overlap=0, |
|
) |
|
splits = text_splitter.split_documents(document) |
|
docs = come_data(splits) |
|
f_docs.append(docs) |
|
print(file.split('/')[-1] + ' split ์งํ ์๋ฃ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ : ' + str(len(docs))) |
|
flattened_list = flatten_list(f_docs) |
|
|
|
''' |
|
flattened ๋ docs๋ฅผ ๋ฒกํฐ db๋ก ๋ฃ์ด์ค ๊ฒ |
|
''' |
|
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True}) |
|
|
|
|
|
|
|
api_key = os.environ['PINECONE_API_KEY'] |
|
pc = Pinecone(api_key=api_key) |
|
|
|
index_name = os.getenv('INDEX_NAME') |
|
|
|
print('Vector DB ์ด๊ธฐํ. Index_name = ' + str(index_name)) |
|
spec = ServerlessSpec(cloud='aws', region='us-east-1') |
|
|
|
|
|
collect_name = [] |
|
for n in pc.list_indexes().indexes: |
|
collect_name.append(n.name) |
|
|
|
if index_name in collect_name: |
|
pc.delete_index(index_name) |
|
print('๊ธฐ์กด ์ธ๋ฑ์ค ์ญ์ ์๋ฃ') |
|
time.sleep(3) |
|
|
|
|
|
pc.create_index( |
|
index_name, |
|
dimension=768, |
|
metric='cosine', |
|
spec=spec |
|
) |
|
|
|
|
|
|
|
print('Vector DB ๋ค์ด๊ฐ๋ ์ค. Index_name = ' + str(index_name)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Vectorstore = PineconeVectorStore.from_documents( |
|
documents=flattened_list, |
|
index_name=index_name, |
|
embedding=embedding_model |
|
) |
|
|
|
print('์ ์ฅ ์๋ฃ') |
|
return Vectorstore, flattened_list |