|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM |
|
from langchain_community.vectorstores import Chroma |
|
from langchain.schema import Document |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline |
|
import torch |
|
|
|
embedding_model_name = 'nomic-ai/nomic-embed-text-v1.5' |
|
|
|
model_kwargs = {'device':'cuda' if torch.cuda.is_available() else 'cpu',"trust_remote_code": True} |
|
|
|
embeddings = HuggingFaceEmbeddings( |
|
model_name=embedding_model_name, |
|
model_kwargs=model_kwargs |
|
) |
|
|
|
vectorstore = None |
|
|
|
|
|
|
|
def read_file(data: str) -> Document: |
|
f = open(data,'r') |
|
content = f.read() |
|
f.close() |
|
doc = Document(page_content=content, metadata={"name": data.split('/')[-1]}) |
|
return doc |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) |
|
|
|
def add_doc(data,vectorstore): |
|
doc = read_file(data) |
|
splits = text_splitter.split_documents([doc]) |
|
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) |
|
retriever = vectorstore.as_retriever(search_kwargs={'k':1}) |
|
return retriever, vectorstore |
|
|
|
def delete_doc(delete_name,vectorstore): |
|
delete_doc_ids = [] |
|
for idx,name in enumerate(vectorstore.get()['metadatas']): |
|
if name['name'] == delete_name: |
|
delete_doc_ids.append(vectorstore.get()['ids'][idx]) |
|
for id in delete_doc_ids: |
|
vectorstore.delete(ids = id) |
|
|
|
retriever = vectorstore.as_retriever(search_kwargs={'k':1}) |
|
return retriever, vectorstore |
|
|
|
def delete_all_doc(vectorstore): |
|
delete_doc_ids = vectorstore.get()['ids'] |
|
for id in delete_doc_ids: |
|
vectorstore.delete(ids = id) |
|
|
|
retriever = vectorstore.as_retriever(search_kwargs={'k':1}) |
|
return retriever, vectorstore |