# function support rag pipeline from typing import List from langchain.vectorstores import Chroma from langchain.retrievers.multi_vector import MultiVectorRetriever from langchain.storage import InMemoryStore import uuid from langchain.document_loaders import TextLoader, DirectoryLoader import os from sentence_transformers.cross_encoder import CrossEncoder import numpy as np from langchain.schema import BaseRetriever, Document from typing import List from langchain.callbacks.manager import CallbackManagerForRetrieverRun from langchain.vectorstores import VectorStore from langchain.load import dumps, loads from typing import Any def load_data(data_path): folders = os.listdir(data_path) dir_loaders = [] loaded_documents = [] for folder in folders: dir_loader = DirectoryLoader(os.path.join(data_path, folder), loader_cls=TextLoader) dir_loaders.append(dir_loader) for dir_loader in dir_loaders: loaded_documents.extend(dir_loader.load()) return loaded_documents def process_data(data: List[str], child_text_splitter, embedding, vectorstore_name: str) -> MultiVectorRetriever: # The vectorstore to use to index the child chunks vectorstore = Chroma( collection_name=vectorstore_name, embedding_function=embedding, # collection_metadata={"hnsw:space": "cosine"} ) # The storage layer for the parent documents store = InMemoryStore() id_key = "doc_id" # The retriever (empty to start) retriever = MultiVectorRetriever( vectorstore=vectorstore, docstore=store, id_key=id_key, search_kwargs={"k": 25} ) doc_ids = [str(uuid.uuid4()) for _ in data] sub_docs = [] for i, doc in enumerate(data): _id = doc_ids[i] _sub_docs = child_text_splitter.split_documents([doc]) for _doc in _sub_docs: _doc.metadata[id_key] = _id sub_docs.extend(_sub_docs) retriever.vectorstore.add_documents(sub_docs) retriever.docstore.mset(list(zip(doc_ids, data))) return vectorstore, retriever class CustomRetriever(BaseRetriever): # vectorstores:Chroma retriever:Any def reciprocal_rank_fusion(self, results: list[list], k=60): """ Reciprocal_rank_fusion that takes multiple lists of ranked documents and an optional parameter k used in the RRF formula """ # Initialize a dictionary to hold fused scores for each unique document fused_scores = {} # Iterate through each list of ranked documents for docs in results: # Iterate through each document in the list, with its rank (position in the list) for rank, doc in enumerate(docs): # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON) doc_str = dumps(doc) # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0 if doc_str not in fused_scores: fused_scores[doc_str] = 0 # Retrieve the current score of the document, if any previous_score = fused_scores[doc_str] # Update the score of the document using the RRF formula: 1 / (rank + k) fused_scores[doc_str] += 1 / (rank + k) # Sort the documents based on their fused scores in descending order to get the final reranked results reranked_results = [ (loads(doc), score) for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True) #[:10] #Top 10 ] # Return the reranked results as a list of tuples, each containing the document and its fused score rr_list=[] for doc in reranked_results: rr_list.append(doc[0]) return rr_list def _get_relevant_documents( self, queries: list, *, run_manager: CallbackManagerForRetrieverRun ) -> List[Document]: # Use your existing retriever to get the documents documents=[] for i in range(len(queries)): document = self.retriever.get_relevant_documents(queries[i], callbacks=run_manager.get_child()) documents.append(document) unique_documents = self.reciprocal_rank_fusion(documents) # Get page content docs_content = [] for i in range(len(unique_documents)): docs_content.append(unique_documents[i].page_content) # model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-641M') # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-642M-2') * # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-644M-1') # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-32-2M-2') # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-32-5M-1') model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-32-6M-1') # So we create the respective sentence combinations sentence_combinations = [[queries[0], document] for document in docs_content] # Compute the similarity scores for these combinations similarity_scores = model.predict(sentence_combinations) # Sort the scores in decreasing order sim_scores_argsort = reversed(np.argsort(similarity_scores)) # Store the rerank document in new list docs = [] for idx in sim_scores_argsort: docs.append(unique_documents[idx]) docs_top_10 = docs[0:10] return docs_top_10