Spaces:

Sunbird
/

acres

Sleeping

App Files Files Community

Patrick Walukagga commited on Nov 5, 2024

Commit

af11e83

1 Parent(s): 85bbaed

chroma integration

Browse files

Files changed (8) hide show

.gitignore +3 -0
app.py +62 -9
rag/rag_pipeline.py +35 -23
rag/rag_pipeline_backup.py +94 -0
requirements.txt +1 -0
study_files.json +9 -1
study_files_backup.json +13 -0
utils/helpers.py +51 -0

.gitignore CHANGED Viewed

@@ -171,3 +171,6 @@ poetry.toml
 # LSP config files
 pyrightconfig.json

 # LSP config files
 pyrightconfig.json
+# data
+data/

app.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import json
 from typing import List, Tuple
 import os
 import gradio as gr
 from dotenv import load_dotenv
 from slugify import slugify
-from config import STUDY_FILES
 from rag.rag_pipeline import RAGPipeline
-from utils.helpers import generate_follow_up_questions, append_to_study_files
 from utils.prompts import (
     highlight_prompt,
     evidence_based_prompt,
@@ -20,9 +20,13 @@ from config import STUDY_FILES, OPENAI_API_KEY
 from utils.zotero_manager import ZoteroManager
 load_dotenv()
 openai.api_key = OPENAI_API_KEY
 # Cache for RAG pipelines
 rag_cache = {}
@@ -47,6 +51,8 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
             zotero_manager.filter_and_return_collections_with_items(zotero_collection_lists)
         )
         for collection in filtered_zotero_collection_lists:
             collection_name = collection.get("name")
             if collection_name not in STUDY_FILES:
@@ -62,6 +68,16 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
                     zotero_items_json, f"data/{export_file}"
                 )
                 append_to_study_files("study_files.json", collection_name, f"data/{export_file}")
         message = "Successfully processed items in your zotero library"
     except Exception as e:
         message = f"Error process your zotero library: {str(e)}"
@@ -70,12 +86,24 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
 def get_rag_pipeline(study_name: str) -> RAGPipeline:
-    """Get or create a RAGPipeline instance for the given study."""
     if study_name not in rag_cache:
-        study_file = STUDY_FILES.get(study_name)
-        if not study_file:
             raise ValueError(f"Invalid study name: {study_name}")
         rag_cache[study_name] = RAGPipeline(study_file)
     return rag_cache[study_name]
@@ -88,6 +116,7 @@ def chat_function(
         return "Please enter a valid query."
     rag = get_rag_pipeline(study_name)
     prompt = {
         "Highlight": highlight_prompt,
         "Evidence-based": evidence_based_prompt,
@@ -100,9 +129,19 @@ def chat_function(
 def get_study_info(study_name: str) -> str:
     """Retrieve information about the specified study."""
-    study_file = STUDY_FILES.get(study_name)
     if not study_file:
-        return "Invalid study name"
     with open(study_file, "r") as f:
         data = json.load(f)
@@ -128,6 +167,7 @@ def process_multi_input(text, study_name, prompt_type):
     # Split input based on commas and strip any extra spaces
     variable_list = [word.strip().upper() for word in text.split(',')]
     user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
     response = chat_function(user_message, study_name, prompt_type)
     return response
@@ -159,11 +199,24 @@ def create_gr_interface() -> gr.Blocks:
                 zotero_output = gr.Markdown(label="Zotero")
                 gr.Markdown("### Study Information")
                 study_dropdown = gr.Dropdown(
-                    choices=list(STUDY_FILES.keys()),
                     label="Select Study",
-                    value=list(STUDY_FILES.keys())[0],
                 )
                 study_info = gr.Markdown(label="Study Details")
                 gr.Markdown("### Settings")

 import json
 from typing import List, Tuple
 import os
+import logging
 import gradio as gr
 from dotenv import load_dotenv
 from slugify import slugify
 from rag.rag_pipeline import RAGPipeline
+from utils.helpers import generate_follow_up_questions, append_to_study_files, add_study_files_to_chromadb, chromadb_client
 from utils.prompts import (
     highlight_prompt,
     evidence_based_prompt,
 from utils.zotero_manager import ZoteroManager
 load_dotenv()
+logging.basicConfig(level=logging.INFO)
 openai.api_key = OPENAI_API_KEY
+# After loop, add all collected data to ChromaDB
+add_study_files_to_chromadb("study_files.json", "study_files_collection")
 # Cache for RAG pipelines
 rag_cache = {}
             zotero_manager.filter_and_return_collections_with_items(zotero_collection_lists)
         )
+        study_files_data = {}  # Dictionary to collect items for ChromaDB
         for collection in filtered_zotero_collection_lists:
             collection_name = collection.get("name")
             if collection_name not in STUDY_FILES:
                     zotero_items_json, f"data/{export_file}"
                 )
                 append_to_study_files("study_files.json", collection_name, f"data/{export_file}")
+                # Collect for ChromaDB
+                study_files_data[collection_name] = f"data/{export_file}"
+                # Update in-memory STUDY_FILES for reference in current session
+                STUDY_FILES.update({collection_name: f"data/{export_file}"})
+                logging.info(f"STUDY_FILES: {STUDY_FILES}")
+        # After loop, add all collected data to ChromaDB
+        add_study_files_to_chromadb("study_files.json", "study_files_collection")
         message = "Successfully processed items in your zotero library"
     except Exception as e:
         message = f"Error process your zotero library: {str(e)}"
 def get_rag_pipeline(study_name: str) -> RAGPipeline:
+    """Get or create a RAGPipeline instance for the given study by querying ChromaDB."""
     if study_name not in rag_cache:
+        # Query ChromaDB for the study file path by ID
+        collection = chromadb_client.get_or_create_collection("study_files_collection")
+        result = collection.get(ids=[study_name])  # Retrieve document by ID
+        # Check if the result contains the requested document
+        if not result or len(result['metadatas']) == 0:
             raise ValueError(f"Invalid study name: {study_name}")
+        # Extract the file path from the document metadata
+        study_file = result['metadatas'][0].get("file_path")
+        if not study_file:
+            raise ValueError(f"File path not found for study name: {study_name}")
+        # Create and cache the RAGPipeline instance
         rag_cache[study_name] = RAGPipeline(study_file)
     return rag_cache[study_name]
         return "Please enter a valid query."
     rag = get_rag_pipeline(study_name)
+    logging.info(f"rag: ==> {rag}")
     prompt = {
         "Highlight": highlight_prompt,
         "Evidence-based": evidence_based_prompt,
 def get_study_info(study_name: str) -> str:
     """Retrieve information about the specified study."""
+    collection = chromadb_client.get_or_create_collection("study_files_collection")
+    result = collection.get(ids=[study_name])  # Query by study name (as a list)
+    logging.info(f"Result: ======> {result}")
+    # Check if the document exists in the result
+    if not result or len(result['metadatas']) == 0:
+        raise ValueError(f"Invalid study name: {study_name}")
+    # Extract the file path from the document metadata
+    study_file = result['metadatas'][0].get("file_path")
+    logging.info(f"study_file: =======> {study_file}")
     if not study_file:
+        raise ValueError(f"File path not found for study name: {study_name}")
     with open(study_file, "r") as f:
         data = json.load(f)
     # Split input based on commas and strip any extra spaces
     variable_list = [word.strip().upper() for word in text.split(',')]
     user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
+    logging.info(f"User message: ==> {user_message}")
     response = chat_function(user_message, study_name, prompt_type)
     return response
                 zotero_output = gr.Markdown(label="Zotero")
                 gr.Markdown("### Study Information")
+                # Query ChromaDB for all document IDs in the "study_files_collection" collection
+                collection = chromadb_client.get_or_create_collection("study_files_collection")
+                # Retrieve all documents by querying with an empty string and specifying a high n_results
+                all_documents = collection.query(query_texts=[""], n_results=1000)
+                logging.info(f"all_documents: =========> {all_documents}")
+                # Extract document IDs as study names
+                document_ids = all_documents.get("ids")
+                study_choices = [doc_id for doc_id in document_ids[0] if document_ids]  # Get list of document IDs
+                logging.info(f"study_choices: ======> {study_choices}")
+                # Update the Dropdown with choices from ChromaDB
                 study_dropdown = gr.Dropdown(
+                    choices=study_choices,
                     label="Select Study",
+                    value=study_choices[0] if study_choices else None,  # Set first choice as default, if available
                 )
                 study_info = gr.Markdown(label="Study Details")
                 gr.Markdown("### Settings")

rag/rag_pipeline.py CHANGED Viewed

@@ -1,19 +1,27 @@
 import json
-from typing import Dict, Any
 from llama_index.core import Document, VectorStoreIndex
 from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
 from llama_index.core import PromptTemplate
-from typing import List
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.llms.openai import OpenAI
 class RAGPipeline:
-    def __init__(self, study_json, use_semantic_splitter=False):
         self.study_json = study_json
         self.use_semantic_splitter = use_semantic_splitter
         self.documents = None
-        self.index = None
         self.load_documents()
         self.build_index()
@@ -23,44 +31,46 @@ class RAGPipeline:
                 self.data = json.load(f)
             self.documents = []
             for index, doc_data in enumerate(self.data):
                 doc_content = (
                     f"Title: {doc_data['title']}\n"
                     f"Abstract: {doc_data['abstract']}\n"
                     f"Authors: {', '.join(doc_data['authors'])}\n"
-                    # f"full_text: {doc_data['full_text']}"
                 )
                 metadata = {
                     "title": doc_data.get("title"),
-                    "authors": doc_data.get("authors", []),
                     "year": doc_data.get("date"),
                     "doi": doc_data.get("doi"),
                 }
                 self.documents.append(
                     Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
                 )
     def build_index(self):
-        if self.index is None:
-            sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
-            def _split(text: str) -> List[str]:
-                return sentence_splitter.split_text(text)
-            node_parser = SentenceWindowNodeParser.from_defaults(
-                sentence_splitter=_split,
-                window_size=5,
-                window_metadata_key="window",
-                original_text_metadata_key="original_text",
-            )
-            nodes = node_parser.get_nodes_from_documents(self.documents)
-            self.index = VectorStoreIndex(
-                nodes, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large")
-            )
     def query(
         self, context: str, prompt_template: PromptTemplate = None
@@ -78,16 +88,18 @@ class RAGPipeline:
                 "If you're unsure about a source, use [?]. "
                 "Ensure that EVERY statement from the context is properly cited."
             )
         # This is a hack to index all the documents in the store :)
         n_documents = len(self.index.docstore.docs)
         query_engine = self.index.as_query_engine(
             text_qa_template=prompt_template,
-            similarity_top_k=n_documents,
             response_mode="tree_summarize",
             llm=OpenAI(model="gpt-4o-mini"),
         )
         response = query_engine.query(context)
         return response

 import json
+import logging
+from typing import Dict, Any, List
 from llama_index.core import Document, VectorStoreIndex
 from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
 from llama_index.core import PromptTemplate
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.llms.openai import OpenAI
+from llama_index.vector_stores.chroma import ChromaVectorStore
+import chromadb
+logging.basicConfig(level=logging.INFO)
 class RAGPipeline:
+    def __init__(self, study_json, collection_name="study_files_rag_collection", use_semantic_splitter=False):
         self.study_json = study_json
+        self.collection_name = collection_name
         self.use_semantic_splitter = use_semantic_splitter
         self.documents = None
+        self.client = chromadb.Client()
+        self.collection = self.client.get_or_create_collection(self.collection_name)
+        # Embed and store each node in ChromaDB
+        self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
         self.load_documents()
         self.build_index()
                 self.data = json.load(f)
             self.documents = []
             for index, doc_data in enumerate(self.data):
                 doc_content = (
                     f"Title: {doc_data['title']}\n"
                     f"Abstract: {doc_data['abstract']}\n"
                     f"Authors: {', '.join(doc_data['authors'])}\n"
                 )
                 metadata = {
                     "title": doc_data.get("title"),
+                    "authors": ", ".join(doc_data.get("authors", [])),
                     "year": doc_data.get("date"),
                     "doi": doc_data.get("doi"),
                 }
+                # Append document data for use in ChromaDB indexing
                 self.documents.append(
                     Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
                 )
     def build_index(self):
+        sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
+        def _split(text: str) -> List[str]:
+            return sentence_splitter.split_text(text)
+        node_parser = SentenceWindowNodeParser.from_defaults(
+            sentence_splitter=_split,
+            window_size=5,
+            window_metadata_key="window",
+            original_text_metadata_key="original_text",
+        )
+        # Parse documents into nodes for embedding
+        nodes = node_parser.get_nodes_from_documents(self.documents)
+        # Initialize ChromaVectorStore with the existing collection
+        vector_store = ChromaVectorStore(chroma_collection=self.collection)
+        # Create the VectorStoreIndex using the ChromaVectorStore
+        self.index = VectorStoreIndex(nodes, vector_store=vector_store, embed_model=self.embedding_model)
     def query(
         self, context: str, prompt_template: PromptTemplate = None
                 "If you're unsure about a source, use [?]. "
                 "Ensure that EVERY statement from the context is properly cited."
             )
         # This is a hack to index all the documents in the store :)
         n_documents = len(self.index.docstore.docs)
+        print(f"n_documents: {n_documents}")
         query_engine = self.index.as_query_engine(
             text_qa_template=prompt_template,
+            similarity_top_k=n_documents if n_documents <= 17 else 15,
             response_mode="tree_summarize",
             llm=OpenAI(model="gpt-4o-mini"),
         )
+        # Perform the query
         response = query_engine.query(context)
         return response

rag/rag_pipeline_backup.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import json
+from typing import Dict, Any
+from llama_index.core import Document, VectorStoreIndex
+from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
+from llama_index.core import PromptTemplate
+from typing import List
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
+class RAGPipeline:
+    def __init__(self, study_json, use_semantic_splitter=False):
+        self.study_json = study_json
+        self.use_semantic_splitter = use_semantic_splitter
+        self.documents = None
+        self.index = None
+        self.load_documents()
+        self.build_index()
+    def load_documents(self):
+        if self.documents is None:
+            with open(self.study_json, "r") as f:
+                self.data = json.load(f)
+            self.documents = []
+            for index, doc_data in enumerate(self.data):
+                doc_content = (
+                    f"Title: {doc_data['title']}\n"
+                    f"Abstract: {doc_data['abstract']}\n"
+                    f"Authors: {', '.join(doc_data['authors'])}\n"
+                    # f"full_text: {doc_data['full_text']}"
+                )
+                metadata = {
+                    "title": doc_data.get("title"),
+                    "authors": doc_data.get("authors", []),
+                    "year": doc_data.get("date"),
+                    "doi": doc_data.get("doi"),
+                }
+                self.documents.append(
+                    Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
+                )
+    def build_index(self):
+        if self.index is None:
+            sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
+            def _split(text: str) -> List[str]:
+                return sentence_splitter.split_text(text)
+            node_parser = SentenceWindowNodeParser.from_defaults(
+                sentence_splitter=_split,
+                window_size=5,
+                window_metadata_key="window",
+                original_text_metadata_key="original_text",
+            )
+            nodes = node_parser.get_nodes_from_documents(self.documents)
+            self.index = VectorStoreIndex(
+                nodes, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large")
+            )
+    def query(
+        self, context: str, prompt_template: PromptTemplate = None
+    ) -> Dict[str, Any]:
+        if prompt_template is None:
+            prompt_template = PromptTemplate(
+                "Context information is below.\n"
+                "---------------------\n"
+                "{context_str}\n"
+                "---------------------\n"
+                "Given this information, please answer the question: {query_str}\n"
+                "Provide an answer to the question using evidence from the context above. "
+                "Cite sources using square brackets for EVERY piece of information, e.g. [1], [2], etc. "
+                "Even if there's only one source, still include the citation. "
+                "If you're unsure about a source, use [?]. "
+                "Ensure that EVERY statement from the context is properly cited."
+            )
+        # This is a hack to index all the documents in the store :)
+        n_documents = len(self.index.docstore.docs)
+        print(f"n_documents: {n_documents}")
+        query_engine = self.index.as_query_engine(
+            text_qa_template=prompt_template,
+            similarity_top_k=n_documents if n_documents <= 17 else 15,
+            response_mode="tree_summarize",
+            llm=OpenAI(model="gpt-4o-mini"),
+        )
+        response = query_engine.query(context)
+        return response

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ chromadb==0.5.5
 fastapi==0.112.2
 gradio
 llama-index
 nest-asyncio==1.6.0
 openai
 pandas

 fastapi==0.112.2
 gradio
 llama-index
+llama-index-vector-stores-chroma
 nest-asyncio==1.6.0
 openai
 pandas

study_files.json CHANGED Viewed

@@ -2,5 +2,13 @@
     "Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
     "Ebola Virus": "data/ebola_virus_zotero_items.json",
     "GeneXpert": "data/gene_xpert_zotero_items.json",
-    "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json"
 }

     "Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
     "Ebola Virus": "data/ebola_virus_zotero_items.json",
     "GeneXpert": "data/gene_xpert_zotero_items.json",
+    "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
+    "Natural resources degradation": "data/natural-resources-degradation_zotero_items.json",
+    "EBSCOhost": "data/ebscohost_zotero_items.json",
+    "ref BMGF": "data/ref-bmgf_zotero_items.json",
+    "scholar (29)": "data/scholar-29_zotero_items.json",
+    "iom": "data/iom_zotero_items.json",
+    "ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
+    "wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
+    "kayongo papers": "data/kayongo-papers_zotero_items.json"
 }

study_files_backup.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
+    "Ebola Virus": "data/ebola_virus_zotero_items.json",
+    "GeneXpert": "data/gene_xpert_zotero_items.json",
+    "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
+    "EBSCOhost": "data/ebscohost_zotero_items.json",
+    "ref BMGF": "data/ref-bmgf_zotero_items.json",
+    "scholar (29)": "data/scholar-29_zotero_items.json",
+    "iom": "data/iom_zotero_items.json",
+    "ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
+    "wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
+    "kayongo papers": "data/kayongo-papers_zotero_items.json"
+}

utils/helpers.py CHANGED Viewed

@@ -8,6 +8,13 @@ from utils.prompts import (
     StudyCharacteristics,
 )
 import json
 def read_study_files(file_path):
     """
@@ -165,3 +172,47 @@ def generate_follow_up_questions(
         if cleaned_q:
             cleaned_questions.append(f"✨ {cleaned_q}")
     return cleaned_questions[:3]

     StudyCharacteristics,
 )
 import json
+import json
+import chromadb
+from chromadb.api.types import Document
+# Initialize ChromaDB client
+chromadb_client = chromadb.Client()
 def read_study_files(file_path):
     """
         if cleaned_q:
             cleaned_questions.append(f"✨ {cleaned_q}")
     return cleaned_questions[:3]
+def add_study_files_to_chromadb(file_path: str, collection_name: str):
+    """
+    Reads the study files data from a JSON file and adds it to the specified ChromaDB collection.
+    :param file_path: Path to the JSON file containing study files data.
+    :param collection_name: Name of the ChromaDB collection to store the data.
+    """
+    # Load study files data from JSON file
+    try:
+        with open(file_path, "r") as f:
+            study_files_data = json.load(f)
+    except FileNotFoundError:
+        print(f"File '{file_path}' not found.")
+        return
+    # Get or create the collection in ChromaDB
+    collection = chromadb_client.get_or_create_collection(collection_name)
+    # Prepare lists for ids, texts, and metadata to batch insert
+    ids = []
+    documents = []
+    metadatas = []
+    # Populate lists with data from the JSON file
+    for name, file_path in study_files_data.items():
+        ids.append(name)  # Document ID
+        documents.append("")  # Optional text, can be left empty if not used
+        metadatas.append({"file_path": file_path})  # Metadata with file path
+    # Add documents to the collection in batch
+    collection.add(
+        ids=ids,
+        documents=documents,
+        metadatas=metadatas
+    )
+    print("All study files have been successfully added to ChromaDB.")
+if __name__ == "__main__":
+    # Usage example
+    add_study_files_to_chromadb("study_files.json", "study_files_collection")