Spaces:
Sleeping
Sleeping
Patrick Walukagga
commited on
Commit
·
af11e83
1
Parent(s):
85bbaed
chroma integration
Browse files- .gitignore +3 -0
- app.py +62 -9
- rag/rag_pipeline.py +35 -23
- rag/rag_pipeline_backup.py +94 -0
- requirements.txt +1 -0
- study_files.json +9 -1
- study_files_backup.json +13 -0
- utils/helpers.py +51 -0
.gitignore
CHANGED
@@ -171,3 +171,6 @@ poetry.toml
|
|
171 |
|
172 |
# LSP config files
|
173 |
pyrightconfig.json
|
|
|
|
|
|
|
|
171 |
|
172 |
# LSP config files
|
173 |
pyrightconfig.json
|
174 |
+
|
175 |
+
# data
|
176 |
+
data/
|
app.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
import json
|
2 |
from typing import List, Tuple
|
3 |
import os
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
from dotenv import load_dotenv
|
7 |
from slugify import slugify
|
8 |
|
9 |
-
from config import STUDY_FILES
|
10 |
from rag.rag_pipeline import RAGPipeline
|
11 |
-
from utils.helpers import generate_follow_up_questions, append_to_study_files
|
12 |
from utils.prompts import (
|
13 |
highlight_prompt,
|
14 |
evidence_based_prompt,
|
@@ -20,9 +20,13 @@ from config import STUDY_FILES, OPENAI_API_KEY
|
|
20 |
from utils.zotero_manager import ZoteroManager
|
21 |
|
22 |
load_dotenv()
|
|
|
23 |
|
24 |
openai.api_key = OPENAI_API_KEY
|
25 |
|
|
|
|
|
|
|
26 |
# Cache for RAG pipelines
|
27 |
rag_cache = {}
|
28 |
|
@@ -47,6 +51,8 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
|
|
47 |
zotero_manager.filter_and_return_collections_with_items(zotero_collection_lists)
|
48 |
)
|
49 |
|
|
|
|
|
50 |
for collection in filtered_zotero_collection_lists:
|
51 |
collection_name = collection.get("name")
|
52 |
if collection_name not in STUDY_FILES:
|
@@ -62,6 +68,16 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
|
|
62 |
zotero_items_json, f"data/{export_file}"
|
63 |
)
|
64 |
append_to_study_files("study_files.json", collection_name, f"data/{export_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
message = "Successfully processed items in your zotero library"
|
66 |
except Exception as e:
|
67 |
message = f"Error process your zotero library: {str(e)}"
|
@@ -70,12 +86,24 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
|
|
70 |
|
71 |
|
72 |
def get_rag_pipeline(study_name: str) -> RAGPipeline:
|
73 |
-
"""Get or create a RAGPipeline instance for the given study."""
|
74 |
if study_name not in rag_cache:
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
77 |
raise ValueError(f"Invalid study name: {study_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
rag_cache[study_name] = RAGPipeline(study_file)
|
|
|
79 |
return rag_cache[study_name]
|
80 |
|
81 |
|
@@ -88,6 +116,7 @@ def chat_function(
|
|
88 |
return "Please enter a valid query."
|
89 |
|
90 |
rag = get_rag_pipeline(study_name)
|
|
|
91 |
prompt = {
|
92 |
"Highlight": highlight_prompt,
|
93 |
"Evidence-based": evidence_based_prompt,
|
@@ -100,9 +129,19 @@ def chat_function(
|
|
100 |
def get_study_info(study_name: str) -> str:
|
101 |
"""Retrieve information about the specified study."""
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
if not study_file:
|
105 |
-
|
106 |
|
107 |
with open(study_file, "r") as f:
|
108 |
data = json.load(f)
|
@@ -128,6 +167,7 @@ def process_multi_input(text, study_name, prompt_type):
|
|
128 |
# Split input based on commas and strip any extra spaces
|
129 |
variable_list = [word.strip().upper() for word in text.split(',')]
|
130 |
user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
|
|
|
131 |
response = chat_function(user_message, study_name, prompt_type)
|
132 |
return response
|
133 |
|
@@ -159,11 +199,24 @@ def create_gr_interface() -> gr.Blocks:
|
|
159 |
zotero_output = gr.Markdown(label="Zotero")
|
160 |
|
161 |
gr.Markdown("### Study Information")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
study_dropdown = gr.Dropdown(
|
163 |
-
choices=
|
164 |
label="Select Study",
|
165 |
-
value=
|
166 |
)
|
|
|
167 |
study_info = gr.Markdown(label="Study Details")
|
168 |
|
169 |
gr.Markdown("### Settings")
|
|
|
1 |
import json
|
2 |
from typing import List, Tuple
|
3 |
import os
|
4 |
+
import logging
|
5 |
|
6 |
import gradio as gr
|
7 |
from dotenv import load_dotenv
|
8 |
from slugify import slugify
|
9 |
|
|
|
10 |
from rag.rag_pipeline import RAGPipeline
|
11 |
+
from utils.helpers import generate_follow_up_questions, append_to_study_files, add_study_files_to_chromadb, chromadb_client
|
12 |
from utils.prompts import (
|
13 |
highlight_prompt,
|
14 |
evidence_based_prompt,
|
|
|
20 |
from utils.zotero_manager import ZoteroManager
|
21 |
|
22 |
load_dotenv()
|
23 |
+
logging.basicConfig(level=logging.INFO)
|
24 |
|
25 |
openai.api_key = OPENAI_API_KEY
|
26 |
|
27 |
+
# After loop, add all collected data to ChromaDB
|
28 |
+
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
29 |
+
|
30 |
# Cache for RAG pipelines
|
31 |
rag_cache = {}
|
32 |
|
|
|
51 |
zotero_manager.filter_and_return_collections_with_items(zotero_collection_lists)
|
52 |
)
|
53 |
|
54 |
+
study_files_data = {} # Dictionary to collect items for ChromaDB
|
55 |
+
|
56 |
for collection in filtered_zotero_collection_lists:
|
57 |
collection_name = collection.get("name")
|
58 |
if collection_name not in STUDY_FILES:
|
|
|
68 |
zotero_items_json, f"data/{export_file}"
|
69 |
)
|
70 |
append_to_study_files("study_files.json", collection_name, f"data/{export_file}")
|
71 |
+
|
72 |
+
# Collect for ChromaDB
|
73 |
+
study_files_data[collection_name] = f"data/{export_file}"
|
74 |
+
|
75 |
+
# Update in-memory STUDY_FILES for reference in current session
|
76 |
+
STUDY_FILES.update({collection_name: f"data/{export_file}"})
|
77 |
+
logging.info(f"STUDY_FILES: {STUDY_FILES}")
|
78 |
+
|
79 |
+
# After loop, add all collected data to ChromaDB
|
80 |
+
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
81 |
message = "Successfully processed items in your zotero library"
|
82 |
except Exception as e:
|
83 |
message = f"Error process your zotero library: {str(e)}"
|
|
|
86 |
|
87 |
|
88 |
def get_rag_pipeline(study_name: str) -> RAGPipeline:
|
89 |
+
"""Get or create a RAGPipeline instance for the given study by querying ChromaDB."""
|
90 |
if study_name not in rag_cache:
|
91 |
+
# Query ChromaDB for the study file path by ID
|
92 |
+
collection = chromadb_client.get_or_create_collection("study_files_collection")
|
93 |
+
result = collection.get(ids=[study_name]) # Retrieve document by ID
|
94 |
+
|
95 |
+
# Check if the result contains the requested document
|
96 |
+
if not result or len(result['metadatas']) == 0:
|
97 |
raise ValueError(f"Invalid study name: {study_name}")
|
98 |
+
|
99 |
+
# Extract the file path from the document metadata
|
100 |
+
study_file = result['metadatas'][0].get("file_path")
|
101 |
+
if not study_file:
|
102 |
+
raise ValueError(f"File path not found for study name: {study_name}")
|
103 |
+
|
104 |
+
# Create and cache the RAGPipeline instance
|
105 |
rag_cache[study_name] = RAGPipeline(study_file)
|
106 |
+
|
107 |
return rag_cache[study_name]
|
108 |
|
109 |
|
|
|
116 |
return "Please enter a valid query."
|
117 |
|
118 |
rag = get_rag_pipeline(study_name)
|
119 |
+
logging.info(f"rag: ==> {rag}")
|
120 |
prompt = {
|
121 |
"Highlight": highlight_prompt,
|
122 |
"Evidence-based": evidence_based_prompt,
|
|
|
129 |
def get_study_info(study_name: str) -> str:
|
130 |
"""Retrieve information about the specified study."""
|
131 |
|
132 |
+
collection = chromadb_client.get_or_create_collection("study_files_collection")
|
133 |
+
result = collection.get(ids=[study_name]) # Query by study name (as a list)
|
134 |
+
logging.info(f"Result: ======> {result}")
|
135 |
+
|
136 |
+
# Check if the document exists in the result
|
137 |
+
if not result or len(result['metadatas']) == 0:
|
138 |
+
raise ValueError(f"Invalid study name: {study_name}")
|
139 |
+
|
140 |
+
# Extract the file path from the document metadata
|
141 |
+
study_file = result['metadatas'][0].get("file_path")
|
142 |
+
logging.info(f"study_file: =======> {study_file}")
|
143 |
if not study_file:
|
144 |
+
raise ValueError(f"File path not found for study name: {study_name}")
|
145 |
|
146 |
with open(study_file, "r") as f:
|
147 |
data = json.load(f)
|
|
|
167 |
# Split input based on commas and strip any extra spaces
|
168 |
variable_list = [word.strip().upper() for word in text.split(',')]
|
169 |
user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
|
170 |
+
logging.info(f"User message: ==> {user_message}")
|
171 |
response = chat_function(user_message, study_name, prompt_type)
|
172 |
return response
|
173 |
|
|
|
199 |
zotero_output = gr.Markdown(label="Zotero")
|
200 |
|
201 |
gr.Markdown("### Study Information")
|
202 |
+
|
203 |
+
# Query ChromaDB for all document IDs in the "study_files_collection" collection
|
204 |
+
collection = chromadb_client.get_or_create_collection("study_files_collection")
|
205 |
+
# Retrieve all documents by querying with an empty string and specifying a high n_results
|
206 |
+
all_documents = collection.query(query_texts=[""], n_results=1000)
|
207 |
+
logging.info(f"all_documents: =========> {all_documents}")
|
208 |
+
# Extract document IDs as study names
|
209 |
+
document_ids = all_documents.get("ids")
|
210 |
+
study_choices = [doc_id for doc_id in document_ids[0] if document_ids] # Get list of document IDs
|
211 |
+
logging.info(f"study_choices: ======> {study_choices}")
|
212 |
+
|
213 |
+
# Update the Dropdown with choices from ChromaDB
|
214 |
study_dropdown = gr.Dropdown(
|
215 |
+
choices=study_choices,
|
216 |
label="Select Study",
|
217 |
+
value=study_choices[0] if study_choices else None, # Set first choice as default, if available
|
218 |
)
|
219 |
+
|
220 |
study_info = gr.Markdown(label="Study Details")
|
221 |
|
222 |
gr.Markdown("### Settings")
|
rag/rag_pipeline.py
CHANGED
@@ -1,19 +1,27 @@
|
|
1 |
import json
|
2 |
-
|
|
|
|
|
3 |
from llama_index.core import Document, VectorStoreIndex
|
4 |
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
|
5 |
from llama_index.core import PromptTemplate
|
6 |
-
from typing import List
|
7 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
8 |
from llama_index.llms.openai import OpenAI
|
|
|
|
|
9 |
|
|
|
10 |
|
11 |
class RAGPipeline:
|
12 |
-
def __init__(self, study_json, use_semantic_splitter=False):
|
13 |
self.study_json = study_json
|
|
|
14 |
self.use_semantic_splitter = use_semantic_splitter
|
15 |
self.documents = None
|
16 |
-
self.
|
|
|
|
|
|
|
17 |
self.load_documents()
|
18 |
self.build_index()
|
19 |
|
@@ -23,44 +31,46 @@ class RAGPipeline:
|
|
23 |
self.data = json.load(f)
|
24 |
|
25 |
self.documents = []
|
26 |
-
|
27 |
for index, doc_data in enumerate(self.data):
|
28 |
doc_content = (
|
29 |
f"Title: {doc_data['title']}\n"
|
30 |
f"Abstract: {doc_data['abstract']}\n"
|
31 |
f"Authors: {', '.join(doc_data['authors'])}\n"
|
32 |
-
# f"full_text: {doc_data['full_text']}"
|
33 |
)
|
34 |
|
35 |
metadata = {
|
36 |
"title": doc_data.get("title"),
|
37 |
-
"authors": doc_data.get("authors", []),
|
38 |
"year": doc_data.get("date"),
|
39 |
"doi": doc_data.get("doi"),
|
40 |
}
|
41 |
|
|
|
42 |
self.documents.append(
|
43 |
Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
|
44 |
)
|
45 |
|
46 |
def build_index(self):
|
47 |
-
|
48 |
-
sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def query(
|
66 |
self, context: str, prompt_template: PromptTemplate = None
|
@@ -78,16 +88,18 @@ class RAGPipeline:
|
|
78 |
"If you're unsure about a source, use [?]. "
|
79 |
"Ensure that EVERY statement from the context is properly cited."
|
80 |
)
|
81 |
-
|
82 |
# This is a hack to index all the documents in the store :)
|
83 |
n_documents = len(self.index.docstore.docs)
|
|
|
84 |
query_engine = self.index.as_query_engine(
|
85 |
text_qa_template=prompt_template,
|
86 |
-
similarity_top_k=n_documents,
|
87 |
response_mode="tree_summarize",
|
88 |
llm=OpenAI(model="gpt-4o-mini"),
|
89 |
)
|
90 |
|
|
|
91 |
response = query_engine.query(context)
|
92 |
|
93 |
return response
|
|
|
1 |
import json
|
2 |
+
import logging
|
3 |
+
from typing import Dict, Any, List
|
4 |
+
|
5 |
from llama_index.core import Document, VectorStoreIndex
|
6 |
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
|
7 |
from llama_index.core import PromptTemplate
|
|
|
8 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
9 |
from llama_index.llms.openai import OpenAI
|
10 |
+
from llama_index.vector_stores.chroma import ChromaVectorStore
|
11 |
+
import chromadb
|
12 |
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
|
15 |
class RAGPipeline:
|
16 |
+
def __init__(self, study_json, collection_name="study_files_rag_collection", use_semantic_splitter=False):
|
17 |
self.study_json = study_json
|
18 |
+
self.collection_name = collection_name
|
19 |
self.use_semantic_splitter = use_semantic_splitter
|
20 |
self.documents = None
|
21 |
+
self.client = chromadb.Client()
|
22 |
+
self.collection = self.client.get_or_create_collection(self.collection_name)
|
23 |
+
# Embed and store each node in ChromaDB
|
24 |
+
self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
|
25 |
self.load_documents()
|
26 |
self.build_index()
|
27 |
|
|
|
31 |
self.data = json.load(f)
|
32 |
|
33 |
self.documents = []
|
|
|
34 |
for index, doc_data in enumerate(self.data):
|
35 |
doc_content = (
|
36 |
f"Title: {doc_data['title']}\n"
|
37 |
f"Abstract: {doc_data['abstract']}\n"
|
38 |
f"Authors: {', '.join(doc_data['authors'])}\n"
|
|
|
39 |
)
|
40 |
|
41 |
metadata = {
|
42 |
"title": doc_data.get("title"),
|
43 |
+
"authors": ", ".join(doc_data.get("authors", [])),
|
44 |
"year": doc_data.get("date"),
|
45 |
"doi": doc_data.get("doi"),
|
46 |
}
|
47 |
|
48 |
+
# Append document data for use in ChromaDB indexing
|
49 |
self.documents.append(
|
50 |
Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
|
51 |
)
|
52 |
|
53 |
def build_index(self):
|
54 |
+
sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
|
|
|
55 |
|
56 |
+
def _split(text: str) -> List[str]:
|
57 |
+
return sentence_splitter.split_text(text)
|
58 |
|
59 |
+
node_parser = SentenceWindowNodeParser.from_defaults(
|
60 |
+
sentence_splitter=_split,
|
61 |
+
window_size=5,
|
62 |
+
window_metadata_key="window",
|
63 |
+
original_text_metadata_key="original_text",
|
64 |
+
)
|
65 |
|
66 |
+
# Parse documents into nodes for embedding
|
67 |
+
nodes = node_parser.get_nodes_from_documents(self.documents)
|
68 |
+
|
69 |
+
# Initialize ChromaVectorStore with the existing collection
|
70 |
+
vector_store = ChromaVectorStore(chroma_collection=self.collection)
|
71 |
+
|
72 |
+
# Create the VectorStoreIndex using the ChromaVectorStore
|
73 |
+
self.index = VectorStoreIndex(nodes, vector_store=vector_store, embed_model=self.embedding_model)
|
74 |
|
75 |
def query(
|
76 |
self, context: str, prompt_template: PromptTemplate = None
|
|
|
88 |
"If you're unsure about a source, use [?]. "
|
89 |
"Ensure that EVERY statement from the context is properly cited."
|
90 |
)
|
91 |
+
|
92 |
# This is a hack to index all the documents in the store :)
|
93 |
n_documents = len(self.index.docstore.docs)
|
94 |
+
print(f"n_documents: {n_documents}")
|
95 |
query_engine = self.index.as_query_engine(
|
96 |
text_qa_template=prompt_template,
|
97 |
+
similarity_top_k=n_documents if n_documents <= 17 else 15,
|
98 |
response_mode="tree_summarize",
|
99 |
llm=OpenAI(model="gpt-4o-mini"),
|
100 |
)
|
101 |
|
102 |
+
# Perform the query
|
103 |
response = query_engine.query(context)
|
104 |
|
105 |
return response
|
rag/rag_pipeline_backup.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import Dict, Any
|
3 |
+
from llama_index.core import Document, VectorStoreIndex
|
4 |
+
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
|
5 |
+
from llama_index.core import PromptTemplate
|
6 |
+
from typing import List
|
7 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
8 |
+
from llama_index.llms.openai import OpenAI
|
9 |
+
|
10 |
+
|
11 |
+
class RAGPipeline:
|
12 |
+
def __init__(self, study_json, use_semantic_splitter=False):
|
13 |
+
self.study_json = study_json
|
14 |
+
self.use_semantic_splitter = use_semantic_splitter
|
15 |
+
self.documents = None
|
16 |
+
self.index = None
|
17 |
+
self.load_documents()
|
18 |
+
self.build_index()
|
19 |
+
|
20 |
+
def load_documents(self):
|
21 |
+
if self.documents is None:
|
22 |
+
with open(self.study_json, "r") as f:
|
23 |
+
self.data = json.load(f)
|
24 |
+
|
25 |
+
self.documents = []
|
26 |
+
|
27 |
+
for index, doc_data in enumerate(self.data):
|
28 |
+
doc_content = (
|
29 |
+
f"Title: {doc_data['title']}\n"
|
30 |
+
f"Abstract: {doc_data['abstract']}\n"
|
31 |
+
f"Authors: {', '.join(doc_data['authors'])}\n"
|
32 |
+
# f"full_text: {doc_data['full_text']}"
|
33 |
+
)
|
34 |
+
|
35 |
+
metadata = {
|
36 |
+
"title": doc_data.get("title"),
|
37 |
+
"authors": doc_data.get("authors", []),
|
38 |
+
"year": doc_data.get("date"),
|
39 |
+
"doi": doc_data.get("doi"),
|
40 |
+
}
|
41 |
+
|
42 |
+
self.documents.append(
|
43 |
+
Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
|
44 |
+
)
|
45 |
+
|
46 |
+
def build_index(self):
|
47 |
+
if self.index is None:
|
48 |
+
sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
|
49 |
+
|
50 |
+
def _split(text: str) -> List[str]:
|
51 |
+
return sentence_splitter.split_text(text)
|
52 |
+
|
53 |
+
node_parser = SentenceWindowNodeParser.from_defaults(
|
54 |
+
sentence_splitter=_split,
|
55 |
+
window_size=5,
|
56 |
+
window_metadata_key="window",
|
57 |
+
original_text_metadata_key="original_text",
|
58 |
+
)
|
59 |
+
|
60 |
+
nodes = node_parser.get_nodes_from_documents(self.documents)
|
61 |
+
self.index = VectorStoreIndex(
|
62 |
+
nodes, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large")
|
63 |
+
)
|
64 |
+
|
65 |
+
def query(
|
66 |
+
self, context: str, prompt_template: PromptTemplate = None
|
67 |
+
) -> Dict[str, Any]:
|
68 |
+
if prompt_template is None:
|
69 |
+
prompt_template = PromptTemplate(
|
70 |
+
"Context information is below.\n"
|
71 |
+
"---------------------\n"
|
72 |
+
"{context_str}\n"
|
73 |
+
"---------------------\n"
|
74 |
+
"Given this information, please answer the question: {query_str}\n"
|
75 |
+
"Provide an answer to the question using evidence from the context above. "
|
76 |
+
"Cite sources using square brackets for EVERY piece of information, e.g. [1], [2], etc. "
|
77 |
+
"Even if there's only one source, still include the citation. "
|
78 |
+
"If you're unsure about a source, use [?]. "
|
79 |
+
"Ensure that EVERY statement from the context is properly cited."
|
80 |
+
)
|
81 |
+
|
82 |
+
# This is a hack to index all the documents in the store :)
|
83 |
+
n_documents = len(self.index.docstore.docs)
|
84 |
+
print(f"n_documents: {n_documents}")
|
85 |
+
query_engine = self.index.as_query_engine(
|
86 |
+
text_qa_template=prompt_template,
|
87 |
+
similarity_top_k=n_documents if n_documents <= 17 else 15,
|
88 |
+
response_mode="tree_summarize",
|
89 |
+
llm=OpenAI(model="gpt-4o-mini"),
|
90 |
+
)
|
91 |
+
|
92 |
+
response = query_engine.query(context)
|
93 |
+
|
94 |
+
return response
|
requirements.txt
CHANGED
@@ -2,6 +2,7 @@ chromadb==0.5.5
|
|
2 |
fastapi==0.112.2
|
3 |
gradio
|
4 |
llama-index
|
|
|
5 |
nest-asyncio==1.6.0
|
6 |
openai
|
7 |
pandas
|
|
|
2 |
fastapi==0.112.2
|
3 |
gradio
|
4 |
llama-index
|
5 |
+
llama-index-vector-stores-chroma
|
6 |
nest-asyncio==1.6.0
|
7 |
openai
|
8 |
pandas
|
study_files.json
CHANGED
@@ -2,5 +2,13 @@
|
|
2 |
"Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
|
3 |
"Ebola Virus": "data/ebola_virus_zotero_items.json",
|
4 |
"GeneXpert": "data/gene_xpert_zotero_items.json",
|
5 |
-
"Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
}
|
|
|
2 |
"Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
|
3 |
"Ebola Virus": "data/ebola_virus_zotero_items.json",
|
4 |
"GeneXpert": "data/gene_xpert_zotero_items.json",
|
5 |
+
"Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
|
6 |
+
"Natural resources degradation": "data/natural-resources-degradation_zotero_items.json",
|
7 |
+
"EBSCOhost": "data/ebscohost_zotero_items.json",
|
8 |
+
"ref BMGF": "data/ref-bmgf_zotero_items.json",
|
9 |
+
"scholar (29)": "data/scholar-29_zotero_items.json",
|
10 |
+
"iom": "data/iom_zotero_items.json",
|
11 |
+
"ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
|
12 |
+
"wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
|
13 |
+
"kayongo papers": "data/kayongo-papers_zotero_items.json"
|
14 |
}
|
study_files_backup.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
|
3 |
+
"Ebola Virus": "data/ebola_virus_zotero_items.json",
|
4 |
+
"GeneXpert": "data/gene_xpert_zotero_items.json",
|
5 |
+
"Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
|
6 |
+
"EBSCOhost": "data/ebscohost_zotero_items.json",
|
7 |
+
"ref BMGF": "data/ref-bmgf_zotero_items.json",
|
8 |
+
"scholar (29)": "data/scholar-29_zotero_items.json",
|
9 |
+
"iom": "data/iom_zotero_items.json",
|
10 |
+
"ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
|
11 |
+
"wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
|
12 |
+
"kayongo papers": "data/kayongo-papers_zotero_items.json"
|
13 |
+
}
|
utils/helpers.py
CHANGED
@@ -8,6 +8,13 @@ from utils.prompts import (
|
|
8 |
StudyCharacteristics,
|
9 |
)
|
10 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def read_study_files(file_path):
|
13 |
"""
|
@@ -165,3 +172,47 @@ def generate_follow_up_questions(
|
|
165 |
if cleaned_q:
|
166 |
cleaned_questions.append(f"✨ {cleaned_q}")
|
167 |
return cleaned_questions[:3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
StudyCharacteristics,
|
9 |
)
|
10 |
import json
|
11 |
+
import json
|
12 |
+
import chromadb
|
13 |
+
from chromadb.api.types import Document
|
14 |
+
|
15 |
+
# Initialize ChromaDB client
|
16 |
+
chromadb_client = chromadb.Client()
|
17 |
+
|
18 |
|
19 |
def read_study_files(file_path):
|
20 |
"""
|
|
|
172 |
if cleaned_q:
|
173 |
cleaned_questions.append(f"✨ {cleaned_q}")
|
174 |
return cleaned_questions[:3]
|
175 |
+
|
176 |
+
|
177 |
+
def add_study_files_to_chromadb(file_path: str, collection_name: str):
|
178 |
+
"""
|
179 |
+
Reads the study files data from a JSON file and adds it to the specified ChromaDB collection.
|
180 |
+
|
181 |
+
:param file_path: Path to the JSON file containing study files data.
|
182 |
+
:param collection_name: Name of the ChromaDB collection to store the data.
|
183 |
+
"""
|
184 |
+
# Load study files data from JSON file
|
185 |
+
try:
|
186 |
+
with open(file_path, "r") as f:
|
187 |
+
study_files_data = json.load(f)
|
188 |
+
except FileNotFoundError:
|
189 |
+
print(f"File '{file_path}' not found.")
|
190 |
+
return
|
191 |
+
|
192 |
+
# Get or create the collection in ChromaDB
|
193 |
+
collection = chromadb_client.get_or_create_collection(collection_name)
|
194 |
+
|
195 |
+
# Prepare lists for ids, texts, and metadata to batch insert
|
196 |
+
ids = []
|
197 |
+
documents = []
|
198 |
+
metadatas = []
|
199 |
+
|
200 |
+
# Populate lists with data from the JSON file
|
201 |
+
for name, file_path in study_files_data.items():
|
202 |
+
ids.append(name) # Document ID
|
203 |
+
documents.append("") # Optional text, can be left empty if not used
|
204 |
+
metadatas.append({"file_path": file_path}) # Metadata with file path
|
205 |
+
|
206 |
+
# Add documents to the collection in batch
|
207 |
+
collection.add(
|
208 |
+
ids=ids,
|
209 |
+
documents=documents,
|
210 |
+
metadatas=metadatas
|
211 |
+
)
|
212 |
+
|
213 |
+
print("All study files have been successfully added to ChromaDB.")
|
214 |
+
|
215 |
+
|
216 |
+
if __name__ == "__main__":
|
217 |
+
# Usage example
|
218 |
+
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|