Patrick Walukagga commited on
Commit
af11e83
·
1 Parent(s): 85bbaed

chroma integration

Browse files
.gitignore CHANGED
@@ -171,3 +171,6 @@ poetry.toml
171
 
172
  # LSP config files
173
  pyrightconfig.json
 
 
 
 
171
 
172
  # LSP config files
173
  pyrightconfig.json
174
+
175
+ # data
176
+ data/
app.py CHANGED
@@ -1,14 +1,14 @@
1
  import json
2
  from typing import List, Tuple
3
  import os
 
4
 
5
  import gradio as gr
6
  from dotenv import load_dotenv
7
  from slugify import slugify
8
 
9
- from config import STUDY_FILES
10
  from rag.rag_pipeline import RAGPipeline
11
- from utils.helpers import generate_follow_up_questions, append_to_study_files
12
  from utils.prompts import (
13
  highlight_prompt,
14
  evidence_based_prompt,
@@ -20,9 +20,13 @@ from config import STUDY_FILES, OPENAI_API_KEY
20
  from utils.zotero_manager import ZoteroManager
21
 
22
  load_dotenv()
 
23
 
24
  openai.api_key = OPENAI_API_KEY
25
 
 
 
 
26
  # Cache for RAG pipelines
27
  rag_cache = {}
28
 
@@ -47,6 +51,8 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
47
  zotero_manager.filter_and_return_collections_with_items(zotero_collection_lists)
48
  )
49
 
 
 
50
  for collection in filtered_zotero_collection_lists:
51
  collection_name = collection.get("name")
52
  if collection_name not in STUDY_FILES:
@@ -62,6 +68,16 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
62
  zotero_items_json, f"data/{export_file}"
63
  )
64
  append_to_study_files("study_files.json", collection_name, f"data/{export_file}")
 
 
 
 
 
 
 
 
 
 
65
  message = "Successfully processed items in your zotero library"
66
  except Exception as e:
67
  message = f"Error process your zotero library: {str(e)}"
@@ -70,12 +86,24 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
70
 
71
 
72
  def get_rag_pipeline(study_name: str) -> RAGPipeline:
73
- """Get or create a RAGPipeline instance for the given study."""
74
  if study_name not in rag_cache:
75
- study_file = STUDY_FILES.get(study_name)
76
- if not study_file:
 
 
 
 
77
  raise ValueError(f"Invalid study name: {study_name}")
 
 
 
 
 
 
 
78
  rag_cache[study_name] = RAGPipeline(study_file)
 
79
  return rag_cache[study_name]
80
 
81
 
@@ -88,6 +116,7 @@ def chat_function(
88
  return "Please enter a valid query."
89
 
90
  rag = get_rag_pipeline(study_name)
 
91
  prompt = {
92
  "Highlight": highlight_prompt,
93
  "Evidence-based": evidence_based_prompt,
@@ -100,9 +129,19 @@ def chat_function(
100
  def get_study_info(study_name: str) -> str:
101
  """Retrieve information about the specified study."""
102
 
103
- study_file = STUDY_FILES.get(study_name)
 
 
 
 
 
 
 
 
 
 
104
  if not study_file:
105
- return "Invalid study name"
106
 
107
  with open(study_file, "r") as f:
108
  data = json.load(f)
@@ -128,6 +167,7 @@ def process_multi_input(text, study_name, prompt_type):
128
  # Split input based on commas and strip any extra spaces
129
  variable_list = [word.strip().upper() for word in text.split(',')]
130
  user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
 
131
  response = chat_function(user_message, study_name, prompt_type)
132
  return response
133
 
@@ -159,11 +199,24 @@ def create_gr_interface() -> gr.Blocks:
159
  zotero_output = gr.Markdown(label="Zotero")
160
 
161
  gr.Markdown("### Study Information")
 
 
 
 
 
 
 
 
 
 
 
 
162
  study_dropdown = gr.Dropdown(
163
- choices=list(STUDY_FILES.keys()),
164
  label="Select Study",
165
- value=list(STUDY_FILES.keys())[0],
166
  )
 
167
  study_info = gr.Markdown(label="Study Details")
168
 
169
  gr.Markdown("### Settings")
 
1
  import json
2
  from typing import List, Tuple
3
  import os
4
+ import logging
5
 
6
  import gradio as gr
7
  from dotenv import load_dotenv
8
  from slugify import slugify
9
 
 
10
  from rag.rag_pipeline import RAGPipeline
11
+ from utils.helpers import generate_follow_up_questions, append_to_study_files, add_study_files_to_chromadb, chromadb_client
12
  from utils.prompts import (
13
  highlight_prompt,
14
  evidence_based_prompt,
 
20
  from utils.zotero_manager import ZoteroManager
21
 
22
  load_dotenv()
23
+ logging.basicConfig(level=logging.INFO)
24
 
25
  openai.api_key = OPENAI_API_KEY
26
 
27
+ # After loop, add all collected data to ChromaDB
28
+ add_study_files_to_chromadb("study_files.json", "study_files_collection")
29
+
30
  # Cache for RAG pipelines
31
  rag_cache = {}
32
 
 
51
  zotero_manager.filter_and_return_collections_with_items(zotero_collection_lists)
52
  )
53
 
54
+ study_files_data = {} # Dictionary to collect items for ChromaDB
55
+
56
  for collection in filtered_zotero_collection_lists:
57
  collection_name = collection.get("name")
58
  if collection_name not in STUDY_FILES:
 
68
  zotero_items_json, f"data/{export_file}"
69
  )
70
  append_to_study_files("study_files.json", collection_name, f"data/{export_file}")
71
+
72
+ # Collect for ChromaDB
73
+ study_files_data[collection_name] = f"data/{export_file}"
74
+
75
+ # Update in-memory STUDY_FILES for reference in current session
76
+ STUDY_FILES.update({collection_name: f"data/{export_file}"})
77
+ logging.info(f"STUDY_FILES: {STUDY_FILES}")
78
+
79
+ # After loop, add all collected data to ChromaDB
80
+ add_study_files_to_chromadb("study_files.json", "study_files_collection")
81
  message = "Successfully processed items in your zotero library"
82
  except Exception as e:
83
  message = f"Error process your zotero library: {str(e)}"
 
86
 
87
 
88
  def get_rag_pipeline(study_name: str) -> RAGPipeline:
89
+ """Get or create a RAGPipeline instance for the given study by querying ChromaDB."""
90
  if study_name not in rag_cache:
91
+ # Query ChromaDB for the study file path by ID
92
+ collection = chromadb_client.get_or_create_collection("study_files_collection")
93
+ result = collection.get(ids=[study_name]) # Retrieve document by ID
94
+
95
+ # Check if the result contains the requested document
96
+ if not result or len(result['metadatas']) == 0:
97
  raise ValueError(f"Invalid study name: {study_name}")
98
+
99
+ # Extract the file path from the document metadata
100
+ study_file = result['metadatas'][0].get("file_path")
101
+ if not study_file:
102
+ raise ValueError(f"File path not found for study name: {study_name}")
103
+
104
+ # Create and cache the RAGPipeline instance
105
  rag_cache[study_name] = RAGPipeline(study_file)
106
+
107
  return rag_cache[study_name]
108
 
109
 
 
116
  return "Please enter a valid query."
117
 
118
  rag = get_rag_pipeline(study_name)
119
+ logging.info(f"rag: ==> {rag}")
120
  prompt = {
121
  "Highlight": highlight_prompt,
122
  "Evidence-based": evidence_based_prompt,
 
129
  def get_study_info(study_name: str) -> str:
130
  """Retrieve information about the specified study."""
131
 
132
+ collection = chromadb_client.get_or_create_collection("study_files_collection")
133
+ result = collection.get(ids=[study_name]) # Query by study name (as a list)
134
+ logging.info(f"Result: ======> {result}")
135
+
136
+ # Check if the document exists in the result
137
+ if not result or len(result['metadatas']) == 0:
138
+ raise ValueError(f"Invalid study name: {study_name}")
139
+
140
+ # Extract the file path from the document metadata
141
+ study_file = result['metadatas'][0].get("file_path")
142
+ logging.info(f"study_file: =======> {study_file}")
143
  if not study_file:
144
+ raise ValueError(f"File path not found for study name: {study_name}")
145
 
146
  with open(study_file, "r") as f:
147
  data = json.load(f)
 
167
  # Split input based on commas and strip any extra spaces
168
  variable_list = [word.strip().upper() for word in text.split(',')]
169
  user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
170
+ logging.info(f"User message: ==> {user_message}")
171
  response = chat_function(user_message, study_name, prompt_type)
172
  return response
173
 
 
199
  zotero_output = gr.Markdown(label="Zotero")
200
 
201
  gr.Markdown("### Study Information")
202
+
203
+ # Query ChromaDB for all document IDs in the "study_files_collection" collection
204
+ collection = chromadb_client.get_or_create_collection("study_files_collection")
205
+ # Retrieve all documents by querying with an empty string and specifying a high n_results
206
+ all_documents = collection.query(query_texts=[""], n_results=1000)
207
+ logging.info(f"all_documents: =========> {all_documents}")
208
+ # Extract document IDs as study names
209
+ document_ids = all_documents.get("ids")
210
+ study_choices = [doc_id for doc_id in document_ids[0] if document_ids] # Get list of document IDs
211
+ logging.info(f"study_choices: ======> {study_choices}")
212
+
213
+ # Update the Dropdown with choices from ChromaDB
214
  study_dropdown = gr.Dropdown(
215
+ choices=study_choices,
216
  label="Select Study",
217
+ value=study_choices[0] if study_choices else None, # Set first choice as default, if available
218
  )
219
+
220
  study_info = gr.Markdown(label="Study Details")
221
 
222
  gr.Markdown("### Settings")
rag/rag_pipeline.py CHANGED
@@ -1,19 +1,27 @@
1
  import json
2
- from typing import Dict, Any
 
 
3
  from llama_index.core import Document, VectorStoreIndex
4
  from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
5
  from llama_index.core import PromptTemplate
6
- from typing import List
7
  from llama_index.embeddings.openai import OpenAIEmbedding
8
  from llama_index.llms.openai import OpenAI
 
 
9
 
 
10
 
11
  class RAGPipeline:
12
- def __init__(self, study_json, use_semantic_splitter=False):
13
  self.study_json = study_json
 
14
  self.use_semantic_splitter = use_semantic_splitter
15
  self.documents = None
16
- self.index = None
 
 
 
17
  self.load_documents()
18
  self.build_index()
19
 
@@ -23,44 +31,46 @@ class RAGPipeline:
23
  self.data = json.load(f)
24
 
25
  self.documents = []
26
-
27
  for index, doc_data in enumerate(self.data):
28
  doc_content = (
29
  f"Title: {doc_data['title']}\n"
30
  f"Abstract: {doc_data['abstract']}\n"
31
  f"Authors: {', '.join(doc_data['authors'])}\n"
32
- # f"full_text: {doc_data['full_text']}"
33
  )
34
 
35
  metadata = {
36
  "title": doc_data.get("title"),
37
- "authors": doc_data.get("authors", []),
38
  "year": doc_data.get("date"),
39
  "doi": doc_data.get("doi"),
40
  }
41
 
 
42
  self.documents.append(
43
  Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
44
  )
45
 
46
  def build_index(self):
47
- if self.index is None:
48
- sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
49
 
50
- def _split(text: str) -> List[str]:
51
- return sentence_splitter.split_text(text)
52
 
53
- node_parser = SentenceWindowNodeParser.from_defaults(
54
- sentence_splitter=_split,
55
- window_size=5,
56
- window_metadata_key="window",
57
- original_text_metadata_key="original_text",
58
- )
59
 
60
- nodes = node_parser.get_nodes_from_documents(self.documents)
61
- self.index = VectorStoreIndex(
62
- nodes, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large")
63
- )
 
 
 
 
64
 
65
  def query(
66
  self, context: str, prompt_template: PromptTemplate = None
@@ -78,16 +88,18 @@ class RAGPipeline:
78
  "If you're unsure about a source, use [?]. "
79
  "Ensure that EVERY statement from the context is properly cited."
80
  )
81
-
82
  # This is a hack to index all the documents in the store :)
83
  n_documents = len(self.index.docstore.docs)
 
84
  query_engine = self.index.as_query_engine(
85
  text_qa_template=prompt_template,
86
- similarity_top_k=n_documents,
87
  response_mode="tree_summarize",
88
  llm=OpenAI(model="gpt-4o-mini"),
89
  )
90
 
 
91
  response = query_engine.query(context)
92
 
93
  return response
 
1
  import json
2
+ import logging
3
+ from typing import Dict, Any, List
4
+
5
  from llama_index.core import Document, VectorStoreIndex
6
  from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
7
  from llama_index.core import PromptTemplate
 
8
  from llama_index.embeddings.openai import OpenAIEmbedding
9
  from llama_index.llms.openai import OpenAI
10
+ from llama_index.vector_stores.chroma import ChromaVectorStore
11
+ import chromadb
12
 
13
+ logging.basicConfig(level=logging.INFO)
14
 
15
  class RAGPipeline:
16
+ def __init__(self, study_json, collection_name="study_files_rag_collection", use_semantic_splitter=False):
17
  self.study_json = study_json
18
+ self.collection_name = collection_name
19
  self.use_semantic_splitter = use_semantic_splitter
20
  self.documents = None
21
+ self.client = chromadb.Client()
22
+ self.collection = self.client.get_or_create_collection(self.collection_name)
23
+ # Embed and store each node in ChromaDB
24
+ self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
25
  self.load_documents()
26
  self.build_index()
27
 
 
31
  self.data = json.load(f)
32
 
33
  self.documents = []
 
34
  for index, doc_data in enumerate(self.data):
35
  doc_content = (
36
  f"Title: {doc_data['title']}\n"
37
  f"Abstract: {doc_data['abstract']}\n"
38
  f"Authors: {', '.join(doc_data['authors'])}\n"
 
39
  )
40
 
41
  metadata = {
42
  "title": doc_data.get("title"),
43
+ "authors": ", ".join(doc_data.get("authors", [])),
44
  "year": doc_data.get("date"),
45
  "doi": doc_data.get("doi"),
46
  }
47
 
48
+ # Append document data for use in ChromaDB indexing
49
  self.documents.append(
50
  Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
51
  )
52
 
53
  def build_index(self):
54
+ sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
 
55
 
56
+ def _split(text: str) -> List[str]:
57
+ return sentence_splitter.split_text(text)
58
 
59
+ node_parser = SentenceWindowNodeParser.from_defaults(
60
+ sentence_splitter=_split,
61
+ window_size=5,
62
+ window_metadata_key="window",
63
+ original_text_metadata_key="original_text",
64
+ )
65
 
66
+ # Parse documents into nodes for embedding
67
+ nodes = node_parser.get_nodes_from_documents(self.documents)
68
+
69
+ # Initialize ChromaVectorStore with the existing collection
70
+ vector_store = ChromaVectorStore(chroma_collection=self.collection)
71
+
72
+ # Create the VectorStoreIndex using the ChromaVectorStore
73
+ self.index = VectorStoreIndex(nodes, vector_store=vector_store, embed_model=self.embedding_model)
74
 
75
  def query(
76
  self, context: str, prompt_template: PromptTemplate = None
 
88
  "If you're unsure about a source, use [?]. "
89
  "Ensure that EVERY statement from the context is properly cited."
90
  )
91
+
92
  # This is a hack to index all the documents in the store :)
93
  n_documents = len(self.index.docstore.docs)
94
+ print(f"n_documents: {n_documents}")
95
  query_engine = self.index.as_query_engine(
96
  text_qa_template=prompt_template,
97
+ similarity_top_k=n_documents if n_documents <= 17 else 15,
98
  response_mode="tree_summarize",
99
  llm=OpenAI(model="gpt-4o-mini"),
100
  )
101
 
102
+ # Perform the query
103
  response = query_engine.query(context)
104
 
105
  return response
rag/rag_pipeline_backup.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict, Any
3
+ from llama_index.core import Document, VectorStoreIndex
4
+ from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
5
+ from llama_index.core import PromptTemplate
6
+ from typing import List
7
+ from llama_index.embeddings.openai import OpenAIEmbedding
8
+ from llama_index.llms.openai import OpenAI
9
+
10
+
11
+ class RAGPipeline:
12
+ def __init__(self, study_json, use_semantic_splitter=False):
13
+ self.study_json = study_json
14
+ self.use_semantic_splitter = use_semantic_splitter
15
+ self.documents = None
16
+ self.index = None
17
+ self.load_documents()
18
+ self.build_index()
19
+
20
+ def load_documents(self):
21
+ if self.documents is None:
22
+ with open(self.study_json, "r") as f:
23
+ self.data = json.load(f)
24
+
25
+ self.documents = []
26
+
27
+ for index, doc_data in enumerate(self.data):
28
+ doc_content = (
29
+ f"Title: {doc_data['title']}\n"
30
+ f"Abstract: {doc_data['abstract']}\n"
31
+ f"Authors: {', '.join(doc_data['authors'])}\n"
32
+ # f"full_text: {doc_data['full_text']}"
33
+ )
34
+
35
+ metadata = {
36
+ "title": doc_data.get("title"),
37
+ "authors": doc_data.get("authors", []),
38
+ "year": doc_data.get("date"),
39
+ "doi": doc_data.get("doi"),
40
+ }
41
+
42
+ self.documents.append(
43
+ Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
44
+ )
45
+
46
+ def build_index(self):
47
+ if self.index is None:
48
+ sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
49
+
50
+ def _split(text: str) -> List[str]:
51
+ return sentence_splitter.split_text(text)
52
+
53
+ node_parser = SentenceWindowNodeParser.from_defaults(
54
+ sentence_splitter=_split,
55
+ window_size=5,
56
+ window_metadata_key="window",
57
+ original_text_metadata_key="original_text",
58
+ )
59
+
60
+ nodes = node_parser.get_nodes_from_documents(self.documents)
61
+ self.index = VectorStoreIndex(
62
+ nodes, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large")
63
+ )
64
+
65
+ def query(
66
+ self, context: str, prompt_template: PromptTemplate = None
67
+ ) -> Dict[str, Any]:
68
+ if prompt_template is None:
69
+ prompt_template = PromptTemplate(
70
+ "Context information is below.\n"
71
+ "---------------------\n"
72
+ "{context_str}\n"
73
+ "---------------------\n"
74
+ "Given this information, please answer the question: {query_str}\n"
75
+ "Provide an answer to the question using evidence from the context above. "
76
+ "Cite sources using square brackets for EVERY piece of information, e.g. [1], [2], etc. "
77
+ "Even if there's only one source, still include the citation. "
78
+ "If you're unsure about a source, use [?]. "
79
+ "Ensure that EVERY statement from the context is properly cited."
80
+ )
81
+
82
+ # This is a hack to index all the documents in the store :)
83
+ n_documents = len(self.index.docstore.docs)
84
+ print(f"n_documents: {n_documents}")
85
+ query_engine = self.index.as_query_engine(
86
+ text_qa_template=prompt_template,
87
+ similarity_top_k=n_documents if n_documents <= 17 else 15,
88
+ response_mode="tree_summarize",
89
+ llm=OpenAI(model="gpt-4o-mini"),
90
+ )
91
+
92
+ response = query_engine.query(context)
93
+
94
+ return response
requirements.txt CHANGED
@@ -2,6 +2,7 @@ chromadb==0.5.5
2
  fastapi==0.112.2
3
  gradio
4
  llama-index
 
5
  nest-asyncio==1.6.0
6
  openai
7
  pandas
 
2
  fastapi==0.112.2
3
  gradio
4
  llama-index
5
+ llama-index-vector-stores-chroma
6
  nest-asyncio==1.6.0
7
  openai
8
  pandas
study_files.json CHANGED
@@ -2,5 +2,13 @@
2
  "Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
3
  "Ebola Virus": "data/ebola_virus_zotero_items.json",
4
  "GeneXpert": "data/gene_xpert_zotero_items.json",
5
- "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json"
 
 
 
 
 
 
 
 
6
  }
 
2
  "Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
3
  "Ebola Virus": "data/ebola_virus_zotero_items.json",
4
  "GeneXpert": "data/gene_xpert_zotero_items.json",
5
+ "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
6
+ "Natural resources degradation": "data/natural-resources-degradation_zotero_items.json",
7
+ "EBSCOhost": "data/ebscohost_zotero_items.json",
8
+ "ref BMGF": "data/ref-bmgf_zotero_items.json",
9
+ "scholar (29)": "data/scholar-29_zotero_items.json",
10
+ "iom": "data/iom_zotero_items.json",
11
+ "ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
12
+ "wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
13
+ "kayongo papers": "data/kayongo-papers_zotero_items.json"
14
  }
study_files_backup.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Vaccine coverage": "data/vaccine_coverage_zotero_items.json",
3
+ "Ebola Virus": "data/ebola_virus_zotero_items.json",
4
+ "GeneXpert": "data/gene_xpert_zotero_items.json",
5
+ "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
6
+ "EBSCOhost": "data/ebscohost_zotero_items.json",
7
+ "ref BMGF": "data/ref-bmgf_zotero_items.json",
8
+ "scholar (29)": "data/scholar-29_zotero_items.json",
9
+ "iom": "data/iom_zotero_items.json",
10
+ "ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
11
+ "wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
12
+ "kayongo papers": "data/kayongo-papers_zotero_items.json"
13
+ }
utils/helpers.py CHANGED
@@ -8,6 +8,13 @@ from utils.prompts import (
8
  StudyCharacteristics,
9
  )
10
  import json
 
 
 
 
 
 
 
11
 
12
  def read_study_files(file_path):
13
  """
@@ -165,3 +172,47 @@ def generate_follow_up_questions(
165
  if cleaned_q:
166
  cleaned_questions.append(f"✨ {cleaned_q}")
167
  return cleaned_questions[:3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  StudyCharacteristics,
9
  )
10
  import json
11
+ import json
12
+ import chromadb
13
+ from chromadb.api.types import Document
14
+
15
+ # Initialize ChromaDB client
16
+ chromadb_client = chromadb.Client()
17
+
18
 
19
  def read_study_files(file_path):
20
  """
 
172
  if cleaned_q:
173
  cleaned_questions.append(f"✨ {cleaned_q}")
174
  return cleaned_questions[:3]
175
+
176
+
177
+ def add_study_files_to_chromadb(file_path: str, collection_name: str):
178
+ """
179
+ Reads the study files data from a JSON file and adds it to the specified ChromaDB collection.
180
+
181
+ :param file_path: Path to the JSON file containing study files data.
182
+ :param collection_name: Name of the ChromaDB collection to store the data.
183
+ """
184
+ # Load study files data from JSON file
185
+ try:
186
+ with open(file_path, "r") as f:
187
+ study_files_data = json.load(f)
188
+ except FileNotFoundError:
189
+ print(f"File '{file_path}' not found.")
190
+ return
191
+
192
+ # Get or create the collection in ChromaDB
193
+ collection = chromadb_client.get_or_create_collection(collection_name)
194
+
195
+ # Prepare lists for ids, texts, and metadata to batch insert
196
+ ids = []
197
+ documents = []
198
+ metadatas = []
199
+
200
+ # Populate lists with data from the JSON file
201
+ for name, file_path in study_files_data.items():
202
+ ids.append(name) # Document ID
203
+ documents.append("") # Optional text, can be left empty if not used
204
+ metadatas.append({"file_path": file_path}) # Metadata with file path
205
+
206
+ # Add documents to the collection in batch
207
+ collection.add(
208
+ ids=ids,
209
+ documents=documents,
210
+ metadatas=metadatas
211
+ )
212
+
213
+ print("All study files have been successfully added to ChromaDB.")
214
+
215
+
216
+ if __name__ == "__main__":
217
+ # Usage example
218
+ add_study_files_to_chromadb("study_files.json", "study_files_collection")