eagle0504 commited on
Commit
38a30d6
1 Parent(s): dff518b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -27
app.py CHANGED
@@ -120,31 +120,47 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
120
  return result
121
 
122
 
123
- file_names = [f"output_files/file_{i}.txt" for i in range(131)]
124
- # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
 
125
 
126
 
127
- # Initialize an empty list to hold all documents
128
- all_documents = [] # this is just a copy, you don't have to use this
129
 
130
- # Iterate over each file and load its contents
131
- for file_name in file_names:
132
- loader = TextLoader(file_name)
133
- documents = loader.load()
134
- all_documents.extend(documents)
135
 
136
- # Split the loaded documents into chunks
137
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
138
- docs = text_splitter.split_documents(all_documents)
139
 
140
- # Create the open-source embedding function
141
- embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
142
- # embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
143
- # embedding_function = openai_text_embedding
144
 
145
- # Load the documents into Chroma
146
- db = Chroma.from_documents(docs, embedding_function)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  st.title("Youth Homelessness Chatbot")
150
 
@@ -174,16 +190,34 @@ if prompt := st.chat_input("Tell me about YSA"):
174
  question = prompt
175
 
176
  with st.spinner("Wait for it..."):
177
- docs = db.similarity_search(question)
178
- docs_2 = db.similarity_search_with_score(question)
179
- docs_2_table = pd.DataFrame(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  {
181
- "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))],
182
- "content": [docs_2[i][0].page_content for i in range(len(docs))],
183
- "distances": [docs_2[i][1] for i in range(len(docs))],
 
184
  }
185
  )
186
- ref_from_db_search = docs_2_table["content"]
187
 
188
  engineered_prompt = f"""
189
  Based on the context: {ref_from_db_search},
@@ -199,9 +233,9 @@ if prompt := st.chat_input("Tell me about YSA"):
199
  with st.spinner("Wait for it..."):
200
  st.markdown(response)
201
  with st.expander("See reference:"):
202
- st.table(docs_2_table)
203
  # Add assistant response to chat history
204
  st.session_state.messages.append({"role": "assistant", "content": response})
205
  st.session_state.messages.append(
206
- {"role": "assistant", "content": docs_2_table.to_json()}
207
  )
 
120
  return result
121
 
122
 
123
+ ## rag strategy 1
124
+ # file_names = [f"output_files/file_{i}.txt" for i in range(131)]
125
+ # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
126
 
127
 
128
+ # # Initialize an empty list to hold all documents
129
+ # all_documents = [] # this is just a copy, you don't have to use this
130
 
131
+ # # Iterate over each file and load its contents
132
+ # for file_name in file_names:
133
+ # loader = TextLoader(file_name)
134
+ # documents = loader.load()
135
+ # all_documents.extend(documents)
136
 
137
+ # # Split the loaded documents into chunks
138
+ # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
139
+ # docs = text_splitter.split_documents(all_documents)
140
 
141
+ # # Create the open-source embedding function
142
+ # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
143
+ # # embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
144
+ # # embedding_function = openai_text_embedding
145
 
146
+ # # Load the documents into Chroma
147
+ # db = Chroma.from_documents(docs, embedding_function)
148
 
149
+ ## rag strategy 2
150
+ from datasets import load_dataset
151
+ dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
152
+
153
+ import chromadb
154
+ client = chromadb.Client()
155
+ collection = client.create_collection("vector_database")
156
+
157
+ # Embed and store the first N supports for this demo
158
+ L = len(dataset["train"]['questions'])
159
+ collection.add(
160
+ ids=[str(i) for i in range(0, L)], # IDs are just strings
161
+ documents=dataset["train"]['questions'], # Enter questions here
162
+ metadatas=[{"type": "support"} for _ in range(0, L)],
163
+ )
164
 
165
  st.title("Youth Homelessness Chatbot")
166
 
 
190
  question = prompt
191
 
192
  with st.spinner("Wait for it..."):
193
+ # strategy 1
194
+ # docs = db.similarity_search(question)
195
+ # docs_2 = db.similarity_search_with_score(question)
196
+ # docs_2_table = pd.DataFrame(
197
+ # {
198
+ # "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))],
199
+ # "content": [docs_2[i][0].page_content for i in range(len(docs))],
200
+ # "distances": [docs_2[i][1] for i in range(len(docs))],
201
+ # }
202
+ # )
203
+ # ref_from_db_search = docs_2_table["content"]
204
+
205
+ # strategy 2
206
+ results = collection.query(
207
+ query_texts=user_query,
208
+ n_results=5
209
+ )
210
+ idx = results["ids"][0]
211
+ idx = [int(i) for i in idx]
212
+ ref = pd.DataFrame(
213
  {
214
+ "idx": idx,
215
+ "question": [dataset["train"]['questions'][i] for i in idx],
216
+ "answers": [dataset["train"]['answers'][i] for i in idx],
217
+ "distances": results["distances"][0]
218
  }
219
  )
220
+ ref_from_db_search = ref["answers"]
221
 
222
  engineered_prompt = f"""
223
  Based on the context: {ref_from_db_search},
 
233
  with st.spinner("Wait for it..."):
234
  st.markdown(response)
235
  with st.expander("See reference:"):
236
+ st.table(ref)
237
  # Add assistant response to chat history
238
  st.session_state.messages.append({"role": "assistant", "content": response})
239
  st.session_state.messages.append(
240
+ {"role": "assistant", "content": ref.to_json()}
241
  )