import os from typing import Dict, List, Union import numpy as np import openai import pandas as pd import streamlit as st from langchain.document_loaders import TextLoader from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Chroma from scipy.spatial.distance import cosine openai.api_key = os.environ["OPENAI_API_KEY"] def call_chatgpt(prompt: str) -> str: """ Uses the OpenAI API to generate an AI response to a prompt. Args: prompt: A string representing the prompt to send to the OpenAI API. Returns: A string representing the AI's generated response. """ # Use the OpenAI API to generate a response based on the input prompt. response = openai.Completion.create( model="gpt-3.5-turbo-instruct", prompt=prompt, temperature=0.5, max_tokens=500, top_p=1, frequency_penalty=0, presence_penalty=0, ) # Extract the text from the first (and only) choice in the response output. ans = response.choices[0]["text"] # Return the generated AI response. return ans ## rag strategy 1 # file_names = [f"output_files/file_{i}.txt" for i in range(131)] # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)] # # Initialize an empty list to hold all documents # all_documents = [] # this is just a copy, you don't have to use this # # Iterate over each file and load its contents # for file_name in file_names: # loader = TextLoader(file_name) # documents = loader.load() # all_documents.extend(documents) # # Split the loaded documents into chunks # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) # docs = text_splitter.split_documents(all_documents) # # Create the open-source embedding function # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # # embedding_function = SentenceTransformer("all-MiniLM-L6-v2") # # embedding_function = openai_text_embedding # # Load the documents into Chroma # db = Chroma.from_documents(docs, embedding_function) ## rag strategy 2 from datasets import load_dataset import chromadb import string dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted") client = chromadb.Client() random_number = np.random.randint(low=1e9, high=1e10) random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10)) combined_string = f"{random_number}{random_string}" collection = client.create_collection(combined_string) # Embed and store the first N supports for this demo L = len(dataset["train"]['questions']) collection.add( ids=[str(i) for i in range(0, L)], # IDs are just strings documents=dataset["train"]['questions'], # Enter questions here metadatas=[{"type": "support"} for _ in range(0, L)], ) st.title("Youth Homelessness Chatbot") # Initialize chat history if "messages" not in st.session_state: st.session_state.messages = [] # Display chat messages from history on app rerun for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) st.sidebar.markdown( """ ### Instructions: This app guides you through YSA's website, utilizing a RAG-ready Q&A dataset [here](https://huggingface.co/datasets/eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted) for chatbot assistance. 🤖 Enter a question, and it finds similar ones in the database, offering answers with a distance score to gauge relevance—the lower the score, the closer the match. 🎯 For better accuracy and to reduce errors, user feedback helps refine the database. ✨ """) clear_button = st.sidebar.button("Clear Conversation", key="clear") if clear_button: st.session_state.messages = [] # React to user input if prompt := st.chat_input("Tell me about YSA"): # Display user message in chat message container st.chat_message("user").markdown(prompt) # Add user message to chat history st.session_state.messages.append({"role": "user", "content": prompt}) question = prompt with st.spinner("Wait for it..."): # strategy 1 # docs = db.similarity_search(question) # docs_2 = db.similarity_search_with_score(question) # docs_2_table = pd.DataFrame( # { # "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))], # "content": [docs_2[i][0].page_content for i in range(len(docs))], # "distances": [docs_2[i][1] for i in range(len(docs))], # } # ) # ref_from_db_search = docs_2_table["content"] # strategy 2 results = collection.query( query_texts=question, n_results=5 ) idx = results["ids"][0] idx = [int(i) for i in idx] ref = pd.DataFrame( { "idx": idx, "question": [dataset["train"]['questions'][i] for i in idx], "answers": [dataset["train"]['answers'][i] for i in idx], "distances": results["distances"][0] } ) special_threshold = 0.3 filtered_ref = ref[ref["distances"] < special_threshold] if filtered_ref.shape[0] > 0: st.success("There are highly relevant information in our database.") ref_from_db_search = filtered_ref["answers"] final_ref = filtered_ref else: st.warning("The database may not have relevant information to help your question so please be aware of hallucinations.") ref_from_db_search = ref["answers"] final_ref = ref engineered_prompt = f""" Based on the context: {ref_from_db_search}, answer the user question: {question}. Answer the question directly (don't say "based on the context, ...") """ answer = call_chatgpt(engineered_prompt) response = answer # Display assistant response in chat message container with st.chat_message("assistant"): with st.spinner("Wait for it..."): st.markdown(response) with st.expander("See reference:"): st.table(final_ref) # Add assistant response to chat history st.session_state.messages.append({"role": "assistant", "content": response}) st.session_state.messages.append( {"role": "assistant", "content": final_ref.to_json()} )