File size: 4,404 Bytes
acc9eef
 
 
 
425fab4
acc9eef
 
 
 
 
9eaa2b8
acc9eef
 
 
4ecf027
acc9eef
 
4ecf027
acc9eef
 
 
 
4ecf027
99e4d1a
acc9eef
4ecf027
acc9eef
 
86c33f7
 
 
 
0cd7c97
 
acc9eef
4ecf027
acc9eef
4ecf027
acc9eef
 
 
 
 
 
 
 
 
 
 
86c33f7
acc9eef
 
 
 
 
 
 
 
 
 
 
4d80aaf
 
acc9eef
 
 
4ecf027
acc9eef
4ecf027
 
acc9eef
 
 
 
 
 
 
 
 
4ecf027
acc9eef
 
 
4ecf027
acc9eef
 
 
 
 
4ecf027
acc9eef
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import streamlit as st
from utils import *


# streamlit session state holds history, this mehtods cleas that history.  Note: as our applications become more complicated session state will be used more.
def clear_history():
    if 'history' in st.session_state:
        del st.session_state['history']


# use main function to designate as primary package
if __name__ == "__main__":
    import os

    # create your side bar
    st.subheader('Load a Document and Ask a Question')
    with st.sidebar:
        # use text_input to bring in your OpenAI API key 
        api_key = st.text_input('OpenAI API Key:', type='password')
        if api_key:
            os.environ['OPENAI_API_KEY'] = api_key

        # sidebar - file uploader widget, drag and drop, browse button works on windows not on mac
        uploaded_file = st.file_uploader('To upload a file drag and drop it on the area below:', type=['pdf', 'docx', 'txt', 'csv'])

        # call the chunk size mehtod that sets the number
        chunk_size = st.number_input('Chunk size:', min_value=100, max_value=2048, value=512, on_change=clear_history)

        # chunk Overlab
        chunk_overlap = st.number_input('Chunk Overlap:', min_value=0, max_value=200, value=20, on_change=clear_history)


        # input the top-k number, k increase the search effectiveness, but is more expensive
        k = st.number_input('top-k most salient docs', min_value=1, max_value=20, value=3, on_change=clear_history)

        # click this sidebard button to add data
        add_data = st.button('Add Data', on_click=clear_history)
        #chekc if data button has been clicked,if the api key is added and if a data file is available for upload
        if add_data:
            if api_key:
                if uploaded_file and add_data:  # if the user browsed a file
                    with st.spinner('Reading, chunking and embedding file ...'):
                        # writing the file from RAM to the current directory on disk
                        bytes_data = uploaded_file.read()
                        file_name = os.path.join('./', uploaded_file.name)
                        with open(file_name, 'wb') as f:
                            f.write(bytes_data)

                        data = load_document(file_name)
                        chunks = chunk_data(data, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
                        st.write(f'Chunk size: {chunk_size}, Chunks: {len(chunks)}')

                        tokens, embedding_cost = calculate_embedding_cost(chunks)
                        st.write(f'Embedding cost: ${embedding_cost:.4f}')

                        # creating the embeddings and returning the Chroma vector store
                        vector_store = create_embeddings(chunks)

                        # saving the vector store in the streamlit session state (to be persistent between reruns)
                        st.session_state.vs = vector_store
                        st.success('File uploaded, chunked and embedded successfully.')
                else:
                    st.error("Please drag and drop your file to the upload area above.....")
            else:
                st.error("Please provide your OpenAI API key above.....")

    # this is the main input widget that allows you to input your query of the uploaded document
    q = st.text_input('Ask a question about the content of your file:')
    if q:  # run the query if the user entered a question and hit enter
        if 'vs' in st.session_state:  # for seesion state, if there's the vector store (user uploaded, split and embedded a file)
            vector_store = st.session_state.vs
            st.write(f'k: {k}')
            answer = ask_and_get_answer(vector_store, q, k)

            # text area widget for the LLM answer
            st.text_area('LLM Answer: ', value=answer)

            st.divider()

            # initialize a chat history if there's no chat history
            if 'history' not in st.session_state:
                st.session_state.history = ''

            # your question and answer
            value = f'Q: {q} \nA: {answer}'

            st.session_state.history = f'{value} \n {"-" * 100} \n {st.session_state.history}'
            h = st.session_state.history

            # chat history text area widget
            st.text_area(label='Chat History', value=h, key='history', height=400)