File size: 4,756 Bytes
51fe9d2
 
0489db2
 
 
 
51fe9d2
 
0489db2
 
 
 
 
7a7c4d5
0489db2
51fe9d2
0489db2
 
 
7a7c4d5
 
 
 
51fe9d2
0489db2
7a7c4d5
51fe9d2
0489db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe9d2
0489db2
 
 
 
51fe9d2
 
 
0489db2
 
7a7c4d5
0489db2
 
 
 
 
 
 
 
7a7c4d5
 
 
0489db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe9d2
0489db2
 
 
 
 
 
51fe9d2
0489db2
 
51fe9d2
0489db2
 
 
51fe9d2
0489db2
 
 
 
51fe9d2
0489db2
 
 
 
 
7a7c4d5
 
 
 
 
 
 
 
0489db2
7a7c4d5
 
51fe9d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import streamlit as st
from openai.error import OpenAIError
from .utils import *
from typing import Text, Union

multiple_files = False

def clear_submit():
    """
    Toggles the file_submitted internal session state variable to False.
    """
    st.session_state["file_submitted"] = False

def set_openai_api_key(api_key:Text)->bool:
    """Sets the internal OpenAI API key to the given value.

    Args:
        api_key (Text): OpenAI API key
    """
    if not (api_key.startswith('sk-') and len(api_key)==51):
        st.error("Invalid OpenAI API key! Please provide a valid key.")
        return False
    
    st.session_state["OPENAI_API_KEY"] = api_key
    st.session_state["api_key_configured"] = True
    return True

def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
    """Converts a file to a document using specialized parsers."""
    if file.name.endswith(".pdf"):
        doc = parse_pdf(file)
    elif file.name.endswith(".docx"):
        doc = parse_docx(file)
    elif file.name.split["."][1] in [".txt", ".py", ".json", ".html", ".css", ".md" ]:
        doc = parse_txt(file)
    else:
        st.error("File type not yet supported! Supported files: [.pdf, .docx, .txt, .py, .json, .html, .css, .md]")
        doc = None
    
    return doc

# this function can be used to define a single doc processing pipeline
# def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:  

def qa_main():
    st.markdown("<h2>This app allows to chat with files!</h2>", unsafe_allow_html=True)
    st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
    
    index = None
    doc = None

    # OpenAI API Key - TODO: consider adding a key valid for everyone
    st.header("Configure OpenAI API Key")
    st.warning('Please enter your OpenAI API Key!', icon='⚠️')
    user_secret = st.text_input(
        "Insert your OpenAI API key here ([get your API key](https://platform.openai.com/account/api-keys)).",
        type="password",
        placeholder="Paste your OpenAI API key here (sk-...)",
        help="You can get your API key from https://platform.openai.com/account/api-keys.",
        value=st.session_state.get("OPENAI_API_KEY", ""),
    )
    if user_secret:
        if set_openai_api_key(user_secret):
            st.success('OpenAI API key successfully provided!', icon='✅')

    # File that needs to be queried
    st.header("Upload a file")
    uploaded_file = st.file_uploader(
        "Upload a pdf, docx, or txt file (scanned documents not supported)",
        type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
        help="Scanned documents are not supported yet 🥲",
        on_change=clear_submit, 
        accept_multiple_files=multiple_files,
    )
        
    # reading the uploaded file
    if uploaded_file is not None:
        # toggle internal file submission state to True
        st.session_state["file_submitted"] = True
        # parse the file using custom parsers
        doc = file_to_doc(uploaded_file)
        # converts the files into a list of documents
        text = text_to_docs(text=tuple(doc))

        try:
            with st.spinner("Indexing the document... This might take a while!"):
                index = embed_docs(tuple(text))
                st.session_state["api_key_configured"] = True
        except OpenAIError as e:
            st.error("OpenAI error encountered: ", e._message)

    if "messages" not in st.session_state:
        st.session_state["messages"] = []

    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    if prompt := st.chat_input("Ask the document something..."):
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)

        with st.chat_message("assistant"):
            message_placeholder = st.empty()
            # retrieving the most relevant sources
            sources = search_docs(index, prompt)
            # producing the answer, live
            full_response = ""
            for answer_bit in get_answer(sources, prompt)["output_text"]:
                full_response += answer_bit
                message_placeholder.markdown(full_response + "▌")
            
            message_placeholder.markdown(full_response)
            # answer = get_answer(sources, prompt)
            # message_placeholder.markdown(answer["output_text"])
        
        # st.session_state.messages.append({"role": "assistant", "content": answer["output_text"]})
        st.session_state.messages.append({"role": "assistant", "content": full_response})