Spaces:
Runtime error
Runtime error
fracapuano
commited on
Commit
·
4f5c619
1
Parent(s):
0e17089
fix: bug fixing inheritance
Browse files
qa/qa.py
CHANGED
@@ -5,11 +5,63 @@ from typing import Text, Union
|
|
5 |
|
6 |
multiple_files = True
|
7 |
|
8 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"""
|
10 |
-
|
11 |
"""
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def set_openai_api_key(api_key:Text)->bool:
|
15 |
"""Sets the internal OpenAI API key to the given value.
|
@@ -17,15 +69,14 @@ def set_openai_api_key(api_key:Text)->bool:
|
|
17 |
Args:
|
18 |
api_key (Text): OpenAI API key
|
19 |
"""
|
20 |
-
if not (api_key
|
21 |
-
|
22 |
-
return False
|
23 |
|
24 |
st.session_state["OPENAI_API_KEY"] = api_key
|
25 |
st.session_state["api_key_configured"] = True
|
26 |
return True
|
27 |
|
28 |
-
def
|
29 |
"""Converts a file to a document using specialized parsers."""
|
30 |
if file.name.endswith(".pdf"):
|
31 |
doc = parse_pdf(file)
|
@@ -43,14 +94,9 @@ def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
|
|
43 |
# def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
|
44 |
|
45 |
def qa_main():
|
46 |
-
|
47 |
st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
|
48 |
|
49 |
-
index = None
|
50 |
-
doc = None
|
51 |
-
|
52 |
-
upload_document_greenlight = False
|
53 |
-
uploaded_processed_document_greenlight = False
|
54 |
# OpenAI API Key - TODO: consider adding a key valid for everyone
|
55 |
# st.header("Configure OpenAI API Key")
|
56 |
# st.warning('Please enter your OpenAI API Key!', icon='⚠️')
|
@@ -63,88 +109,93 @@ def qa_main():
|
|
63 |
# help="You can get your API key from https://platform.openai.com/account/api-keys.",
|
64 |
# value=st.session_state.get("OPENAI_API_KEY", ""),
|
65 |
# )
|
|
|
66 |
user_secret = st.secrets["OPENAI_API_KEY"]
|
67 |
if user_secret:
|
68 |
if set_openai_api_key(user_secret):
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
-
if upload_document_greenlight:
|
73 |
# File that needs to be queried
|
74 |
st.header("Upload a file")
|
75 |
-
|
76 |
"Upload a pdf, docx, or txt file (scanned documents not supported)",
|
77 |
type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
|
78 |
help="Scanned documents are not supported yet 🥲",
|
79 |
-
|
80 |
-
|
|
|
81 |
)
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
file_doc = file_to_doc(file)
|
91 |
-
# converts the files into a list of documents
|
92 |
-
file_text = text_to_docs(text=tuple(file_doc), file_name=file.name)
|
93 |
-
text.extend(file_text)
|
94 |
|
95 |
-
|
96 |
try:
|
97 |
-
|
98 |
-
|
99 |
-
st.session_state["api_key_configured"] = True
|
100 |
except OpenAIError as e:
|
101 |
st.error("OpenAI error encountered: ", e._message)
|
102 |
-
|
103 |
-
uploaded_processed_document_greenlight = True
|
104 |
-
|
105 |
-
if uploaded_processed_document_greenlight:
|
106 |
-
if "messages" not in st.session_state:
|
107 |
-
st.session_state["messages"] = []
|
108 |
-
|
109 |
-
for message in st.session_state.messages:
|
110 |
-
with st.chat_message(message["role"]):
|
111 |
-
st.markdown(message["content"])
|
112 |
-
|
113 |
-
if prompt := st.chat_input("Ask the document something..."):
|
114 |
-
st.session_state.messages.append({"role": "user", "content": prompt})
|
115 |
-
with st.chat_message("user"):
|
116 |
-
st.markdown(prompt)
|
117 |
-
|
118 |
-
with st.chat_message("assistant"):
|
119 |
-
message_placeholder = st.empty()
|
120 |
-
# retrieving the most relevant sources
|
121 |
-
sources = search_docs(index, prompt)
|
122 |
-
# producing the answer, live
|
123 |
-
full_response = ""
|
124 |
-
for answer_bit in get_answer(sources, prompt)["output_text"]:
|
125 |
-
full_response += answer_bit
|
126 |
-
message_placeholder.markdown(full_response + "▌")
|
127 |
-
|
128 |
-
message_placeholder.markdown(full_response)
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
-
|
137 |
-
|
|
|
138 |
|
139 |
-
|
140 |
-
|
|
|
|
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
-
# qachat("Ask your question here...")
|
|
|
5 |
|
6 |
multiple_files = True
|
7 |
|
8 |
+
def query_pipeline(index:VectorStore, query:Text, stream_answer:bool=False)->Text:
|
9 |
+
"""This function reproduces the querying pipeline considering a given input index."""
|
10 |
+
# retrieving the most relevant pieces of information within the knowledge base
|
11 |
+
sources = search_docs(index, query=query)
|
12 |
+
# getting the answer, all at once
|
13 |
+
answer = get_answer(sources, query=query, stream_answer=stream_answer)["output_text"]
|
14 |
+
|
15 |
+
return answer
|
16 |
+
|
17 |
+
def toggle_process_document():
|
18 |
+
"""Toggles the greenlight for the next step in the pipeline, i.e. processing the document."""
|
19 |
+
if "processing_document_greenlight" not in st.session_state:
|
20 |
+
st.session_state["processing_document_greenlight"] = True
|
21 |
+
|
22 |
+
st.session_state["processing_document_greenlight"] = not st.session_state["processing_document_greenlight"]
|
23 |
+
|
24 |
+
def register_new_file_name(file_name):
|
25 |
"""
|
26 |
+
Registers a new file name in the internal session state.
|
27 |
"""
|
28 |
+
if "uploaded_file_names" not in st.session_state:
|
29 |
+
st.session_state["uploaded_file_names"] = []
|
30 |
+
|
31 |
+
st.session_state["uploaded_file_names"].append(file_name)
|
32 |
+
|
33 |
+
def clear_index():
|
34 |
+
"""
|
35 |
+
Clears the index from the internal session state.
|
36 |
+
This is a non reversible operation.
|
37 |
+
"""
|
38 |
+
if "index" in st.session_state:
|
39 |
+
del globals()["index"]
|
40 |
+
|
41 |
+
def clear_session_state():
|
42 |
+
"""
|
43 |
+
Clears the session state iterating over keys.
|
44 |
+
This is a non reversible operation.
|
45 |
+
"""
|
46 |
+
for k in st.session_state.keys():
|
47 |
+
del st.session_state[k]
|
48 |
+
|
49 |
+
def register_new_file(new_file):
|
50 |
+
"""
|
51 |
+
Registers a new file in the internal session state.
|
52 |
+
"""
|
53 |
+
if "uploaded_files" not in st.session_state:
|
54 |
+
st.session_state["uploaded_files"] = []
|
55 |
+
|
56 |
+
st.session_state["uploaded_files"].extend(new_file)
|
57 |
+
|
58 |
+
def clear_all_files():
|
59 |
+
"""Removes all uploaded files from the interal session state."""
|
60 |
+
st.session_state["uploaded_files"] = []
|
61 |
+
|
62 |
+
def append_uploaded_files(file):
|
63 |
+
"""Appends the uploaded files to the internal session state."""
|
64 |
+
st.session_state.get("uploaded_files", []).extend(file)
|
65 |
|
66 |
def set_openai_api_key(api_key:Text)->bool:
|
67 |
"""Sets the internal OpenAI API key to the given value.
|
|
|
69 |
Args:
|
70 |
api_key (Text): OpenAI API key
|
71 |
"""
|
72 |
+
if not check_openai_api_key(api_key=api_key):
|
73 |
+
raise ValueError("Invalid OpenAI API key! Please provide a valid key.")
|
|
|
74 |
|
75 |
st.session_state["OPENAI_API_KEY"] = api_key
|
76 |
st.session_state["api_key_configured"] = True
|
77 |
return True
|
78 |
|
79 |
+
def parse_file(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
|
80 |
"""Converts a file to a document using specialized parsers."""
|
81 |
if file.name.endswith(".pdf"):
|
82 |
doc = parse_pdf(file)
|
|
|
94 |
# def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
|
95 |
|
96 |
def qa_main():
|
97 |
+
"""Main function for the QA app."""
|
98 |
st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
# OpenAI API Key - TODO: consider adding a key valid for everyone
|
101 |
# st.header("Configure OpenAI API Key")
|
102 |
# st.warning('Please enter your OpenAI API Key!', icon='⚠️')
|
|
|
109 |
# help="You can get your API key from https://platform.openai.com/account/api-keys.",
|
110 |
# value=st.session_state.get("OPENAI_API_KEY", ""),
|
111 |
# )
|
112 |
+
|
113 |
user_secret = st.secrets["OPENAI_API_KEY"]
|
114 |
if user_secret:
|
115 |
if set_openai_api_key(user_secret):
|
116 |
+
# removing this when the OpenAI API key is hardcoded
|
117 |
+
# st.success('OpenAI API key successfully accessed!', icon='✅')
|
118 |
+
|
119 |
+
# greenlight for next step, i.e. uploading the document to chat with
|
120 |
+
st.session_state["upload_document_greenlight"] = True
|
121 |
|
122 |
+
if st.session_state.get("upload_document_greenlight"):
|
123 |
# File that needs to be queried
|
124 |
st.header("Upload a file")
|
125 |
+
st.file_uploader(
|
126 |
"Upload a pdf, docx, or txt file (scanned documents not supported)",
|
127 |
type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
|
128 |
help="Scanned documents are not supported yet 🥲",
|
129 |
+
accept_multiple_files=multiple_files,
|
130 |
+
#on_change=toggle_process_document,
|
131 |
+
key="uploaded_file"
|
132 |
)
|
133 |
+
|
134 |
+
documents = {}
|
135 |
+
indexes = {}
|
136 |
+
for file in st.session_state["uploaded_file"]:
|
137 |
+
parsed_file = parse_file(file)
|
138 |
+
# converts the files into a list of documents
|
139 |
+
document = text_to_docs(pages=tuple(parsed_file), file_name=file.name)
|
140 |
+
documents[file.name] = document
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
with st.spinner(f"Indexing {file.name} (might take some time)"):
|
143 |
try:
|
144 |
+
# indexing the document uploaded
|
145 |
+
indexes[file.name] = embed_docs(file_name=file.name, _docs=tuple(document))
|
|
|
146 |
except OpenAIError as e:
|
147 |
st.error("OpenAI error encountered: ", e._message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
+
if len(documents)>1:
|
150 |
+
# documents to be indexed when providing the query
|
151 |
+
st.multiselect(
|
152 |
+
label="Select the documents to be indexed",
|
153 |
+
options=list(documents.keys()),
|
154 |
+
key="multiselect_documents_choices",
|
155 |
+
)
|
156 |
+
|
157 |
+
elif len(documents)==1:
|
158 |
+
st.session_state["multiselect_documents_choices"] = [list(documents.keys())[0]]
|
159 |
+
|
160 |
+
# this is the code that actually performs the chat process
|
161 |
+
if "messages" not in st.session_state: # checking if there is any cache history
|
162 |
+
st.session_state["messages"] = []
|
163 |
|
164 |
+
for message in st.session_state.messages:
|
165 |
+
with st.chat_message(message["role"]):
|
166 |
+
st.markdown(message["content"], unsafe_allow_html=True)
|
167 |
|
168 |
+
if prompt:=st.chat_input("Ask the document something..."):
|
169 |
+
|
170 |
+
if prompt=="1":
|
171 |
+
prompt="What is this document about?"
|
172 |
|
173 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
174 |
+
|
175 |
+
with st.chat_message("user"):
|
176 |
+
st.markdown(prompt)
|
177 |
+
|
178 |
+
with st.chat_message("assistant"):
|
179 |
+
# full_response will store every question asked to all the document(s) considered
|
180 |
+
full_response = ""
|
181 |
+
message_placeholder = st.empty()
|
182 |
+
|
183 |
+
# asking the same question to all of the documents considered
|
184 |
+
for chat_document in st.session_state["multiselect_documents_choices"]:
|
185 |
+
# keeping track of what is asked to what document
|
186 |
+
full_response += \
|
187 |
+
f"<i>Asking</i> <b>{chat_document}</b> <i>question</i> <b>{prompt}</b></i><br>"
|
188 |
+
message_placeholder.markdown(full_response, unsafe_allow_html=True)
|
189 |
+
# retrieving the vector store associated to the chat document considered
|
190 |
+
chat_index = indexes[chat_document]
|
191 |
+
# producing the answer considered, live
|
192 |
+
for answer_bit in query_pipeline(chat_index, prompt, stream_answer=True):
|
193 |
+
full_response += answer_bit
|
194 |
+
message_placeholder.markdown(full_response + "▌", unsafe_allow_html=True)
|
195 |
+
# appending a final entering
|
196 |
+
full_response += "<br>"
|
197 |
+
message_placeholder.markdown(full_response, unsafe_allow_html=True)
|
198 |
+
|
199 |
+
# appending the final response obtained after having asked all the documents
|
200 |
+
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
201 |
|
|