Spaces:
Sleeping
Sleeping
th8m0z
commited on
Commit
·
0826ebe
1
Parent(s):
47d3f11
working with multiple files + better prompt
Browse files- .gitignore +1 -0
- __pycache__/api.cpython-311.pyc +0 -0
- __pycache__/app.cpython-311.pyc +0 -0
- app.py +39 -22
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
venv/
|
|
|
|
1 |
venv/
|
2 |
+
Universal Sentence Encoder/
|
__pycache__/api.cpython-311.pyc
ADDED
Binary file (12.4 kB). View file
|
|
__pycache__/app.cpython-311.pyc
ADDED
Binary file (16.1 kB). View file
|
|
app.py
CHANGED
@@ -18,6 +18,7 @@ def preprocess(text):
|
|
18 |
return text
|
19 |
|
20 |
|
|
|
21 |
def pdf_to_text(path, start_page=1, end_page=None):
|
22 |
doc = fitz.open(path)
|
23 |
total_pages = doc.page_count
|
@@ -35,10 +36,11 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
35 |
doc.close()
|
36 |
return text_list
|
37 |
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
text_toks = [t.split(' ') for t in
|
41 |
-
page_nums = []
|
42 |
chunks = []
|
43 |
|
44 |
for idx, words in enumerate(text_toks):
|
@@ -49,15 +51,16 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
49 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
50 |
continue
|
51 |
chunk = ' '.join(chunk).strip()
|
52 |
-
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
53 |
chunks.append(chunk)
|
|
|
54 |
return chunks
|
55 |
|
56 |
|
57 |
class SemanticSearch:
|
58 |
|
59 |
def __init__(self):
|
60 |
-
self.use = hub.load('
|
61 |
self.fitted = False
|
62 |
|
63 |
|
@@ -91,10 +94,13 @@ class SemanticSearch:
|
|
91 |
|
92 |
|
93 |
|
94 |
-
def load_recommender(
|
95 |
global recommender
|
96 |
-
texts =
|
97 |
-
chunks =
|
|
|
|
|
|
|
98 |
recommender.fit(chunks)
|
99 |
return 'Corpus Loaded.'
|
100 |
|
@@ -148,13 +154,15 @@ def generate_answer(question, openAI_key, model):
|
|
148 |
return answer
|
149 |
|
150 |
|
151 |
-
def question_answer(chat_history, url,
|
152 |
try:
|
|
|
|
|
153 |
if openAI_key.strip()=='':
|
154 |
return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
155 |
-
if url.strip() == '' and
|
156 |
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
157 |
-
if url.strip() != '' and
|
158 |
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
159 |
if model is None or model =='':
|
160 |
return '[ERROR]: You have not selected any model. Please choose an LLM model.'
|
@@ -163,11 +171,17 @@ def question_answer(chat_history, url, file, question, openAI_key, model):
|
|
163 |
download_pdf(glob_url, 'corpus.pdf')
|
164 |
load_recommender('corpus.pdf')
|
165 |
else:
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
if question.strip() == '':
|
172 |
return '[ERROR]: Question field is empty'
|
173 |
if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
|
@@ -197,21 +211,24 @@ def generate_text_text_davinci_003(openAI_key,prompt, engine="text-davinci-003")
|
|
197 |
|
198 |
def generate_answer_text_davinci_003(question,openAI_key):
|
199 |
topn_chunks = recommender(question)
|
|
|
200 |
prompt = ""
|
201 |
prompt += 'search results:\n\n'
|
202 |
for c in topn_chunks:
|
203 |
prompt += c + '\n\n'
|
204 |
|
205 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
206 |
-
"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
|
207 |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
208 |
"with the same name, create separate answers for each. Only include information found in the results and "\
|
209 |
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
210 |
"If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
|
211 |
"search results which has nothing to do with the question. Only answer what is asked. The "\
|
212 |
-
"answer should be short and concise
|
213 |
|
214 |
prompt += f"Query: {question}\nAnswer:"
|
|
|
|
|
215 |
answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
|
216 |
return answer
|
217 |
|
@@ -248,9 +265,9 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
248 |
gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
|
249 |
with gr.Accordion("API Key"):
|
250 |
openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
|
251 |
-
url = gr.Textbox(label='Enter PDF URL here
|
252 |
gr.Markdown("<center><h4>OR<h4></center>")
|
253 |
-
|
254 |
question = gr.Textbox(label='Enter your question here')
|
255 |
gr.Examples(
|
256 |
[[q] for q in questions],
|
@@ -274,11 +291,11 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
274 |
chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
|
275 |
|
276 |
|
277 |
-
|
278 |
# Bind the click event of the button to the question_answer function
|
279 |
btn.click(
|
280 |
question_answer,
|
281 |
-
inputs=[chatbot, url,
|
282 |
outputs=[chatbot],
|
283 |
)
|
284 |
|
|
|
18 |
return text
|
19 |
|
20 |
|
21 |
+
# converts pdf to text
|
22 |
def pdf_to_text(path, start_page=1, end_page=None):
|
23 |
doc = fitz.open(path)
|
24 |
total_pages = doc.page_count
|
|
|
36 |
doc.close()
|
37 |
return text_list
|
38 |
|
39 |
+
# one text converts a list of chunks
|
40 |
+
def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
41 |
|
42 |
+
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
43 |
+
text_toks = [t.split(' ') for t in filtered_texts]
|
|
|
44 |
chunks = []
|
45 |
|
46 |
for idx, words in enumerate(text_toks):
|
|
|
51 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
52 |
continue
|
53 |
chunk = ' '.join(chunk).strip()
|
54 |
+
chunk = f'[PDF no. {file_number}] [Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
55 |
chunks.append(chunk)
|
56 |
+
# print("chunks == " + str(chunks))
|
57 |
return chunks
|
58 |
|
59 |
|
60 |
class SemanticSearch:
|
61 |
|
62 |
def __init__(self):
|
63 |
+
self.use = hub.load('./Universal Sentence Encoder/')
|
64 |
self.fitted = False
|
65 |
|
66 |
|
|
|
94 |
|
95 |
|
96 |
|
97 |
+
def load_recommender(paths, start_page=1):
|
98 |
global recommender
|
99 |
+
texts = []
|
100 |
+
chunks = []
|
101 |
+
for idx, path in enumerate(paths):
|
102 |
+
chunks += text_to_chunks(pdf_to_text(path, start_page=start_page), start_page=start_page, file_number=idx+1)
|
103 |
+
# print("chunks == " + str(chunks))
|
104 |
recommender.fit(chunks)
|
105 |
return 'Corpus Loaded.'
|
106 |
|
|
|
154 |
return answer
|
155 |
|
156 |
|
157 |
+
def question_answer(chat_history, url, files, question, openAI_key, model):
|
158 |
try:
|
159 |
+
if files == None:
|
160 |
+
files = []
|
161 |
if openAI_key.strip()=='':
|
162 |
return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
163 |
+
if url.strip() == '' and files == []:
|
164 |
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
165 |
+
if url.strip() != '' and files is not []:
|
166 |
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
167 |
if model is None or model =='':
|
168 |
return '[ERROR]: You have not selected any model. Please choose an LLM model.'
|
|
|
171 |
download_pdf(glob_url, 'corpus.pdf')
|
172 |
load_recommender('corpus.pdf')
|
173 |
else:
|
174 |
+
print(files)
|
175 |
+
filenames = []
|
176 |
+
for file in files:
|
177 |
+
old_file_name = file.name
|
178 |
+
file_name = file.name
|
179 |
+
file_name = file_name[:-12] + file_name[-4:]
|
180 |
+
os.rename(old_file_name, file_name)
|
181 |
+
filenames.append(file_name)
|
182 |
+
load_recommender(filenames)
|
183 |
+
|
184 |
+
|
185 |
if question.strip() == '':
|
186 |
return '[ERROR]: Question field is empty'
|
187 |
if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
|
|
|
211 |
|
212 |
def generate_answer_text_davinci_003(question,openAI_key):
|
213 |
topn_chunks = recommender(question)
|
214 |
+
# print("topn chunks == " + str(topn_chunks))
|
215 |
prompt = ""
|
216 |
prompt += 'search results:\n\n'
|
217 |
for c in topn_chunks:
|
218 |
prompt += c + '\n\n'
|
219 |
|
220 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
221 |
+
"Cite each reference using [PDF Number][Page Number] notation (every result has this number at the beginning). "\
|
222 |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
223 |
"with the same name, create separate answers for each. Only include information found in the results and "\
|
224 |
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
225 |
"If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
|
226 |
"search results which has nothing to do with the question. Only answer what is asked. The "\
|
227 |
+
"answer should be short and concise.\n\n"
|
228 |
|
229 |
prompt += f"Query: {question}\nAnswer:"
|
230 |
+
print("prompt == " + str(prompt))
|
231 |
+
# print("prompt == " + str(prompt))
|
232 |
answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
|
233 |
return answer
|
234 |
|
|
|
265 |
gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
|
266 |
with gr.Accordion("API Key"):
|
267 |
openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
|
268 |
+
url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
|
269 |
gr.Markdown("<center><h4>OR<h4></center>")
|
270 |
+
files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
|
271 |
question = gr.Textbox(label='Enter your question here')
|
272 |
gr.Examples(
|
273 |
[[q] for q in questions],
|
|
|
291 |
chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
|
292 |
|
293 |
|
294 |
+
|
295 |
# Bind the click event of the button to the question_answer function
|
296 |
btn.click(
|
297 |
question_answer,
|
298 |
+
inputs=[chatbot, url, files, question, openAI_key, model],
|
299 |
outputs=[chatbot],
|
300 |
)
|
301 |
|