pdfGPT_Turbo

Sleeping

App Files Files Community

th8m0z commited on Nov 13, 2023

Commit

2dae07c

1 Parent(s): 71b98c7

increased batch/nn sizes + added summary feature

Browse files

Files changed (3) hide show

app.py +2 -1
functions.py +20 -3
semantic_search.py +1 -1

app.py CHANGED Viewed

@@ -47,7 +47,8 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
                 'gpt-3.5-turbo-16k-0613',
                 'text-davinci-003',
                 'gpt-4',
-                'gpt-4-32k'
             ], label='Select Model', default='gpt-3.5-turbo')
             btn = gr.Button(value='Submit')

                 'gpt-3.5-turbo-16k-0613',
                 'text-davinci-003',
                 'gpt-4',
+                'gpt-4-32k',
+                'gpt-4-1106-preview'
             ], label='Select Model', default='gpt-3.5-turbo')
             btn = gr.Button(value='Submit')

functions.py CHANGED Viewed

@@ -36,7 +36,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
     return text_list
 # converts a text into a list of chunks
-def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
     filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
     text_toks = [t.split(' ') for t in filtered_texts]
@@ -102,17 +102,23 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
 # constructs the prompt for the given query
-def construct_prompt(question):
     topn_chunks = recommender(question)
     prompt = 'search results:\n\n'
     for c in topn_chunks:
         prompt += c + '\n\n'
     prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
               "Cite each reference using [PDF Number][Page Number] notation. "\
               "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
     prompt += f"{question}\nAnswer:"
     return prompt
 # main function that is called when the user clicks the submit button, generates an answer for the query
@@ -146,9 +152,20 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
         if question.strip() == '':
             return '[ERROR]: Question field is empty'
-        prompt = construct_prompt(question)
         answer = generate_text(openAI_key, prompt, model)
         chat_history.append([question, answer])
         return chat_history
     except openai.error.InvalidRequestError as e:
         return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'

     return text_list
 # converts a text into a list of chunks
+def text_to_chunks(texts, word_length=300, start_page=1, file_number=1):
     filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
     text_toks = [t.split(' ') for t in filtered_texts]
 # constructs the prompt for the given query
+def construct_prompt(question, openAI_key):
     topn_chunks = recommender(question)
+    topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-4")
     prompt = 'search results:\n\n'
     for c in topn_chunks:
         prompt += c + '\n\n'
     prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
               "Cite each reference using [PDF Number][Page Number] notation. "\
               "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
     prompt += f"{question}\nAnswer:"
+    print("prompt == " + str(prompt))
     return prompt
 # main function that is called when the user clicks the submit button, generates an answer for the query
         if question.strip() == '':
             return '[ERROR]: Question field is empty'
+        prompt = construct_prompt(question, openAI_key)
         answer = generate_text(openAI_key, prompt, model)
         chat_history.append([question, answer])
         return chat_history
     except openai.error.InvalidRequestError as e:
         return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
+def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=2000):
+    total_tokens = sum(len(chunk.split()) for chunk in chunks)
+    if total_tokens > token_limit:
+        print("has to summarize")
+        summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n" + " ".join(chunks)
+        return generate_text(openAI_key, summary_prompt, model=model)
+    else:
+        return chunks

semantic_search.py CHANGED Viewed

@@ -10,7 +10,7 @@ class SemanticSearch:
         self.fitted = False
     # fits the recommender
-    def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         n_neighbors = min(n_neighbors, len(self.embeddings))

         self.fitted = False
     # fits the recommender
+    def fit(self, data, batch=1000, n_neighbors=10):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         n_neighbors = min(n_neighbors, len(self.embeddings))