Spaces:
Sleeping
Sleeping
th8m0z
commited on
Commit
·
2dae07c
1
Parent(s):
71b98c7
increased batch/nn sizes + added summary feature
Browse files- app.py +2 -1
- functions.py +20 -3
- semantic_search.py +1 -1
app.py
CHANGED
@@ -47,7 +47,8 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
47 |
'gpt-3.5-turbo-16k-0613',
|
48 |
'text-davinci-003',
|
49 |
'gpt-4',
|
50 |
-
'gpt-4-32k'
|
|
|
51 |
], label='Select Model', default='gpt-3.5-turbo')
|
52 |
btn = gr.Button(value='Submit')
|
53 |
|
|
|
47 |
'gpt-3.5-turbo-16k-0613',
|
48 |
'text-davinci-003',
|
49 |
'gpt-4',
|
50 |
+
'gpt-4-32k',
|
51 |
+
'gpt-4-1106-preview'
|
52 |
], label='Select Model', default='gpt-3.5-turbo')
|
53 |
btn = gr.Button(value='Submit')
|
54 |
|
functions.py
CHANGED
@@ -36,7 +36,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
36 |
return text_list
|
37 |
|
38 |
# converts a text into a list of chunks
|
39 |
-
def text_to_chunks(texts, word_length=
|
40 |
|
41 |
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
42 |
text_toks = [t.split(' ') for t in filtered_texts]
|
@@ -102,17 +102,23 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
|
|
102 |
|
103 |
|
104 |
# constructs the prompt for the given query
|
105 |
-
def construct_prompt(question):
|
106 |
topn_chunks = recommender(question)
|
|
|
|
|
|
|
107 |
prompt = 'search results:\n\n'
|
108 |
for c in topn_chunks:
|
109 |
prompt += c + '\n\n'
|
|
|
|
|
110 |
|
111 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
112 |
"Cite each reference using [PDF Number][Page Number] notation. "\
|
113 |
"Only answer what is asked. The answer should be short and concise. \n\nQuery: "
|
114 |
|
115 |
prompt += f"{question}\nAnswer:"
|
|
|
116 |
return prompt
|
117 |
|
118 |
# main function that is called when the user clicks the submit button, generates an answer for the query
|
@@ -146,9 +152,20 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
|
|
146 |
|
147 |
if question.strip() == '':
|
148 |
return '[ERROR]: Question field is empty'
|
149 |
-
prompt = construct_prompt(question)
|
|
|
150 |
answer = generate_text(openAI_key, prompt, model)
|
151 |
chat_history.append([question, answer])
|
152 |
return chat_history
|
153 |
except openai.error.InvalidRequestError as e:
|
154 |
return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
return text_list
|
37 |
|
38 |
# converts a text into a list of chunks
|
39 |
+
def text_to_chunks(texts, word_length=300, start_page=1, file_number=1):
|
40 |
|
41 |
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
42 |
text_toks = [t.split(' ') for t in filtered_texts]
|
|
|
102 |
|
103 |
|
104 |
# constructs the prompt for the given query
|
105 |
+
def construct_prompt(question, openAI_key):
|
106 |
topn_chunks = recommender(question)
|
107 |
+
|
108 |
+
topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-4")
|
109 |
+
|
110 |
prompt = 'search results:\n\n'
|
111 |
for c in topn_chunks:
|
112 |
prompt += c + '\n\n'
|
113 |
+
|
114 |
+
|
115 |
|
116 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
117 |
"Cite each reference using [PDF Number][Page Number] notation. "\
|
118 |
"Only answer what is asked. The answer should be short and concise. \n\nQuery: "
|
119 |
|
120 |
prompt += f"{question}\nAnswer:"
|
121 |
+
print("prompt == " + str(prompt))
|
122 |
return prompt
|
123 |
|
124 |
# main function that is called when the user clicks the submit button, generates an answer for the query
|
|
|
152 |
|
153 |
if question.strip() == '':
|
154 |
return '[ERROR]: Question field is empty'
|
155 |
+
prompt = construct_prompt(question, openAI_key)
|
156 |
+
|
157 |
answer = generate_text(openAI_key, prompt, model)
|
158 |
chat_history.append([question, answer])
|
159 |
return chat_history
|
160 |
except openai.error.InvalidRequestError as e:
|
161 |
return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
|
162 |
+
|
163 |
+
|
164 |
+
def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=2000):
|
165 |
+
total_tokens = sum(len(chunk.split()) for chunk in chunks)
|
166 |
+
if total_tokens > token_limit:
|
167 |
+
print("has to summarize")
|
168 |
+
summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n" + " ".join(chunks)
|
169 |
+
return generate_text(openAI_key, summary_prompt, model=model)
|
170 |
+
else:
|
171 |
+
return chunks
|
semantic_search.py
CHANGED
@@ -10,7 +10,7 @@ class SemanticSearch:
|
|
10 |
self.fitted = False
|
11 |
|
12 |
# fits the recommender
|
13 |
-
def fit(self, data, batch=1000, n_neighbors=
|
14 |
self.data = data
|
15 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
16 |
n_neighbors = min(n_neighbors, len(self.embeddings))
|
|
|
10 |
self.fitted = False
|
11 |
|
12 |
# fits the recommender
|
13 |
+
def fit(self, data, batch=1000, n_neighbors=10):
|
14 |
self.data = data
|
15 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
16 |
n_neighbors = min(n_neighbors, len(self.embeddings))
|