Spaces:
Sleeping
Sleeping
th8m0z
commited on
Commit
•
b5ac495
1
Parent(s):
54f6539
more comments
Browse files- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/semantic_search.cpython-311.pyc +0 -0
- __pycache__/ui.cpython-311.pyc +0 -0
- app.py +6 -4
- semantic_search.py +5 -3
- ui.py +0 -2
__pycache__/app.cpython-311.pyc
CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
|
|
__pycache__/semantic_search.cpython-311.pyc
CHANGED
Binary files a/__pycache__/semantic_search.cpython-311.pyc and b/__pycache__/semantic_search.cpython-311.pyc differ
|
|
__pycache__/ui.cpython-311.pyc
CHANGED
Binary files a/__pycache__/ui.cpython-311.pyc and b/__pycache__/ui.cpython-311.pyc differ
|
|
app.py
CHANGED
@@ -35,7 +35,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
35 |
doc.close()
|
36 |
return text_list
|
37 |
|
38 |
-
#
|
39 |
def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
40 |
|
41 |
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
@@ -56,6 +56,7 @@ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
|
56 |
return chunks
|
57 |
|
58 |
|
|
|
59 |
def load_recommender(paths, start_page=1):
|
60 |
global recommender
|
61 |
texts = []
|
@@ -66,6 +67,8 @@ def load_recommender(paths, start_page=1):
|
|
66 |
recommender.fit(chunks)
|
67 |
return 'Corpus Loaded.'
|
68 |
|
|
|
|
|
69 |
def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
|
70 |
openai.api_key = openAI_key
|
71 |
temperature=0.7
|
@@ -101,6 +104,7 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
|
|
101 |
return message
|
102 |
|
103 |
|
|
|
104 |
def construct_prompt(question):
|
105 |
topn_chunks = recommender(question)
|
106 |
prompt = 'search results:\n\n'
|
@@ -114,6 +118,7 @@ def construct_prompt(question):
|
|
114 |
prompt += f"{question}\nAnswer:"
|
115 |
return prompt
|
116 |
|
|
|
117 |
def question_answer(chat_history, url, files, question, openAI_key, model):
|
118 |
try:
|
119 |
if files == None:
|
@@ -150,6 +155,3 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
|
|
150 |
return chat_history
|
151 |
except openai.error.InvalidRequestError as e:
|
152 |
return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
35 |
doc.close()
|
36 |
return text_list
|
37 |
|
38 |
+
# converts a text into a list of chunks
|
39 |
def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
40 |
|
41 |
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
|
|
56 |
return chunks
|
57 |
|
58 |
|
59 |
+
# merges a list of pdfs into a list of chunks and fits the recommender
|
60 |
def load_recommender(paths, start_page=1):
|
61 |
global recommender
|
62 |
texts = []
|
|
|
67 |
recommender.fit(chunks)
|
68 |
return 'Corpus Loaded.'
|
69 |
|
70 |
+
|
71 |
+
# calls the OpenAI API to generate a response for the given query
|
72 |
def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
|
73 |
openai.api_key = openAI_key
|
74 |
temperature=0.7
|
|
|
104 |
return message
|
105 |
|
106 |
|
107 |
+
# constructs the prompt for the given query
|
108 |
def construct_prompt(question):
|
109 |
topn_chunks = recommender(question)
|
110 |
prompt = 'search results:\n\n'
|
|
|
118 |
prompt += f"{question}\nAnswer:"
|
119 |
return prompt
|
120 |
|
121 |
+
# main function that is called when the user clicks the submit button, generates an answer for the query
|
122 |
def question_answer(chat_history, url, files, question, openAI_key, model):
|
123 |
try:
|
124 |
if files == None:
|
|
|
155 |
return chat_history
|
156 |
except openai.error.InvalidRequestError as e:
|
157 |
return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
|
|
|
|
|
|
semantic_search.py
CHANGED
@@ -2,13 +2,14 @@ import numpy as np
|
|
2 |
import tensorflow_hub as hub
|
3 |
from sklearn.neighbors import NearestNeighbors
|
4 |
|
|
|
|
|
5 |
class SemanticSearch:
|
6 |
-
|
7 |
def __init__(self):
|
8 |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
|
9 |
self.fitted = False
|
10 |
|
11 |
-
|
12 |
def fit(self, data, batch=1000, n_neighbors=5):
|
13 |
self.data = data
|
14 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
@@ -16,7 +17,7 @@ class SemanticSearch:
|
|
16 |
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
17 |
self.nn.fit(self.embeddings)
|
18 |
self.fitted = True
|
19 |
-
|
20 |
|
21 |
def __call__(self, text, return_data=True):
|
22 |
inp_emb = self.use([text])
|
@@ -28,6 +29,7 @@ class SemanticSearch:
|
|
28 |
return neighbors
|
29 |
|
30 |
|
|
|
31 |
def get_text_embedding(self, texts, batch=1000):
|
32 |
embeddings = []
|
33 |
for i in range(0, len(texts), batch):
|
|
|
2 |
import tensorflow_hub as hub
|
3 |
from sklearn.neighbors import NearestNeighbors
|
4 |
|
5 |
+
|
6 |
+
|
7 |
class SemanticSearch:
|
|
|
8 |
def __init__(self):
|
9 |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
|
10 |
self.fitted = False
|
11 |
|
12 |
+
# fits the recommender
|
13 |
def fit(self, data, batch=1000, n_neighbors=5):
|
14 |
self.data = data
|
15 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
|
|
17 |
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
18 |
self.nn.fit(self.embeddings)
|
19 |
self.fitted = True
|
20 |
+
|
21 |
|
22 |
def __call__(self, text, return_data=True):
|
23 |
inp_emb = self.use([text])
|
|
|
29 |
return neighbors
|
30 |
|
31 |
|
32 |
+
# returns embeddings
|
33 |
def get_text_embedding(self, texts, batch=1000):
|
34 |
embeddings = []
|
35 |
for i in range(0, len(texts), batch):
|
ui.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import app as app
|
3 |
|
4 |
-
|
5 |
-
|
6 |
# pre-defined questions
|
7 |
questions = [
|
8 |
"What did the study investigate?",
|
|
|
1 |
import gradio as gr
|
2 |
import app as app
|
3 |
|
|
|
|
|
4 |
# pre-defined questions
|
5 |
questions = [
|
6 |
"What did the study investigate?",
|