pdfGPT_Turbo

Sleeping

th8m0z commited on Nov 11, 2023

Commit

b5ac495

1 Parent(s): 54f6539

more comments

Files changed (6) hide show

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

__pycache__/semantic_search.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/semantic_search.cpython-311.pyc and b/__pycache__/semantic_search.cpython-311.pyc differ

__pycache__/ui.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/ui.cpython-311.pyc and b/__pycache__/ui.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
     doc.close()
     return text_list
-# one text converts a list of chunks
 def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
     filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
@@ -56,6 +56,7 @@ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
     return chunks
 def load_recommender(paths, start_page=1):
     global recommender
     texts = []
@@ -66,6 +67,8 @@ def load_recommender(paths, start_page=1):
     recommender.fit(chunks)
     return 'Corpus Loaded.'
 def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
     openai.api_key = openAI_key
     temperature=0.7
@@ -101,6 +104,7 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
     return message
 def construct_prompt(question):
     topn_chunks = recommender(question)
     prompt = 'search results:\n\n'
@@ -114,6 +118,7 @@ def construct_prompt(question):
     prompt += f"{question}\nAnswer:"
     return prompt
 def question_answer(chat_history, url, files, question, openAI_key, model):
     try:
         if files == None:
@@ -150,6 +155,3 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
         return chat_history
     except openai.error.InvalidRequestError as e:
         return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'

     doc.close()
     return text_list
+# converts a text into a list of chunks
 def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
     filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
     return chunks
+# merges a list of pdfs into a list of chunks and fits the recommender
 def load_recommender(paths, start_page=1):
     global recommender
     texts = []
     recommender.fit(chunks)
     return 'Corpus Loaded.'
+# calls the OpenAI API to generate a response for the given query
 def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
     openai.api_key = openAI_key
     temperature=0.7
     return message
+# constructs the prompt for the given query
 def construct_prompt(question):
     topn_chunks = recommender(question)
     prompt = 'search results:\n\n'
     prompt += f"{question}\nAnswer:"
     return prompt
+# main function that is called when the user clicks the submit button, generates an answer for the query
 def question_answer(chat_history, url, files, question, openAI_key, model):
     try:
         if files == None:
         return chat_history
     except openai.error.InvalidRequestError as e:
         return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'

semantic_search.py CHANGED Viewed

@@ -2,13 +2,14 @@ import numpy as np
 import tensorflow_hub as hub
 from sklearn.neighbors import NearestNeighbors
 class SemanticSearch:
     def __init__(self):
         self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         self.fitted = False
     def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -16,7 +17,7 @@ class SemanticSearch:
         self.nn = NearestNeighbors(n_neighbors=n_neighbors)
         self.nn.fit(self.embeddings)
         self.fitted = True
     def __call__(self, text, return_data=True):
         inp_emb = self.use([text])
@@ -28,6 +29,7 @@ class SemanticSearch:
             return neighbors
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):

 import tensorflow_hub as hub
 from sklearn.neighbors import NearestNeighbors
 class SemanticSearch:
     def __init__(self):
         self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         self.fitted = False
+    # fits the recommender
     def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         self.nn = NearestNeighbors(n_neighbors=n_neighbors)
         self.nn.fit(self.embeddings)
         self.fitted = True
     def __call__(self, text, return_data=True):
         inp_emb = self.use([text])
             return neighbors
+	# returns embeddings
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):

ui.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import gradio as gr
 import app as app
 # pre-defined questions
 questions = [
     "What did the study investigate?",

 import gradio as gr
 import app as app
 # pre-defined questions
 questions = [
     "What did the study investigate?",