Spaces:

kausthubkannan17
/

Drake

Build error

App Files Files Community

kausthubkannan17 commited on Apr 11, 2024

Commit

9bf726b

1 Parent(s): 4c0b3a6

feat: added comments to class

Browse files

Files changed (5) hide show

README.md +13 -0
model.py +55 -4
pages/upload_file.py +2 -1
pages/upload_url.py +2 -1
utilis.py +10 -14

README.md CHANGED Viewed

	@@ -0,0 +1,13 @@

+# Drake
+**Make Notes without mess**
+DrakeLLM is developed to help students to solve the issue of making notes from videos and LLMs. Utilising RAG, Drake helps in making quick notes along with a Q&A bot. Books, YouTube tutorials or Videos, Drake supports all your means.
+## Features
+- **Quick Notes**: Make notes quickly with Drake.
+- **Q&A Bot**: Ask questions and get answers from Drake.
+## Upcoming Features
+- **Image Support**: Querying images on similarity criteria.
+- **Image for context**: Using images for context in multimodal models like Llava.
+- **Completely Open Source**: Supporting the app to run on completely open source models like Llava, Llama.

model.py CHANGED Viewed

@@ -10,6 +10,15 @@ from langchain_core.documents.base import Document
 class DrakeLM:
     def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
         self.llm_model = llm_model
         if llm_model == "llama":
@@ -25,7 +34,18 @@ class DrakeLM:
         self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
         self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
-    def _chat_prompt(self, query: str, context: str):
         prompt = """You are assisting a student to understand topics. \n\n
          You have to answer the below question by utilising the below context to answer the question. \n\n
          Note to follow the rules given below \n\n
@@ -46,7 +66,19 @@ class DrakeLM:
         prompt = prompt.format(query=query, context=context, rules=rules)
         return PromptTemplate.from_template(prompt), prompt
-    def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos"):
         self.retriever.search_kwargs["distance_metric"] = distance_metric
         self.retriever.search_kwargs["k"] = k
@@ -65,7 +97,17 @@ class DrakeLM:
         return context
-    def ask_llm(self, query: str, metadata_filter: dict = None):
         context = self._retrieve(query, metadata_filter)
         print("Retrieved context")
         prompt_template, prompt_string = self._chat_prompt(query, context)
@@ -89,7 +131,16 @@ class DrakeLM:
         return self.chat_history.messages[-1].content
-    def create_notes(self, documents: List[Document]):
         rules = """
         - Follow the Markdown format for creating notes as shown in the example.
         - The heading of the content should be the title of the markdown file.

 class DrakeLM:
     def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
+        """
+        Parameters:
+            model_path (str): The path to the model in case running Llama
+            db (DeepLake): The DeepLake DB object
+            config (dict): The configuration for the llama model
+            llm_model (str): The LLM model type
+        Initialize the DrakeLM model
+        """
         self.llm_model = llm_model
         if llm_model == "llama":
         self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
         self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
+    def _chat_prompt(self, query: str, context: str) -> (PromptTemplate, str):
+        """
+        Parameters:
+            query (str): The question asked by the user
+            context (str): The context retrieved from the DB
+        Returns:
+            PromptTemplate: The prompt template for the chat
+            prompt (str): The prompt string for the chat
+        Create the chat prompt for the LLM model
+        """
         prompt = """You are assisting a student to understand topics. \n\n
          You have to answer the below question by utilising the below context to answer the question. \n\n
          Note to follow the rules given below \n\n
         prompt = prompt.format(query=query, context=context, rules=rules)
         return PromptTemplate.from_template(prompt), prompt
+    def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos") -> str:
+        """
+        Parameters:
+            query (str): The question asked by the user
+            metadata_filter (dict): The metadata filter for the DB
+            k (int): The number of documents to retrieve
+            distance_metric (str): The distance metric for retrieval
+        Returns:
+            str: The context retrieved from the DB
+        Retrieve the context from the DB
+        """
         self.retriever.search_kwargs["distance_metric"] = distance_metric
         self.retriever.search_kwargs["k"] = k
         return context
+    def ask_llm(self, query: str, metadata_filter: dict = None) -> str:
+        """
+        Parameters:
+            query (str): The question asked by the user
+            metadata_filter (dict): The metadata filter for the DB
+        Returns:
+            str: The response from the LLM model
+        Ask the LLM model a question
+        """
         context = self._retrieve(query, metadata_filter)
         print("Retrieved context")
         prompt_template, prompt_string = self._chat_prompt(query, context)
         return self.chat_history.messages[-1].content
+    def create_notes(self, documents: List[Document]) -> str:
+        """
+        Parameters:
+            documents (List[Document]): The list of documents to create notes from
+        Returns:
+            str: The notes generated from the LLM model
+        Create notes from the LLM model
+        """
         rules = """
         - Follow the Markdown format for creating notes as shown in the example.
         - The heading of the content should be the title of the markdown file.

pages/upload_file.py CHANGED Viewed

@@ -16,7 +16,8 @@ if st.button("Youtube/Video URL"):
 st.subheader('Upload the file')
 uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
 allow_make_notes = st.toggle('Make Complete Notes!')
-llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama', 'Mobile phone'))
 drake.llm_model = llm_model

 st.subheader('Upload the file')
 uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
 allow_make_notes = st.toggle('Make Complete Notes!')
+llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
+# llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
 drake.llm_model = llm_model

pages/upload_url.py CHANGED Viewed

@@ -13,7 +13,8 @@ if st.button("PDF/Transcript"):
 st.subheader('Enter the Video URL')
 video_url = st.text_input(label="Enter the URL")
-llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama', 'Mobile phone'))
 drake.llm_model = llm_model
 allow_make_notes = st.toggle('Make Complete Notes!')

 st.subheader('Enter the Video URL')
 video_url = st.text_input(label="Enter the URL")
+llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
+# llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
 drake.llm_model = llm_model
 allow_make_notes = st.toggle('Make Complete Notes!')

utilis.py CHANGED Viewed

@@ -13,16 +13,12 @@ from typing import Dict
 import uuid
 class Processing:
-    def __init__(self, dataset_path: str, embedding_model_name: str,
-                 device='cpu', chunk_size=500, chunk_overlap=5):
         """
         Parameters:
             dataset_path (str): Path to the dataset in the Vector-DB
-            file_path (str): Path to the file to be processed
             embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
-            device (str): Device to run the embedding model on
             chunk_size (int): Size of each chunk to be processed
             chunk_overlap (int): Overlap between each chunk
@@ -34,7 +30,6 @@ class Processing:
         self.embedding_model = HuggingFaceEmbeddings(
             model_name=embedding_model_name,
-            model_kwargs={'device': device},
             encode_kwargs={'normalize_embeddings': False}
         )
@@ -43,8 +38,8 @@ class Processing:
                            exec_option="compute_engine"
                            )
-    def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str, course_tag="") -> (List[
-        Document], Dict[str, str]):
         """
         Parameters:
             documents (List[Document]): List of documents to add metadata to
@@ -54,7 +49,7 @@ class Processing:
             course_tag (str): Tag to identify the course the documents belongs to
         Returns:
-            documents (List[Document]): List of documents with metadata added
         Add metadata to the documents
         """
@@ -69,10 +64,10 @@ class Processing:
             doc.metadata = metadata
         return documents, metadata
-    def load_pdf(self, name, text) -> (List[Document], Dict[str, str]):
         """
         Returns:
-            pdf_chunk (List[Document]): List of documents with metadata added
         Load PDF file, split into chunks and add metadata
         """
@@ -83,7 +78,7 @@ class Processing:
     def load_transcript(self, url) -> (List[Document], Dict[str, str]):
         """
         Returns:
-            transcript_chunk (List[Document]): List of documents with metadata added
         Load transcript, split into chunks and add metadata
         """
@@ -91,12 +86,13 @@ class Processing:
         print("Transcribed")
         transcript_chunk = self.text_splitter.create_documents([transcript.text])
         print("Created transcript chunks")
-        return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video", file_type="transcript")
     def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
         """
         Returns:
-            yt_transcript_chunk (List[Document]): List of documents with metadata added
         Load YouTube transcript, split into chunks and add metadata
         """

 import uuid
 class Processing:
+    def __init__(self, dataset_path: str, embedding_model_name: str, chunk_size=500, chunk_overlap=5):
         """
         Parameters:
             dataset_path (str): Path to the dataset in the Vector-DB
             embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
             chunk_size (int): Size of each chunk to be processed
             chunk_overlap (int): Overlap between each chunk
         self.embedding_model = HuggingFaceEmbeddings(
             model_name=embedding_model_name,
             encode_kwargs={'normalize_embeddings': False}
         )
                            exec_option="compute_engine"
                            )
+    def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
+                      course_tag="") -> (List[Document], Dict[str, str]):
         """
         Parameters:
             documents (List[Document]): List of documents to add metadata to
             course_tag (str): Tag to identify the course the documents belongs to
         Returns:
+            documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
         Add metadata to the documents
         """
             doc.metadata = metadata
         return documents, metadata
+    def load_pdf(self, text) -> (List[Document], Dict[str, str]):
         """
         Returns:
+            documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
         Load PDF file, split into chunks and add metadata
         """
     def load_transcript(self, url) -> (List[Document], Dict[str, str]):
         """
         Returns:
+            documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
         Load transcript, split into chunks and add metadata
         """
         print("Transcribed")
         transcript_chunk = self.text_splitter.create_documents([transcript.text])
         print("Created transcript chunks")
+        return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
+                                  file_type="transcript")
     def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
         """
         Returns:
+            documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
         Load YouTube transcript, split into chunks and add metadata
         """