Spaces:
Build error
Build error
Commit
·
9bf726b
1
Parent(s):
4c0b3a6
feat: added comments to class
Browse files- README.md +13 -0
- model.py +55 -4
- pages/upload_file.py +2 -1
- pages/upload_url.py +2 -1
- utilis.py +10 -14
README.md
CHANGED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Drake
|
| 2 |
+
**Make Notes without mess**
|
| 3 |
+
|
| 4 |
+
DrakeLLM is developed to help students to solve the issue of making notes from videos and LLMs. Utilising RAG, Drake helps in making quick notes along with a Q&A bot. Books, YouTube tutorials or Videos, Drake supports all your means.
|
| 5 |
+
|
| 6 |
+
## Features
|
| 7 |
+
- **Quick Notes**: Make notes quickly with Drake.
|
| 8 |
+
- **Q&A Bot**: Ask questions and get answers from Drake.
|
| 9 |
+
|
| 10 |
+
## Upcoming Features
|
| 11 |
+
- **Image Support**: Querying images on similarity criteria.
|
| 12 |
+
- **Image for context**: Using images for context in multimodal models like Llava.
|
| 13 |
+
- **Completely Open Source**: Supporting the app to run on completely open source models like Llava, Llama.
|
model.py
CHANGED
|
@@ -10,6 +10,15 @@ from langchain_core.documents.base import Document
|
|
| 10 |
|
| 11 |
class DrakeLM:
|
| 12 |
def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
self.llm_model = llm_model
|
| 14 |
|
| 15 |
if llm_model == "llama":
|
|
@@ -25,7 +34,18 @@ class DrakeLM:
|
|
| 25 |
self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
|
| 26 |
self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
|
| 27 |
|
| 28 |
-
def _chat_prompt(self, query: str, context: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
prompt = """You are assisting a student to understand topics. \n\n
|
| 30 |
You have to answer the below question by utilising the below context to answer the question. \n\n
|
| 31 |
Note to follow the rules given below \n\n
|
|
@@ -46,7 +66,19 @@ class DrakeLM:
|
|
| 46 |
prompt = prompt.format(query=query, context=context, rules=rules)
|
| 47 |
return PromptTemplate.from_template(prompt), prompt
|
| 48 |
|
| 49 |
-
def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
self.retriever.search_kwargs["distance_metric"] = distance_metric
|
| 51 |
self.retriever.search_kwargs["k"] = k
|
| 52 |
|
|
@@ -65,7 +97,17 @@ class DrakeLM:
|
|
| 65 |
|
| 66 |
return context
|
| 67 |
|
| 68 |
-
def ask_llm(self, query: str, metadata_filter: dict = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
context = self._retrieve(query, metadata_filter)
|
| 70 |
print("Retrieved context")
|
| 71 |
prompt_template, prompt_string = self._chat_prompt(query, context)
|
|
@@ -89,7 +131,16 @@ class DrakeLM:
|
|
| 89 |
|
| 90 |
return self.chat_history.messages[-1].content
|
| 91 |
|
| 92 |
-
def create_notes(self, documents: List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
rules = """
|
| 94 |
- Follow the Markdown format for creating notes as shown in the example.
|
| 95 |
- The heading of the content should be the title of the markdown file.
|
|
|
|
| 10 |
|
| 11 |
class DrakeLM:
|
| 12 |
def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
|
| 13 |
+
"""
|
| 14 |
+
Parameters:
|
| 15 |
+
model_path (str): The path to the model in case running Llama
|
| 16 |
+
db (DeepLake): The DeepLake DB object
|
| 17 |
+
config (dict): The configuration for the llama model
|
| 18 |
+
llm_model (str): The LLM model type
|
| 19 |
+
|
| 20 |
+
Initialize the DrakeLM model
|
| 21 |
+
"""
|
| 22 |
self.llm_model = llm_model
|
| 23 |
|
| 24 |
if llm_model == "llama":
|
|
|
|
| 34 |
self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
|
| 35 |
self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
|
| 36 |
|
| 37 |
+
def _chat_prompt(self, query: str, context: str) -> (PromptTemplate, str):
|
| 38 |
+
"""
|
| 39 |
+
Parameters:
|
| 40 |
+
query (str): The question asked by the user
|
| 41 |
+
context (str): The context retrieved from the DB
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
PromptTemplate: The prompt template for the chat
|
| 45 |
+
prompt (str): The prompt string for the chat
|
| 46 |
+
|
| 47 |
+
Create the chat prompt for the LLM model
|
| 48 |
+
"""
|
| 49 |
prompt = """You are assisting a student to understand topics. \n\n
|
| 50 |
You have to answer the below question by utilising the below context to answer the question. \n\n
|
| 51 |
Note to follow the rules given below \n\n
|
|
|
|
| 66 |
prompt = prompt.format(query=query, context=context, rules=rules)
|
| 67 |
return PromptTemplate.from_template(prompt), prompt
|
| 68 |
|
| 69 |
+
def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos") -> str:
|
| 70 |
+
"""
|
| 71 |
+
Parameters:
|
| 72 |
+
query (str): The question asked by the user
|
| 73 |
+
metadata_filter (dict): The metadata filter for the DB
|
| 74 |
+
k (int): The number of documents to retrieve
|
| 75 |
+
distance_metric (str): The distance metric for retrieval
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
str: The context retrieved from the DB
|
| 79 |
+
|
| 80 |
+
Retrieve the context from the DB
|
| 81 |
+
"""
|
| 82 |
self.retriever.search_kwargs["distance_metric"] = distance_metric
|
| 83 |
self.retriever.search_kwargs["k"] = k
|
| 84 |
|
|
|
|
| 97 |
|
| 98 |
return context
|
| 99 |
|
| 100 |
+
def ask_llm(self, query: str, metadata_filter: dict = None) -> str:
|
| 101 |
+
"""
|
| 102 |
+
Parameters:
|
| 103 |
+
query (str): The question asked by the user
|
| 104 |
+
metadata_filter (dict): The metadata filter for the DB
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
str: The response from the LLM model
|
| 108 |
+
|
| 109 |
+
Ask the LLM model a question
|
| 110 |
+
"""
|
| 111 |
context = self._retrieve(query, metadata_filter)
|
| 112 |
print("Retrieved context")
|
| 113 |
prompt_template, prompt_string = self._chat_prompt(query, context)
|
|
|
|
| 131 |
|
| 132 |
return self.chat_history.messages[-1].content
|
| 133 |
|
| 134 |
+
def create_notes(self, documents: List[Document]) -> str:
|
| 135 |
+
"""
|
| 136 |
+
Parameters:
|
| 137 |
+
documents (List[Document]): The list of documents to create notes from
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
str: The notes generated from the LLM model
|
| 141 |
+
|
| 142 |
+
Create notes from the LLM model
|
| 143 |
+
"""
|
| 144 |
rules = """
|
| 145 |
- Follow the Markdown format for creating notes as shown in the example.
|
| 146 |
- The heading of the content should be the title of the markdown file.
|
pages/upload_file.py
CHANGED
|
@@ -16,7 +16,8 @@ if st.button("Youtube/Video URL"):
|
|
| 16 |
st.subheader('Upload the file')
|
| 17 |
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
| 18 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
| 19 |
-
llm_model = st.selectbox('Choose LLM Model',
|
|
|
|
| 20 |
drake.llm_model = llm_model
|
| 21 |
|
| 22 |
|
|
|
|
| 16 |
st.subheader('Upload the file')
|
| 17 |
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
| 18 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
| 19 |
+
llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
|
| 20 |
+
# llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
|
| 21 |
drake.llm_model = llm_model
|
| 22 |
|
| 23 |
|
pages/upload_url.py
CHANGED
|
@@ -13,7 +13,8 @@ if st.button("PDF/Transcript"):
|
|
| 13 |
|
| 14 |
st.subheader('Enter the Video URL')
|
| 15 |
video_url = st.text_input(label="Enter the URL")
|
| 16 |
-
llm_model = st.selectbox('Choose LLM Model',
|
|
|
|
| 17 |
drake.llm_model = llm_model
|
| 18 |
|
| 19 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
|
|
|
| 13 |
|
| 14 |
st.subheader('Enter the Video URL')
|
| 15 |
video_url = st.text_input(label="Enter the URL")
|
| 16 |
+
llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
|
| 17 |
+
# llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
|
| 18 |
drake.llm_model = llm_model
|
| 19 |
|
| 20 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
utilis.py
CHANGED
|
@@ -13,16 +13,12 @@ from typing import Dict
|
|
| 13 |
import uuid
|
| 14 |
|
| 15 |
|
| 16 |
-
|
| 17 |
class Processing:
|
| 18 |
-
def __init__(self, dataset_path: str, embedding_model_name: str,
|
| 19 |
-
device='cpu', chunk_size=500, chunk_overlap=5):
|
| 20 |
"""
|
| 21 |
Parameters:
|
| 22 |
dataset_path (str): Path to the dataset in the Vector-DB
|
| 23 |
-
file_path (str): Path to the file to be processed
|
| 24 |
embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
|
| 25 |
-
device (str): Device to run the embedding model on
|
| 26 |
chunk_size (int): Size of each chunk to be processed
|
| 27 |
chunk_overlap (int): Overlap between each chunk
|
| 28 |
|
|
@@ -34,7 +30,6 @@ class Processing:
|
|
| 34 |
|
| 35 |
self.embedding_model = HuggingFaceEmbeddings(
|
| 36 |
model_name=embedding_model_name,
|
| 37 |
-
model_kwargs={'device': device},
|
| 38 |
encode_kwargs={'normalize_embeddings': False}
|
| 39 |
)
|
| 40 |
|
|
@@ -43,8 +38,8 @@ class Processing:
|
|
| 43 |
exec_option="compute_engine"
|
| 44 |
)
|
| 45 |
|
| 46 |
-
def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
|
| 47 |
-
|
| 48 |
"""
|
| 49 |
Parameters:
|
| 50 |
documents (List[Document]): List of documents to add metadata to
|
|
@@ -54,7 +49,7 @@ class Processing:
|
|
| 54 |
course_tag (str): Tag to identify the course the documents belongs to
|
| 55 |
|
| 56 |
Returns:
|
| 57 |
-
documents (List[Document]): List of documents with metadata added
|
| 58 |
|
| 59 |
Add metadata to the documents
|
| 60 |
"""
|
|
@@ -69,10 +64,10 @@ class Processing:
|
|
| 69 |
doc.metadata = metadata
|
| 70 |
return documents, metadata
|
| 71 |
|
| 72 |
-
def load_pdf(self,
|
| 73 |
"""
|
| 74 |
Returns:
|
| 75 |
-
|
| 76 |
|
| 77 |
Load PDF file, split into chunks and add metadata
|
| 78 |
"""
|
|
@@ -83,7 +78,7 @@ class Processing:
|
|
| 83 |
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
| 84 |
"""
|
| 85 |
Returns:
|
| 86 |
-
|
| 87 |
|
| 88 |
Load transcript, split into chunks and add metadata
|
| 89 |
"""
|
|
@@ -91,12 +86,13 @@ class Processing:
|
|
| 91 |
print("Transcribed")
|
| 92 |
transcript_chunk = self.text_splitter.create_documents([transcript.text])
|
| 93 |
print("Created transcript chunks")
|
| 94 |
-
return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
|
|
|
|
| 95 |
|
| 96 |
def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
|
| 97 |
"""
|
| 98 |
Returns:
|
| 99 |
-
|
| 100 |
|
| 101 |
Load YouTube transcript, split into chunks and add metadata
|
| 102 |
"""
|
|
|
|
| 13 |
import uuid
|
| 14 |
|
| 15 |
|
|
|
|
| 16 |
class Processing:
|
| 17 |
+
def __init__(self, dataset_path: str, embedding_model_name: str, chunk_size=500, chunk_overlap=5):
|
|
|
|
| 18 |
"""
|
| 19 |
Parameters:
|
| 20 |
dataset_path (str): Path to the dataset in the Vector-DB
|
|
|
|
| 21 |
embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
|
|
|
|
| 22 |
chunk_size (int): Size of each chunk to be processed
|
| 23 |
chunk_overlap (int): Overlap between each chunk
|
| 24 |
|
|
|
|
| 30 |
|
| 31 |
self.embedding_model = HuggingFaceEmbeddings(
|
| 32 |
model_name=embedding_model_name,
|
|
|
|
| 33 |
encode_kwargs={'normalize_embeddings': False}
|
| 34 |
)
|
| 35 |
|
|
|
|
| 38 |
exec_option="compute_engine"
|
| 39 |
)
|
| 40 |
|
| 41 |
+
def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
|
| 42 |
+
course_tag="") -> (List[Document], Dict[str, str]):
|
| 43 |
"""
|
| 44 |
Parameters:
|
| 45 |
documents (List[Document]): List of documents to add metadata to
|
|
|
|
| 49 |
course_tag (str): Tag to identify the course the documents belongs to
|
| 50 |
|
| 51 |
Returns:
|
| 52 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
| 53 |
|
| 54 |
Add metadata to the documents
|
| 55 |
"""
|
|
|
|
| 64 |
doc.metadata = metadata
|
| 65 |
return documents, metadata
|
| 66 |
|
| 67 |
+
def load_pdf(self, text) -> (List[Document], Dict[str, str]):
|
| 68 |
"""
|
| 69 |
Returns:
|
| 70 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
| 71 |
|
| 72 |
Load PDF file, split into chunks and add metadata
|
| 73 |
"""
|
|
|
|
| 78 |
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
| 79 |
"""
|
| 80 |
Returns:
|
| 81 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
| 82 |
|
| 83 |
Load transcript, split into chunks and add metadata
|
| 84 |
"""
|
|
|
|
| 86 |
print("Transcribed")
|
| 87 |
transcript_chunk = self.text_splitter.create_documents([transcript.text])
|
| 88 |
print("Created transcript chunks")
|
| 89 |
+
return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
|
| 90 |
+
file_type="transcript")
|
| 91 |
|
| 92 |
def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
|
| 93 |
"""
|
| 94 |
Returns:
|
| 95 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
| 96 |
|
| 97 |
Load YouTube transcript, split into chunks and add metadata
|
| 98 |
"""
|