Spaces:

kausthubkannan17
/

Drake

Running

App Files Files Community

kausthubkannan17 commited on Apr 12, 2024

Commit

8163d1a

1 Parent(s): 1a6ee56

feat: OCR support

Browse files

Files changed (5) hide show

model.py +2 -15
pages/upload_file.py +36 -35
pages/upload_url.py +1 -3
requirements.txt +2 -2
utilis.py +26 -0

model.py CHANGED Viewed

@@ -9,24 +9,15 @@ from langchain_core.documents.base import Document
 class DrakeLM:
-    def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
         """
         Parameters:
             model_path (str): The path to the model in case running Llama
             db (DeepLake): The DeepLake DB object
             config (dict): The configuration for the llama model
-            llm_model (str): The LLM model type
         Initialize the DrakeLM model
         """
-        self.llm_model = llm_model
-        if llm_model == "llama":
-            self.llama = CTransformers(
-                model=model_path,
-                model_type="llama",
-                config=config
-            )
         self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
         self.retriever = db.as_retriever()
         self.chat_history = ChatMessageHistory()
@@ -123,11 +114,7 @@ class DrakeLM:
         """
         prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
-        if self.llm_model == "llama":
-            self.chat_history.add_ai_message(AIMessage(content=self.llama.invoke(prompt_template)))
-        else:
-            self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
         return self.chat_history.messages[-1].content

 class DrakeLM:
+    def __init__(self, model_path: str, db: DeepLake, config: dict):
         """
         Parameters:
             model_path (str): The path to the model in case running Llama
             db (DeepLake): The DeepLake DB object
             config (dict): The configuration for the llama model
         Initialize the DrakeLM model
         """
         self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
         self.retriever = db.as_retriever()
         self.chat_history = ChatMessageHistory()
         """
         prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
+        self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
         return self.chat_history.messages[-1].content

pages/upload_file.py CHANGED Viewed

@@ -15,10 +15,9 @@ if st.button("Youtube/Video URL"):
 st.subheader('Upload the file')
 uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
 allow_make_notes = st.toggle('Make Complete Notes!')
-llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
-st.caption("Note: Llama support to be added soon!")
-drake.llm_model = llm_model
 if uploaded_file:
@@ -27,45 +26,47 @@ if uploaded_file:
         # Chunking the file
         with st.spinner('Please wait, file is chunking ...'):
             try:
-                pdf_stream = io.BytesIO(uploaded_file.read())
-                pdf_reader = PyPDF2.PdfReader(pdf_stream)
-                text = ""
-                for page in pdf_reader.pages:
-                    text += page.extract_text()
-                documents, metadata = processing.load_pdf("hello world", text)
                 st.session_state["metadata"] = metadata
-                st.success("Successfully chunked the file")
             except Exception as e:
                 st.error("Error in chunking")
-            # Uploading to DB
-            with st.spinner('Please wait, file is uploading ...'):
                 try:
-                    processing.upload_to_db(documents)
                 except Exception as e:
-                    st.error("Error in uploading")
-                # Generating Notes
-                if allow_make_notes:
-                    with st.spinner('Please wait, notes are being generated ...'):
-                        try:
-                            config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
-                            notes = drake.create_notes(documents)
-                            encoded_text = notes.encode('utf-8')
-                            st.success("Notes generated successfully")
-                            if st.download_button(
-                                    label="Download data as Markdown",
-                                    data=encoded_text,
-                                    file_name='your_notes.md',
-                                    mime='text/markdown',
-                            ):
-                                st.switch_page("pages/chat.py")
-                        except Exception as e:
-                            print(e)
-                            st.error("Error in generating notes")
-                else:
-                    st.switch_page("pages/chat.py")

 st.subheader('Upload the file')
 uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
+is_scanned = st.toggle("Is the file scanned?")
 allow_make_notes = st.toggle('Make Complete Notes!')
+st.caption("Note: Currently, Drake support Gemini, Llama support to be added soon!")
 if uploaded_file:
         # Chunking the file
         with st.spinner('Please wait, file is chunking ...'):
             try:
+                pdf_stream = io.BytesIO(uploaded_file.getvalue())
+                if is_scanned:
+                    text = processing.load_scanned_pdf(uploaded_file.getvalue())
+                else:
+                    pdf_reader = PyPDF2.PdfReader(pdf_stream)
+                    text = ""
+                    for page in pdf_reader.pages:
+                        text += page.extract_text()
+                documents, metadata = processing.load_pdf(text)
                 st.session_state["metadata"] = metadata
             except Exception as e:
                 st.error("Error in chunking")
+        # Uploading to DB
+        with st.spinner('Please wait, documents uploading ...'):
+            try:
+                processing.upload_to_db(documents)
+                st.success("Successfully uploaded the file")
+            except Exception as e:
+                st.error("Error in uploading")
+        # Generating Notes
+        if allow_make_notes:
+            with st.spinner('Please wait, notes are being generated ...'):
                 try:
+                    config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
+                    notes = drake.create_notes(documents)
+                    encoded_text = notes.encode('utf-8')
+                    st.success("Notes generated successfully")
+                    if st.download_button(
+                            label="Download your notes",
+                            data=encoded_text,
+                            file_name='your_notes.md',
+                            mime='text/markdown',
+                    ):
+                        st.switch_page("pages/chat.py")
                 except Exception as e:
+                    st.error("Error in generating notes", e)
+        else:
+            st.switch_page("pages/chat.py")

pages/upload_url.py CHANGED Viewed

@@ -13,9 +13,7 @@ if st.button("PDF/Transcript"):
 st.subheader('Enter the Video URL')
 video_url = st.text_input(label="Enter the URL")
-llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
-st.caption("Note: Llama support to be added soon!")
-drake.llm_model = llm_model
 allow_make_notes = st.toggle('Make Complete Notes!')

 st.subheader('Enter the Video URL')
 video_url = st.text_input(label="Enter the URL")
+st.caption("Note: Currently, Drake support Gemini, Llama support to be added soon!")
 allow_make_notes = st.toggle('Make Complete Notes!')

requirements.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 PyPDF2
 streamlit
 langchain
 deeplake
 assemblyai
 sentence-transformers
 youtube-transcript-api
-modal
-ctransformers
 langchain-google-genai

 PyPDF2
+pdf2image
+pytesseract
 streamlit
 langchain
 deeplake
 assemblyai
 sentence-transformers
 youtube-transcript-api
 langchain-google-genai

utilis.py CHANGED Viewed

@@ -11,6 +11,9 @@ from langchain.prompts.few_shot import FewShotPromptTemplate
 from langchain.prompts.prompt import PromptTemplate
 from typing import Dict
 import uuid
 class Processing:
@@ -75,6 +78,29 @@ class Processing:
         print("Created document chunks")
         return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
     def load_transcript(self, url) -> (List[Document], Dict[str, str]):
         """
         Returns:

 from langchain.prompts.prompt import PromptTemplate
 from typing import Dict
 import uuid
+from pdf2image import convert_from_bytes
+import pytesseract
+from pytesseract import Output
 class Processing:
         print("Created document chunks")
         return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
+    def load_scanned_pdf(self, file) -> str:
+        """
+        Parameters:
+            file (File): Scanned PDF file to be processed
+        Returns:
+            str: Text extracted from the scanned PDF file
+        Extract text from scanned PDF file
+        """
+        images = convert_from_bytes(file)
+        all_text = ""
+        for image in images:
+            # Perform OCR on the image
+            text = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
+            # Extract text from the dictionary
+            page_text = " ".join(text['text'])
+            all_text += page_text + "\n"
+        return all_text
     def load_transcript(self, url) -> (List[Document], Dict[str, str]):
         """
         Returns: