Spaces:
Running
Running
kausthubkannan17
commited on
Commit
·
8163d1a
1
Parent(s):
1a6ee56
feat: OCR support
Browse files- model.py +2 -15
- pages/upload_file.py +36 -35
- pages/upload_url.py +1 -3
- requirements.txt +2 -2
- utilis.py +26 -0
model.py
CHANGED
@@ -9,24 +9,15 @@ from langchain_core.documents.base import Document
|
|
9 |
|
10 |
|
11 |
class DrakeLM:
|
12 |
-
def __init__(self, model_path: str, db: DeepLake, config: dict
|
13 |
"""
|
14 |
Parameters:
|
15 |
model_path (str): The path to the model in case running Llama
|
16 |
db (DeepLake): The DeepLake DB object
|
17 |
config (dict): The configuration for the llama model
|
18 |
-
llm_model (str): The LLM model type
|
19 |
|
20 |
Initialize the DrakeLM model
|
21 |
"""
|
22 |
-
self.llm_model = llm_model
|
23 |
-
|
24 |
-
if llm_model == "llama":
|
25 |
-
self.llama = CTransformers(
|
26 |
-
model=model_path,
|
27 |
-
model_type="llama",
|
28 |
-
config=config
|
29 |
-
)
|
30 |
self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
|
31 |
self.retriever = db.as_retriever()
|
32 |
self.chat_history = ChatMessageHistory()
|
@@ -123,11 +114,7 @@ class DrakeLM:
|
|
123 |
"""
|
124 |
|
125 |
prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
|
126 |
-
|
127 |
-
if self.llm_model == "llama":
|
128 |
-
self.chat_history.add_ai_message(AIMessage(content=self.llama.invoke(prompt_template)))
|
129 |
-
else:
|
130 |
-
self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
|
131 |
|
132 |
return self.chat_history.messages[-1].content
|
133 |
|
|
|
9 |
|
10 |
|
11 |
class DrakeLM:
|
12 |
+
def __init__(self, model_path: str, db: DeepLake, config: dict):
|
13 |
"""
|
14 |
Parameters:
|
15 |
model_path (str): The path to the model in case running Llama
|
16 |
db (DeepLake): The DeepLake DB object
|
17 |
config (dict): The configuration for the llama model
|
|
|
18 |
|
19 |
Initialize the DrakeLM model
|
20 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
|
22 |
self.retriever = db.as_retriever()
|
23 |
self.chat_history = ChatMessageHistory()
|
|
|
114 |
"""
|
115 |
|
116 |
prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
|
117 |
+
self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
|
|
|
|
|
|
|
|
|
118 |
|
119 |
return self.chat_history.messages[-1].content
|
120 |
|
pages/upload_file.py
CHANGED
@@ -15,10 +15,9 @@ if st.button("Youtube/Video URL"):
|
|
15 |
|
16 |
st.subheader('Upload the file')
|
17 |
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
|
|
18 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
19 |
-
|
20 |
-
st.caption("Note: Llama support to be added soon!")
|
21 |
-
drake.llm_model = llm_model
|
22 |
|
23 |
|
24 |
if uploaded_file:
|
@@ -27,45 +26,47 @@ if uploaded_file:
|
|
27 |
# Chunking the file
|
28 |
with st.spinner('Please wait, file is chunking ...'):
|
29 |
try:
|
30 |
-
pdf_stream = io.BytesIO(uploaded_file.
|
31 |
-
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
documents, metadata = processing.load_pdf(
|
38 |
st.session_state["metadata"] = metadata
|
39 |
-
st.success("Successfully chunked the file")
|
40 |
|
41 |
except Exception as e:
|
42 |
st.error("Error in chunking")
|
43 |
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
try:
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
except Exception as e:
|
49 |
-
st.error("Error in
|
50 |
-
|
51 |
-
# Generating Notes
|
52 |
-
if allow_make_notes:
|
53 |
-
with st.spinner('Please wait, notes are being generated ...'):
|
54 |
-
try:
|
55 |
-
config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
|
56 |
-
notes = drake.create_notes(documents)
|
57 |
-
encoded_text = notes.encode('utf-8')
|
58 |
-
st.success("Notes generated successfully")
|
59 |
-
if st.download_button(
|
60 |
-
label="Download data as Markdown",
|
61 |
-
data=encoded_text,
|
62 |
-
file_name='your_notes.md',
|
63 |
-
mime='text/markdown',
|
64 |
-
):
|
65 |
-
st.switch_page("pages/chat.py")
|
66 |
-
except Exception as e:
|
67 |
-
print(e)
|
68 |
-
st.error("Error in generating notes")
|
69 |
|
70 |
-
|
71 |
-
|
|
|
15 |
|
16 |
st.subheader('Upload the file')
|
17 |
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
18 |
+
is_scanned = st.toggle("Is the file scanned?")
|
19 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
20 |
+
st.caption("Note: Currently, Drake support Gemini, Llama support to be added soon!")
|
|
|
|
|
21 |
|
22 |
|
23 |
if uploaded_file:
|
|
|
26 |
# Chunking the file
|
27 |
with st.spinner('Please wait, file is chunking ...'):
|
28 |
try:
|
29 |
+
pdf_stream = io.BytesIO(uploaded_file.getvalue())
|
|
|
30 |
|
31 |
+
if is_scanned:
|
32 |
+
text = processing.load_scanned_pdf(uploaded_file.getvalue())
|
33 |
+
else:
|
34 |
+
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
35 |
+
text = ""
|
36 |
+
for page in pdf_reader.pages:
|
37 |
+
text += page.extract_text()
|
38 |
|
39 |
+
documents, metadata = processing.load_pdf(text)
|
40 |
st.session_state["metadata"] = metadata
|
|
|
41 |
|
42 |
except Exception as e:
|
43 |
st.error("Error in chunking")
|
44 |
|
45 |
+
# Uploading to DB
|
46 |
+
with st.spinner('Please wait, documents uploading ...'):
|
47 |
+
try:
|
48 |
+
processing.upload_to_db(documents)
|
49 |
+
st.success("Successfully uploaded the file")
|
50 |
+
except Exception as e:
|
51 |
+
st.error("Error in uploading")
|
52 |
+
|
53 |
+
# Generating Notes
|
54 |
+
if allow_make_notes:
|
55 |
+
with st.spinner('Please wait, notes are being generated ...'):
|
56 |
try:
|
57 |
+
config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
|
58 |
+
notes = drake.create_notes(documents)
|
59 |
+
encoded_text = notes.encode('utf-8')
|
60 |
+
st.success("Notes generated successfully")
|
61 |
+
if st.download_button(
|
62 |
+
label="Download your notes",
|
63 |
+
data=encoded_text,
|
64 |
+
file_name='your_notes.md',
|
65 |
+
mime='text/markdown',
|
66 |
+
):
|
67 |
+
st.switch_page("pages/chat.py")
|
68 |
except Exception as e:
|
69 |
+
st.error("Error in generating notes", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
else:
|
72 |
+
st.switch_page("pages/chat.py")
|
pages/upload_url.py
CHANGED
@@ -13,9 +13,7 @@ if st.button("PDF/Transcript"):
|
|
13 |
|
14 |
st.subheader('Enter the Video URL')
|
15 |
video_url = st.text_input(label="Enter the URL")
|
16 |
-
|
17 |
-
st.caption("Note: Llama support to be added soon!")
|
18 |
-
drake.llm_model = llm_model
|
19 |
|
20 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
21 |
|
|
|
13 |
|
14 |
st.subheader('Enter the Video URL')
|
15 |
video_url = st.text_input(label="Enter the URL")
|
16 |
+
st.caption("Note: Currently, Drake support Gemini, Llama support to be added soon!")
|
|
|
|
|
17 |
|
18 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
19 |
|
requirements.txt
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
PyPDF2
|
|
|
|
|
2 |
streamlit
|
3 |
langchain
|
4 |
deeplake
|
5 |
assemblyai
|
6 |
sentence-transformers
|
7 |
youtube-transcript-api
|
8 |
-
modal
|
9 |
-
ctransformers
|
10 |
langchain-google-genai
|
|
|
1 |
PyPDF2
|
2 |
+
pdf2image
|
3 |
+
pytesseract
|
4 |
streamlit
|
5 |
langchain
|
6 |
deeplake
|
7 |
assemblyai
|
8 |
sentence-transformers
|
9 |
youtube-transcript-api
|
|
|
|
|
10 |
langchain-google-genai
|
utilis.py
CHANGED
@@ -11,6 +11,9 @@ from langchain.prompts.few_shot import FewShotPromptTemplate
|
|
11 |
from langchain.prompts.prompt import PromptTemplate
|
12 |
from typing import Dict
|
13 |
import uuid
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
class Processing:
|
@@ -75,6 +78,29 @@ class Processing:
|
|
75 |
print("Created document chunks")
|
76 |
return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
79 |
"""
|
80 |
Returns:
|
|
|
11 |
from langchain.prompts.prompt import PromptTemplate
|
12 |
from typing import Dict
|
13 |
import uuid
|
14 |
+
from pdf2image import convert_from_bytes
|
15 |
+
import pytesseract
|
16 |
+
from pytesseract import Output
|
17 |
|
18 |
|
19 |
class Processing:
|
|
|
78 |
print("Created document chunks")
|
79 |
return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
|
80 |
|
81 |
+
def load_scanned_pdf(self, file) -> str:
|
82 |
+
"""
|
83 |
+
Parameters:
|
84 |
+
file (File): Scanned PDF file to be processed
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
str: Text extracted from the scanned PDF file
|
88 |
+
|
89 |
+
Extract text from scanned PDF file
|
90 |
+
"""
|
91 |
+
images = convert_from_bytes(file)
|
92 |
+
|
93 |
+
all_text = ""
|
94 |
+
for image in images:
|
95 |
+
# Perform OCR on the image
|
96 |
+
text = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
|
97 |
+
|
98 |
+
# Extract text from the dictionary
|
99 |
+
page_text = " ".join(text['text'])
|
100 |
+
all_text += page_text + "\n"
|
101 |
+
|
102 |
+
return all_text
|
103 |
+
|
104 |
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
105 |
"""
|
106 |
Returns:
|