Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PyPDF2 import PdfReader
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
from PIL import Image
|
5 |
+
import pytesseract
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
import spacy
|
8 |
+
import os
|
9 |
+
from groq import Groq
|
10 |
+
import configparser
|
11 |
+
|
12 |
+
# Load environment variables from .env file
|
13 |
+
config = configparser.ConfigParser()
|
14 |
+
config.read(".env")
|
15 |
+
api_key = config.get("GROQ", "GROQ_API_KEY")
|
16 |
+
|
17 |
+
# Initialize spaCy model
|
18 |
+
nlp = spacy.load("en_core_web_sm")
|
19 |
+
|
20 |
+
# Set up the Groq client
|
21 |
+
client = Groq(api_key=api_key)
|
22 |
+
|
23 |
+
# Streamlit app
|
24 |
+
st.title("DocBot: Smart Document ChatBot")
|
25 |
+
|
26 |
+
uploaded_file = st.file_uploader("Upload your file", type=["pdf", "png", "jpg", "jpeg"])
|
27 |
+
|
28 |
+
def convert_pdf_to_images(pdf_path, output_folder="temp_images"):
|
29 |
+
if not os.path.exists(output_folder):
|
30 |
+
os.makedirs(output_folder)
|
31 |
+
pages = convert_from_path(pdf_path, 300)
|
32 |
+
image_paths = []
|
33 |
+
for i, page in enumerate(pages):
|
34 |
+
image_path = os.path.join(output_folder, f'page_{i}.png')
|
35 |
+
page.save(image_path, 'PNG')
|
36 |
+
image_paths.append(image_path)
|
37 |
+
return image_paths
|
38 |
+
|
39 |
+
def extract_text_from_pdf(pdf_path):
|
40 |
+
text = ""
|
41 |
+
with open(pdf_path, "rb") as f:
|
42 |
+
reader = PdfReader(f)
|
43 |
+
for page in reader.pages:
|
44 |
+
text += page.extract_text()
|
45 |
+
return text
|
46 |
+
|
47 |
+
def extract_text_from_image(image_path):
|
48 |
+
return pytesseract.image_to_string(Image.open(image_path))
|
49 |
+
|
50 |
+
def generate_response(user_prompt):
|
51 |
+
try:
|
52 |
+
# Define the system prompt
|
53 |
+
system_prompt = "You are a helpful assistant, your name is DocBot, you help with document text"
|
54 |
+
chatbot_symbol = "🤖"
|
55 |
+
|
56 |
+
# Concatenate the system and user prompts
|
57 |
+
prompt = f"{system_prompt}\n\nUser Query: {user_prompt}\n\nAnswer:"
|
58 |
+
|
59 |
+
# Call the Groq model with the combined prompt
|
60 |
+
chat_completion = client.chat.completions.create(
|
61 |
+
messages=[
|
62 |
+
{
|
63 |
+
"role": "system",
|
64 |
+
"content": system_prompt,
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"role": "user",
|
68 |
+
"content": user_prompt,
|
69 |
+
}
|
70 |
+
],
|
71 |
+
model="llama3-8b-8192",
|
72 |
+
)
|
73 |
+
|
74 |
+
# Get the chatbot's response
|
75 |
+
chatbot_response = chat_completion.choices[0].message.content.strip()
|
76 |
+
|
77 |
+
# Add the chatbot symbol to the response
|
78 |
+
chatbot_response_with_symbol = f"{chatbot_symbol} {chatbot_response}"
|
79 |
+
|
80 |
+
# Return the response
|
81 |
+
return chatbot_response_with_symbol
|
82 |
+
except Exception as e:
|
83 |
+
st.error(f"An error occurred: {e}")
|
84 |
+
return "There was an error processing your request."
|
85 |
+
|
86 |
+
progress_bar = st.empty() # Create an empty placeholder for the progress bar
|
87 |
+
|
88 |
+
if uploaded_file:
|
89 |
+
file_path = os.path.join("uploads", uploaded_file.name)
|
90 |
+
with open(file_path, "wb") as f:
|
91 |
+
f.write(uploaded_file.getbuffer())
|
92 |
+
|
93 |
+
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
|
94 |
+
|
95 |
+
if file_extension == ".pdf":
|
96 |
+
st.write("Processing PDF...")
|
97 |
+
# Check if PDF contains text or images
|
98 |
+
text_content = extract_text_from_pdf(file_path)
|
99 |
+
if not text_content.strip():
|
100 |
+
st.write("PDF contains images, using OCR...")
|
101 |
+
image_paths = convert_pdf_to_images(file_path)
|
102 |
+
text_content = ""
|
103 |
+
for image_path in image_paths:
|
104 |
+
text_content += extract_text_from_image(image_path)
|
105 |
+
else:
|
106 |
+
progress_bar.progress(100) # Set progress to 100% if text extraction successful
|
107 |
+
st.success("PDF read successfully.")
|
108 |
+
else:
|
109 |
+
st.write("Processing Image...")
|
110 |
+
text_content = extract_text_from_image(file_path)
|
111 |
+
|
112 |
+
|
113 |
+
progress_bar.progress(100) # Set progress to 100% if text extraction successful
|
114 |
+
|
115 |
+
st.subheader("Chat with your document")
|
116 |
+
user_query = st.text_input("Ask a question about your document")
|
117 |
+
|
118 |
+
if user_query:
|
119 |
+
prompt = f"Document Text: {text_content}\n\nUser Query: {user_query}\n\nAnswer:"
|
120 |
+
response = generate_response(prompt)
|
121 |
+
st.write("Response: ", response)
|
122 |
+
|
123 |
+
# Clean up temp images
|
124 |
+
def cleanup_temp_images(output_folder="temp_images"):
|
125 |
+
if os.path.exists(output_folder):
|
126 |
+
for file in os.listdir(output_folder):
|
127 |
+
file_path = os.path.join(output_folder, file)
|
128 |
+
if os.path.isfile(file_path):
|
129 |
+
os.unlink(file_path)
|
130 |
+
|
131 |
+
cleanup_temp_images()
|