Spaces:
Runtime error
Runtime error
File size: 4,576 Bytes
6ad2654 c78bbe1 6ad2654 46a7ece |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import streamlit as st
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import fitz # PyMuPDF
import spacy
import os
from groq import Groq
import configparser
# Check if Iterable is defined in collections.abc, otherwise define it
try:
from collections.abc import Iterable
except ImportError:
from collections import Iterable
# Load environment variables from .env file
config = configparser.ConfigParser()
config.read(".env")
api_key = config.get("GROQ", "GROQ_API_KEY")
# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")
# Set up the Groq client
client = Groq(api_key=api_key)
# Streamlit app
st.title("DocBot: Smart Document ChatBot")
uploaded_file = st.file_uploader("Upload your file", type=["pdf", "png", "jpg", "jpeg"])
def convert_pdf_to_images(pdf_path, output_folder="temp_images"):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
pages = convert_from_path(pdf_path, 300)
image_paths = []
for i, page in enumerate(pages):
image_path = os.path.join(output_folder, f'page_{i}.png')
page.save(image_path, 'PNG')
image_paths.append(image_path)
return image_paths
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
text += page.extract_text()
return text
def extract_text_from_image(image_path):
return pytesseract.image_to_string(Image.open(image_path))
def generate_response(user_prompt):
try:
# Define the system prompt
system_prompt = "You are a helpful assistant, your name is DocBot, you help with document text"
chatbot_symbol = "🤖"
# Concatenate the system and user prompts
prompt = f"{system_prompt}\n\nUser Query: {user_prompt}\n\nAnswer:"
# Call the Groq model with the combined prompt
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": system_prompt,
},
{
"role": "user",
"content": user_prompt,
}
],
model="llama3-8b-8192",
)
# Get the chatbot's response
chatbot_response = chat_completion.choices[0].message.content.strip()
# Add the chatbot symbol to the response
chatbot_response_with_symbol = f"{chatbot_symbol} {chatbot_response}"
# Return the response
return chatbot_response_with_symbol
except Exception as e:
st.error(f"An error occurred: {e}")
return "There was an error processing your request."
progress_bar = st.empty() # Create an empty placeholder for the progress bar
if uploaded_file:
file_path = os.path.join("uploads", uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension == ".pdf":
st.write("Processing PDF...")
# Check if PDF contains text or images
text_content = extract_text_from_pdf(file_path)
if not text_content.strip():
st.write("PDF contains images, using OCR...")
image_paths = convert_pdf_to_images(file_path)
text_content = ""
for image_path in image_paths:
text_content += extract_text_from_image(image_path)
else:
progress_bar.progress(100) # Set progress to 100% if text extraction successful
st.success("PDF read successfully.")
else:
st.write("Processing Image...")
text_content = extract_text_from_image(file_path)
progress_bar.progress(100) # Set progress to 100% if text extraction successful
st.subheader("Chat with your document")
user_query = st.text_input("Ask a question about your document")
if user_query:
prompt = f"Document Text: {text_content}\n\nUser Query: {user_query}\n\nAnswer:"
response = generate_response(prompt)
st.write("Response: ", response)
# Clean up temp images
def cleanup_temp_images(output_folder="temp_images"):
if os.path.exists(output_folder):
for file in os.listdir(output_folder):
file_path = os.path.join(output_folder, file)
if os.path.isfile(file_path):
os.unlink(file_path)
cleanup_temp_images()
|