Spaces:

aadi8anant
/

DocBot

Runtime error

File size: 4,576 Bytes

import streamlit as st
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import fitz  # PyMuPDF
import spacy
import os
from groq import Groq
import configparser

# Check if Iterable is defined in collections.abc, otherwise define it
try:
    from collections.abc import Iterable
except ImportError:
    from collections import Iterable

# Load environment variables from .env file
config = configparser.ConfigParser()
config.read(".env")
api_key = config.get("GROQ", "GROQ_API_KEY")

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Set up the Groq client
client = Groq(api_key=api_key)

# Streamlit app
st.title("DocBot: Smart Document ChatBot")

uploaded_file = st.file_uploader("Upload your file", type=["pdf", "png", "jpg", "jpeg"])

def convert_pdf_to_images(pdf_path, output_folder="temp_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    pages = convert_from_path(pdf_path, 300)
    image_paths = []
    for i, page in enumerate(pages):
        image_path = os.path.join(output_folder, f'page_{i}.png')
        page.save(image_path, 'PNG')
        image_paths.append(image_path)
    return image_paths

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_text_from_image(image_path):
    return pytesseract.image_to_string(Image.open(image_path))

def generate_response(user_prompt):
    try:
        # Define the system prompt
        system_prompt = "You are a helpful assistant, your name is DocBot, you help with document text"
        chatbot_symbol = "🤖" 
        
        # Concatenate the system and user prompts
        prompt = f"{system_prompt}\n\nUser Query: {user_prompt}\n\nAnswer:"
        
        # Call the Groq model with the combined prompt
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {
                    "role": "user",
                    "content": user_prompt,
                }
            ],
            model="llama3-8b-8192",
        )
        
        # Get the chatbot's response
        chatbot_response = chat_completion.choices[0].message.content.strip()
        
        # Add the chatbot symbol to the response
        chatbot_response_with_symbol = f"{chatbot_symbol} {chatbot_response}"
        
        # Return the response
        return chatbot_response_with_symbol
    except Exception as e:
        st.error(f"An error occurred: {e}")
        return "There was an error processing your request."
    
progress_bar = st.empty()  # Create an empty placeholder for the progress bar

if uploaded_file:
    file_path = os.path.join("uploads", uploaded_file.name)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    file_extension = os.path.splitext(uploaded_file.name)[1].lower()

    if file_extension == ".pdf":
        st.write("Processing PDF...")
        # Check if PDF contains text or images
        text_content = extract_text_from_pdf(file_path)
        if not text_content.strip():
            st.write("PDF contains images, using OCR...")
            image_paths = convert_pdf_to_images(file_path)
            text_content = ""
            for image_path in image_paths:
                text_content += extract_text_from_image(image_path)
        else:
            progress_bar.progress(100)  # Set progress to 100% if text extraction successful
            st.success("PDF read successfully.")
    else:
        st.write("Processing Image...")
        text_content = extract_text_from_image(file_path)

    
    progress_bar.progress(100)  # Set progress to 100% if text extraction successful

    st.subheader("Chat with your document")
    user_query = st.text_input("Ask a question about your document")

    if user_query:
        prompt = f"Document Text: {text_content}\n\nUser Query: {user_query}\n\nAnswer:"
        response = generate_response(prompt)
        st.write("Response: ", response)

# Clean up temp images
def cleanup_temp_images(output_folder="temp_images"):
    if os.path.exists(output_folder):
        for file in os.listdir(output_folder):
            file_path = os.path.join(output_folder, file)
            if os.path.isfile(file_path):
                os.unlink(file_path)

cleanup_temp_images()