Spaces:

aadi8anant
/

DocBot

Runtime error

App Files Files Community

aadi8anant commited on Jun 13, 2024

Commit

6ad2654

verified ·

1 Parent(s): b6c5e16

Create app.py

Browse files

Files changed (1) hide show

app.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_path
+from PIL import Image
+import pytesseract
+import fitz  # PyMuPDF
+import spacy
+import os
+from groq import Groq
+import configparser
+# Load environment variables from .env file
+config = configparser.ConfigParser()
+config.read(".env")
+api_key = config.get("GROQ", "GROQ_API_KEY")
+# Initialize spaCy model
+nlp = spacy.load("en_core_web_sm")
+# Set up the Groq client
+client = Groq(api_key=api_key)
+# Streamlit app
+st.title("DocBot: Smart Document ChatBot")
+uploaded_file = st.file_uploader("Upload your file", type=["pdf", "png", "jpg", "jpeg"])
+def convert_pdf_to_images(pdf_path, output_folder="temp_images"):
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    pages = convert_from_path(pdf_path, 300)
+    image_paths = []
+    for i, page in enumerate(pages):
+        image_path = os.path.join(output_folder, f'page_{i}.png')
+        page.save(image_path, 'PNG')
+        image_paths.append(image_path)
+    return image_paths
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    with open(pdf_path, "rb") as f:
+        reader = PdfReader(f)
+        for page in reader.pages:
+            text += page.extract_text()
+    return text
+def extract_text_from_image(image_path):
+    return pytesseract.image_to_string(Image.open(image_path))
+def generate_response(user_prompt):
+    try:
+        # Define the system prompt
+        system_prompt = "You are a helpful assistant, your name is DocBot, you help with document text"
+        chatbot_symbol = "🤖"
+        # Concatenate the system and user prompts
+        prompt = f"{system_prompt}\n\nUser Query: {user_prompt}\n\nAnswer:"
+        # Call the Groq model with the combined prompt
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": user_prompt,
+                }
+            ],
+            model="llama3-8b-8192",
+        )
+        # Get the chatbot's response
+        chatbot_response = chat_completion.choices[0].message.content.strip()
+        # Add the chatbot symbol to the response
+        chatbot_response_with_symbol = f"{chatbot_symbol} {chatbot_response}"
+        # Return the response
+        return chatbot_response_with_symbol
+    except Exception as e:
+        st.error(f"An error occurred: {e}")
+        return "There was an error processing your request."
+progress_bar = st.empty()  # Create an empty placeholder for the progress bar
+if uploaded_file:
+    file_path = os.path.join("uploads", uploaded_file.name)
+    with open(file_path, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    file_extension = os.path.splitext(uploaded_file.name)[1].lower()
+    if file_extension == ".pdf":
+        st.write("Processing PDF...")
+        # Check if PDF contains text or images
+        text_content = extract_text_from_pdf(file_path)
+        if not text_content.strip():
+            st.write("PDF contains images, using OCR...")
+            image_paths = convert_pdf_to_images(file_path)
+            text_content = ""
+            for image_path in image_paths:
+                text_content += extract_text_from_image(image_path)
+        else:
+            progress_bar.progress(100)  # Set progress to 100% if text extraction successful
+            st.success("PDF read successfully.")
+    else:
+        st.write("Processing Image...")
+        text_content = extract_text_from_image(file_path)
+    progress_bar.progress(100)  # Set progress to 100% if text extraction successful
+    st.subheader("Chat with your document")
+    user_query = st.text_input("Ask a question about your document")
+    if user_query:
+        prompt = f"Document Text: {text_content}\n\nUser Query: {user_query}\n\nAnswer:"
+        response = generate_response(prompt)
+        st.write("Response: ", response)
+# Clean up temp images
+def cleanup_temp_images(output_folder="temp_images"):
+    if os.path.exists(output_folder):
+        for file in os.listdir(output_folder):
+            file_path = os.path.join(output_folder, file)
+            if os.path.isfile(file_path):
+                os.unlink(file_path)
+cleanup_temp_images()