aadi8anant commited on
Commit
6ad2654
·
verified ·
1 Parent(s): b6c5e16

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from pdf2image import convert_from_path
4
+ from PIL import Image
5
+ import pytesseract
6
+ import fitz # PyMuPDF
7
+ import spacy
8
+ import os
9
+ from groq import Groq
10
+ import configparser
11
+
12
+ # Load environment variables from .env file
13
+ config = configparser.ConfigParser()
14
+ config.read(".env")
15
+ api_key = config.get("GROQ", "GROQ_API_KEY")
16
+
17
+ # Initialize spaCy model
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ # Set up the Groq client
21
+ client = Groq(api_key=api_key)
22
+
23
+ # Streamlit app
24
+ st.title("DocBot: Smart Document ChatBot")
25
+
26
+ uploaded_file = st.file_uploader("Upload your file", type=["pdf", "png", "jpg", "jpeg"])
27
+
28
+ def convert_pdf_to_images(pdf_path, output_folder="temp_images"):
29
+ if not os.path.exists(output_folder):
30
+ os.makedirs(output_folder)
31
+ pages = convert_from_path(pdf_path, 300)
32
+ image_paths = []
33
+ for i, page in enumerate(pages):
34
+ image_path = os.path.join(output_folder, f'page_{i}.png')
35
+ page.save(image_path, 'PNG')
36
+ image_paths.append(image_path)
37
+ return image_paths
38
+
39
+ def extract_text_from_pdf(pdf_path):
40
+ text = ""
41
+ with open(pdf_path, "rb") as f:
42
+ reader = PdfReader(f)
43
+ for page in reader.pages:
44
+ text += page.extract_text()
45
+ return text
46
+
47
+ def extract_text_from_image(image_path):
48
+ return pytesseract.image_to_string(Image.open(image_path))
49
+
50
+ def generate_response(user_prompt):
51
+ try:
52
+ # Define the system prompt
53
+ system_prompt = "You are a helpful assistant, your name is DocBot, you help with document text"
54
+ chatbot_symbol = "🤖"
55
+
56
+ # Concatenate the system and user prompts
57
+ prompt = f"{system_prompt}\n\nUser Query: {user_prompt}\n\nAnswer:"
58
+
59
+ # Call the Groq model with the combined prompt
60
+ chat_completion = client.chat.completions.create(
61
+ messages=[
62
+ {
63
+ "role": "system",
64
+ "content": system_prompt,
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content": user_prompt,
69
+ }
70
+ ],
71
+ model="llama3-8b-8192",
72
+ )
73
+
74
+ # Get the chatbot's response
75
+ chatbot_response = chat_completion.choices[0].message.content.strip()
76
+
77
+ # Add the chatbot symbol to the response
78
+ chatbot_response_with_symbol = f"{chatbot_symbol} {chatbot_response}"
79
+
80
+ # Return the response
81
+ return chatbot_response_with_symbol
82
+ except Exception as e:
83
+ st.error(f"An error occurred: {e}")
84
+ return "There was an error processing your request."
85
+
86
+ progress_bar = st.empty() # Create an empty placeholder for the progress bar
87
+
88
+ if uploaded_file:
89
+ file_path = os.path.join("uploads", uploaded_file.name)
90
+ with open(file_path, "wb") as f:
91
+ f.write(uploaded_file.getbuffer())
92
+
93
+ file_extension = os.path.splitext(uploaded_file.name)[1].lower()
94
+
95
+ if file_extension == ".pdf":
96
+ st.write("Processing PDF...")
97
+ # Check if PDF contains text or images
98
+ text_content = extract_text_from_pdf(file_path)
99
+ if not text_content.strip():
100
+ st.write("PDF contains images, using OCR...")
101
+ image_paths = convert_pdf_to_images(file_path)
102
+ text_content = ""
103
+ for image_path in image_paths:
104
+ text_content += extract_text_from_image(image_path)
105
+ else:
106
+ progress_bar.progress(100) # Set progress to 100% if text extraction successful
107
+ st.success("PDF read successfully.")
108
+ else:
109
+ st.write("Processing Image...")
110
+ text_content = extract_text_from_image(file_path)
111
+
112
+
113
+ progress_bar.progress(100) # Set progress to 100% if text extraction successful
114
+
115
+ st.subheader("Chat with your document")
116
+ user_query = st.text_input("Ask a question about your document")
117
+
118
+ if user_query:
119
+ prompt = f"Document Text: {text_content}\n\nUser Query: {user_query}\n\nAnswer:"
120
+ response = generate_response(prompt)
121
+ st.write("Response: ", response)
122
+
123
+ # Clean up temp images
124
+ def cleanup_temp_images(output_folder="temp_images"):
125
+ if os.path.exists(output_folder):
126
+ for file in os.listdir(output_folder):
127
+ file_path = os.path.join(output_folder, file)
128
+ if os.path.isfile(file_path):
129
+ os.unlink(file_path)
130
+
131
+ cleanup_temp_images()