flutterbasit commited on
Commit
2951be9
·
verified ·
1 Parent(s): 8a39816

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ from PyPDF2 import PdfReader
6
+ from docx import Document
7
+ from transformers import pipeline # Hugging Face for summarization
8
+
9
+ # Initialize Sentence Transformer for embeddings
10
+ model = SentenceTransformer('all-MiniLM-L6-v2')
11
+
12
+ # Vector Store (FAISS)
13
+ dimension = 384 # Embedding size
14
+ index = faiss.IndexFlatL2(dimension)
15
+
16
+ # Initialize Hugging Face summarization model
17
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
18
+
19
+ # Function to extract text from PDFs
20
+ def extract_text_from_pdf(file_path):
21
+ reader = PdfReader(file_path)
22
+ text = ""
23
+ for page in reader.pages:
24
+ text += page.extract_text()
25
+ return text
26
+
27
+ # Function to extract text from DOCX
28
+ def extract_text_from_docx(file_path):
29
+ doc = Document(file_path)
30
+ text = ""
31
+ for paragraph in doc.paragraphs:
32
+ text += paragraph.text + "\n"
33
+ return text
34
+
35
+ # Function to process files
36
+ def process_files(files):
37
+ texts = []
38
+ for file in files:
39
+ if file.name.endswith('.pdf'):
40
+ texts.append(extract_text_from_pdf(file.name))
41
+ elif file.name.endswith('.docx'):
42
+ texts.append(extract_text_from_docx(file.name))
43
+ return texts
44
+
45
+ # Function to tokenize and chunk text
46
+ def chunk_text(text, chunk_size=500, overlap=50):
47
+ text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
48
+ return text_splitter.split_text(text)
49
+
50
+ # Function to create embeddings and populate FAISS index
51
+ def create_embeddings_and_store(chunks):
52
+ global index
53
+ # Reset the FAISS index before adding new embeddings
54
+ index = faiss.IndexFlatL2(dimension)
55
+ for chunk in chunks:
56
+ embedding = model.encode([chunk])
57
+ embedding = embedding.astype('float32') # Ensure embedding is in correct format
58
+ index.add(embedding)
59
+
60
+ # Function for summarizing the text before sending
61
+ def summarize_text(text):
62
+ summary = summarizer(text, max_length=300, min_length=100, do_sample=False)
63
+ return summary[0]['summary_text']
64
+
65
+ # Function to dynamically truncate context to fit the Groq API's token limit
66
+ def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits
67
+ if len(context) > max_tokens:
68
+ context = context[:max_tokens] # Truncate context to fit within the token limit
69
+ return context
70
+
71
+ # Function to query Groq with context and question
72
+ def query_groq(question, context):
73
+ try:
74
+ if not question.strip():
75
+ return "Error: Question is empty or invalid."
76
+ if not context.strip():
77
+ return "Error: No context available from the uploaded documents."
78
+
79
+ # Dynamically truncate context to fit within the token limit
80
+ max_context_tokens = 4000 # Groq's token limit for context
81
+ context = truncate_context(context, max_tokens=max_context_tokens)
82
+
83
+ # Query Groq API with the truncated context
84
+ chat_completion = client.chat.completions.create(
85
+ messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
86
+ {"role": "assistant", "content": context},
87
+ {"role": "user", "content": question}],
88
+ model="llama3-8b-8192", stream=False)
89
+ if chat_completion and chat_completion.choices:
90
+ return chat_completion.choices[0].message.content
91
+ else:
92
+ return "Error: Received an unexpected response from Groq API."
93
+ except Exception as e:
94
+ return f"Error: {str(e)}"
95
+
96
+ # Function to handle RAG pipeline
97
+ def rag_pipeline(files, question, summarize_before_sending=False):
98
+ try:
99
+ if not files:
100
+ return "Error: No files uploaded. Please upload at least one document."
101
+
102
+ # Process uploaded files
103
+ texts = process_files(files)
104
+ if not texts:
105
+ return "Error: Could not extract text from the uploaded files."
106
+
107
+ # Combine all extracted text into a single context
108
+ combined_text = " ".join(texts)
109
+
110
+ if summarize_before_sending:
111
+ # Summarize the text to reduce token count
112
+ combined_text = summarize_text(combined_text)
113
+
114
+ # Ensure the combined text is within Groq's token limit
115
+ max_text_size = 4000 # Adjust based on Groq's token limits
116
+ combined_text = truncate_context(combined_text, max_tokens=max_text_size)
117
+
118
+ # Chunk and create embeddings
119
+ chunks = chunk_text(combined_text)
120
+ create_embeddings_and_store(chunks)
121
+
122
+ # Query Groq LLM with context and question
123
+ answer = query_groq(question, combined_text)
124
+ return answer
125
+ except Exception as e:
126
+ return f"Error: {str(e)}"
127
+
128
+ # Enhanced UI with modern and clean style
129
+ with gr.Blocks() as app:
130
+ with gr.Row():
131
+ # Left Column for instructions
132
+ with gr.Column(scale=1, min_width=250):
133
+ gr.Markdown("""
134
+ <div style="background: linear-gradient(145deg, #6e7dff, #1c2b58); padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;">
135
+ <h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2>
136
+ <p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p>
137
+ <p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p>
138
+ <ul style="color: #ddd; font-size: 16px; line-height: 1.6;">
139
+ <li>Upload your PDF or DOCX files.</li>
140
+ <li>Ask questions related to the document.</li>
141
+ <li>Enable "Summarize Before Sending" for a brief summary of the document.</li>
142
+ <li>Click "Submit" to get your answers.</li>
143
+ </ul>
144
+ <p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p>
145
+ </div>
146
+ """)
147
+
148
+ # Right Column for the main application content
149
+ with gr.Column(scale=2, min_width=600):
150
+ gr.Markdown("""
151
+ <div style="background: linear-gradient(135deg, #6e7dff, #1c2b58); padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;">
152
+ <h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;">
153
+ Ask Your Document
154
+ </h2>
155
+ <p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;">
156
+ Get intelligent answers based on the content of your uploaded documents. Just ask a question!
157
+ </p>
158
+ </div>
159
+ """)
160
+
161
+ # File input
162
+ file_input = gr.File(
163
+ label="Upload Documents (PDF/DOCX)",
164
+ file_types=[".pdf", ".docx"],
165
+ file_count="multiple",
166
+ interactive=True
167
+ )
168
+
169
+ # Question input
170
+ question_input = gr.Textbox(
171
+ label="Ask a question",
172
+ placeholder="Type your question here...",
173
+ interactive=True,
174
+ lines=2,
175
+ max_lines=4
176
+ )
177
+
178
+ # Summarize before sending checkbox
179
+ summarize_before_input = gr.Checkbox(
180
+ label="Summarize Before Sending",
181
+ value=False
182
+ )
183
+
184
+ # Output text box with enhanced styling
185
+ output = gr.Textbox(
186
+ label="Answer from LLM",
187
+ interactive=False,
188
+ lines=4,
189
+ max_lines=6
190
+ )
191
+
192
+ # Submit button with icon and modern styling
193
+ submit_button = gr.Button("Submit", icon="send")
194
+
195
+ # Loading spinner
196
+ with gr.Row():
197
+ with gr.Column(scale=1, min_width=250):
198
+ gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>")
199
+
200
+ # Apply the logic for the button to trigger the RAG pipeline
201
+ submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)
202
+
203
+ # Launch the app
204
+ app.launch()