flutterbasit commited on
Commit
a22c97f
·
verified ·
1 Parent(s): c9aca66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -12
app.py CHANGED
@@ -1,16 +1,206 @@
1
- import gradio as gr
2
- import os
3
  from groq import Groq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from langchain.text_splitter import CharacterTextSplitter
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  from PyPDF2 import PdfReader
8
  from docx import Document
9
- from transformers import pipeline
10
 
11
  # Initialize Sentence Transformer for embeddings
12
  model = SentenceTransformer('all-MiniLM-L6-v2')
13
- client = Groq(api_key=os.getenv("groq_api_key"))
14
  # Vector Store (FAISS)
15
  dimension = 384 # Embedding size
16
  index = faiss.IndexFlatL2(dimension)
@@ -52,10 +242,11 @@ def chunk_text(text, chunk_size=500, overlap=50):
52
  # Function to create embeddings and populate FAISS index
53
  def create_embeddings_and_store(chunks):
54
  global index
 
55
  index = faiss.IndexFlatL2(dimension)
56
  for chunk in chunks:
57
  embedding = model.encode([chunk])
58
- embedding = embedding.astype('float32')
59
  index.add(embedding)
60
 
61
  # Function for summarizing the text before sending
@@ -64,9 +255,9 @@ def summarize_text(text):
64
  return summary[0]['summary_text']
65
 
66
  # Function to dynamically truncate context to fit the Groq API's token limit
67
- def truncate_context(context, max_tokens=4000):
68
  if len(context) > max_tokens:
69
- context = context[:max_tokens]
70
  return context
71
 
72
  # Function to query Groq with context and question
@@ -77,9 +268,11 @@ def query_groq(question, context):
77
  if not context.strip():
78
  return "Error: No context available from the uploaded documents."
79
 
80
- max_context_tokens = 4000
 
81
  context = truncate_context(context, max_tokens=max_context_tokens)
82
 
 
83
  chat_completion = client.chat.completions.create(
84
  messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
85
  {"role": "assistant", "content": context},
@@ -98,21 +291,27 @@ def rag_pipeline(files, question, summarize_before_sending=False):
98
  if not files:
99
  return "Error: No files uploaded. Please upload at least one document."
100
 
 
101
  texts = process_files(files)
102
  if not texts:
103
  return "Error: Could not extract text from the uploaded files."
104
 
 
105
  combined_text = " ".join(texts)
106
 
107
  if summarize_before_sending:
 
108
  combined_text = summarize_text(combined_text)
109
 
110
- max_text_size = 4000
 
111
  combined_text = truncate_context(combined_text, max_tokens=max_text_size)
112
 
 
113
  chunks = chunk_text(combined_text)
114
  create_embeddings_and_store(chunks)
115
 
 
116
  answer = query_groq(question, combined_text)
117
  return answer
118
  except Exception as e:
@@ -174,17 +373,22 @@ with gr.Blocks() as app:
174
  value=False
175
  )
176
 
177
- # Output text box
178
  output = gr.Textbox(
179
  label="Answer from LLM",
180
  interactive=False,
181
  lines=4,
182
  max_lines=6
183
  )
184
-
185
- # Submit button
186
  submit_button = gr.Button("Submit", icon="send")
187
 
 
 
 
 
 
188
  # Apply the logic for the button to trigger the RAG pipeline
189
  submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)
190
 
 
1
+ # import gradio as gr
2
+ # import os
3
  from groq import Groq
4
+ # from langchain.text_splitter import CharacterTextSplitter
5
+ # from sentence_transformers import SentenceTransformer
6
+ # import faiss
7
+ # from PyPDF2 import PdfReader
8
+ # from docx import Document
9
+ # from transformers import pipeline
10
+
11
+ # # Initialize Sentence Transformer for embeddings
12
+ # model = SentenceTransformer('all-MiniLM-L6-v2')
13
+ client = Groq(api_key=os.getenv("groq_api_key"))
14
+ # # Vector Store (FAISS)
15
+ # dimension = 384 # Embedding size
16
+ # index = faiss.IndexFlatL2(dimension)
17
+
18
+ # # Initialize Hugging Face summarization model
19
+ # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
20
+
21
+ # # Function to extract text from PDFs
22
+ # def extract_text_from_pdf(file_path):
23
+ # reader = PdfReader(file_path)
24
+ # text = ""
25
+ # for page in reader.pages:
26
+ # text += page.extract_text()
27
+ # return text
28
+
29
+ # # Function to extract text from DOCX
30
+ # def extract_text_from_docx(file_path):
31
+ # doc = Document(file_path)
32
+ # text = ""
33
+ # for paragraph in doc.paragraphs:
34
+ # text += paragraph.text + "\n"
35
+ # return text
36
+
37
+ # # Function to process files
38
+ # def process_files(files):
39
+ # texts = []
40
+ # for file in files:
41
+ # if file.name.endswith('.pdf'):
42
+ # texts.append(extract_text_from_pdf(file.name))
43
+ # elif file.name.endswith('.docx'):
44
+ # texts.append(extract_text_from_docx(file.name))
45
+ # return texts
46
+
47
+ # # Function to tokenize and chunk text
48
+ # def chunk_text(text, chunk_size=500, overlap=50):
49
+ # text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
50
+ # return text_splitter.split_text(text)
51
+
52
+ # # Function to create embeddings and populate FAISS index
53
+ # def create_embeddings_and_store(chunks):
54
+ # global index
55
+ # index = faiss.IndexFlatL2(dimension)
56
+ # for chunk in chunks:
57
+ # embedding = model.encode([chunk])
58
+ # embedding = embedding.astype('float32')
59
+ # index.add(embedding)
60
+
61
+ # # Function for summarizing the text before sending
62
+ # def summarize_text(text):
63
+ # summary = summarizer(text, max_length=300, min_length=100, do_sample=False)
64
+ # return summary[0]['summary_text']
65
+
66
+ # # Function to dynamically truncate context to fit the Groq API's token limit
67
+ # def truncate_context(context, max_tokens=4000):
68
+ # if len(context) > max_tokens:
69
+ # context = context[:max_tokens]
70
+ # return context
71
+
72
+ # # Function to query Groq with context and question
73
+ # def query_groq(question, context):
74
+ # try:
75
+ # if not question.strip():
76
+ # return "Error: Question is empty or invalid."
77
+ # if not context.strip():
78
+ # return "Error: No context available from the uploaded documents."
79
+
80
+ # max_context_tokens = 4000
81
+ # context = truncate_context(context, max_tokens=max_context_tokens)
82
+
83
+ # chat_completion = client.chat.completions.create(
84
+ # messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
85
+ # {"role": "assistant", "content": context},
86
+ # {"role": "user", "content": question}],
87
+ # model="llama3-8b-8192", stream=False)
88
+ # if chat_completion and chat_completion.choices:
89
+ # return chat_completion.choices[0].message.content
90
+ # else:
91
+ # return "Error: Received an unexpected response from Groq API."
92
+ # except Exception as e:
93
+ # return f"Error: {str(e)}"
94
+
95
+ # # Function to handle RAG pipeline
96
+ # def rag_pipeline(files, question, summarize_before_sending=False):
97
+ # try:
98
+ # if not files:
99
+ # return "Error: No files uploaded. Please upload at least one document."
100
+
101
+ # texts = process_files(files)
102
+ # if not texts:
103
+ # return "Error: Could not extract text from the uploaded files."
104
+
105
+ # combined_text = " ".join(texts)
106
+
107
+ # if summarize_before_sending:
108
+ # combined_text = summarize_text(combined_text)
109
+
110
+ # max_text_size = 4000
111
+ # combined_text = truncate_context(combined_text, max_tokens=max_text_size)
112
+
113
+ # chunks = chunk_text(combined_text)
114
+ # create_embeddings_and_store(chunks)
115
+
116
+ # answer = query_groq(question, combined_text)
117
+ # return answer
118
+ # except Exception as e:
119
+ # return f"Error: {str(e)}"
120
+
121
+ # # Enhanced UI with modern and clean style
122
+ # with gr.Blocks() as app:
123
+ # with gr.Row():
124
+ # # Left Column for instructions
125
+ # with gr.Column(scale=1, min_width=250):
126
+ # gr.Markdown("""
127
+ # <div style="background: linear-gradient(145deg, #6e7dff, #1c2b58); padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;">
128
+ # <h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2>
129
+ # <p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p>
130
+ # <p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p>
131
+ # <ul style="color: #ddd; font-size: 16px; line-height: 1.6;">
132
+ # <li>Upload your PDF or DOCX files.</li>
133
+ # <li>Ask questions related to the document.</li>
134
+ # <li>Enable "Summarize Before Sending" for a brief summary of the document.</li>
135
+ # <li>Click "Submit" to get your answers.</li>
136
+ # </ul>
137
+ # <p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p>
138
+ # </div>
139
+ # """)
140
+
141
+ # # Right Column for the main application content
142
+ # with gr.Column(scale=2, min_width=600):
143
+ # gr.Markdown("""
144
+ # <div style="background: linear-gradient(135deg, #6e7dff, #1c2b58); padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;">
145
+ # <h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;">
146
+ # Ask Your Document
147
+ # </h2>
148
+ # <p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;">
149
+ # Get intelligent answers based on the content of your uploaded documents. Just ask a question!
150
+ # </p>
151
+ # </div>
152
+ # """)
153
+
154
+ # # File input
155
+ # file_input = gr.File(
156
+ # label="Upload Documents (PDF/DOCX)",
157
+ # file_types=[".pdf", ".docx"],
158
+ # file_count="multiple",
159
+ # interactive=True
160
+ # )
161
+
162
+ # # Question input
163
+ # question_input = gr.Textbox(
164
+ # label="Ask a question",
165
+ # placeholder="Type your question here...",
166
+ # interactive=True,
167
+ # lines=2,
168
+ # max_lines=4
169
+ # )
170
+
171
+ # # Summarize before sending checkbox
172
+ # summarize_before_input = gr.Checkbox(
173
+ # label="Summarize Before Sending",
174
+ # value=False
175
+ # )
176
+
177
+ # # Output text box
178
+ # output = gr.Textbox(
179
+ # label="Answer from LLM",
180
+ # interactive=False,
181
+ # lines=4,
182
+ # max_lines=6
183
+ # )
184
+
185
+ # # Submit button
186
+ # submit_button = gr.Button("Submit", icon="send")
187
+
188
+ # # Apply the logic for the button to trigger the RAG pipeline
189
+ # submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)
190
+
191
+ # # Launch the app
192
+ # app.launch()
193
+ import gradio as gr
194
  from langchain.text_splitter import CharacterTextSplitter
195
  from sentence_transformers import SentenceTransformer
196
  import faiss
197
  from PyPDF2 import PdfReader
198
  from docx import Document
199
+ from transformers import pipeline # Hugging Face for summarization
200
 
201
  # Initialize Sentence Transformer for embeddings
202
  model = SentenceTransformer('all-MiniLM-L6-v2')
203
+
204
  # Vector Store (FAISS)
205
  dimension = 384 # Embedding size
206
  index = faiss.IndexFlatL2(dimension)
 
242
  # Function to create embeddings and populate FAISS index
243
  def create_embeddings_and_store(chunks):
244
  global index
245
+ # Reset the FAISS index before adding new embeddings
246
  index = faiss.IndexFlatL2(dimension)
247
  for chunk in chunks:
248
  embedding = model.encode([chunk])
249
+ embedding = embedding.astype('float32') # Ensure embedding is in correct format
250
  index.add(embedding)
251
 
252
  # Function for summarizing the text before sending
 
255
  return summary[0]['summary_text']
256
 
257
  # Function to dynamically truncate context to fit the Groq API's token limit
258
+ def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits
259
  if len(context) > max_tokens:
260
+ context = context[:max_tokens] # Truncate context to fit within the token limit
261
  return context
262
 
263
  # Function to query Groq with context and question
 
268
  if not context.strip():
269
  return "Error: No context available from the uploaded documents."
270
 
271
+ # Dynamically truncate context to fit within the token limit
272
+ max_context_tokens = 4000 # Groq's token limit for context
273
  context = truncate_context(context, max_tokens=max_context_tokens)
274
 
275
+ # Query Groq API with the truncated context
276
  chat_completion = client.chat.completions.create(
277
  messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
278
  {"role": "assistant", "content": context},
 
291
  if not files:
292
  return "Error: No files uploaded. Please upload at least one document."
293
 
294
+ # Process uploaded files
295
  texts = process_files(files)
296
  if not texts:
297
  return "Error: Could not extract text from the uploaded files."
298
 
299
+ # Combine all extracted text into a single context
300
  combined_text = " ".join(texts)
301
 
302
  if summarize_before_sending:
303
+ # Summarize the text to reduce token count
304
  combined_text = summarize_text(combined_text)
305
 
306
+ # Ensure the combined text is within Groq's token limit
307
+ max_text_size = 4000 # Adjust based on Groq's token limits
308
  combined_text = truncate_context(combined_text, max_tokens=max_text_size)
309
 
310
+ # Chunk and create embeddings
311
  chunks = chunk_text(combined_text)
312
  create_embeddings_and_store(chunks)
313
 
314
+ # Query Groq LLM with context and question
315
  answer = query_groq(question, combined_text)
316
  return answer
317
  except Exception as e:
 
373
  value=False
374
  )
375
 
376
+ # Output text box with enhanced styling
377
  output = gr.Textbox(
378
  label="Answer from LLM",
379
  interactive=False,
380
  lines=4,
381
  max_lines=6
382
  )
383
+
384
+ # Submit button with icon and modern styling
385
  submit_button = gr.Button("Submit", icon="send")
386
 
387
+ # Loading spinner
388
+ with gr.Row():
389
+ with gr.Column(scale=1, min_width=250):
390
+ gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>")
391
+
392
  # Apply the logic for the button to trigger the RAG pipeline
393
  submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)
394