Spaces:

flutterbasit
/

Ask-Document-AI

Sleeping

App Files Files Community

flutterbasit commited on Dec 26, 2024

Commit

a22c97f

verified ·

1 Parent(s): c9aca66

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -12

app.py CHANGED Viewed

@@ -1,16 +1,206 @@
-import gradio as gr
-import os
 from groq import Groq
 from langchain.text_splitter import CharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 import faiss
 from PyPDF2 import PdfReader
 from docx import Document
-from transformers import pipeline
 # Initialize Sentence Transformer for embeddings
 model = SentenceTransformer('all-MiniLM-L6-v2')
-client = Groq(api_key=os.getenv("groq_api_key"))
 # Vector Store (FAISS)
 dimension = 384  # Embedding size
 index = faiss.IndexFlatL2(dimension)
@@ -52,10 +242,11 @@ def chunk_text(text, chunk_size=500, overlap=50):
 # Function to create embeddings and populate FAISS index
 def create_embeddings_and_store(chunks):
     global index
     index = faiss.IndexFlatL2(dimension)
     for chunk in chunks:
         embedding = model.encode([chunk])
-        embedding = embedding.astype('float32')
         index.add(embedding)
 # Function for summarizing the text before sending
@@ -64,9 +255,9 @@ def summarize_text(text):
     return summary[0]['summary_text']
 # Function to dynamically truncate context to fit the Groq API's token limit
-def truncate_context(context, max_tokens=4000):
     if len(context) > max_tokens:
-        context = context[:max_tokens]
     return context
 # Function to query Groq with context and question
@@ -77,9 +268,11 @@ def query_groq(question, context):
         if not context.strip():
             return "Error: No context available from the uploaded documents."
-        max_context_tokens = 4000
         context = truncate_context(context, max_tokens=max_context_tokens)
         chat_completion = client.chat.completions.create(
             messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
                       {"role": "assistant", "content": context},
@@ -98,21 +291,27 @@ def rag_pipeline(files, question, summarize_before_sending=False):
         if not files:
             return "Error: No files uploaded. Please upload at least one document."
         texts = process_files(files)
         if not texts:
             return "Error: Could not extract text from the uploaded files."
         combined_text = " ".join(texts)
         if summarize_before_sending:
             combined_text = summarize_text(combined_text)
-        max_text_size = 4000
         combined_text = truncate_context(combined_text, max_tokens=max_text_size)
         chunks = chunk_text(combined_text)
         create_embeddings_and_store(chunks)
         answer = query_groq(question, combined_text)
         return answer
     except Exception as e:
@@ -174,17 +373,22 @@ with gr.Blocks() as app:
                 value=False
             )
-            # Output text box
             output = gr.Textbox(
                 label="Answer from LLM",
                 interactive=False,
                 lines=4,
                 max_lines=6
             )
-            # Submit button
             submit_button = gr.Button("Submit", icon="send")
             # Apply the logic for the button to trigger the RAG pipeline
             submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)

+# import gradio as gr
+# import os
 from groq import Groq
+# from langchain.text_splitter import CharacterTextSplitter
+# from sentence_transformers import SentenceTransformer
+# import faiss
+# from PyPDF2 import PdfReader
+# from docx import Document
+# from transformers import pipeline
+# # Initialize Sentence Transformer for embeddings
+# model = SentenceTransformer('all-MiniLM-L6-v2')
+client = Groq(api_key=os.getenv("groq_api_key"))
+# # Vector Store (FAISS)
+# dimension = 384  # Embedding size
+# index = faiss.IndexFlatL2(dimension)
+# # Initialize Hugging Face summarization model
+# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# # Function to extract text from PDFs
+# def extract_text_from_pdf(file_path):
+#     reader = PdfReader(file_path)
+#     text = ""
+#     for page in reader.pages:
+#         text += page.extract_text()
+#     return text
+# # Function to extract text from DOCX
+# def extract_text_from_docx(file_path):
+#     doc = Document(file_path)
+#     text = ""
+#     for paragraph in doc.paragraphs:
+#         text += paragraph.text + "\n"
+#     return text
+# # Function to process files
+# def process_files(files):
+#     texts = []
+#     for file in files:
+#         if file.name.endswith('.pdf'):
+#             texts.append(extract_text_from_pdf(file.name))
+#         elif file.name.endswith('.docx'):
+#             texts.append(extract_text_from_docx(file.name))
+#     return texts
+# # Function to tokenize and chunk text
+# def chunk_text(text, chunk_size=500, overlap=50):
+#     text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
+#     return text_splitter.split_text(text)
+# # Function to create embeddings and populate FAISS index
+# def create_embeddings_and_store(chunks):
+#     global index
+#     index = faiss.IndexFlatL2(dimension)
+#     for chunk in chunks:
+#         embedding = model.encode([chunk])
+#         embedding = embedding.astype('float32')
+#         index.add(embedding)
+# # Function for summarizing the text before sending
+# def summarize_text(text):
+#     summary = summarizer(text, max_length=300, min_length=100, do_sample=False)
+#     return summary[0]['summary_text']
+# # Function to dynamically truncate context to fit the Groq API's token limit
+# def truncate_context(context, max_tokens=4000):
+#     if len(context) > max_tokens:
+#         context = context[:max_tokens]
+#     return context
+# # Function to query Groq with context and question
+# def query_groq(question, context):
+#     try:
+#         if not question.strip():
+#             return "Error: Question is empty or invalid."
+#         if not context.strip():
+#             return "Error: No context available from the uploaded documents."
+#         max_context_tokens = 4000
+#         context = truncate_context(context, max_tokens=max_context_tokens)
+#         chat_completion = client.chat.completions.create(
+#             messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
+#                       {"role": "assistant", "content": context},
+#                       {"role": "user", "content": question}],
+#             model="llama3-8b-8192", stream=False)
+#         if chat_completion and chat_completion.choices:
+#             return chat_completion.choices[0].message.content
+#         else:
+#             return "Error: Received an unexpected response from Groq API."
+#     except Exception as e:
+#         return f"Error: {str(e)}"
+# # Function to handle RAG pipeline
+# def rag_pipeline(files, question, summarize_before_sending=False):
+#     try:
+#         if not files:
+#             return "Error: No files uploaded. Please upload at least one document."
+#         texts = process_files(files)
+#         if not texts:
+#             return "Error: Could not extract text from the uploaded files."
+#         combined_text = " ".join(texts)
+#         if summarize_before_sending:
+#             combined_text = summarize_text(combined_text)
+#         max_text_size = 4000
+#         combined_text = truncate_context(combined_text, max_tokens=max_text_size)
+#         chunks = chunk_text(combined_text)
+#         create_embeddings_and_store(chunks)
+#         answer = query_groq(question, combined_text)
+#         return answer
+#     except Exception as e:
+#         return f"Error: {str(e)}"
+# # Enhanced UI with modern and clean style
+# with gr.Blocks() as app:
+#     with gr.Row():
+#         # Left Column for instructions
+#         with gr.Column(scale=1, min_width=250):
+#             gr.Markdown("""
+#             <div style="background: linear-gradient(145deg, #6e7dff, #1c2b58); padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;">
+#                 <h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2>
+#                 <p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p>
+#                 <p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p>
+#                 <ul style="color: #ddd; font-size: 16px; line-height: 1.6;">
+#                     <li>Upload your PDF or DOCX files.</li>
+#                     <li>Ask questions related to the document.</li>
+#                     <li>Enable "Summarize Before Sending" for a brief summary of the document.</li>
+#                     <li>Click "Submit" to get your answers.</li>
+#                 </ul>
+#                 <p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p>
+#             </div>
+#             """)
+#         # Right Column for the main application content
+#         with gr.Column(scale=2, min_width=600):
+#             gr.Markdown("""
+#             <div style="background: linear-gradient(135deg, #6e7dff, #1c2b58); padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;">
+#                 <h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;">
+#                     Ask Your Document
+#                 </h2>
+#                 <p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;">
+#                     Get intelligent answers based on the content of your uploaded documents. Just ask a question!
+#                 </p>
+#             </div>
+#             """)
+#             # File input
+#             file_input = gr.File(
+#                 label="Upload Documents (PDF/DOCX)",
+#                 file_types=[".pdf", ".docx"],
+#                 file_count="multiple",
+#                 interactive=True
+#             )
+#             # Question input
+#             question_input = gr.Textbox(
+#                 label="Ask a question",
+#                 placeholder="Type your question here...",
+#                 interactive=True,
+#                 lines=2,
+#                 max_lines=4
+#             )
+#             # Summarize before sending checkbox
+#             summarize_before_input = gr.Checkbox(
+#                 label="Summarize Before Sending",
+#                 value=False
+#             )
+#             # Output text box
+#             output = gr.Textbox(
+#                 label="Answer from LLM",
+#                 interactive=False,
+#                 lines=4,
+#                 max_lines=6
+#             )
+#             # Submit button
+#             submit_button = gr.Button("Submit", icon="send")
+#             # Apply the logic for the button to trigger the RAG pipeline
+#             submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)
+# # Launch the app
+# app.launch()
+import gradio as gr
 from langchain.text_splitter import CharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 import faiss
 from PyPDF2 import PdfReader
 from docx import Document
+from transformers import pipeline  # Hugging Face for summarization
 # Initialize Sentence Transformer for embeddings
 model = SentenceTransformer('all-MiniLM-L6-v2')
 # Vector Store (FAISS)
 dimension = 384  # Embedding size
 index = faiss.IndexFlatL2(dimension)
 # Function to create embeddings and populate FAISS index
 def create_embeddings_and_store(chunks):
     global index
+    # Reset the FAISS index before adding new embeddings
     index = faiss.IndexFlatL2(dimension)
     for chunk in chunks:
         embedding = model.encode([chunk])
+        embedding = embedding.astype('float32')  # Ensure embedding is in correct format
         index.add(embedding)
 # Function for summarizing the text before sending
     return summary[0]['summary_text']
 # Function to dynamically truncate context to fit the Groq API's token limit
+def truncate_context(context, max_tokens=4000):  # Adjust max_tokens based on Groq's limits
     if len(context) > max_tokens:
+        context = context[:max_tokens]  # Truncate context to fit within the token limit
     return context
 # Function to query Groq with context and question
         if not context.strip():
             return "Error: No context available from the uploaded documents."
+        # Dynamically truncate context to fit within the token limit
+        max_context_tokens = 4000  # Groq's token limit for context
         context = truncate_context(context, max_tokens=max_context_tokens)
+        # Query Groq API with the truncated context
         chat_completion = client.chat.completions.create(
             messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
                       {"role": "assistant", "content": context},
         if not files:
             return "Error: No files uploaded. Please upload at least one document."
+        # Process uploaded files
         texts = process_files(files)
         if not texts:
             return "Error: Could not extract text from the uploaded files."
+        # Combine all extracted text into a single context
         combined_text = " ".join(texts)
         if summarize_before_sending:
+            # Summarize the text to reduce token count
             combined_text = summarize_text(combined_text)
+        # Ensure the combined text is within Groq's token limit
+        max_text_size = 4000  # Adjust based on Groq's token limits
         combined_text = truncate_context(combined_text, max_tokens=max_text_size)
+        # Chunk and create embeddings
         chunks = chunk_text(combined_text)
         create_embeddings_and_store(chunks)
+        # Query Groq LLM with context and question
         answer = query_groq(question, combined_text)
         return answer
     except Exception as e:
                 value=False
             )
+            # Output text box with enhanced styling
             output = gr.Textbox(
                 label="Answer from LLM",
                 interactive=False,
                 lines=4,
                 max_lines=6
             )
+            # Submit button with icon and modern styling
             submit_button = gr.Button("Submit", icon="send")
+            # Loading spinner
+            with gr.Row():
+                with gr.Column(scale=1, min_width=250):
+                    gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>")
             # Apply the logic for the button to trigger the RAG pipeline
             submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output)