import gradio as gr from langchain.text_splitter import CharacterTextSplitter from sentence_transformers import SentenceTransformer import faiss from PyPDF2 import PdfReader from docx import Document from transformers import pipeline # Hugging Face for summarization import os from groq import Groq # Initialize Sentence Transformer for embeddings model = SentenceTransformer('all-MiniLM-L6-v2') client = Groq(api_key=os.getenv("groq_api_key")) # Vector Store (FAISS) dimension = 384 # Embedding size index = faiss.IndexFlatL2(dimension) # Initialize Hugging Face summarization model summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Function to extract text from PDFs def extract_text_from_pdf(file_path): reader = PdfReader(file_path) text = "" for page in reader.pages: text += page.extract_text() return text # Function to extract text from DOCX def extract_text_from_docx(file_path): doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text # Function to process files def process_files(files): texts = [] for file in files: if file.name.endswith('.pdf'): texts.append(extract_text_from_pdf(file.name)) elif file.name.endswith('.docx'): texts.append(extract_text_from_docx(file.name)) return texts # Function to tokenize and chunk text def chunk_text(text, chunk_size=500, overlap=50): text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) return text_splitter.split_text(text) # Function to create embeddings and populate FAISS index def create_embeddings_and_store(chunks): global index # Reset the FAISS index before adding new embeddings index = faiss.IndexFlatL2(dimension) for chunk in chunks: embedding = model.encode([chunk]) embedding = embedding.astype('float32') # Ensure embedding is in correct format index.add(embedding) # Function for summarizing the text before sending def summarize_text(text): summary = summarizer(text, max_length=300, min_length=100, do_sample=False) return summary[0]['summary_text'] # Function to dynamically truncate context to fit the Groq API's token limit def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits if len(context) > max_tokens: context = context[:max_tokens] # Truncate context to fit within the token limit return context # Function to query Groq with context and question def query_groq(question, context): try: if not question.strip(): return "Error: Question is empty or invalid." if not context.strip(): return "Error: No context available from the uploaded documents." # Dynamically truncate context to fit within the token limit max_context_tokens = 4000 # Groq's token limit for context context = truncate_context(context, max_tokens=max_context_tokens) # Query Groq API with the truncated context chat_completion = client.chat.completions.create( messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, {"role": "assistant", "content": context}, {"role": "user", "content": question}], model="llama3-8b-8192", stream=False) if chat_completion and chat_completion.choices: return chat_completion.choices[0].message.content else: return "Error: Received an unexpected response from Groq API." except Exception as e: return f"Error: {str(e)}" # Function to handle RAG pipeline def rag_pipeline(files, question, summarize_before_sending=False): try: if not files: return "Error: No files uploaded. Please upload at least one document." # Process uploaded files texts = process_files(files) if not texts: return "Error: Could not extract text from the uploaded files." # Combine all extracted text into a single context combined_text = " ".join(texts) if summarize_before_sending: # Summarize the text to reduce token count combined_text = summarize_text(combined_text) # Ensure the combined text is within Groq's token limit max_text_size = 4000 # Adjust based on Groq's token limits combined_text = truncate_context(combined_text, max_tokens=max_text_size) # Chunk and create embeddings chunks = chunk_text(combined_text) create_embeddings_and_store(chunks) # Query Groq LLM with context and question answer = query_groq(question, combined_text) return answer except Exception as e: return f"Error: {str(e)}" # Enhanced UI with modern and clean style with gr.Blocks() as app: with gr.Row(): # Left Column for instructions with gr.Column(scale=1, min_width=250): gr.Markdown("""

DocAI: Document Assistant

Welcome to DocAI! Upload your documents and get intelligent answers based on their content.

Steps to use:

Upload multiple files and get answers based on their contents.

""") # Right Column for the main application content with gr.Column(scale=2, min_width=600): gr.Markdown("""

Ask Your Document

Get intelligent answers based on the content of your uploaded documents. Just ask a question!

""") # File input file_input = gr.File( label="Upload Documents (PDF/DOCX)", file_types=[".pdf", ".docx"], file_count="multiple", interactive=True ) # Question input question_input = gr.Textbox( label="Ask a question related your document", placeholder="Type your question here...", interactive=True, lines=2, max_lines=4 ) # # Summarize before sending checkbox # summarize_before_input = gr.Checkbox( # label="Summarize Before Sending", # value=False # ) # Output text box with enhanced styling output = gr.Textbox( label="Answer from LLM", interactive=False, lines=4, max_lines=6 ) # Submit button with icon and modern styling submit_button = gr.Button("Submit", icon="send") # Loading spinner # with gr.Row(): # with gr.Column(scale=1, min_width=250): # gr.Markdown("
Your answer will appear here...
") # Apply the logic for the button to trigger the RAG pipeline submit_button.click(rag_pipeline, inputs=[file_input, question_input], outputs=output) # Launch the app app.launch()