import gradio as gr from langchain.text_splitter import CharacterTextSplitter from sentence_transformers import SentenceTransformer import faiss from PyPDF2 import PdfReader from docx import Document from transformers import pipeline # Hugging Face for summarization import os from groq import Groq # Initialize Sentence Transformer for embeddings model = SentenceTransformer('all-MiniLM-L6-v2') client = Groq(api_key=os.getenv("groq_api_key")) # Vector Store (FAISS) dimension = 384 # Embedding size index = faiss.IndexFlatL2(dimension) # Initialize Hugging Face summarization model summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Function to extract text from PDFs def extract_text_from_pdf(file_path): reader = PdfReader(file_path) text = "" for page in reader.pages: text += page.extract_text() return text # Function to extract text from DOCX def extract_text_from_docx(file_path): doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text # Function to process files def process_files(files): texts = [] for file in files: if file.name.endswith('.pdf'): texts.append(extract_text_from_pdf(file.name)) elif file.name.endswith('.docx'): texts.append(extract_text_from_docx(file.name)) return texts # Function to tokenize and chunk text def chunk_text(text, chunk_size=500, overlap=50): text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) return text_splitter.split_text(text) # Function to create embeddings and populate FAISS index def create_embeddings_and_store(chunks): global index # Reset the FAISS index before adding new embeddings index = faiss.IndexFlatL2(dimension) for chunk in chunks: embedding = model.encode([chunk]) embedding = embedding.astype('float32') # Ensure embedding is in correct format index.add(embedding) # Function for summarizing the text before sending def summarize_text(text): summary = summarizer(text, max_length=300, min_length=100, do_sample=False) return summary[0]['summary_text'] # Function to dynamically truncate context to fit the Groq API's token limit def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits if len(context) > max_tokens: context = context[:max_tokens] # Truncate context to fit within the token limit return context # Function to query Groq with context and question def query_groq(question, context): try: if not question.strip(): return "Error: Question is empty or invalid." if not context.strip(): return "Error: No context available from the uploaded documents." # Dynamically truncate context to fit within the token limit max_context_tokens = 4000 # Groq's token limit for context context = truncate_context(context, max_tokens=max_context_tokens) # Query Groq API with the truncated context chat_completion = client.chat.completions.create( messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, {"role": "assistant", "content": context}, {"role": "user", "content": question}], model="llama3-8b-8192", stream=False) if chat_completion and chat_completion.choices: return chat_completion.choices[0].message.content else: return "Error: Received an unexpected response from Groq API." except Exception as e: return f"Error: {str(e)}" # Function to handle RAG pipeline def rag_pipeline(files, question, summarize_before_sending=False): try: if not files: return "Error: No files uploaded. Please upload at least one document." # Process uploaded files texts = process_files(files) if not texts: return "Error: Could not extract text from the uploaded files." # Combine all extracted text into a single context combined_text = " ".join(texts) if summarize_before_sending: # Summarize the text to reduce token count combined_text = summarize_text(combined_text) # Ensure the combined text is within Groq's token limit max_text_size = 4000 # Adjust based on Groq's token limits combined_text = truncate_context(combined_text, max_tokens=max_text_size) # Chunk and create embeddings chunks = chunk_text(combined_text) create_embeddings_and_store(chunks) # Query Groq LLM with context and question answer = query_groq(question, combined_text) return answer except Exception as e: return f"Error: {str(e)}" # Enhanced UI with modern and clean style with gr.Blocks() as app: with gr.Row(): # Left Column for instructions with gr.Column(scale=1, min_width=250): gr.Markdown("""
Welcome to DocAI! Upload your documents and get intelligent answers based on their content.
Steps to use:
Upload multiple files and get answers based on their contents.
Get intelligent answers based on the content of your uploaded documents. Just ask a question!