Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| from transformers import pipeline # Hugging Face for summarization | |
| import os | |
| from groq import Groq | |
| # Initialize Sentence Transformer for embeddings | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| client = Groq(api_key=os.getenv("groq_api_key")) | |
| # Vector Store (FAISS) | |
| dimension = 384 # Embedding size | |
| index = faiss.IndexFlatL2(dimension) | |
| # Initialize Hugging Face summarization model | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| # Function to extract text from PDFs | |
| def extract_text_from_pdf(file_path): | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Function to extract text from DOCX | |
| def extract_text_from_docx(file_path): | |
| doc = Document(file_path) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| # Function to process files | |
| def process_files(files): | |
| texts = [] | |
| for file in files: | |
| if file.name.endswith('.pdf'): | |
| texts.append(extract_text_from_pdf(file.name)) | |
| elif file.name.endswith('.docx'): | |
| texts.append(extract_text_from_docx(file.name)) | |
| return texts | |
| # Function to tokenize and chunk text | |
| def chunk_text(text, chunk_size=500, overlap=50): | |
| text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
| return text_splitter.split_text(text) | |
| # Function to create embeddings and populate FAISS index | |
| def create_embeddings_and_store(chunks): | |
| global index | |
| # Reset the FAISS index before adding new embeddings | |
| index = faiss.IndexFlatL2(dimension) | |
| for chunk in chunks: | |
| embedding = model.encode([chunk]) | |
| embedding = embedding.astype('float32') # Ensure embedding is in correct format | |
| index.add(embedding) | |
| # Function for summarizing the text before sending | |
| def summarize_text(text): | |
| summary = summarizer(text, max_length=300, min_length=100, do_sample=False) | |
| return summary[0]['summary_text'] | |
| # Function to dynamically truncate context to fit the Groq API's token limit | |
| def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits | |
| if len(context) > max_tokens: | |
| context = context[:max_tokens] # Truncate context to fit within the token limit | |
| return context | |
| # Function to query Groq with context and question | |
| def query_groq(question, context): | |
| try: | |
| if not question.strip(): | |
| return "Error: Question is empty or invalid." | |
| if not context.strip(): | |
| return "Error: No context available from the uploaded documents." | |
| # Dynamically truncate context to fit within the token limit | |
| max_context_tokens = 4000 # Groq's token limit for context | |
| context = truncate_context(context, max_tokens=max_context_tokens) | |
| # Query Groq API with the truncated context | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, | |
| {"role": "assistant", "content": context}, | |
| {"role": "user", "content": question}], | |
| model="llama3-8b-8192", stream=False) | |
| if chat_completion and chat_completion.choices: | |
| return chat_completion.choices[0].message.content | |
| else: | |
| return "Error: Received an unexpected response from Groq API." | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Function to handle RAG pipeline | |
| def rag_pipeline(files, question, summarize_before_sending=False): | |
| try: | |
| if not files: | |
| return "Error: No files uploaded. Please upload at least one document." | |
| # Process uploaded files | |
| texts = process_files(files) | |
| if not texts: | |
| return "Error: Could not extract text from the uploaded files." | |
| # Combine all extracted text into a single context | |
| combined_text = " ".join(texts) | |
| if summarize_before_sending: | |
| # Summarize the text to reduce token count | |
| combined_text = summarize_text(combined_text) | |
| # Ensure the combined text is within Groq's token limit | |
| max_text_size = 4000 # Adjust based on Groq's token limits | |
| combined_text = truncate_context(combined_text, max_tokens=max_text_size) | |
| # Chunk and create embeddings | |
| chunks = chunk_text(combined_text) | |
| create_embeddings_and_store(chunks) | |
| # Query Groq LLM with context and question | |
| answer = query_groq(question, combined_text) | |
| return answer | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Enhanced UI with modern and clean style | |
| with gr.Blocks() as app: | |
| with gr.Row(): | |
| # Left Column for instructions | |
| with gr.Column(scale=1, min_width=250): | |
| gr.Markdown(""" | |
| <div style="background: #3498db; padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;"> | |
| <h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2> | |
| <p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p> | |
| <p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p> | |
| <ul style="color: #ddd; font-size: 16px; line-height: 1.6;"> | |
| <li>Upload your PDF or DOCX files.</li> | |
| <li>Ask questions related to the document.</li> | |
| <li>Click "Submit" to get your answers.</li> | |
| </ul> | |
| <p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p> | |
| </div> | |
| """) | |
| # Right Column for the main application content | |
| with gr.Column(scale=2, min_width=600): | |
| gr.Markdown(""" | |
| <div style="background: #3498db; padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;"> | |
| <h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;"> | |
| Ask Your Document | |
| </h2> | |
| <p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;"> | |
| Get intelligent answers based on the content of your uploaded documents. Just ask a question! | |
| </p> | |
| </div> | |
| """) | |
| # File input | |
| file_input = gr.File( | |
| label="Upload Documents (PDF/DOCX)", | |
| file_types=[".pdf", ".docx"], | |
| file_count="multiple", | |
| interactive=True | |
| ) | |
| # Question input | |
| question_input = gr.Textbox( | |
| label="Ask a question related your document", | |
| placeholder="Type your question here...", | |
| interactive=True, | |
| lines=2, | |
| max_lines=4 | |
| ) | |
| # # Summarize before sending checkbox | |
| # summarize_before_input = gr.Checkbox( | |
| # label="Summarize Before Sending", | |
| # value=False | |
| # ) | |
| # Output text box with enhanced styling | |
| output = gr.Textbox( | |
| label="Answer from LLM", | |
| interactive=False, | |
| lines=4, | |
| max_lines=6 | |
| ) | |
| # Submit button with icon and modern styling | |
| submit_button = gr.Button("Submit", icon="send") | |
| # Loading spinner | |
| # with gr.Row(): | |
| # with gr.Column(scale=1, min_width=250): | |
| # gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>") | |
| # Apply the logic for the button to trigger the RAG pipeline | |
| submit_button.click(rag_pipeline, inputs=[file_input, question_input], outputs=output) | |
| # Launch the app | |
| app.launch() | |