Spaces:
Sleeping
Sleeping
import gradio as gr | |
from langchain.text_splitter import CharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
from PyPDF2 import PdfReader | |
from docx import Document | |
from transformers import pipeline # Hugging Face for summarization | |
import os | |
from groq import Groq | |
# Initialize Sentence Transformer for embeddings | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
client = Groq(api_key=os.getenv("groq_api_key")) | |
# Vector Store (FAISS) | |
dimension = 384 # Embedding size | |
index = faiss.IndexFlatL2(dimension) | |
# Initialize Hugging Face summarization model | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(file_path): | |
reader = PdfReader(file_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
# Function to extract text from DOCX | |
def extract_text_from_docx(file_path): | |
doc = Document(file_path) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
# Function to process files | |
def process_files(files): | |
texts = [] | |
for file in files: | |
if file.name.endswith('.pdf'): | |
texts.append(extract_text_from_pdf(file.name)) | |
elif file.name.endswith('.docx'): | |
texts.append(extract_text_from_docx(file.name)) | |
return texts | |
# Function to tokenize and chunk text | |
def chunk_text(text, chunk_size=500, overlap=50): | |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
return text_splitter.split_text(text) | |
# Function to create embeddings and populate FAISS index | |
def create_embeddings_and_store(chunks): | |
global index | |
# Reset the FAISS index before adding new embeddings | |
index = faiss.IndexFlatL2(dimension) | |
for chunk in chunks: | |
embedding = model.encode([chunk]) | |
embedding = embedding.astype('float32') # Ensure embedding is in correct format | |
index.add(embedding) | |
# Function for summarizing the text before sending | |
def summarize_text(text): | |
summary = summarizer(text, max_length=300, min_length=100, do_sample=False) | |
return summary[0]['summary_text'] | |
# Function to dynamically truncate context to fit the Groq API's token limit | |
def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits | |
if len(context) > max_tokens: | |
context = context[:max_tokens] # Truncate context to fit within the token limit | |
return context | |
# Function to query Groq with context and question | |
def query_groq(question, context): | |
try: | |
if not question.strip(): | |
return "Error: Question is empty or invalid." | |
if not context.strip(): | |
return "Error: No context available from the uploaded documents." | |
# Dynamically truncate context to fit within the token limit | |
max_context_tokens = 4000 # Groq's token limit for context | |
context = truncate_context(context, max_tokens=max_context_tokens) | |
# Query Groq API with the truncated context | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, | |
{"role": "assistant", "content": context}, | |
{"role": "user", "content": question}], | |
model="llama3-8b-8192", stream=False) | |
if chat_completion and chat_completion.choices: | |
return chat_completion.choices[0].message.content | |
else: | |
return "Error: Received an unexpected response from Groq API." | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Function to handle RAG pipeline | |
def rag_pipeline(files, question, summarize_before_sending=False): | |
try: | |
if not files: | |
return "Error: No files uploaded. Please upload at least one document." | |
# Process uploaded files | |
texts = process_files(files) | |
if not texts: | |
return "Error: Could not extract text from the uploaded files." | |
# Combine all extracted text into a single context | |
combined_text = " ".join(texts) | |
if summarize_before_sending: | |
# Summarize the text to reduce token count | |
combined_text = summarize_text(combined_text) | |
# Ensure the combined text is within Groq's token limit | |
max_text_size = 4000 # Adjust based on Groq's token limits | |
combined_text = truncate_context(combined_text, max_tokens=max_text_size) | |
# Chunk and create embeddings | |
chunks = chunk_text(combined_text) | |
create_embeddings_and_store(chunks) | |
# Query Groq LLM with context and question | |
answer = query_groq(question, combined_text) | |
return answer | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Enhanced UI with modern and clean style | |
with gr.Blocks() as app: | |
with gr.Row(): | |
# Left Column for instructions | |
with gr.Column(scale=1, min_width=250): | |
gr.Markdown(""" | |
<div style="background: #3498db; padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;"> | |
<h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2> | |
<p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p> | |
<p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p> | |
<ul style="color: #ddd; font-size: 16px; line-height: 1.6;"> | |
<li>Upload your PDF or DOCX files.</li> | |
<li>Ask questions related to the document.</li> | |
<li>Click "Submit" to get your answers.</li> | |
</ul> | |
<p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p> | |
</div> | |
""") | |
# Right Column for the main application content | |
with gr.Column(scale=2, min_width=600): | |
gr.Markdown(""" | |
<div style="background: #3498db; padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;"> | |
<h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;"> | |
Ask Your Document | |
</h2> | |
<p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;"> | |
Get intelligent answers based on the content of your uploaded documents. Just ask a question! | |
</p> | |
</div> | |
""") | |
# File input | |
file_input = gr.File( | |
label="Upload Documents (PDF/DOCX)", | |
file_types=[".pdf", ".docx"], | |
file_count="multiple", | |
interactive=True | |
) | |
# Question input | |
question_input = gr.Textbox( | |
label="Ask a question related your document", | |
placeholder="Type your question here...", | |
interactive=True, | |
lines=2, | |
max_lines=4 | |
) | |
# # Summarize before sending checkbox | |
# summarize_before_input = gr.Checkbox( | |
# label="Summarize Before Sending", | |
# value=False | |
# ) | |
# Output text box with enhanced styling | |
output = gr.Textbox( | |
label="Answer from LLM", | |
interactive=False, | |
lines=4, | |
max_lines=6 | |
) | |
# Submit button with icon and modern styling | |
submit_button = gr.Button("Submit", icon="send") | |
# Loading spinner | |
# with gr.Row(): | |
# with gr.Column(scale=1, min_width=250): | |
# gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>") | |
# Apply the logic for the button to trigger the RAG pipeline | |
submit_button.click(rag_pipeline, inputs=[file_input, question_input], outputs=output) | |
# Launch the app | |
app.launch() | |