Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
from groq import Groq | |
from langchain.text_splitter import CharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
from PyPDF2 import PdfReader | |
from docx import Document | |
from transformers import pipeline | |
# Initialize Sentence Transformer for embeddings | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
client = Groq(api_key=os.getenv("groq_api_key")) | |
# Vector Store (FAISS) | |
dimension = 384 # Embedding size | |
index = faiss.IndexFlatL2(dimension) | |
# Initialize Hugging Face summarization model | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(file_path): | |
reader = PdfReader(file_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
# Function to extract text from DOCX | |
def extract_text_from_docx(file_path): | |
doc = Document(file_path) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
# Function to process files | |
def process_files(files): | |
texts = [] | |
for file in files: | |
if file.name.endswith('.pdf'): | |
texts.append(extract_text_from_pdf(file.name)) | |
elif file.name.endswith('.docx'): | |
texts.append(extract_text_from_docx(file.name)) | |
return texts | |
# Function to tokenize and chunk text | |
def chunk_text(text, chunk_size=500, overlap=50): | |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
return text_splitter.split_text(text) | |
# Function to create embeddings and populate FAISS index | |
def create_embeddings_and_store(chunks): | |
global index | |
index = faiss.IndexFlatL2(dimension) | |
for chunk in chunks: | |
embedding = model.encode([chunk]) | |
embedding = embedding.astype('float32') | |
index.add(embedding) | |
# Function for summarizing the text before sending | |
def summarize_text(text): | |
summary = summarizer(text, max_length=300, min_length=100, do_sample=False) | |
return summary[0]['summary_text'] | |
# Function to dynamically truncate context to fit the Groq API's token limit | |
def truncate_context(context, max_tokens=4000): | |
if len(context) > max_tokens: | |
context = context[:max_tokens] | |
return context | |
# Function to query Groq with context and question | |
def query_groq(question, context): | |
try: | |
if not question.strip(): | |
return "Error: Question is empty or invalid." | |
if not context.strip(): | |
return "Error: No context available from the uploaded documents." | |
max_context_tokens = 4000 | |
context = truncate_context(context, max_tokens=max_context_tokens) | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, | |
{"role": "assistant", "content": context}, | |
{"role": "user", "content": question}], | |
model="llama3-8b-8192", stream=False) | |
if chat_completion and chat_completion.choices: | |
return chat_completion.choices[0].message.content | |
else: | |
return "Error: Received an unexpected response from Groq API." | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Function to handle RAG pipeline | |
def rag_pipeline(files, question, summarize_before_sending=False): | |
try: | |
if not files: | |
return "Error: No files uploaded. Please upload at least one document." | |
texts = process_files(files) | |
if not texts: | |
return "Error: Could not extract text from the uploaded files." | |
combined_text = " ".join(texts) | |
if summarize_before_sending: | |
combined_text = summarize_text(combined_text) | |
max_text_size = 4000 | |
combined_text = truncate_context(combined_text, max_tokens=max_text_size) | |
chunks = chunk_text(combined_text) | |
create_embeddings_and_store(chunks) | |
answer = query_groq(question, combined_text) | |
return answer | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Enhanced UI with modern and clean style | |
with gr.Blocks() as app: | |
with gr.Row(): | |
# Left Column for instructions | |
with gr.Column(scale=1, min_width=250): | |
gr.Markdown(""" | |
<div style="background: linear-gradient(145deg, #6e7dff, #1c2b58); padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;"> | |
<h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2> | |
<p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p> | |
<p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p> | |
<ul style="color: #ddd; font-size: 16px; line-height: 1.6;"> | |
<li>Upload your PDF or DOCX files.</li> | |
<li>Ask questions related to the document.</li> | |
<li>Enable "Summarize Before Sending" for a brief summary of the document.</li> | |
<li>Click "Submit" to get your answers.</li> | |
</ul> | |
<p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p> | |
</div> | |
""") | |
# Right Column for the main application content | |
with gr.Column(scale=2, min_width=600): | |
gr.Markdown(""" | |
<div style="background: linear-gradient(135deg, #6e7dff, #1c2b58); padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;"> | |
<h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;"> | |
Ask Your Document | |
</h2> | |
<p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;"> | |
Get intelligent answers based on the content of your uploaded documents. Just ask a question! | |
</p> | |
</div> | |
""") | |
# File input | |
file_input = gr.File( | |
label="Upload Documents (PDF/DOCX)", | |
file_types=[".pdf", ".docx"], | |
file_count="multiple", | |
interactive=True | |
) | |
# Question input | |
question_input = gr.Textbox( | |
label="Ask a question", | |
placeholder="Type your question here...", | |
interactive=True, | |
lines=2, | |
max_lines=4 | |
) | |
# Summarize before sending checkbox | |
summarize_before_input = gr.Checkbox( | |
label="Summarize Before Sending", | |
value=False | |
) | |
# Output text box | |
output = gr.Textbox( | |
label="Answer from LLM", | |
interactive=False, | |
lines=4, | |
max_lines=6 | |
) | |
# Submit button | |
submit_button = gr.Button("Submit", icon="send") | |
# Apply the logic for the button to trigger the RAG pipeline | |
submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output) | |
# Launch the app | |
app.launch() | |