Spaces:
Sleeping
Sleeping
# import gradio as gr | |
# import os | |
from groq import Groq | |
# from langchain.text_splitter import CharacterTextSplitter | |
# from sentence_transformers import SentenceTransformer | |
# import faiss | |
# from PyPDF2 import PdfReader | |
# from docx import Document | |
# from transformers import pipeline | |
# # Initialize Sentence Transformer for embeddings | |
# model = SentenceTransformer('all-MiniLM-L6-v2') | |
client = Groq(api_key=os.getenv("groq_api_key")) | |
# # Vector Store (FAISS) | |
# dimension = 384 # Embedding size | |
# index = faiss.IndexFlatL2(dimension) | |
# # Initialize Hugging Face summarization model | |
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# # Function to extract text from PDFs | |
# def extract_text_from_pdf(file_path): | |
# reader = PdfReader(file_path) | |
# text = "" | |
# for page in reader.pages: | |
# text += page.extract_text() | |
# return text | |
# # Function to extract text from DOCX | |
# def extract_text_from_docx(file_path): | |
# doc = Document(file_path) | |
# text = "" | |
# for paragraph in doc.paragraphs: | |
# text += paragraph.text + "\n" | |
# return text | |
# # Function to process files | |
# def process_files(files): | |
# texts = [] | |
# for file in files: | |
# if file.name.endswith('.pdf'): | |
# texts.append(extract_text_from_pdf(file.name)) | |
# elif file.name.endswith('.docx'): | |
# texts.append(extract_text_from_docx(file.name)) | |
# return texts | |
# # Function to tokenize and chunk text | |
# def chunk_text(text, chunk_size=500, overlap=50): | |
# text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
# return text_splitter.split_text(text) | |
# # Function to create embeddings and populate FAISS index | |
# def create_embeddings_and_store(chunks): | |
# global index | |
# index = faiss.IndexFlatL2(dimension) | |
# for chunk in chunks: | |
# embedding = model.encode([chunk]) | |
# embedding = embedding.astype('float32') | |
# index.add(embedding) | |
# # Function for summarizing the text before sending | |
# def summarize_text(text): | |
# summary = summarizer(text, max_length=300, min_length=100, do_sample=False) | |
# return summary[0]['summary_text'] | |
# # Function to dynamically truncate context to fit the Groq API's token limit | |
# def truncate_context(context, max_tokens=4000): | |
# if len(context) > max_tokens: | |
# context = context[:max_tokens] | |
# return context | |
# # Function to query Groq with context and question | |
# def query_groq(question, context): | |
# try: | |
# if not question.strip(): | |
# return "Error: Question is empty or invalid." | |
# if not context.strip(): | |
# return "Error: No context available from the uploaded documents." | |
# max_context_tokens = 4000 | |
# context = truncate_context(context, max_tokens=max_context_tokens) | |
# chat_completion = client.chat.completions.create( | |
# messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, | |
# {"role": "assistant", "content": context}, | |
# {"role": "user", "content": question}], | |
# model="llama3-8b-8192", stream=False) | |
# if chat_completion and chat_completion.choices: | |
# return chat_completion.choices[0].message.content | |
# else: | |
# return "Error: Received an unexpected response from Groq API." | |
# except Exception as e: | |
# return f"Error: {str(e)}" | |
# # Function to handle RAG pipeline | |
# def rag_pipeline(files, question, summarize_before_sending=False): | |
# try: | |
# if not files: | |
# return "Error: No files uploaded. Please upload at least one document." | |
# texts = process_files(files) | |
# if not texts: | |
# return "Error: Could not extract text from the uploaded files." | |
# combined_text = " ".join(texts) | |
# if summarize_before_sending: | |
# combined_text = summarize_text(combined_text) | |
# max_text_size = 4000 | |
# combined_text = truncate_context(combined_text, max_tokens=max_text_size) | |
# chunks = chunk_text(combined_text) | |
# create_embeddings_and_store(chunks) | |
# answer = query_groq(question, combined_text) | |
# return answer | |
# except Exception as e: | |
# return f"Error: {str(e)}" | |
# # Enhanced UI with modern and clean style | |
# with gr.Blocks() as app: | |
# with gr.Row(): | |
# # Left Column for instructions | |
# with gr.Column(scale=1, min_width=250): | |
# gr.Markdown(""" | |
# <div style="background: linear-gradient(145deg, #6e7dff, #1c2b58); padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;"> | |
# <h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2> | |
# <p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p> | |
# <p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p> | |
# <ul style="color: #ddd; font-size: 16px; line-height: 1.6;"> | |
# <li>Upload your PDF or DOCX files.</li> | |
# <li>Ask questions related to the document.</li> | |
# <li>Enable "Summarize Before Sending" for a brief summary of the document.</li> | |
# <li>Click "Submit" to get your answers.</li> | |
# </ul> | |
# <p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p> | |
# </div> | |
# """) | |
# # Right Column for the main application content | |
# with gr.Column(scale=2, min_width=600): | |
# gr.Markdown(""" | |
# <div style="background: linear-gradient(135deg, #6e7dff, #1c2b58); padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;"> | |
# <h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;"> | |
# Ask Your Document | |
# </h2> | |
# <p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;"> | |
# Get intelligent answers based on the content of your uploaded documents. Just ask a question! | |
# </p> | |
# </div> | |
# """) | |
# # File input | |
# file_input = gr.File( | |
# label="Upload Documents (PDF/DOCX)", | |
# file_types=[".pdf", ".docx"], | |
# file_count="multiple", | |
# interactive=True | |
# ) | |
# # Question input | |
# question_input = gr.Textbox( | |
# label="Ask a question", | |
# placeholder="Type your question here...", | |
# interactive=True, | |
# lines=2, | |
# max_lines=4 | |
# ) | |
# # Summarize before sending checkbox | |
# summarize_before_input = gr.Checkbox( | |
# label="Summarize Before Sending", | |
# value=False | |
# ) | |
# # Output text box | |
# output = gr.Textbox( | |
# label="Answer from LLM", | |
# interactive=False, | |
# lines=4, | |
# max_lines=6 | |
# ) | |
# # Submit button | |
# submit_button = gr.Button("Submit", icon="send") | |
# # Apply the logic for the button to trigger the RAG pipeline | |
# submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output) | |
# # Launch the app | |
# app.launch() | |
import gradio as gr | |
from langchain.text_splitter import CharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
from PyPDF2 import PdfReader | |
from docx import Document | |
from transformers import pipeline # Hugging Face for summarization | |
# Initialize Sentence Transformer for embeddings | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Vector Store (FAISS) | |
dimension = 384 # Embedding size | |
index = faiss.IndexFlatL2(dimension) | |
# Initialize Hugging Face summarization model | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(file_path): | |
reader = PdfReader(file_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
# Function to extract text from DOCX | |
def extract_text_from_docx(file_path): | |
doc = Document(file_path) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
# Function to process files | |
def process_files(files): | |
texts = [] | |
for file in files: | |
if file.name.endswith('.pdf'): | |
texts.append(extract_text_from_pdf(file.name)) | |
elif file.name.endswith('.docx'): | |
texts.append(extract_text_from_docx(file.name)) | |
return texts | |
# Function to tokenize and chunk text | |
def chunk_text(text, chunk_size=500, overlap=50): | |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
return text_splitter.split_text(text) | |
# Function to create embeddings and populate FAISS index | |
def create_embeddings_and_store(chunks): | |
global index | |
# Reset the FAISS index before adding new embeddings | |
index = faiss.IndexFlatL2(dimension) | |
for chunk in chunks: | |
embedding = model.encode([chunk]) | |
embedding = embedding.astype('float32') # Ensure embedding is in correct format | |
index.add(embedding) | |
# Function for summarizing the text before sending | |
def summarize_text(text): | |
summary = summarizer(text, max_length=300, min_length=100, do_sample=False) | |
return summary[0]['summary_text'] | |
# Function to dynamically truncate context to fit the Groq API's token limit | |
def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits | |
if len(context) > max_tokens: | |
context = context[:max_tokens] # Truncate context to fit within the token limit | |
return context | |
# Function to query Groq with context and question | |
def query_groq(question, context): | |
try: | |
if not question.strip(): | |
return "Error: Question is empty or invalid." | |
if not context.strip(): | |
return "Error: No context available from the uploaded documents." | |
# Dynamically truncate context to fit within the token limit | |
max_context_tokens = 4000 # Groq's token limit for context | |
context = truncate_context(context, max_tokens=max_context_tokens) | |
# Query Groq API with the truncated context | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."}, | |
{"role": "assistant", "content": context}, | |
{"role": "user", "content": question}], | |
model="llama3-8b-8192", stream=False) | |
if chat_completion and chat_completion.choices: | |
return chat_completion.choices[0].message.content | |
else: | |
return "Error: Received an unexpected response from Groq API." | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Function to handle RAG pipeline | |
def rag_pipeline(files, question, summarize_before_sending=False): | |
try: | |
if not files: | |
return "Error: No files uploaded. Please upload at least one document." | |
# Process uploaded files | |
texts = process_files(files) | |
if not texts: | |
return "Error: Could not extract text from the uploaded files." | |
# Combine all extracted text into a single context | |
combined_text = " ".join(texts) | |
if summarize_before_sending: | |
# Summarize the text to reduce token count | |
combined_text = summarize_text(combined_text) | |
# Ensure the combined text is within Groq's token limit | |
max_text_size = 4000 # Adjust based on Groq's token limits | |
combined_text = truncate_context(combined_text, max_tokens=max_text_size) | |
# Chunk and create embeddings | |
chunks = chunk_text(combined_text) | |
create_embeddings_and_store(chunks) | |
# Query Groq LLM with context and question | |
answer = query_groq(question, combined_text) | |
return answer | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Enhanced UI with modern and clean style | |
with gr.Blocks() as app: | |
with gr.Row(): | |
# Left Column for instructions | |
with gr.Column(scale=1, min_width=250): | |
gr.Markdown(""" | |
<div style="background: linear-gradient(145deg, #6e7dff, #1c2b58); padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;"> | |
<h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2> | |
<p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p> | |
<p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p> | |
<ul style="color: #ddd; font-size: 16px; line-height: 1.6;"> | |
<li>Upload your PDF or DOCX files.</li> | |
<li>Ask questions related to the document.</li> | |
<li>Enable "Summarize Before Sending" for a brief summary of the document.</li> | |
<li>Click "Submit" to get your answers.</li> | |
</ul> | |
<p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p> | |
</div> | |
""") | |
# Right Column for the main application content | |
with gr.Column(scale=2, min_width=600): | |
gr.Markdown(""" | |
<div style="background: linear-gradient(135deg, #6e7dff, #1c2b58); padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;"> | |
<h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;"> | |
Ask Your Document | |
</h2> | |
<p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;"> | |
Get intelligent answers based on the content of your uploaded documents. Just ask a question! | |
</p> | |
</div> | |
""") | |
# File input | |
file_input = gr.File( | |
label="Upload Documents (PDF/DOCX)", | |
file_types=[".pdf", ".docx"], | |
file_count="multiple", | |
interactive=True | |
) | |
# Question input | |
question_input = gr.Textbox( | |
label="Ask a question", | |
placeholder="Type your question here...", | |
interactive=True, | |
lines=2, | |
max_lines=4 | |
) | |
# Summarize before sending checkbox | |
summarize_before_input = gr.Checkbox( | |
label="Summarize Before Sending", | |
value=False | |
) | |
# Output text box with enhanced styling | |
output = gr.Textbox( | |
label="Answer from LLM", | |
interactive=False, | |
lines=4, | |
max_lines=6 | |
) | |
# Submit button with icon and modern styling | |
submit_button = gr.Button("Submit", icon="send") | |
# Loading spinner | |
with gr.Row(): | |
with gr.Column(scale=1, min_width=250): | |
gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>") | |
# Apply the logic for the button to trigger the RAG pipeline | |
submit_button.click(rag_pipeline, inputs=[file_input, question_input, summarize_before_input], outputs=output) | |
# Launch the app | |
app.launch() | |