Spaces:
Sleeping
Sleeping
File size: 8,437 Bytes
641b1eb a22c97f c9aca66 a22c97f 39755b9 2951be9 c9aca66 a22c97f 39755b9 c9aca66 2951be9 c9aca66 2951be9 c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f c9aca66 2951be9 c9aca66 63bed45 641b1eb c9aca66 63bed45 c9aca66 63bed45 641b1eb c9aca66 63bed45 c9aca66 63bed45 c9aca66 a22c97f c9aca66 a22c97f c9aca66 a22c97f 63bed45 a22c97f c9aca66 ddf40bd c9aca66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import gradio as gr
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
from PyPDF2 import PdfReader
from docx import Document
from transformers import pipeline # Hugging Face for summarization
import os
from groq import Groq
# Initialize Sentence Transformer for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
client = Groq(api_key=os.getenv("groq_api_key"))
# Vector Store (FAISS)
dimension = 384 # Embedding size
index = faiss.IndexFlatL2(dimension)
# Initialize Hugging Face summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Function to extract text from PDFs
def extract_text_from_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to extract text from DOCX
def extract_text_from_docx(file_path):
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
# Function to process files
def process_files(files):
texts = []
for file in files:
if file.name.endswith('.pdf'):
texts.append(extract_text_from_pdf(file.name))
elif file.name.endswith('.docx'):
texts.append(extract_text_from_docx(file.name))
return texts
# Function to tokenize and chunk text
def chunk_text(text, chunk_size=500, overlap=50):
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
return text_splitter.split_text(text)
# Function to create embeddings and populate FAISS index
def create_embeddings_and_store(chunks):
global index
# Reset the FAISS index before adding new embeddings
index = faiss.IndexFlatL2(dimension)
for chunk in chunks:
embedding = model.encode([chunk])
embedding = embedding.astype('float32') # Ensure embedding is in correct format
index.add(embedding)
# Function for summarizing the text before sending
def summarize_text(text):
summary = summarizer(text, max_length=300, min_length=100, do_sample=False)
return summary[0]['summary_text']
# Function to dynamically truncate context to fit the Groq API's token limit
def truncate_context(context, max_tokens=4000): # Adjust max_tokens based on Groq's limits
if len(context) > max_tokens:
context = context[:max_tokens] # Truncate context to fit within the token limit
return context
# Function to query Groq with context and question
def query_groq(question, context):
try:
if not question.strip():
return "Error: Question is empty or invalid."
if not context.strip():
return "Error: No context available from the uploaded documents."
# Dynamically truncate context to fit within the token limit
max_context_tokens = 4000 # Groq's token limit for context
context = truncate_context(context, max_tokens=max_context_tokens)
# Query Groq API with the truncated context
chat_completion = client.chat.completions.create(
messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
{"role": "assistant", "content": context},
{"role": "user", "content": question}],
model="llama3-8b-8192", stream=False)
if chat_completion and chat_completion.choices:
return chat_completion.choices[0].message.content
else:
return "Error: Received an unexpected response from Groq API."
except Exception as e:
return f"Error: {str(e)}"
# Function to handle RAG pipeline
def rag_pipeline(files, question, summarize_before_sending=False):
try:
if not files:
return "Error: No files uploaded. Please upload at least one document."
# Process uploaded files
texts = process_files(files)
if not texts:
return "Error: Could not extract text from the uploaded files."
# Combine all extracted text into a single context
combined_text = " ".join(texts)
if summarize_before_sending:
# Summarize the text to reduce token count
combined_text = summarize_text(combined_text)
# Ensure the combined text is within Groq's token limit
max_text_size = 4000 # Adjust based on Groq's token limits
combined_text = truncate_context(combined_text, max_tokens=max_text_size)
# Chunk and create embeddings
chunks = chunk_text(combined_text)
create_embeddings_and_store(chunks)
# Query Groq LLM with context and question
answer = query_groq(question, combined_text)
return answer
except Exception as e:
return f"Error: {str(e)}"
# Enhanced UI with modern and clean style
with gr.Blocks() as app:
with gr.Row():
# Left Column for instructions
with gr.Column(scale=1, min_width=250):
gr.Markdown("""
<div style="background: #3498db; padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;">
<h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2>
<p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p>
<p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p>
<ul style="color: #ddd; font-size: 16px; line-height: 1.6;">
<li>Upload your PDF or DOCX files.</li>
<li>Ask questions related to the document.</li>
<li>Click "Submit" to get your answers.</li>
</ul>
<p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p>
</div>
""")
# Right Column for the main application content
with gr.Column(scale=2, min_width=600):
gr.Markdown("""
<div style="background: #3498db; padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;">
<h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;">
Ask Your Document
</h2>
<p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;">
Get intelligent answers based on the content of your uploaded documents. Just ask a question!
</p>
</div>
""")
# File input
file_input = gr.File(
label="Upload Documents (PDF/DOCX)",
file_types=[".pdf", ".docx"],
file_count="multiple",
interactive=True
)
# Question input
question_input = gr.Textbox(
label="Ask a question related your document",
placeholder="Type your question here...",
interactive=True,
lines=2,
max_lines=4
)
# # Summarize before sending checkbox
# summarize_before_input = gr.Checkbox(
# label="Summarize Before Sending",
# value=False
# )
# Output text box with enhanced styling
output = gr.Textbox(
label="Answer from LLM",
interactive=False,
lines=4,
max_lines=6
)
# Submit button with icon and modern styling
submit_button = gr.Button("Submit", icon="send")
# Loading spinner
# with gr.Row():
# with gr.Column(scale=1, min_width=250):
# gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>")
# Apply the logic for the button to trigger the RAG pipeline
submit_button.click(rag_pipeline, inputs=[file_input, question_input], outputs=output)
# Launch the app
app.launch()
|