File size: 8,437 Bytes
641b1eb
a22c97f
c9aca66
 
 
 
 
a22c97f
39755b9
 
 
2951be9
c9aca66
 
a22c97f
39755b9
 
c9aca66
 
 
2951be9
c9aca66
 
2951be9
c9aca66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a22c97f
c9aca66
 
 
a22c97f
c9aca66
 
 
 
 
 
 
 
a22c97f
c9aca66
a22c97f
c9aca66
 
 
 
 
 
 
 
 
 
a22c97f
 
c9aca66
 
a22c97f
c9aca66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a22c97f
c9aca66
 
 
 
a22c97f
c9aca66
 
 
a22c97f
c9aca66
 
a22c97f
 
c9aca66
 
a22c97f
c9aca66
 
 
a22c97f
c9aca66
 
 
 
 
 
 
2951be9
c9aca66
 
 
63bed45
641b1eb
c9aca66
 
 
 
 
63bed45
c9aca66
 
 
 
 
 
 
 
 
63bed45
641b1eb
c9aca66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63bed45
c9aca66
 
 
 
 
 
63bed45
 
 
 
 
c9aca66
a22c97f
c9aca66
 
 
 
 
 
a22c97f
 
c9aca66
 
a22c97f
63bed45
 
 
a22c97f
c9aca66
ddf40bd
c9aca66
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211

import gradio as gr
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
from PyPDF2 import PdfReader
from docx import Document
from transformers import pipeline  # Hugging Face for summarization
import os
from groq import Groq


# Initialize Sentence Transformer for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

client = Groq(api_key=os.getenv("groq_api_key"))

# Vector Store (FAISS)
dimension = 384  # Embedding size
index = faiss.IndexFlatL2(dimension)

# Initialize Hugging Face summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to extract text from PDFs
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Function to process files
def process_files(files):
    texts = []
    for file in files:
        if file.name.endswith('.pdf'):
            texts.append(extract_text_from_pdf(file.name))
        elif file.name.endswith('.docx'):
            texts.append(extract_text_from_docx(file.name))
    return texts

# Function to tokenize and chunk text
def chunk_text(text, chunk_size=500, overlap=50):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return text_splitter.split_text(text)

# Function to create embeddings and populate FAISS index
def create_embeddings_and_store(chunks):
    global index
    # Reset the FAISS index before adding new embeddings
    index = faiss.IndexFlatL2(dimension)
    for chunk in chunks:
        embedding = model.encode([chunk])
        embedding = embedding.astype('float32')  # Ensure embedding is in correct format
        index.add(embedding)

# Function for summarizing the text before sending
def summarize_text(text):
    summary = summarizer(text, max_length=300, min_length=100, do_sample=False)
    return summary[0]['summary_text']

# Function to dynamically truncate context to fit the Groq API's token limit
def truncate_context(context, max_tokens=4000):  # Adjust max_tokens based on Groq's limits
    if len(context) > max_tokens:
        context = context[:max_tokens]  # Truncate context to fit within the token limit
    return context

# Function to query Groq with context and question
def query_groq(question, context):
    try:
        if not question.strip():
            return "Error: Question is empty or invalid."
        if not context.strip():
            return "Error: No context available from the uploaded documents."

        # Dynamically truncate context to fit within the token limit
        max_context_tokens = 4000  # Groq's token limit for context
        context = truncate_context(context, max_tokens=max_context_tokens)

        # Query Groq API with the truncated context
        chat_completion = client.chat.completions.create(
            messages=[{"role": "system", "content": "You are a helpful assistant. Use the context provided to answer the question."},
                      {"role": "assistant", "content": context},
                      {"role": "user", "content": question}],
            model="llama3-8b-8192", stream=False)
        if chat_completion and chat_completion.choices:
            return chat_completion.choices[0].message.content
        else:
            return "Error: Received an unexpected response from Groq API."
    except Exception as e:
        return f"Error: {str(e)}"

# Function to handle RAG pipeline
def rag_pipeline(files, question, summarize_before_sending=False):
    try:
        if not files:
            return "Error: No files uploaded. Please upload at least one document."

        # Process uploaded files
        texts = process_files(files)
        if not texts:
            return "Error: Could not extract text from the uploaded files."

        # Combine all extracted text into a single context
        combined_text = " ".join(texts)

        if summarize_before_sending:
            # Summarize the text to reduce token count
            combined_text = summarize_text(combined_text)

        # Ensure the combined text is within Groq's token limit
        max_text_size = 4000  # Adjust based on Groq's token limits
        combined_text = truncate_context(combined_text, max_tokens=max_text_size)

        # Chunk and create embeddings
        chunks = chunk_text(combined_text)
        create_embeddings_and_store(chunks)

        # Query Groq LLM with context and question
        answer = query_groq(question, combined_text)
        return answer
    except Exception as e:
        return f"Error: {str(e)}"

# Enhanced UI with modern and clean style
with gr.Blocks() as app:
    with gr.Row():
        # Left Column for instructions
        with gr.Column(scale=1, min_width=250):
            gr.Markdown("""
            <div style="background: #3498db; padding: 30px; border-radius: 12px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1); font-family: 'Roboto', sans-serif;">
                <h2 style="color: #fff; font-size: 32px; font-weight: bold;">DocAI: Document Assistant</h2>
                <p style="color: #ddd; font-size: 18px;">Welcome to DocAI! Upload your documents and get intelligent answers based on their content.</p>
                <p style="color: #ddd; font-size: 16px; line-height: 1.6;"><strong>Steps to use:</strong></p>
                <ul style="color: #ddd; font-size: 16px; line-height: 1.6;">
                    <li>Upload your PDF or DOCX files.</li>
                    <li>Ask questions related to the document.</li>
                    
                    <li>Click "Submit" to get your answers.</li>
                </ul>
                <p style="color: #ddd; font-size: 16px; line-height: 1.6;">Upload multiple files and get answers based on their contents.</p>
            </div>
            """)

        # Right Column for the main application content
        with gr.Column(scale=2, min_width=600):
            gr.Markdown("""
            <div style="background: #3498db; padding: 20px; border-radius: 15px; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); font-family: 'Roboto', sans-serif;">
                <h2 style="color: #fff; font-size: 36px; font-weight: bold; text-align: center; letter-spacing: 2px; text-transform: uppercase;">
                    Ask Your Document
                </h2>
                <p style="color: #ddd; font-size: 18px; text-align: center; line-height: 1.6;">
                    Get intelligent answers based on the content of your uploaded documents. Just ask a question!
                </p>
            </div>
            """)

            # File input
            file_input = gr.File(
                label="Upload Documents (PDF/DOCX)", 
                file_types=[".pdf", ".docx"], 
                file_count="multiple", 
                interactive=True
            )

            # Question input
            question_input = gr.Textbox(
                label="Ask a question related your document", 
                placeholder="Type your question here...", 
                interactive=True, 
                lines=2, 
                max_lines=4
            )

            # # Summarize before sending checkbox
            # summarize_before_input = gr.Checkbox(
            #     label="Summarize Before Sending", 
            #     value=False
            # )

            # Output text box with enhanced styling
            output = gr.Textbox(
                label="Answer from LLM", 
                interactive=False, 
                lines=4, 
                max_lines=6
            )
            
            # Submit button with icon and modern styling
            submit_button = gr.Button("Submit", icon="send")

            # Loading spinner
            # with gr.Row():
            #     with gr.Column(scale=1, min_width=250):
            #         gr.Markdown("<div style='font-size: 14px; color: #555;'>Your answer will appear here...</div>")
            
            # Apply the logic for the button to trigger the RAG pipeline
            submit_button.click(rag_pipeline, inputs=[file_input, question_input], outputs=output)

# Launch the app
app.launch()