Spaces:
Running
Running
import gradio as gr | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# from langchain_chroma import Chroma | |
from langchain_community.vectorstores import FAISS | |
from langchain_groq import ChatGroq | |
from langchain.chains import create_retrieval_chain | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain_core.prompts import ChatPromptTemplate | |
import os | |
from dotenv import load_dotenv | |
from helper import SYSTEM_PROMPT | |
from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
# from langchain.embeddings import HuggingFaceEmbeddings # open source free embedding | |
load_dotenv() | |
class PDFQAProcessor: | |
SYSTEM_PROMPT = SYSTEM_PROMPT | |
llm = ChatGroq( | |
# model_name="deepseek-r1-distill-llama-70b", | |
model_name="llama3-70b-8192", | |
temperature=0.1, | |
max_tokens=3000, | |
api_key = os.getenv('GROQ_API_KEY') | |
) | |
# Setup RAG chain | |
prompt = ChatPromptTemplate.from_messages([ | |
("system", SYSTEM_PROMPT), | |
("human", "{input}"), | |
]) | |
question_answer_chain = create_stuff_documents_chain(llm, prompt) | |
# EMBEDDING_MODEL = "intfloat/e5-large-v2" | |
# embeddings = HuggingFaceEmbeddings( | |
# model_name=EMBEDDING_MODEL, | |
# model_kwargs={'device': 'cpu'}, | |
# encode_kwargs={'normalize_embeddings': True} | |
# ) | |
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
CHUNK_SIZE = 700 | |
CHUNK_OVERLAP = 150 | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP) | |
# persist_directory="./chroma_db" | |
def __init__(self): | |
self.vectorstore = None | |
self.retriever = None | |
def process_pdfs(self, pdf_files): | |
"""Processing PDF files and creating vector store""" | |
if not pdf_files: | |
return "Please upload PDF files first!" | |
try: | |
# Load and split documents | |
docs = [] | |
for pdf_file in pdf_files: | |
loader = PyPDFLoader(pdf_file.name) | |
docs.extend(loader.load()) | |
splits = self.text_splitter.split_documents(docs) | |
# # Create vector store | |
# self.vectorstore = Chroma.from_documents( | |
# documents=splits, | |
# embedding=self.embeddings, | |
# # persist_directory = self.persist_directory | |
# ) | |
# Replace Chroma with: | |
self.vectorstore = FAISS.from_documents( | |
splits, | |
self.embeddings | |
) | |
self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 10}) | |
return "PDFs processed successfully! Ask your questions now." | |
except Exception as e: | |
return f"Error processing PDFs: {str(e)}" | |
def answer_question(self, question): | |
"""Handling question answering""" | |
if not self.retriever: | |
return "Please process PDFs first!", None | |
try: | |
# Initialize LLM | |
rag_chain = create_retrieval_chain(self.retriever, self.question_answer_chain) | |
response = rag_chain.invoke({"input": question}) | |
# final_response = response["answer"] + "\n\nSources\n\n" | |
# for info in response["context"]: | |
# final_response += info.page_content + "\nSource of Info: " + info.metadata['source'] + "\nAt Page No: " + info.metadata['page_label']+"\n\n" | |
final_response = response["answer"] + "\n\n### Sources\n\n" # Changed to use markdown formatting | |
for info in response["context"]: | |
final_response += ( | |
f"{info.page_content}<br>" # Changed to use markdown bold formatting | |
f"Source of Info: {info.metadata['source']}<br>" | |
f"At Page No: {info.metadata['page_label']}<br><br>" | |
) | |
return final_response | |
except Exception as e: | |
return f"Error answering question: {str(e)}", None | |
processor = PDFQAProcessor() | |
with gr.Blocks(title="PDF QA Assistant") as demo: | |
with gr.Tab("Upload PDFs"): | |
file_input = gr.Files(label="Upload PDFs", file_types=[".pdf"]) | |
process_btn = gr.Button("Process PDFs") | |
status_output = gr.Textbox(label="Processing Status") | |
with gr.Tab("Ask Questions"): | |
question_input = gr.Textbox(label="Your Question") | |
# answer_output = gr.Textbox(label="Answer", interactive=False) | |
answer_output = gr.Markdown(label="Answer") | |
ask_btn = gr.Button("Ask Question") | |
process_btn.click( | |
processor.process_pdfs, | |
inputs=file_input, | |
outputs=status_output | |
) | |
# QA workflow | |
ask_btn.click( | |
processor.answer_question, | |
inputs=question_input, | |
outputs=[answer_output] | |
) | |
if __name__ == "__main__": | |
demo.launch() |