rag-pdf-chatbot / app.py
datascientist22's picture
Update app.py
93c6cd0 verified
import os
import streamlit as st
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationChain
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings
# Set the OpenAI API key directly (or ensure it's set in the environment)
os.environ["OPENAI_API_KEY"] = "api_key"
# Set up the title and LinkedIn link
st.title("")
st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
st.title("PDF Query Chatbot")
# Load the pre-trained model and tokenizer
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')
return tokenizer, model
tokenizer, model = load_model()
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text()
return text
def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(text)
return chunks
def get_embeddings(texts):
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
# Sidebar for file upload and link input
st.sidebar.title("Load PDF")
pdf_url = st.sidebar.text_input("Paste PDF link here:")
uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True)
submit_button = st.sidebar.button("Submit")
# Initialize an empty dictionary for storing processed PDFs
pdf_chunks_embeddings = {}
if submit_button:
if pdf_url:
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_file = BytesIO(response.content)
st.write(f"Processing document from URL: {pdf_url}")
text = extract_text_from_pdf(pdf_file)
chunks = chunkize_text(text)
embeddings = get_embeddings(chunks)
pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings}
st.success("PDF processed successfully!")
except requests.exceptions.RequestException as e:
st.error(f"Error loading PDF from URL: {e}")
if uploaded_files:
for uploaded_file in uploaded_files:
pdf_name = uploaded_file.name
st.write(f"Processing `{pdf_name}`...")
text = extract_text_from_pdf(uploaded_file)
chunks = chunkize_text(text)
embeddings = get_embeddings(chunks)
pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings}
st.success("PDF(s) processed successfully!")
# Chatbot section for querying the PDF content
st.write("### PDF Query Chatbot")
if pdf_chunks_embeddings:
chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings())
query = st.text_input("Enter your query here:")
if query:
# Generate a response from the chatbot based on the processed PDFs
for pdf_name, data in pdf_chunks_embeddings.items():
chatbot.add_documents(data['chunks'])
response = chatbot.run(query)
st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}")
else:
st.write("No PDFs processed yet. Please submit a PDF to get started.")