Spaces:

datascientist22
/

rag-pdf-chatbot

Sleeping

App Files Files Community

rag-pdf-chatbot / app.py

datascientist22

Update app.py

93c6cd0 verified 3 months ago

raw

history blame contribute delete

3.69 kB

	import os
	import streamlit as st
	import PyPDF2
	import torch
	from transformers import AutoTokenizer, AutoModel
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains import ConversationChain
	from langchain.llms import OpenAI
	from langchain.embeddings import HuggingFaceEmbeddings

	# Set the OpenAI API key directly (or ensure it's set in the environment)
	os.environ["OPENAI_API_KEY"] = "api_key"

	# Set up the title and LinkedIn link
	st.title("")
	st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
	st.title("PDF Query Chatbot")

	# Load the pre-trained model and tokenizer
	@st.cache_resource
	def load_model():
	tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
	model = AutoModel.from_pretrained('distilbert-base-uncased')
	return tokenizer, model

	tokenizer, model = load_model()

	def extract_text_from_pdf(pdf_file):
	reader = PyPDF2.PdfReader(pdf_file)
	text = ''
	for page in range(len(reader.pages)):
	text += reader.pages[page].extract_text()
	return text

	def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_embeddings(texts):
	inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1)
	return embeddings

	# Sidebar for file upload and link input
	st.sidebar.title("Load PDF")
	pdf_url = st.sidebar.text_input("Paste PDF link here:")
	uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True)
	submit_button = st.sidebar.button("Submit")

	# Initialize an empty dictionary for storing processed PDFs
	pdf_chunks_embeddings = {}

	if submit_button:
	if pdf_url:
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()
	pdf_file = BytesIO(response.content)
	st.write(f"Processing document from URL: {pdf_url}")
	text = extract_text_from_pdf(pdf_file)
	chunks = chunkize_text(text)
	embeddings = get_embeddings(chunks)
	pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings}
	st.success("PDF processed successfully!")
	except requests.exceptions.RequestException as e:
	st.error(f"Error loading PDF from URL: {e}")

	if uploaded_files:
	for uploaded_file in uploaded_files:
	pdf_name = uploaded_file.name
	st.write(f"Processing `{pdf_name}`...")
	text = extract_text_from_pdf(uploaded_file)
	chunks = chunkize_text(text)
	embeddings = get_embeddings(chunks)
	pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings}
	st.success("PDF(s) processed successfully!")

	# Chatbot section for querying the PDF content
	st.write("### PDF Query Chatbot")
	if pdf_chunks_embeddings:
	chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings())

	query = st.text_input("Enter your query here:")
	if query:
	# Generate a response from the chatbot based on the processed PDFs
	for pdf_name, data in pdf_chunks_embeddings.items():
	chatbot.add_documents(data['chunks'])
	response = chatbot.run(query)
	st.write(f"Response from `{pdf_name}`:\n{response}\n{'-'*50}")
	else:
	st.write("No PDFs processed yet. Please submit a PDF to get started.")