Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import PyPDF2 | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.chains import ConversationChain | |
from langchain.llms import OpenAI | |
from langchain.embeddings import HuggingFaceEmbeddings | |
# Set the OpenAI API key directly (or ensure it's set in the environment) | |
os.environ["OPENAI_API_KEY"] = "api_key" | |
# Set up the title and LinkedIn link | |
st.title("") | |
st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)") | |
st.title("PDF Query Chatbot") | |
# Load the pre-trained model and tokenizer | |
def load_model(): | |
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') | |
model = AutoModel.from_pretrained('distilbert-base-uncased') | |
return tokenizer, model | |
tokenizer, model = load_model() | |
def extract_text_from_pdf(pdf_file): | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = '' | |
for page in range(len(reader.pages)): | |
text += reader.pages[page].extract_text() | |
return text | |
def chunkize_text(text, chunk_size=1000, chunk_overlap=200): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def get_embeddings(texts): | |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
return embeddings | |
# Sidebar for file upload and link input | |
st.sidebar.title("Load PDF") | |
pdf_url = st.sidebar.text_input("Paste PDF link here:") | |
uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True) | |
submit_button = st.sidebar.button("Submit") | |
# Initialize an empty dictionary for storing processed PDFs | |
pdf_chunks_embeddings = {} | |
if submit_button: | |
if pdf_url: | |
try: | |
response = requests.get(pdf_url) | |
response.raise_for_status() | |
pdf_file = BytesIO(response.content) | |
st.write(f"Processing document from URL: {pdf_url}") | |
text = extract_text_from_pdf(pdf_file) | |
chunks = chunkize_text(text) | |
embeddings = get_embeddings(chunks) | |
pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings} | |
st.success("PDF processed successfully!") | |
except requests.exceptions.RequestException as e: | |
st.error(f"Error loading PDF from URL: {e}") | |
if uploaded_files: | |
for uploaded_file in uploaded_files: | |
pdf_name = uploaded_file.name | |
st.write(f"Processing `{pdf_name}`...") | |
text = extract_text_from_pdf(uploaded_file) | |
chunks = chunkize_text(text) | |
embeddings = get_embeddings(chunks) | |
pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings} | |
st.success("PDF(s) processed successfully!") | |
# Chatbot section for querying the PDF content | |
st.write("### PDF Query Chatbot") | |
if pdf_chunks_embeddings: | |
chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings()) | |
query = st.text_input("Enter your query here:") | |
if query: | |
# Generate a response from the chatbot based on the processed PDFs | |
for pdf_name, data in pdf_chunks_embeddings.items(): | |
chatbot.add_documents(data['chunks']) | |
response = chatbot.run(query) | |
st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}") | |
else: | |
st.write("No PDFs processed yet. Please submit a PDF to get started.") |