Spaces:
Build error
Build error
| import os | |
| import streamlit as st | |
| import PyPDF2 | |
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chains import ConversationChain | |
| from langchain.llms import OpenAI | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| # Set the OpenAI API key directly (or ensure it's set in the environment) | |
| os.environ["OPENAI_API_KEY"] = "api_key" | |
| # Set up the title and LinkedIn link | |
| st.title("") | |
| st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)") | |
| st.title("PDF Query Chatbot") | |
| # Load the pre-trained model and tokenizer | |
| def load_model(): | |
| tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') | |
| model = AutoModel.from_pretrained('distilbert-base-uncased') | |
| return tokenizer, model | |
| tokenizer, model = load_model() | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = '' | |
| for page in range(len(reader.pages)): | |
| text += reader.pages[page].extract_text() | |
| return text | |
| def chunkize_text(text, chunk_size=1000, chunk_overlap=200): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def get_embeddings(texts): | |
| inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt') | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.mean(dim=1) | |
| return embeddings | |
| # Sidebar for file upload and link input | |
| st.sidebar.title("Load PDF") | |
| pdf_url = st.sidebar.text_input("Paste PDF link here:") | |
| uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True) | |
| submit_button = st.sidebar.button("Submit") | |
| # Initialize an empty dictionary for storing processed PDFs | |
| pdf_chunks_embeddings = {} | |
| if submit_button: | |
| if pdf_url: | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| pdf_file = BytesIO(response.content) | |
| st.write(f"Processing document from URL: {pdf_url}") | |
| text = extract_text_from_pdf(pdf_file) | |
| chunks = chunkize_text(text) | |
| embeddings = get_embeddings(chunks) | |
| pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings} | |
| st.success("PDF processed successfully!") | |
| except requests.exceptions.RequestException as e: | |
| st.error(f"Error loading PDF from URL: {e}") | |
| if uploaded_files: | |
| for uploaded_file in uploaded_files: | |
| pdf_name = uploaded_file.name | |
| st.write(f"Processing `{pdf_name}`...") | |
| text = extract_text_from_pdf(uploaded_file) | |
| chunks = chunkize_text(text) | |
| embeddings = get_embeddings(chunks) | |
| pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings} | |
| st.success("PDF(s) processed successfully!") | |
| # Chatbot section for querying the PDF content | |
| st.write("### PDF Query Chatbot") | |
| if pdf_chunks_embeddings: | |
| chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings()) | |
| query = st.text_input("Enter your query here:") | |
| if query: | |
| # Generate a response from the chatbot based on the processed PDFs | |
| for pdf_name, data in pdf_chunks_embeddings.items(): | |
| chatbot.add_documents(data['chunks']) | |
| response = chatbot.run(query) | |
| st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}") | |
| else: | |
| st.write("No PDFs processed yet. Please submit a PDF to get started.") |