pdf / app.py
sumanthkv's picture
Upload app.py
3a6de21 verified
raw
history blame
3.84 kB
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from huggingface_hub import notebook_login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
import os
import sys
# Create a directory for documents if it doesn't exist
if not os.path.exists("docs"):
os.makedirs("docs")
# Define a function to load documents from the "docs" directory
def load_documents():
document = []
for file in os.listdir("docs"):
if file.endswith(".pdf"):
pdf_path = "./docs/" + file
loader = PyPDFLoader(pdf_path)
document.extend(loader.load())
elif file.endswith('.docx') or file.endswith('.doc'):
doc_path = "./docs/" + file
loader = Docx2txtLoader(doc_path)
document.extend(loader.load())
elif file.endswith('.txt'):
text_path = "./docs/" + file
loader = TextLoader(text_path)
document.extend(loader.load())
return document
# Load documents
document = load_documents()
# Split documents into chunks
document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
document_chunks = document_splitter.split_documents(document)
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"
# Initialize Chroma as the vector database
vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory='./data')
vectordb.persist()
# Login to Hugging Face Hub
notebook_login()
# Initialize tokenizer and model for text generation
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16, device_map="auto")
# Initialize the text generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map='auto',
max_new_tokens=512, min_new_tokens=-1, top_k=30)
# Initialize the conversational retrieval chain
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': 0})
llm = ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo')
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
pdf_qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
verbose=False, memory=memory)
# Streamlit app
st.title('DocBot - Your Document Query Assistant')
st.write('Upload your documents to get started.')
uploaded_files = st.file_uploader("Upload Files", type=['pdf', 'docx', 'doc', 'txt'], accept_multiple_files=True)
if uploaded_files:
st.write("Uploaded Files:")
for file in uploaded_files:
with open(os.path.join("docs", file.name), "wb") as f:
f.write(file.getbuffer())
st.write("Files uploaded successfully. You can start asking questions now.")
while True:
query = st.text_input("Ask a question:")
if query:
result = pdf_qa({"question": query})
st.write("Answer: " + result["answer"])