Spaces:

Amelia-James
/

new-law-app

Sleeping

App Files Files Community

new-law-app / app.py

Amelia-James

Update app.py

b52f1b7 verified 5 months ago

raw

history blame contribute delete

3.69 kB

	import os
	import streamlit as st
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate
	from langchain.chat_models import ChatOpenAI

	# Streamlit app title
	st.title("Question Answering with Legal Documents")

	# Directory containing the PDFs
	pdf_folder_path = "law" # This is the folder where all PDFs are stored

	# Load and process all PDFs from the folder, including page numbers
	@st.cache_data
	def load_pdfs_from_folder(folder_path):
	all_docs = []
	for filename in os.listdir(folder_path):
	if filename.endswith(".pdf"):
	pdf_path = os.path.join(folder_path, filename)
	loader = PyPDFLoader(pdf_path)
	docs = loader.load()
	for doc in docs:
	# Ensure each chunk has metadata for page numbers
	doc.metadata["source"] = filename
	all_docs.extend(docs)
	return all_docs

	docs = load_pdfs_from_folder(pdf_folder_path)

	# Split the documents into chunks
	@st.cache_data
	def split_docs(_docs):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
	return text_splitter.split_documents(_docs)

	splits = split_docs(docs)

	# Load OpenAI embeddings
	openai_api_key = st.secrets["openai_api_key"]
	embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)

	# Vectorstore setup (Chroma)
	persist_directory = 'docs/chroma/'
	vectordb = Chroma.from_documents(documents=splits, embedding=embedding, persist_directory=persist_directory)

	# Custom PromptTemplate with improvements
	template = """You are a legal expert and must answer questions using only the provided legal documents.
	You must use the context below to find the answer. Do not guess or provide any answer that is not based on the documents.
	If you don't know the answer based on the documents, simply say, "The answer is not available in the provided documents."
	Keep your answer under 100 words, and always say "Thanks for asking!" at the end.

	{context}

	Question: {question}
	Helpful Answer (based on the documents, 100 words max):"""

	QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

	# Improved retriever settings
	retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 1}) # Maximal Marginal Relevance for better diversity

	# Build the QA chain with improved relevance
	qa_chain = RetrievalQA.from_chain_type(
	llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key),
	retriever=retriever,
	return_source_documents=True,
	chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
	)

	# Streamlit user input
	question = st.text_input("Ask a question based on the legal documents:")

	if st.button("Get Answer"):
	if question:
	with st.spinner('Generating answer...'):
	result = qa_chain({"query": question})
	st.write(result["result"]) # Display the concise answer

	# Display reference to source documents
	st.subheader("Referenced Source:")
	for doc in result["source_documents"]:
	doc_name = os.path.basename(doc.metadata["source"])
	page_number = doc.metadata.get('page', 'N/A') # Attempt to get the page number, fallback to N/A
	st.write(f"Referenced from {doc_name}, page {page_number}")
	# Optionally show content
	# st.write(doc.page_content) # Uncomment to show the content of the document
	else:
	st.error("Please ask a question.")