import streamlit as st import numpy as np import re import tempfile from datetime import datetime from langchain_community.document_loaders import PDFPlumberLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_community.llms import Ollama from langchain.retrievers import BM25Retriever, EnsembleRetriever from sentence_transformers import CrossEncoder from transformers import pipeline from langchain_core.prompts import PromptTemplate from langchain.chains import LLMChain from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from langchain.llms.huggingface_pipeline import HuggingFacePipeline from huggingface_hub import login # Log in with your token (optional if already logged in via CLI) # login(token=HF_API_TOKEN) # Load the model and tokenizer model_name = "meta-llama/Llama-3.2-1B-Instruct" model_name = "HuggingFaceTB/SmolLM2-360M-Instruct" # Initialize classifier once for input guardrail classifier = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli") # Streamlit UI Configuration st.set_page_config(page_title="Multi-File Financial Analyzer", layout="wide") st.title("📊 Comparative Financial Analysis System") # Sidebar Controls with st.sidebar: st.header("Configuration Panel") model_choice = st.selectbox("LLM Model", ["deepseek-r1:1.5b", "llama3.2:1b"], help="Choose the core analysis engine") chunk_size = st.slider("Document Chunk Size", 500, 2000, 1000) rerank_threshold = st.slider("Re-ranking Threshold", 0.0, 1.0, 0.5) # File Upload Handling for multiple files uploaded_files = st.file_uploader("Upload 2 Financial PDFs", type="pdf", accept_multiple_files=True) if len(uploaded_files) == 2: all_docs = [] with st.spinner("Processing Multiple Financial Documents..."): for uploaded_file in uploaded_files: # Create temporary file for each PDF with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(uploaded_file.getvalue()) tmp_path = tmp.name # Load and process each document loader = PDFPlumberLoader(tmp_path) docs = loader.load() all_docs.extend(docs) # Combined Document Processing text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=200, separators=["\n\n", "\n", "\. ", "! ", "? ", " ", ""] ) documents = text_splitter.split_documents(all_docs) # Hybrid Retrieval Setup for combined documents embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") vector_store = FAISS.from_documents(documents, embedder) bm25_retriever = BM25Retriever.from_documents(documents) bm25_retriever.k = 5 faiss_retriever = vector_store.as_retriever(search_kwargs={"k": 5}) ensemble_retriever = EnsembleRetriever( retrievers=[bm25_retriever, faiss_retriever], weights=[0.4, 0.6] ) # Re-ranking Model cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Financial Analysis LLM Configuration # llm = Ollama(model=model_choice) ## tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) # Create a local pipeline pipeline_llm = pipeline("text-generation", model=model, tokenizer=tokenizer) # Wrap the local pipeline with Langchain llm = HuggingFacePipeline(pipeline=pipeline_llm) # PROMPT_TEMPLATE = """ As a senior financial analyst, analyze the following context from multiple financial reports: 1. Compare key metrics between both documents 2. Identify trends across reporting periods 3. Highlight significant differences or similarities 4. Provide integrated risk assessment 5. Offer comprehensive recommendations Context: {context} Question: {question} Format with clear section headers and bullet points. Maintain comparative analysis throughout. Keep under 300 words. """ qa_prompt = PromptTemplate( template=PROMPT_TEMPLATE, input_variables=["context", "question"] ) llm_chain = LLMChain(llm=llm, prompt=qa_prompt) # Proper LLMChain initialization # Interactive Q&A Interface st.header("🔍 Cross-Document Financial Inquiry") # Suggested Comparative Questions comparative_questions = [ "Compare revenue growth between both fiscal years", "Analyze changes in debt structure across both reports", "Show expense ratio differences between the two years", "What are the main liquidity changes across both periods?", "How does net profit margin compare between the two reports?" ] user_query = st.selectbox("Sample Comparative Questions", [""] + comparative_questions) user_input = st.text_input("Or enter custom comparative query:", value=user_query) if user_input: # Input Validation Guardrail classification = classifier(user_input, ["financial comparison", "other"], multi_label=False) if classification['scores'][0] < 0.2: st.error("Query not comparative/financial. Ask about financial comparisons between documents.") st.stop() with st.spinner("Performing Cross-Document Analysis..."): # Hybrid Document Retrieval initial_docs = ensemble_retriever.get_relevant_documents(user_input) # Context Re-ranking doc_pairs = [(user_input, doc.page_content) for doc in initial_docs] rerank_scores = cross_encoder.predict(doc_pairs) sorted_indices = np.argsort(rerank_scores)[::-1] ranked_docs = [initial_docs[i] for i in sorted_indices] filtered_docs = [d for d, s in zip(ranked_docs, rerank_scores) if s > rerank_threshold][:7] # Confidence Calculation confidence_score = np.mean(rerank_scores[sorted_indices][:3]) * 100 confidence_score = min(100, max(0, round(confidence_score, 1))) # Response Generation context = "\n".join([doc.page_content for doc in filtered_docs]) analysis = llm_chain.run( context=context, question=user_input ) # Response Cleaning clean_analysis = re.sub(r"||\n{3,}", "", analysis) clean_analysis = re.sub(r'(\d)([A-Za-z])', r'\1 \2', clean_analysis) clean_analysis = re.sub(r'(\d{1,3})(\d{3})', r'\1,\2', clean_analysis) # Results Display st.subheader("Integrated Financial Analysis") st.markdown(f"```\n{clean_analysis}\n```") st.progress(int(confidence_score)/100) st.caption(f"Analysis Confidence: {confidence_score}%") # Export Functionality if st.button("Generate Comparative Report"): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") export_content = f"COMPARATIVE QUERY: {user_input}\n\nANALYSIS:\n{clean_analysis}" st.download_button("Download Full Report", export_content, file_name=f"Comparative_Analysis_{timestamp}.txt", mime="text/plain") elif len(uploaded_files) > 0: st.warning("Please upload exactly 2 financial documents for comparative analysis") else: st.info("Please upload 2 PDF financial reports to begin comparative analysis")