Abijith commited on
Commit
8dadd45
Β·
verified Β·
1 Parent(s): 12a965b

Update compare-financial-report.py

Browse files
Files changed (1) hide show
  1. compare-financial-report.py +188 -167
compare-financial-report.py CHANGED
@@ -1,168 +1,189 @@
1
- import streamlit as st
2
- import numpy as np
3
- import re
4
- import tempfile
5
- from datetime import datetime
6
- from langchain_community.document_loaders import PDFPlumberLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.vectorstores import FAISS
10
- from langchain_community.llms import Ollama
11
- from langchain.retrievers import BM25Retriever, EnsembleRetriever
12
- from sentence_transformers import CrossEncoder
13
- from transformers import pipeline
14
- from langchain_core.prompts import PromptTemplate
15
- from langchain.chains import LLMChain
16
-
17
- # Initialize classifier once for input guardrail
18
- classifier = pipeline("zero-shot-classification",
19
- model="typeform/distilbert-base-uncased-mnli")
20
-
21
- # Streamlit UI Configuration
22
- st.set_page_config(page_title="Multi-File Financial Analyzer", layout="wide")
23
- st.title("πŸ“Š Comparative Financial Analysis System")
24
-
25
- # Sidebar Controls
26
- with st.sidebar:
27
- st.header("Configuration Panel")
28
- model_choice = st.selectbox("LLM Model",
29
- ["deepseek-r1:1.5b", "llama3.2:1b"],
30
- help="Choose the core analysis engine")
31
- chunk_size = st.slider("Document Chunk Size", 500, 2000, 1000)
32
- rerank_threshold = st.slider("Re-ranking Threshold", 0.0, 1.0, 0.5)
33
-
34
- # File Upload Handling for multiple files
35
- uploaded_files = st.file_uploader("Upload 2 Financial PDFs",
36
- type="pdf",
37
- accept_multiple_files=True)
38
-
39
- if len(uploaded_files) == 2:
40
- all_docs = []
41
- with st.spinner("Processing Multiple Financial Documents..."):
42
- for uploaded_file in uploaded_files:
43
- # Create temporary file for each PDF
44
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
45
- tmp.write(uploaded_file.getvalue())
46
- tmp_path = tmp.name
47
-
48
- # Load and process each document
49
- loader = PDFPlumberLoader(tmp_path)
50
- docs = loader.load()
51
- all_docs.extend(docs)
52
-
53
- # Combined Document Processing
54
- text_splitter = RecursiveCharacterTextSplitter(
55
- chunk_size=chunk_size,
56
- chunk_overlap=200,
57
- separators=["\n\n", "\n", "\. ", "! ", "? ", " ", ""]
58
- )
59
- documents = text_splitter.split_documents(all_docs)
60
-
61
- # Hybrid Retrieval Setup for combined documents
62
- embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
63
- vector_store = FAISS.from_documents(documents, embedder)
64
- bm25_retriever = BM25Retriever.from_documents(documents)
65
- bm25_retriever.k = 5
66
- faiss_retriever = vector_store.as_retriever(search_kwargs={"k": 5})
67
- ensemble_retriever = EnsembleRetriever(
68
- retrievers=[bm25_retriever, faiss_retriever],
69
- weights=[0.4, 0.6]
70
- )
71
-
72
- # Re-ranking Model
73
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
74
-
75
- # Financial Analysis LLM Configuration
76
- llm = Ollama(model=model_choice)
77
- PROMPT_TEMPLATE = """
78
- As a senior financial analyst, analyze the following context from multiple financial reports:
79
- 1. Compare key metrics between both documents
80
- 2. Identify trends across reporting periods
81
- 3. Highlight significant differences or similarities
82
- 4. Provide integrated risk assessment
83
- 5. Offer comprehensive recommendations
84
-
85
- Context: {context}
86
- Question: {question}
87
-
88
- Format with clear section headers and bullet points.
89
- Maintain comparative analysis throughout.
90
- Keep under 300 words.
91
- """
92
- qa_prompt = PromptTemplate(
93
- template=PROMPT_TEMPLATE,
94
- input_variables=["context", "question"]
95
- )
96
- llm_chain = LLMChain(llm=llm, prompt=qa_prompt) # Proper LLMChain initialization
97
-
98
- # Interactive Q&A Interface
99
- st.header("πŸ” Cross-Document Financial Inquiry")
100
-
101
- # Suggested Comparative Questions
102
- comparative_questions = [
103
- "Compare revenue growth between both fiscal years",
104
- "Analyze changes in debt structure across both reports",
105
- "Show expense ratio differences between the two years",
106
- "What are the main liquidity changes across both periods?",
107
- "How does net profit margin compare between the two reports?"
108
- ]
109
- user_query = st.selectbox("Sample Comparative Questions",
110
- [""] + comparative_questions)
111
- user_input = st.text_input("Or enter custom comparative query:",
112
- value=user_query)
113
-
114
- if user_input:
115
- # Input Validation Guardrail
116
- classification = classifier(user_input,
117
- ["financial comparison", "other"],
118
- multi_label=False)
119
- if classification['scores'][0] < 0.2:
120
- st.error("Query not comparative/financial. Ask about financial comparisons between documents.")
121
- st.stop()
122
-
123
- with st.spinner("Performing Cross-Document Analysis..."):
124
- # Hybrid Document Retrieval
125
- initial_docs = ensemble_retriever.get_relevant_documents(user_input)
126
-
127
- # Context Re-ranking
128
- doc_pairs = [(user_input, doc.page_content) for doc in initial_docs]
129
- rerank_scores = cross_encoder.predict(doc_pairs)
130
- sorted_indices = np.argsort(rerank_scores)[::-1]
131
- ranked_docs = [initial_docs[i] for i in sorted_indices]
132
- filtered_docs = [d for d, s in zip(ranked_docs, rerank_scores)
133
- if s > rerank_threshold][:7]
134
-
135
- # Confidence Calculation
136
- confidence_score = np.mean(rerank_scores[sorted_indices][:3]) * 100
137
- confidence_score = min(100, max(0, round(confidence_score, 1)))
138
-
139
- # Response Generation
140
- context = "\n".join([doc.page_content for doc in filtered_docs])
141
- analysis = llm_chain.run(
142
- context=context,
143
- question=user_input
144
- )
145
-
146
- # Response Cleaning
147
- clean_analysis = re.sub(r"<think>|</think>|\n{3,}", "", analysis)
148
- clean_analysis = re.sub(r'(\d)([A-Za-z])', r'\1 \2', clean_analysis)
149
- clean_analysis = re.sub(r'(\d{1,3})(\d{3})', r'\1,\2', clean_analysis)
150
-
151
- # Results Display
152
- st.subheader("Integrated Financial Analysis")
153
- st.markdown(f"```\n{clean_analysis}\n```")
154
- st.progress(int(confidence_score)/100)
155
- st.caption(f"Analysis Confidence: {confidence_score}%")
156
-
157
- # Export Functionality
158
- if st.button("Generate Comparative Report"):
159
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
160
- export_content = f"COMPARATIVE QUERY: {user_input}\n\nANALYSIS:\n{clean_analysis}"
161
- st.download_button("Download Full Report", export_content,
162
- file_name=f"Comparative_Analysis_{timestamp}.txt",
163
- mime="text/plain")
164
-
165
- elif len(uploaded_files) > 0:
166
- st.warning("Please upload exactly 2 financial documents for comparative analysis")
167
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  st.info("Please upload 2 PDF financial reports to begin comparative analysis")
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import re
4
+ import tempfile
5
+ from datetime import datetime
6
+ from langchain_community.document_loaders import PDFPlumberLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_community.vectorstores import FAISS
10
+ from langchain_community.llms import Ollama
11
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
12
+ from sentence_transformers import CrossEncoder
13
+ from transformers import pipeline
14
+ from langchain_core.prompts import PromptTemplate
15
+ from langchain.chains import LLMChain
16
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
17
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
18
+ from huggingface_hub import login
19
+
20
+ # Log in with your token (optional if already logged in via CLI)
21
+ # login(token=HF_API_TOKEN)
22
+
23
+ # Load the model and tokenizer
24
+ model_name = "meta-llama/Llama-3.2-1B-Instruct"
25
+ model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
26
+
27
+ # Initialize classifier once for input guardrail
28
+ classifier = pipeline("zero-shot-classification",
29
+ model="typeform/distilbert-base-uncased-mnli")
30
+
31
+ # Streamlit UI Configuration
32
+ st.set_page_config(page_title="Multi-File Financial Analyzer", layout="wide")
33
+ st.title("πŸ“Š Comparative Financial Analysis System")
34
+
35
+ # Sidebar Controls
36
+ with st.sidebar:
37
+ st.header("Configuration Panel")
38
+ model_choice = st.selectbox("LLM Model",
39
+ ["deepseek-r1:1.5b", "llama3.2:1b"],
40
+ help="Choose the core analysis engine")
41
+ chunk_size = st.slider("Document Chunk Size", 500, 2000, 1000)
42
+ rerank_threshold = st.slider("Re-ranking Threshold", 0.0, 1.0, 0.5)
43
+
44
+ # File Upload Handling for multiple files
45
+ uploaded_files = st.file_uploader("Upload 2 Financial PDFs",
46
+ type="pdf",
47
+ accept_multiple_files=True)
48
+
49
+ if len(uploaded_files) == 2:
50
+ all_docs = []
51
+ with st.spinner("Processing Multiple Financial Documents..."):
52
+ for uploaded_file in uploaded_files:
53
+ # Create temporary file for each PDF
54
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
55
+ tmp.write(uploaded_file.getvalue())
56
+ tmp_path = tmp.name
57
+
58
+ # Load and process each document
59
+ loader = PDFPlumberLoader(tmp_path)
60
+ docs = loader.load()
61
+ all_docs.extend(docs)
62
+
63
+ # Combined Document Processing
64
+ text_splitter = RecursiveCharacterTextSplitter(
65
+ chunk_size=chunk_size,
66
+ chunk_overlap=200,
67
+ separators=["\n\n", "\n", "\. ", "! ", "? ", " ", ""]
68
+ )
69
+ documents = text_splitter.split_documents(all_docs)
70
+
71
+ # Hybrid Retrieval Setup for combined documents
72
+ embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
73
+ vector_store = FAISS.from_documents(documents, embedder)
74
+ bm25_retriever = BM25Retriever.from_documents(documents)
75
+ bm25_retriever.k = 5
76
+ faiss_retriever = vector_store.as_retriever(search_kwargs={"k": 5})
77
+ ensemble_retriever = EnsembleRetriever(
78
+ retrievers=[bm25_retriever, faiss_retriever],
79
+ weights=[0.4, 0.6]
80
+ )
81
+
82
+ # Re-ranking Model
83
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
84
+
85
+ # Financial Analysis LLM Configuration
86
+ # llm = Ollama(model=model_choice)
87
+
88
+ ##
89
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
90
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
91
+
92
+ # Create a local pipeline
93
+ pipeline_llm = pipeline("text-generation", model=model, tokenizer=tokenizer)
94
+ # Wrap the local pipeline with Langchain
95
+ llm = HuggingFacePipeline(pipeline=pipeline_llm)
96
+ #
97
+
98
+ PROMPT_TEMPLATE = """
99
+ As a senior financial analyst, analyze the following context from multiple financial reports:
100
+ 1. Compare key metrics between both documents
101
+ 2. Identify trends across reporting periods
102
+ 3. Highlight significant differences or similarities
103
+ 4. Provide integrated risk assessment
104
+ 5. Offer comprehensive recommendations
105
+
106
+ Context: {context}
107
+ Question: {question}
108
+
109
+ Format with clear section headers and bullet points.
110
+ Maintain comparative analysis throughout.
111
+ Keep under 300 words.
112
+ """
113
+ qa_prompt = PromptTemplate(
114
+ template=PROMPT_TEMPLATE,
115
+ input_variables=["context", "question"]
116
+ )
117
+ llm_chain = LLMChain(llm=llm, prompt=qa_prompt) # Proper LLMChain initialization
118
+
119
+ # Interactive Q&A Interface
120
+ st.header("πŸ” Cross-Document Financial Inquiry")
121
+
122
+ # Suggested Comparative Questions
123
+ comparative_questions = [
124
+ "Compare revenue growth between both fiscal years",
125
+ "Analyze changes in debt structure across both reports",
126
+ "Show expense ratio differences between the two years",
127
+ "What are the main liquidity changes across both periods?",
128
+ "How does net profit margin compare between the two reports?"
129
+ ]
130
+ user_query = st.selectbox("Sample Comparative Questions",
131
+ [""] + comparative_questions)
132
+ user_input = st.text_input("Or enter custom comparative query:",
133
+ value=user_query)
134
+
135
+ if user_input:
136
+ # Input Validation Guardrail
137
+ classification = classifier(user_input,
138
+ ["financial comparison", "other"],
139
+ multi_label=False)
140
+ if classification['scores'][0] < 0.2:
141
+ st.error("Query not comparative/financial. Ask about financial comparisons between documents.")
142
+ st.stop()
143
+
144
+ with st.spinner("Performing Cross-Document Analysis..."):
145
+ # Hybrid Document Retrieval
146
+ initial_docs = ensemble_retriever.get_relevant_documents(user_input)
147
+
148
+ # Context Re-ranking
149
+ doc_pairs = [(user_input, doc.page_content) for doc in initial_docs]
150
+ rerank_scores = cross_encoder.predict(doc_pairs)
151
+ sorted_indices = np.argsort(rerank_scores)[::-1]
152
+ ranked_docs = [initial_docs[i] for i in sorted_indices]
153
+ filtered_docs = [d for d, s in zip(ranked_docs, rerank_scores)
154
+ if s > rerank_threshold][:7]
155
+
156
+ # Confidence Calculation
157
+ confidence_score = np.mean(rerank_scores[sorted_indices][:3]) * 100
158
+ confidence_score = min(100, max(0, round(confidence_score, 1)))
159
+
160
+ # Response Generation
161
+ context = "\n".join([doc.page_content for doc in filtered_docs])
162
+ analysis = llm_chain.run(
163
+ context=context,
164
+ question=user_input
165
+ )
166
+
167
+ # Response Cleaning
168
+ clean_analysis = re.sub(r"<think>|</think>|\n{3,}", "", analysis)
169
+ clean_analysis = re.sub(r'(\d)([A-Za-z])', r'\1 \2', clean_analysis)
170
+ clean_analysis = re.sub(r'(\d{1,3})(\d{3})', r'\1,\2', clean_analysis)
171
+
172
+ # Results Display
173
+ st.subheader("Integrated Financial Analysis")
174
+ st.markdown(f"```\n{clean_analysis}\n```")
175
+ st.progress(int(confidence_score)/100)
176
+ st.caption(f"Analysis Confidence: {confidence_score}%")
177
+
178
+ # Export Functionality
179
+ if st.button("Generate Comparative Report"):
180
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
181
+ export_content = f"COMPARATIVE QUERY: {user_input}\n\nANALYSIS:\n{clean_analysis}"
182
+ st.download_button("Download Full Report", export_content,
183
+ file_name=f"Comparative_Analysis_{timestamp}.txt",
184
+ mime="text/plain")
185
+
186
+ elif len(uploaded_files) > 0:
187
+ st.warning("Please upload exactly 2 financial documents for comparative analysis")
188
+ else:
189
  st.info("Please upload 2 PDF financial reports to begin comparative analysis")