import streamlit as st from langchain_pipeline import pipeline, model_names import fitz # PyMuPDF from docx import Document from difflib import unified_diff import tempfile from docx.shared import RGBColor import re def pdf_to_text_with_layout(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = [] for page_num in range(doc.page_count): page = doc.load_page(page_num) text.append(page.get_text("text")) return "\n".join(text) def clean_text(text): # Remove non-ASCII and control characters return ''.join(c for c in text if c.isprintable() and ord(c) < 65536) def text_to_word_with_formatting(text, word_path): doc = Document() for line in text.split("\n"): clean_line = clean_text(line) doc.add_paragraph(clean_line) doc.save(word_path) def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft): return pipeline( file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft ) def redline_changes(original_path, revised_path, output_path): original_doc = Document(original_path) revised_doc = Document(revised_path) original_text = "\n".join([para.text for para in original_doc.paragraphs]) revised_text = "\n".join([para.text for para in revised_doc.paragraphs]) diff = unified_diff(original_text.splitlines(), revised_text.splitlines(), lineterm='') diff_doc = Document() for line in diff: if line.startswith('-'): p = diff_doc.add_paragraph(style='Normal') run = p.add_run(line) run.font.color.rgb = RGBColor(255, 0, 0) # Red elif line.startswith('+'): p = diff_doc.add_paragraph(style='Normal') run = p.add_run(line) run.font.color.rgb = RGBColor(0, 128, 0) # Green elif line.startswith('@@'): p = diff_doc.add_paragraph(style='Normal') run = p.add_run(line) run.font.color.rgb = RGBColor(0, 0, 255) # Blue else: diff_doc.add_paragraph(line, style='Normal') diff_doc.save(output_path) # Streamlit App st.title("Canarie AI Prototype") st.subheader("Finding the canarie in the coal mine") model_name = st.selectbox("Model", model_names()) balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"]) apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"]) max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10) min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500) min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500) uploaded_file = st.file_uploader("Choose a file", type=["pdf"]) if uploaded_file is not None: with st.spinner('Please wait ...'): try: # Extract text with layout preservation extracted_text = pdf_to_text_with_layout(uploaded_file) original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name text_to_word_with_formatting(extracted_text, original_word_path) diff = apply_pipeline( uploaded_file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft ) revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name text_to_word_with_formatting(diff, revised_word_path) redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name redline_changes(original_word_path, revised_word_path, redlined_output_path) with open(redlined_output_path, "rb") as f: st.download_button( label="Download Redlined Document", data=f, file_name="redlined_document.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) st.success("Redlined document created successfully!") except Exception as e: st.exception(e)