File size: 4,581 Bytes
3b5a8b1
c8891d8
68ac076
855b4b0
93abebf
56f5cbd
93abebf
507a7c5
3b5a8b1
42b2650
 
68ac076
 
 
 
 
 
507a7c5
28cc4a3
 
507a7c5
68ac076
 
 
507a7c5
 
855b4b0
 
 
 
 
 
 
 
 
 
 
 
 
56f5cbd
855b4b0
 
 
 
 
 
93abebf
855b4b0
 
93abebf
 
9fbcde0
 
 
93abebf
9fbcde0
 
 
93abebf
9fbcde0
 
 
b3b8d46
 
56f5cbd
 
855b4b0
 
3b5a8b1
 
 
855b4b0
3b5a8b1
855b4b0
3b5a8b1
855b4b0
3b5a8b1
855b4b0
3b5a8b1
855b4b0
3b5a8b1
855b4b0
3b5a8b1
42b2650
855b4b0
c16c548
 
 
68ac076
 
 
56f5cbd
68ac076
855b4b0
 
dd61286
 
 
 
 
 
 
e2bb6ed
855b4b0
56f5cbd
68ac076
855b4b0
56f5cbd
 
855b4b0
56f5cbd
 
 
 
 
 
 
855b4b0
 
c16c548
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import streamlit as st
from langchain_pipeline import pipeline, model_names
import fitz  # PyMuPDF
from docx import Document
from difflib import unified_diff
import tempfile
from docx.shared import RGBColor
import re

def pdf_to_text_with_layout(pdf_file):
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    text = []
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text.append(page.get_text("text"))
    return "\n".join(text)

def clean_text(text):
    # Remove non-ASCII and control characters
    return ''.join(c for c in text if c.isprintable() and ord(c) < 65536)

def text_to_word_with_formatting(text, word_path):
    doc = Document()
    for line in text.split("\n"):
        clean_line = clean_text(line)
        doc.add_paragraph(clean_line)
    doc.save(word_path)

def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
    return pipeline(
        file,
        model_name,
        balance_type,
        apsn_transactions,
        max_fees_per_day,
        min_overdrawn_fee,
        min_transaction_overdraft
    )

def redline_changes(original_path, revised_path, output_path):
    original_doc = Document(original_path)
    revised_doc = Document(revised_path)

    original_text = "\n".join([para.text for para in original_doc.paragraphs])
    revised_text = "\n".join([para.text for para in revised_doc.paragraphs])

    diff = unified_diff(original_text.splitlines(), revised_text.splitlines(), lineterm='')

    diff_doc = Document()
    for line in diff:
        if line.startswith('-'):
            p = diff_doc.add_paragraph(style='Normal')
            run = p.add_run(line)
            run.font.color.rgb = RGBColor(255, 0, 0)  # Red
        elif line.startswith('+'):
            p = diff_doc.add_paragraph(style='Normal')
            run = p.add_run(line)
            run.font.color.rgb = RGBColor(0, 128, 0)  # Green
        elif line.startswith('@@'):
            p = diff_doc.add_paragraph(style='Normal')
            run = p.add_run(line)
            run.font.color.rgb = RGBColor(0, 0, 255)  # Blue
        else:
            diff_doc.add_paragraph(line, style='Normal')

    diff_doc.save(output_path)

# Streamlit App
st.title("Canarie AI Prototype")
st.subheader("Finding the canarie in the coal mine")

model_name = st.selectbox("Model", model_names())

balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"])

apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"])

max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10)

min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500)

min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500)

uploaded_file = st.file_uploader("Choose a file", type=["pdf"])

if uploaded_file is not None:
    with st.spinner('Please wait ...'):
        try:
            # Extract text with layout preservation
            extracted_text = pdf_to_text_with_layout(uploaded_file)
            
            original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
            text_to_word_with_formatting(extracted_text, original_word_path)

            diff = apply_pipeline(
                uploaded_file,
                model_name,
                balance_type,
                apsn_transactions,
                max_fees_per_day,
                min_overdrawn_fee,
                min_transaction_overdraft
            )

            revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
            text_to_word_with_formatting(diff, revised_word_path)

            redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
            redline_changes(original_word_path, revised_word_path, redlined_output_path)

            with open(redlined_output_path, "rb") as f:
                st.download_button(
                    label="Download Redlined Document",
                    data=f,
                    file_name="redlined_document.docx",
                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                )
            st.success("Redlined document created successfully!")

        except Exception as e:
            st.exception(e)