Spaces:
Sleeping
Sleeping
File size: 4,416 Bytes
3b5a8b1 3f682ab 68ac076 855b4b0 93abebf 56f5cbd 93abebf 507a7c5 2b00504 3b5a8b1 42b2650 68ac076 fc4e550 68ac076 507a7c5 28cc4a3 8024e72 507a7c5 68ac076 507a7c5 855b4b0 56f5cbd 2b00504 855b4b0 3b5a8b1 e675e34 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 42b2650 855b4b0 c16c548 68ac076 56f5cbd 68ac076 855b4b0 dd61286 e2bb6ed 855b4b0 56f5cbd 68ac076 855b4b0 56f5cbd 855b4b0 2b00504 56f5cbd 2b00504 855b4b0 c16c548 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import streamlit as st
from langchain_pipeline import pipeline
import fitz # PyMuPDF
from docx import Document
from difflib import unified_diff
import tempfile
from docx.shared import RGBColor
import re
import subprocess
def pdf_to_text_with_layout(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
text.append(page.get_text("text"))
return "\n".join(text)
def clean_text(text):
# Remove non-ASCII and control characters
return ''.join(c for c in text if c.isprintable() and ord(c) < 65536)
def text_to_word_with_formatting(text, word_path):
doc = Document()
for line in text.split("\n"):
clean_line = clean_text(line)
doc.add_paragraph(clean_line)
doc.save(word_path)
def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
return pipeline(
file,
model_name,
balance_type,
apsn_transactions,
max_fees_per_day,
min_overdrawn_fee,
min_transaction_overdraft
)
def redline_changes(original_path, revised_path, output_path):
# Using docxcompose to create a redlined document
subprocess.run(['docxcompose', 'compose', original_path, revised_path, output_path])
# Streamlit App
st.title("Canarie AI Prototype")
st.subheader("Finding the canarie in the coal mine")
model_name = st.selectbox("Model", ["gemini-1.5-pro-001", "other-model-name"])
balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"])
apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"])
max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10)
min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500)
min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500)
uploaded_file = st.file_uploader("Choose a file", type=["pdf"])
if uploaded_file is not None:
with st.spinner('Please wait ...'):
try:
# Extract text with layout preservation
extracted_text = pdf_to_text_with_layout(uploaded_file)
original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
text_to_word_with_formatting(extracted_text, original_word_path)
diff = apply_pipeline(
uploaded_file,
model_name,
balance_type,
apsn_transactions,
max_fees_per_day,
min_overdrawn_fee,
min_transaction_overdraft
)
revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
text_to_word_with_formatting(diff, revised_word_path)
redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
redline_changes(original_word_path, revised_word_path, redlined_output_path)
with open(original_word_path, "rb") as f:
st.download_button(
label="Download Original Document",
data=f,
file_name="original_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
with open(revised_word_path, "rb") as f:
st.download_button(
label="Download Revised Document",
data=f,
file_name="revised_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
with open(redlined_output_path, "rb") as f:
st.download_button(
label="Download Redlined Document",
data=f,
file_name="redlined_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
st.success("Documents created successfully!")
except Exception as e:
st.exception(e)
|