Spaces:
Sleeping
Sleeping
File size: 4,581 Bytes
3b5a8b1 c8891d8 68ac076 855b4b0 93abebf 56f5cbd 93abebf 507a7c5 3b5a8b1 42b2650 68ac076 507a7c5 28cc4a3 507a7c5 68ac076 507a7c5 855b4b0 56f5cbd 855b4b0 93abebf 855b4b0 93abebf 9fbcde0 93abebf 9fbcde0 93abebf 9fbcde0 b3b8d46 56f5cbd 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 855b4b0 3b5a8b1 42b2650 855b4b0 c16c548 68ac076 56f5cbd 68ac076 855b4b0 dd61286 e2bb6ed 855b4b0 56f5cbd 68ac076 855b4b0 56f5cbd 855b4b0 56f5cbd 855b4b0 c16c548 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import streamlit as st
from langchain_pipeline import pipeline, model_names
import fitz # PyMuPDF
from docx import Document
from difflib import unified_diff
import tempfile
from docx.shared import RGBColor
import re
def pdf_to_text_with_layout(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
text.append(page.get_text("text"))
return "\n".join(text)
def clean_text(text):
# Remove non-ASCII and control characters
return ''.join(c for c in text if c.isprintable() and ord(c) < 65536)
def text_to_word_with_formatting(text, word_path):
doc = Document()
for line in text.split("\n"):
clean_line = clean_text(line)
doc.add_paragraph(clean_line)
doc.save(word_path)
def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
return pipeline(
file,
model_name,
balance_type,
apsn_transactions,
max_fees_per_day,
min_overdrawn_fee,
min_transaction_overdraft
)
def redline_changes(original_path, revised_path, output_path):
original_doc = Document(original_path)
revised_doc = Document(revised_path)
original_text = "\n".join([para.text for para in original_doc.paragraphs])
revised_text = "\n".join([para.text for para in revised_doc.paragraphs])
diff = unified_diff(original_text.splitlines(), revised_text.splitlines(), lineterm='')
diff_doc = Document()
for line in diff:
if line.startswith('-'):
p = diff_doc.add_paragraph(style='Normal')
run = p.add_run(line)
run.font.color.rgb = RGBColor(255, 0, 0) # Red
elif line.startswith('+'):
p = diff_doc.add_paragraph(style='Normal')
run = p.add_run(line)
run.font.color.rgb = RGBColor(0, 128, 0) # Green
elif line.startswith('@@'):
p = diff_doc.add_paragraph(style='Normal')
run = p.add_run(line)
run.font.color.rgb = RGBColor(0, 0, 255) # Blue
else:
diff_doc.add_paragraph(line, style='Normal')
diff_doc.save(output_path)
# Streamlit App
st.title("Canarie AI Prototype")
st.subheader("Finding the canarie in the coal mine")
model_name = st.selectbox("Model", model_names())
balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"])
apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"])
max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10)
min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500)
min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500)
uploaded_file = st.file_uploader("Choose a file", type=["pdf"])
if uploaded_file is not None:
with st.spinner('Please wait ...'):
try:
# Extract text with layout preservation
extracted_text = pdf_to_text_with_layout(uploaded_file)
original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
text_to_word_with_formatting(extracted_text, original_word_path)
diff = apply_pipeline(
uploaded_file,
model_name,
balance_type,
apsn_transactions,
max_fees_per_day,
min_overdrawn_fee,
min_transaction_overdraft
)
revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
text_to_word_with_formatting(diff, revised_word_path)
redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
redline_changes(original_word_path, revised_word_path, redlined_output_path)
with open(redlined_output_path, "rb") as f:
st.download_button(
label="Download Redlined Document",
data=f,
file_name="redlined_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
st.success("Redlined document created successfully!")
except Exception as e:
st.exception(e)
|