Spaces:
Build error
Build error
# app.py | |
from flask import Flask, request, jsonify, render_template_string | |
import PyPDF2 | |
import sqlite3 | |
from datetime import datetime | |
import re | |
import os | |
import hashlib | |
from typing import List, Dict | |
import shutil | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
import numpy as np | |
import joblib | |
import base64 | |
# HTML template with embedded JavaScript | |
HTML_TEMPLATE = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Document Processor</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<style> | |
/* Additional custom styles can go here */ | |
.processing { | |
animation: pulse 2s infinite; | |
} | |
@keyframes pulse { | |
0% { opacity: 1; } | |
50% { opacity: 0.5; } | |
100% { opacity: 1; } | |
} | |
</style> | |
</head> | |
<body class="bg-gray-50"> | |
<div class="container mx-auto p-6 max-w-4xl"> | |
<div class="mb-8"> | |
<h1 class="text-3xl font-bold mb-2">Smart Document Processor</h1> | |
<p class="text-gray-600">Upload and analyze PDF documents with AI</p> | |
</div> | |
<!-- Upload Section --> | |
<div class="mb-8"> | |
<div id="dropZone" class="border-2 border-dashed border-gray-300 rounded-lg p-8 text-center hover:border-blue-500 transition-colors"> | |
<input type="file" multiple accept=".pdf" id="fileInput" class="hidden"> | |
<div class="cursor-pointer"> | |
<svg class="w-12 h-12 text-gray-400 mx-auto mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor"> | |
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12"/> | |
</svg> | |
<span class="text-lg mb-2 block">Drop PDFs here or click to upload</span> | |
<span class="text-sm text-gray-500">Supports multiple files</span> | |
</div> | |
</div> | |
</div> | |
<!-- File List --> | |
<div id="fileList" class="mb-8 hidden"> | |
<h2 class="text-xl font-semibold mb-4">Selected Files</h2> | |
<div id="fileListContent" class="space-y-2"></div> | |
<button id="processButton" class="mt-4 bg-blue-600 text-white px-6 py-2 rounded-lg hover:bg-blue-700 disabled:opacity-50"> | |
Process Documents | |
</button> | |
</div> | |
<!-- Results Section --> | |
<div id="results" class="space-y-4"></div> | |
<!-- Error Alert --> | |
<div id="error" class="hidden mt-4 bg-red-100 border border-red-400 text-red-700 px-4 py-3 rounded"></div> | |
</div> | |
<script> | |
let files = []; | |
const dropZone = document.getElementById('dropZone'); | |
const fileInput = document.getElementById('fileInput'); | |
const fileList = document.getElementById('fileList'); | |
const fileListContent = document.getElementById('fileListContent'); | |
const processButton = document.getElementById('processButton'); | |
const resultsDiv = document.getElementById('results'); | |
const errorDiv = document.getElementById('error'); | |
// Drag and drop handlers | |
dropZone.addEventListener('dragover', (e) => { | |
e.preventDefault(); | |
dropZone.classList.add('border-blue-500'); | |
}); | |
dropZone.addEventListener('dragleave', () => { | |
dropZone.classList.remove('border-blue-500'); | |
}); | |
dropZone.addEventListener('drop', (e) => { | |
e.preventDefault(); | |
dropZone.classList.remove('border-blue-500'); | |
handleFiles(e.dataTransfer.files); | |
}); | |
dropZone.addEventListener('click', () => { | |
fileInput.click(); | |
}); | |
fileInput.addEventListener('change', (e) => { | |
handleFiles(e.target.files); | |
}); | |
function handleFiles(uploadedFiles) { | |
files = Array.from(uploadedFiles).filter(file => file.name.toLowerCase().endsWith('.pdf')); | |
updateFileList(); | |
} | |
function updateFileList() { | |
if (files.length > 0) { | |
fileList.classList.remove('hidden'); | |
fileListContent.innerHTML = files.map((file, index) => ` | |
<div class="flex items-center p-3 bg-gray-50 rounded"> | |
<svg class="w-5 h-5 text-gray-500 mr-3" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor"> | |
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/> | |
</svg> | |
<span>${file.name}</span> | |
</div> | |
`).join(''); | |
} else { | |
fileList.classList.add('hidden'); | |
} | |
} | |
processButton.addEventListener('click', async () => { | |
if (files.length === 0) return; | |
processButton.disabled = true; | |
processButton.innerHTML = ` | |
<svg class="animate-spin -ml-1 mr-3 h-5 w-5 text-white inline" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24"> | |
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle> | |
<path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path> | |
</svg> | |
Processing... | |
`; | |
const formData = new FormData(); | |
files.forEach(file => { | |
formData.append('files[]', file); | |
}); | |
try { | |
const response = await fetch('/batch_process', { | |
method: 'POST', | |
body: formData | |
}); | |
const data = await response.json(); | |
displayResults(data); | |
errorDiv.classList.add('hidden'); | |
} catch (error) { | |
errorDiv.textContent = 'Failed to process documents. Please try again.'; | |
errorDiv.classList.remove('hidden'); | |
} finally { | |
processButton.disabled = false; | |
processButton.textContent = 'Process Documents'; | |
} | |
}); | |
function displayResults(results) { | |
resultsDiv.innerHTML = results.map(result => ` | |
<div class="border rounded-lg p-4 bg-white shadow-sm"> | |
<h3 class="font-medium mb-2">${result.result.filename}</h3> | |
<div class="grid grid-cols-2 gap-4"> | |
<div> | |
<span class="text-gray-600">Type:</span> | |
<span class="ml-2">${result.result.doc_type}</span> | |
</div> | |
<div> | |
<span class="text-gray-600">Date:</span> | |
<span class="ml-2">${result.result.date || 'N/A'}</span> | |
</div> | |
<div> | |
<span class="text-gray-600">Amount:</span> | |
<span class="ml-2">${result.result.amount ? '$' + result.result.amount.toFixed(2) : 'N/A'}</span> | |
</div> | |
<div> | |
<span class="text-gray-600">Person:</span> | |
<span class="ml-2">${result.result.person_name}</span> | |
</div> | |
</div> | |
</div> | |
`).join(''); | |
} | |
</script> | |
</body> | |
</html> | |
""" | |
class MLDocumentClassifier: | |
def __init__(self): | |
self.labels = ['Invoice', 'Statement', 'Contract', 'Receipt', 'Report', 'Letter', 'Form'] | |
self.classifier = Pipeline([ | |
('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)), | |
('clf', MultinomialNB()) | |
]) | |
self.is_trained = False | |
def predict(self, text): | |
return self._rule_based_classify(text) | |
def _rule_based_classify(self, text): | |
text_lower = text.lower() | |
rules = [ | |
('Invoice', ['invoice', 'bill', 'payment due', 'amount due']), | |
('Statement', ['statement', 'balance', 'transaction history']), | |
('Contract', ['contract', 'agreement', 'terms and conditions']), | |
('Receipt', ['receipt', 'purchased', 'payment received']), | |
('Report', ['report', 'analysis', 'findings']), | |
('Letter', ['dear', 'sincerely', 'regards']), | |
('Form', ['form', 'please fill', 'application']) | |
] | |
scores = [] | |
for doc_type, keywords in rules: | |
score = sum(1 for keyword in keywords if keyword in text_lower) | |
scores.append((doc_type, score / len(keywords) if keywords else 0)) | |
scores.sort(key=lambda x: x[1], reverse=True) | |
return scores[0][0] | |
class EnhancedDocProcessor: | |
def __init__(self): | |
self.conn = sqlite3.connect(':memory:', check_same_thread=False) | |
self.setup_database() | |
self.classifier = MLDocumentClassifier() | |
def setup_database(self): | |
self.conn.executescript(''' | |
CREATE TABLE IF NOT EXISTS documents ( | |
id INTEGER PRIMARY KEY, | |
filename TEXT, | |
doc_type TEXT, | |
person_name TEXT, | |
amount REAL, | |
date TEXT, | |
account_number TEXT, | |
raw_text TEXT, | |
processed_date TEXT, | |
file_hash TEXT, | |
version INTEGER, | |
user_id TEXT | |
); | |
CREATE TABLE IF NOT EXISTS similar_docs ( | |
doc_id INTEGER, | |
similar_doc_id INTEGER, | |
similarity_score REAL, | |
FOREIGN KEY (doc_id) REFERENCES documents (id), | |
FOREIGN KEY (similar_doc_id) REFERENCES documents (id) | |
); | |
''') | |
self.conn.commit() | |
def extract_text(self, pdf_path: str) -> str: | |
try: | |
text_parts = [] | |
with open(pdf_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
for page in reader.pages: | |
text = page.extract_text() | |
if text: | |
text_parts.append(text) | |
return "\n".join(text_parts) | |
except Exception as e: | |
return f"Error extracting text: {str(e)}" | |
def extract_metadata(self, text: str) -> Dict: | |
return { | |
'amount': next((float(amt.replace('$','').replace(',','')) | |
for amt in re.findall(r'\$[\d,]+\.?\d*', text)), 0.0), | |
'date': next(iter(re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)), None), | |
'account_number': next(iter(re.findall(r'Account\s*#?\s*:?\s*(\d{8,12})', text)), None), | |
'person_name': next(iter(re.findall(r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)', text)), "Unknown") | |
} | |
def process_document(self, pdf_path: str, filename: str, user_id: str = None) -> Dict: | |
text = self.extract_text(pdf_path) | |
doc_type = self.classifier.predict(text) | |
metadata = self.extract_metadata(text) | |
cursor = self.conn.execute(''' | |
INSERT INTO documents | |
(filename, doc_type, person_name, amount, date, | |
account_number, raw_text, processed_date, user_id) | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) | |
''', ( | |
filename, doc_type, metadata['person_name'], | |
metadata['amount'], metadata['date'], | |
metadata['account_number'], text, | |
datetime.now().isoformat(), user_id | |
)) | |
doc_id = cursor.lastrowid | |
self.conn.commit() | |
return { | |
'id': doc_id, | |
'filename': filename, | |
'doc_type': doc_type, | |
**metadata | |
} | |
def process_batch(self, file_paths: List[str], user_id: str = None) -> List[Dict]: | |
results = [] | |
for file_path in file_paths: | |
try: | |
result = self.process_document(file_path, os.path.basename(file_path), user_id) | |
results.append({"status": "success", "result": result, "file": file_path}) | |
except Exception as e: | |
results.append({"status": "error", "error": str(e), "file": file_path}) | |
return results | |
app = Flask(__name__) | |
processor = EnhancedDocProcessor() | |
def index(): | |
return render_template_string(HTML_TEMPLATE) | |
def batch_process(): | |
if 'files[]' not in request.files: | |
return jsonify({'error': 'No files uploaded'}), 400 | |
files = request.files.getlist('files[]') | |
user_id = request.form.get('user_id') | |
file_paths = [] | |
for file in files: | |
if file.filename.endswith('.pdf'): | |
temp_path = f"temp_{file.filename}" | |
file.save(temp_path) | |
file_paths.append(temp_path) | |
try: | |
results = processor.process_batch(file_paths, user_id) | |
except Exception as e: | |
return jsonify({'error': str(e)}), 500 | |
finally: | |
# Clean up temporary files | |
for path in file_paths: | |
try: | |
os.remove(path) | |
except: | |
pass | |
return jsonify(results) | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860, debug=True) |