Spaces:

sanjaymalladi
/

document-processor

Build error

App Files Files Community

realsanjay commited on Dec 27, 2024

Commit

858e9ba

verified ·

1 Parent(s): 1015649

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -199

app.py CHANGED Viewed

@@ -16,187 +16,61 @@ import joblib
 import base64
 from werkzeug.utils import secure_filename
 import tempfile
-# HTML template with embedded JavaScript
-HTML_TEMPLATE = """
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Document Processor</title>
-    <script src="https://cdn.tailwindcss.com"></script>
-    <style>
-        /* Additional custom styles can go here */
-        .processing {
-            animation: pulse 2s infinite;
-        }
-        @keyframes pulse {
-            0% { opacity: 1; }
-            50% { opacity: 0.5; }
-            100% { opacity: 1; }
-        }
-    </style>
-</head>
-<body class="bg-gray-50">
-    <div class="container mx-auto p-6 max-w-4xl">
-        <div class="mb-8">
-            <h1 class="text-3xl font-bold mb-2">Smart Document Processor</h1>
-            <p class="text-gray-600">Upload and analyze PDF documents with AI</p>
-        </div>
-        <!-- Upload Section -->
-        <div class="mb-8">
-            <div id="dropZone" class="border-2 border-dashed border-gray-300 rounded-lg p-8 text-center hover:border-blue-500 transition-colors">
-                <input type="file" multiple accept=".pdf" id="fileInput" class="hidden">
-                <div class="cursor-pointer">
-                    <svg class="w-12 h-12 text-gray-400 mx-auto mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor">
-                        <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12"/>
-                    </svg>
-                    <span class="text-lg mb-2 block">Drop PDFs here or click to upload</span>
-                    <span class="text-sm text-gray-500">Supports multiple files</span>
-                </div>
-            </div>
-        </div>
-        <!-- File List -->
-        <div id="fileList" class="mb-8 hidden">
-            <h2 class="text-xl font-semibold mb-4">Selected Files</h2>
-            <div id="fileListContent" class="space-y-2"></div>
-            <button id="processButton" class="mt-4 bg-blue-600 text-white px-6 py-2 rounded-lg hover:bg-blue-700 disabled:opacity-50">
-                Process Documents
-            </button>
-        </div>
-        <!-- Results Section -->
-        <div id="results" class="space-y-4"></div>
-        <!-- Error Alert -->
-        <div id="error" class="hidden mt-4 bg-red-100 border border-red-400 text-red-700 px-4 py-3 rounded"></div>
-    </div>
-    <script>
-        let files = [];
-        const dropZone = document.getElementById('dropZone');
-        const fileInput = document.getElementById('fileInput');
-        const fileList = document.getElementById('fileList');
-        const fileListContent = document.getElementById('fileListContent');
-        const processButton = document.getElementById('processButton');
-        const resultsDiv = document.getElementById('results');
-        const errorDiv = document.getElementById('error');
-        // Drag and drop handlers
-        dropZone.addEventListener('dragover', (e) => {
-            e.preventDefault();
-            dropZone.classList.add('border-blue-500');
-        });
-        dropZone.addEventListener('dragleave', () => {
-            dropZone.classList.remove('border-blue-500');
-        });
-        dropZone.addEventListener('drop', (e) => {
-            e.preventDefault();
-            dropZone.classList.remove('border-blue-500');
-            handleFiles(e.dataTransfer.files);
-        });
-        dropZone.addEventListener('click', () => {
-            fileInput.click();
-        });
-        fileInput.addEventListener('change', (e) => {
-            handleFiles(e.target.files);
-        });
-        function handleFiles(uploadedFiles) {
-            files = Array.from(uploadedFiles).filter(file => file.name.toLowerCase().endsWith('.pdf'));
-            updateFileList();
-        }
-        function updateFileList() {
-            if (files.length > 0) {
-                fileList.classList.remove('hidden');
-                fileListContent.innerHTML = files.map((file, index) => `
-                    <div class="flex items-center p-3 bg-gray-50 rounded">
-                        <svg class="w-5 h-5 text-gray-500 mr-3" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/>
-                        </svg>
-                        <span>${file.name}</span>
-                    </div>
-                `).join('');
-            } else {
-                fileList.classList.add('hidden');
-            }
         }
-        processButton.addEventListener('click', async () => {
-            if (files.length === 0) return;
-            processButton.disabled = true;
-            processButton.innerHTML = `
-                <svg class="animate-spin -ml-1 mr-3 h-5 w-5 text-white inline" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
-                    <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
-                    <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
-                </svg>
-                Processing...
-            `;
-            const formData = new FormData();
-            files.forEach(file => {
-                formData.append('files[]', file);
-            });
-            try {
-                const response = await fetch('/batch_process', {
-                    method: 'POST',
-                    body: formData
-                });
-                const data = await response.json();
-                displayResults(data);
-                errorDiv.classList.add('hidden');
-            } catch (error) {
-                errorDiv.textContent = 'Failed to process documents. Please try again.';
-                errorDiv.classList.remove('hidden');
-            } finally {
-                processButton.disabled = false;
-                processButton.textContent = 'Process Documents';
-            }
-        });
-        function displayResults(results) {
-            resultsDiv.innerHTML = results.map(result => `
-                <div class="border rounded-lg p-4 bg-white shadow-sm">
-                    <h3 class="font-medium mb-2">${result.result.filename}</h3>
-                    <div class="grid grid-cols-2 gap-4">
-                        <div>
-                            <span class="text-gray-600">Type:</span>
-                            <span class="ml-2">${result.result.doc_type}</span>
-                        </div>
-                        <div>
-                            <span class="text-gray-600">Date:</span>
-                            <span class="ml-2">${result.result.date || 'N/A'}</span>
-                        </div>
-                        <div>
-                            <span class="text-gray-600">Amount:</span>
-                            <span class="ml-2">${result.result.amount ? '$' + result.result.amount.toFixed(2) : 'N/A'}</span>
-                        </div>
-                        <div>
-                            <span class="text-gray-600">Person:</span>
-                            <span class="ml-2">${result.result.person_name}</span>
-                        </div>
-                    </div>
-                </div>
-            `).join('');
         }
-    </script>
-</body>
-</html>
-"""
 class MLDocumentClassifier:
     def __init__(self):
-        self.labels = ['Invoice', 'Statement', 'Contract', 'Receipt', 'Report', 'Letter', 'Form']
         self.classifier = Pipeline([
             ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
             ('clf', MultinomialNB())
@@ -209,13 +83,15 @@ class MLDocumentClassifier:
     def _rule_based_classify(self, text):
         text_lower = text.lower()
         rules = [
-            ('Invoice', ['invoice', 'bill', 'payment due', 'amount due']),
-            ('Statement', ['statement', 'balance', 'transaction history']),
-            ('Contract', ['contract', 'agreement', 'terms and conditions']),
-            ('Receipt', ['receipt', 'purchased', 'payment received']),
-            ('Report', ['report', 'analysis', 'findings']),
-            ('Letter', ['dear', 'sincerely', 'regards']),
-            ('Form', ['form', 'please fill', 'application'])
         ]
         scores = []
@@ -231,22 +107,33 @@ class EnhancedDocProcessor:
         self.conn = sqlite3.connect(':memory:', check_same_thread=False)
         self.setup_database()
         self.classifier = MLDocumentClassifier()
     def setup_database(self):
         self.conn.executescript('''
             CREATE TABLE IF NOT EXISTS documents (
                 id INTEGER PRIMARY KEY,
                 filename TEXT,
                 doc_type TEXT,
-                person_name TEXT,
                 amount REAL,
                 date TEXT,
                 account_number TEXT,
                 raw_text TEXT,
                 processed_date TEXT,
                 file_hash TEXT,
-                version INTEGER,
-                user_id TEXT
             );
             CREATE TABLE IF NOT EXISTS similar_docs (
@@ -273,29 +160,58 @@ class EnhancedDocProcessor:
             return f"Error extracting text: {str(e)}"
     def extract_metadata(self, text: str) -> Dict:
-        return {
             'amount': next((float(amt.replace('$','').replace(',',''))
                           for amt in re.findall(r'\$[\d,]+\.?\d*', text)), 0.0),
             'date': next(iter(re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)), None),
             'account_number': next(iter(re.findall(r'Account\s*#?\s*:?\s*(\d{8,12})', text)), None),
-            'person_name': next(iter(re.findall(r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)', text)), "Unknown")
         }
-    def process_document(self, pdf_path: str, filename: str, user_id: str = None) -> Dict:
         text = self.extract_text(pdf_path)
         doc_type = self.classifier.predict(text)
         metadata = self.extract_metadata(text)
         cursor = self.conn.execute('''
             INSERT INTO documents
-            (filename, doc_type, person_name, amount, date,
-             account_number, raw_text, processed_date, user_id)
             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
         ''', (
-            filename, doc_type, metadata['person_name'],
             metadata['amount'], metadata['date'],
             metadata['account_number'], text,
-            datetime.now().isoformat(), user_id
         ))
         doc_id = cursor.lastrowid
@@ -305,19 +221,22 @@ class EnhancedDocProcessor:
             'id': doc_id,
             'filename': filename,
             'doc_type': doc_type,
             **metadata
         }
-    def process_batch(self, file_paths: List[str], user_id: str = None) -> List[Dict]:
         results = []
         for file_path in file_paths:
             try:
-                result = self.process_document(file_path, os.path.basename(file_path), user_id)
                 results.append({"status": "success", "result": result, "file": file_path})
             except Exception as e:
                 results.append({"status": "error", "error": str(e), "file": file_path})
         return results
 app = Flask(__name__)
 processor = EnhancedDocProcessor()
@@ -331,25 +250,20 @@ def batch_process():
         return jsonify({'error': 'No files uploaded'}), 400
     files = request.files.getlist('files[]')
-    user_id = request.form.get('user_id')
-    # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
         file_paths = []
         for file in files:
             if file.filename.endswith('.pdf'):
-                # Create a secure filename
                 secure_name = secure_filename(file.filename)
-                # Create full path in temporary directory
                 temp_path = os.path.join(temp_dir, secure_name)
                 file.save(temp_path)
                 file_paths.append(temp_path)
         try:
-            results = processor.process_batch(file_paths, user_id)
         except Exception as e:
             return jsonify({'error': str(e)}), 500
-        # No need to manually clean up - TemporaryDirectory does it automatically
         return jsonify(results)

 import base64
 from werkzeug.utils import secure_filename
 import tempfile
+class PersonIdentifier:
+    def __init__(self):
+        self.name_patterns = [
+            r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
+            r'Name:?\s*([A-Z][a-z]+\s+[A-Z][a-z]+)',
+            r'([A-Z][a-z]+\s+[A-Z][a-z]+)'
+        ]
+        self.id_patterns = {
+            'ssn': r'(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}',
+            'drivers_license': r'[A-Z]\d{7}',
+            'passport': r'[A-Z]\d{8}',
         }
+        self.email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
+    def identify_person(self, text: str) -> Dict:
+        person_data = {
+            'name': None,
+            'id_numbers': {},
+            'email': None
         }
+        # Extract name
+        for pattern in self.name_patterns:
+            names = re.findall(pattern, text)
+            if names:
+                person_data['name'] = names[0]
+                break
+        # Extract IDs
+        for id_type, pattern in self.id_patterns.items():
+            ids = re.findall(pattern, text)
+            if ids:
+                person_data['id_numbers'][id_type] = ids[0]
+        # Extract email
+        emails = re.findall(self.email_pattern, text)
+        if emails:
+            person_data['email'] = emails[0]
+        return person_data
 class MLDocumentClassifier:
     def __init__(self):
+        self.labels = [
+            'BankApplication_CreditCard',
+            'BankApplication_SavingsAccount',
+            'ID_DriversLicense',
+            'ID_Passport',
+            'ID_StateID',
+            'Financial_PayStub',
+            'Financial_TaxReturn',
+            'Financial_IncomeStatement',
+            'Receipt'
+        ]
         self.classifier = Pipeline([
             ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
             ('clf', MultinomialNB())
     def _rule_based_classify(self, text):
         text_lower = text.lower()
         rules = [
+            ('BankApplication_CreditCard', ['credit card application', 'card request', 'new card']),
+            ('BankApplication_SavingsAccount', ['savings account', 'open account', 'new account']),
+            ('ID_DriversLicense', ['driver license', 'driving permit', 'operator license']),
+            ('ID_Passport', ['passport', 'travel document']),
+            ('ID_StateID', ['state id', 'identification card']),
+            ('Financial_PayStub', ['pay stub', 'salary', 'wages']),
+            ('Financial_TaxReturn', ['tax return', 'form 1040', 'tax year']),
+            ('Financial_IncomeStatement', ['income statement', 'earnings report']),
+            ('Receipt', ['receipt', 'payment received', 'transaction record'])
         ]
         scores = []
         self.conn = sqlite3.connect(':memory:', check_same_thread=False)
         self.setup_database()
         self.classifier = MLDocumentClassifier()
+        self.person_identifier = PersonIdentifier()
     def setup_database(self):
         self.conn.executescript('''
+            CREATE TABLE IF NOT EXISTS persons (
+                id INTEGER PRIMARY KEY,
+                name TEXT,
+                email TEXT,
+                ssn TEXT,
+                drivers_license TEXT,
+                passport TEXT,
+                created_date TEXT
+            );
             CREATE TABLE IF NOT EXISTS documents (
                 id INTEGER PRIMARY KEY,
                 filename TEXT,
                 doc_type TEXT,
+                person_id INTEGER,
                 amount REAL,
                 date TEXT,
                 account_number TEXT,
                 raw_text TEXT,
                 processed_date TEXT,
                 file_hash TEXT,
+                confidence_score REAL,
+                FOREIGN KEY (person_id) REFERENCES persons (id)
             );
             CREATE TABLE IF NOT EXISTS similar_docs (
             return f"Error extracting text: {str(e)}"
     def extract_metadata(self, text: str) -> Dict:
+        metadata = {
             'amount': next((float(amt.replace('$','').replace(',',''))
                           for amt in re.findall(r'\$[\d,]+\.?\d*', text)), 0.0),
             'date': next(iter(re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)), None),
             'account_number': next(iter(re.findall(r'Account\s*#?\s*:?\s*(\d{8,12})', text)), None),
         }
+        return metadata
+    def get_or_create_person(self, person_data: Dict) -> int:
+        cursor = self.conn.execute(
+            'SELECT id FROM persons WHERE name = ? OR email = ? OR ssn = ? OR drivers_license = ? OR passport = ?',
+            (person_data['name'], person_data.get('email'),
+             person_data.get('id_numbers', {}).get('ssn'),
+             person_data.get('id_numbers', {}).get('drivers_license'),
+             person_data.get('id_numbers', {}).get('passport'))
+        )
+        result = cursor.fetchone()
+        if result:
+            return result[0]
+        cursor = self.conn.execute('''
+            INSERT INTO persons (name, email, ssn, drivers_license, passport, created_date)
+            VALUES (?, ?, ?, ?, ?, ?)
+        ''', (
+            person_data['name'],
+            person_data.get('email'),
+            person_data.get('id_numbers', {}).get('ssn'),
+            person_data.get('id_numbers', {}).get('drivers_license'),
+            person_data.get('id_numbers', {}).get('passport'),
+            datetime.now().isoformat()
+        ))
+        self.conn.commit()
+        return cursor.lastrowid
+    def process_document(self, pdf_path: str, filename: str) -> Dict:
         text = self.extract_text(pdf_path)
         doc_type = self.classifier.predict(text)
         metadata = self.extract_metadata(text)
+        person_data = self.person_identifier.identify_person(text)
+        person_id = self.get_or_create_person(person_data)
         cursor = self.conn.execute('''
             INSERT INTO documents
+            (filename, doc_type, person_id, amount, date,
+             account_number, raw_text, processed_date, confidence_score)
             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
         ''', (
+            filename, doc_type, person_id,
             metadata['amount'], metadata['date'],
             metadata['account_number'], text,
+            datetime.now().isoformat(), 0.85
         ))
         doc_id = cursor.lastrowid
             'id': doc_id,
             'filename': filename,
             'doc_type': doc_type,
+            'person': person_data,
             **metadata
         }
+    def process_batch(self, file_paths: List[str]) -> List[Dict]:
         results = []
         for file_path in file_paths:
             try:
+                result = self.process_document(file_path, os.path.basename(file_path))
                 results.append({"status": "success", "result": result, "file": file_path})
             except Exception as e:
                 results.append({"status": "error", "error": str(e), "file": file_path})
         return results
+# [Previous HTML_TEMPLATE remains the same]
 app = Flask(__name__)
 processor = EnhancedDocProcessor()
         return jsonify({'error': 'No files uploaded'}), 400
     files = request.files.getlist('files[]')
     with tempfile.TemporaryDirectory() as temp_dir:
         file_paths = []
         for file in files:
             if file.filename.endswith('.pdf'):
                 secure_name = secure_filename(file.filename)
                 temp_path = os.path.join(temp_dir, secure_name)
                 file.save(temp_path)
                 file_paths.append(temp_path)
         try:
+            results = processor.process_batch(file_paths)
         except Exception as e:
             return jsonify({'error': str(e)}), 500
         return jsonify(results)