realsanjay commited on
Commit
858e9ba
·
verified ·
1 Parent(s): 1015649

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -199
app.py CHANGED
@@ -16,187 +16,61 @@ import joblib
16
  import base64
17
  from werkzeug.utils import secure_filename
18
  import tempfile
19
- # HTML template with embedded JavaScript
20
- HTML_TEMPLATE = """
21
- <!DOCTYPE html>
22
- <html lang="en">
23
- <head>
24
- <meta charset="UTF-8">
25
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
26
- <title>Document Processor</title>
27
- <script src="https://cdn.tailwindcss.com"></script>
28
- <style>
29
- /* Additional custom styles can go here */
30
- .processing {
31
- animation: pulse 2s infinite;
32
- }
33
- @keyframes pulse {
34
- 0% { opacity: 1; }
35
- 50% { opacity: 0.5; }
36
- 100% { opacity: 1; }
37
- }
38
- </style>
39
- </head>
40
- <body class="bg-gray-50">
41
- <div class="container mx-auto p-6 max-w-4xl">
42
- <div class="mb-8">
43
- <h1 class="text-3xl font-bold mb-2">Smart Document Processor</h1>
44
- <p class="text-gray-600">Upload and analyze PDF documents with AI</p>
45
- </div>
46
-
47
- <!-- Upload Section -->
48
- <div class="mb-8">
49
- <div id="dropZone" class="border-2 border-dashed border-gray-300 rounded-lg p-8 text-center hover:border-blue-500 transition-colors">
50
- <input type="file" multiple accept=".pdf" id="fileInput" class="hidden">
51
- <div class="cursor-pointer">
52
- <svg class="w-12 h-12 text-gray-400 mx-auto mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor">
53
- <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12"/>
54
- </svg>
55
- <span class="text-lg mb-2 block">Drop PDFs here or click to upload</span>
56
- <span class="text-sm text-gray-500">Supports multiple files</span>
57
- </div>
58
- </div>
59
- </div>
60
-
61
- <!-- File List -->
62
- <div id="fileList" class="mb-8 hidden">
63
- <h2 class="text-xl font-semibold mb-4">Selected Files</h2>
64
- <div id="fileListContent" class="space-y-2"></div>
65
- <button id="processButton" class="mt-4 bg-blue-600 text-white px-6 py-2 rounded-lg hover:bg-blue-700 disabled:opacity-50">
66
- Process Documents
67
- </button>
68
- </div>
69
-
70
- <!-- Results Section -->
71
- <div id="results" class="space-y-4"></div>
72
-
73
- <!-- Error Alert -->
74
- <div id="error" class="hidden mt-4 bg-red-100 border border-red-400 text-red-700 px-4 py-3 rounded"></div>
75
- </div>
76
-
77
- <script>
78
- let files = [];
79
- const dropZone = document.getElementById('dropZone');
80
- const fileInput = document.getElementById('fileInput');
81
- const fileList = document.getElementById('fileList');
82
- const fileListContent = document.getElementById('fileListContent');
83
- const processButton = document.getElementById('processButton');
84
- const resultsDiv = document.getElementById('results');
85
- const errorDiv = document.getElementById('error');
86
-
87
- // Drag and drop handlers
88
- dropZone.addEventListener('dragover', (e) => {
89
- e.preventDefault();
90
- dropZone.classList.add('border-blue-500');
91
- });
92
-
93
- dropZone.addEventListener('dragleave', () => {
94
- dropZone.classList.remove('border-blue-500');
95
- });
96
-
97
- dropZone.addEventListener('drop', (e) => {
98
- e.preventDefault();
99
- dropZone.classList.remove('border-blue-500');
100
- handleFiles(e.dataTransfer.files);
101
- });
102
 
103
- dropZone.addEventListener('click', () => {
104
- fileInput.click();
105
- });
106
-
107
- fileInput.addEventListener('change', (e) => {
108
- handleFiles(e.target.files);
109
- });
110
-
111
- function handleFiles(uploadedFiles) {
112
- files = Array.from(uploadedFiles).filter(file => file.name.toLowerCase().endsWith('.pdf'));
113
- updateFileList();
114
- }
115
-
116
- function updateFileList() {
117
- if (files.length > 0) {
118
- fileList.classList.remove('hidden');
119
- fileListContent.innerHTML = files.map((file, index) => `
120
- <div class="flex items-center p-3 bg-gray-50 rounded">
121
- <svg class="w-5 h-5 text-gray-500 mr-3" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor">
122
- <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/>
123
- </svg>
124
- <span>${file.name}</span>
125
- </div>
126
- `).join('');
127
- } else {
128
- fileList.classList.add('hidden');
129
- }
130
  }
 
131
 
132
- processButton.addEventListener('click', async () => {
133
- if (files.length === 0) return;
134
-
135
- processButton.disabled = true;
136
- processButton.innerHTML = `
137
- <svg class="animate-spin -ml-1 mr-3 h-5 w-5 text-white inline" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
138
- <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
139
- <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
140
- </svg>
141
- Processing...
142
- `;
143
-
144
- const formData = new FormData();
145
- files.forEach(file => {
146
- formData.append('files[]', file);
147
- });
148
-
149
- try {
150
- const response = await fetch('/batch_process', {
151
- method: 'POST',
152
- body: formData
153
- });
154
-
155
- const data = await response.json();
156
- displayResults(data);
157
- errorDiv.classList.add('hidden');
158
- } catch (error) {
159
- errorDiv.textContent = 'Failed to process documents. Please try again.';
160
- errorDiv.classList.remove('hidden');
161
- } finally {
162
- processButton.disabled = false;
163
- processButton.textContent = 'Process Documents';
164
- }
165
- });
166
-
167
- function displayResults(results) {
168
- resultsDiv.innerHTML = results.map(result => `
169
- <div class="border rounded-lg p-4 bg-white shadow-sm">
170
- <h3 class="font-medium mb-2">${result.result.filename}</h3>
171
- <div class="grid grid-cols-2 gap-4">
172
- <div>
173
- <span class="text-gray-600">Type:</span>
174
- <span class="ml-2">${result.result.doc_type}</span>
175
- </div>
176
- <div>
177
- <span class="text-gray-600">Date:</span>
178
- <span class="ml-2">${result.result.date || 'N/A'}</span>
179
- </div>
180
- <div>
181
- <span class="text-gray-600">Amount:</span>
182
- <span class="ml-2">${result.result.amount ? '$' + result.result.amount.toFixed(2) : 'N/A'}</span>
183
- </div>
184
- <div>
185
- <span class="text-gray-600">Person:</span>
186
- <span class="ml-2">${result.result.person_name}</span>
187
- </div>
188
- </div>
189
- </div>
190
- `).join('');
191
  }
192
- </script>
193
- </body>
194
- </html>
195
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  class MLDocumentClassifier:
198
  def __init__(self):
199
- self.labels = ['Invoice', 'Statement', 'Contract', 'Receipt', 'Report', 'Letter', 'Form']
 
 
 
 
 
 
 
 
 
 
200
  self.classifier = Pipeline([
201
  ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
202
  ('clf', MultinomialNB())
@@ -209,13 +83,15 @@ class MLDocumentClassifier:
209
  def _rule_based_classify(self, text):
210
  text_lower = text.lower()
211
  rules = [
212
- ('Invoice', ['invoice', 'bill', 'payment due', 'amount due']),
213
- ('Statement', ['statement', 'balance', 'transaction history']),
214
- ('Contract', ['contract', 'agreement', 'terms and conditions']),
215
- ('Receipt', ['receipt', 'purchased', 'payment received']),
216
- ('Report', ['report', 'analysis', 'findings']),
217
- ('Letter', ['dear', 'sincerely', 'regards']),
218
- ('Form', ['form', 'please fill', 'application'])
 
 
219
  ]
220
 
221
  scores = []
@@ -231,22 +107,33 @@ class EnhancedDocProcessor:
231
  self.conn = sqlite3.connect(':memory:', check_same_thread=False)
232
  self.setup_database()
233
  self.classifier = MLDocumentClassifier()
 
234
 
235
  def setup_database(self):
236
  self.conn.executescript('''
 
 
 
 
 
 
 
 
 
 
237
  CREATE TABLE IF NOT EXISTS documents (
238
  id INTEGER PRIMARY KEY,
239
  filename TEXT,
240
  doc_type TEXT,
241
- person_name TEXT,
242
  amount REAL,
243
  date TEXT,
244
  account_number TEXT,
245
  raw_text TEXT,
246
  processed_date TEXT,
247
  file_hash TEXT,
248
- version INTEGER,
249
- user_id TEXT
250
  );
251
 
252
  CREATE TABLE IF NOT EXISTS similar_docs (
@@ -273,29 +160,58 @@ class EnhancedDocProcessor:
273
  return f"Error extracting text: {str(e)}"
274
 
275
  def extract_metadata(self, text: str) -> Dict:
276
- return {
277
  'amount': next((float(amt.replace('$','').replace(',',''))
278
  for amt in re.findall(r'\$[\d,]+\.?\d*', text)), 0.0),
279
  'date': next(iter(re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)), None),
280
  'account_number': next(iter(re.findall(r'Account\s*#?\s*:?\s*(\d{8,12})', text)), None),
281
- 'person_name': next(iter(re.findall(r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)', text)), "Unknown")
282
  }
 
283
 
284
- def process_document(self, pdf_path: str, filename: str, user_id: str = None) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  text = self.extract_text(pdf_path)
286
  doc_type = self.classifier.predict(text)
287
  metadata = self.extract_metadata(text)
 
 
288
 
289
  cursor = self.conn.execute('''
290
  INSERT INTO documents
291
- (filename, doc_type, person_name, amount, date,
292
- account_number, raw_text, processed_date, user_id)
293
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
294
  ''', (
295
- filename, doc_type, metadata['person_name'],
296
  metadata['amount'], metadata['date'],
297
  metadata['account_number'], text,
298
- datetime.now().isoformat(), user_id
299
  ))
300
 
301
  doc_id = cursor.lastrowid
@@ -305,19 +221,22 @@ class EnhancedDocProcessor:
305
  'id': doc_id,
306
  'filename': filename,
307
  'doc_type': doc_type,
 
308
  **metadata
309
  }
310
 
311
- def process_batch(self, file_paths: List[str], user_id: str = None) -> List[Dict]:
312
  results = []
313
  for file_path in file_paths:
314
  try:
315
- result = self.process_document(file_path, os.path.basename(file_path), user_id)
316
  results.append({"status": "success", "result": result, "file": file_path})
317
  except Exception as e:
318
  results.append({"status": "error", "error": str(e), "file": file_path})
319
  return results
320
 
 
 
321
  app = Flask(__name__)
322
  processor = EnhancedDocProcessor()
323
 
@@ -331,25 +250,20 @@ def batch_process():
331
  return jsonify({'error': 'No files uploaded'}), 400
332
 
333
  files = request.files.getlist('files[]')
334
- user_id = request.form.get('user_id')
335
 
336
- # Create a temporary directory
337
  with tempfile.TemporaryDirectory() as temp_dir:
338
  file_paths = []
339
  for file in files:
340
  if file.filename.endswith('.pdf'):
341
- # Create a secure filename
342
  secure_name = secure_filename(file.filename)
343
- # Create full path in temporary directory
344
  temp_path = os.path.join(temp_dir, secure_name)
345
  file.save(temp_path)
346
  file_paths.append(temp_path)
347
 
348
  try:
349
- results = processor.process_batch(file_paths, user_id)
350
  except Exception as e:
351
  return jsonify({'error': str(e)}), 500
352
- # No need to manually clean up - TemporaryDirectory does it automatically
353
 
354
  return jsonify(results)
355
 
 
16
  import base64
17
  from werkzeug.utils import secure_filename
18
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ class PersonIdentifier:
21
+ def __init__(self):
22
+ self.name_patterns = [
23
+ r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
24
+ r'Name:?\s*([A-Z][a-z]+\s+[A-Z][a-z]+)',
25
+ r'([A-Z][a-z]+\s+[A-Z][a-z]+)'
26
+ ]
27
+ self.id_patterns = {
28
+ 'ssn': r'(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}',
29
+ 'drivers_license': r'[A-Z]\d{7}',
30
+ 'passport': r'[A-Z]\d{8}',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
+ self.email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
33
 
34
+ def identify_person(self, text: str) -> Dict:
35
+ person_data = {
36
+ 'name': None,
37
+ 'id_numbers': {},
38
+ 'email': None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
+
41
+ # Extract name
42
+ for pattern in self.name_patterns:
43
+ names = re.findall(pattern, text)
44
+ if names:
45
+ person_data['name'] = names[0]
46
+ break
47
+
48
+ # Extract IDs
49
+ for id_type, pattern in self.id_patterns.items():
50
+ ids = re.findall(pattern, text)
51
+ if ids:
52
+ person_data['id_numbers'][id_type] = ids[0]
53
+
54
+ # Extract email
55
+ emails = re.findall(self.email_pattern, text)
56
+ if emails:
57
+ person_data['email'] = emails[0]
58
+
59
+ return person_data
60
 
61
  class MLDocumentClassifier:
62
  def __init__(self):
63
+ self.labels = [
64
+ 'BankApplication_CreditCard',
65
+ 'BankApplication_SavingsAccount',
66
+ 'ID_DriversLicense',
67
+ 'ID_Passport',
68
+ 'ID_StateID',
69
+ 'Financial_PayStub',
70
+ 'Financial_TaxReturn',
71
+ 'Financial_IncomeStatement',
72
+ 'Receipt'
73
+ ]
74
  self.classifier = Pipeline([
75
  ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
76
  ('clf', MultinomialNB())
 
83
  def _rule_based_classify(self, text):
84
  text_lower = text.lower()
85
  rules = [
86
+ ('BankApplication_CreditCard', ['credit card application', 'card request', 'new card']),
87
+ ('BankApplication_SavingsAccount', ['savings account', 'open account', 'new account']),
88
+ ('ID_DriversLicense', ['driver license', 'driving permit', 'operator license']),
89
+ ('ID_Passport', ['passport', 'travel document']),
90
+ ('ID_StateID', ['state id', 'identification card']),
91
+ ('Financial_PayStub', ['pay stub', 'salary', 'wages']),
92
+ ('Financial_TaxReturn', ['tax return', 'form 1040', 'tax year']),
93
+ ('Financial_IncomeStatement', ['income statement', 'earnings report']),
94
+ ('Receipt', ['receipt', 'payment received', 'transaction record'])
95
  ]
96
 
97
  scores = []
 
107
  self.conn = sqlite3.connect(':memory:', check_same_thread=False)
108
  self.setup_database()
109
  self.classifier = MLDocumentClassifier()
110
+ self.person_identifier = PersonIdentifier()
111
 
112
  def setup_database(self):
113
  self.conn.executescript('''
114
+ CREATE TABLE IF NOT EXISTS persons (
115
+ id INTEGER PRIMARY KEY,
116
+ name TEXT,
117
+ email TEXT,
118
+ ssn TEXT,
119
+ drivers_license TEXT,
120
+ passport TEXT,
121
+ created_date TEXT
122
+ );
123
+
124
  CREATE TABLE IF NOT EXISTS documents (
125
  id INTEGER PRIMARY KEY,
126
  filename TEXT,
127
  doc_type TEXT,
128
+ person_id INTEGER,
129
  amount REAL,
130
  date TEXT,
131
  account_number TEXT,
132
  raw_text TEXT,
133
  processed_date TEXT,
134
  file_hash TEXT,
135
+ confidence_score REAL,
136
+ FOREIGN KEY (person_id) REFERENCES persons (id)
137
  );
138
 
139
  CREATE TABLE IF NOT EXISTS similar_docs (
 
160
  return f"Error extracting text: {str(e)}"
161
 
162
  def extract_metadata(self, text: str) -> Dict:
163
+ metadata = {
164
  'amount': next((float(amt.replace('$','').replace(',',''))
165
  for amt in re.findall(r'\$[\d,]+\.?\d*', text)), 0.0),
166
  'date': next(iter(re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)), None),
167
  'account_number': next(iter(re.findall(r'Account\s*#?\s*:?\s*(\d{8,12})', text)), None),
 
168
  }
169
+ return metadata
170
 
171
+ def get_or_create_person(self, person_data: Dict) -> int:
172
+ cursor = self.conn.execute(
173
+ 'SELECT id FROM persons WHERE name = ? OR email = ? OR ssn = ? OR drivers_license = ? OR passport = ?',
174
+ (person_data['name'], person_data.get('email'),
175
+ person_data.get('id_numbers', {}).get('ssn'),
176
+ person_data.get('id_numbers', {}).get('drivers_license'),
177
+ person_data.get('id_numbers', {}).get('passport'))
178
+ )
179
+ result = cursor.fetchone()
180
+
181
+ if result:
182
+ return result[0]
183
+
184
+ cursor = self.conn.execute('''
185
+ INSERT INTO persons (name, email, ssn, drivers_license, passport, created_date)
186
+ VALUES (?, ?, ?, ?, ?, ?)
187
+ ''', (
188
+ person_data['name'],
189
+ person_data.get('email'),
190
+ person_data.get('id_numbers', {}).get('ssn'),
191
+ person_data.get('id_numbers', {}).get('drivers_license'),
192
+ person_data.get('id_numbers', {}).get('passport'),
193
+ datetime.now().isoformat()
194
+ ))
195
+ self.conn.commit()
196
+ return cursor.lastrowid
197
+
198
+ def process_document(self, pdf_path: str, filename: str) -> Dict:
199
  text = self.extract_text(pdf_path)
200
  doc_type = self.classifier.predict(text)
201
  metadata = self.extract_metadata(text)
202
+ person_data = self.person_identifier.identify_person(text)
203
+ person_id = self.get_or_create_person(person_data)
204
 
205
  cursor = self.conn.execute('''
206
  INSERT INTO documents
207
+ (filename, doc_type, person_id, amount, date,
208
+ account_number, raw_text, processed_date, confidence_score)
209
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
210
  ''', (
211
+ filename, doc_type, person_id,
212
  metadata['amount'], metadata['date'],
213
  metadata['account_number'], text,
214
+ datetime.now().isoformat(), 0.85
215
  ))
216
 
217
  doc_id = cursor.lastrowid
 
221
  'id': doc_id,
222
  'filename': filename,
223
  'doc_type': doc_type,
224
+ 'person': person_data,
225
  **metadata
226
  }
227
 
228
+ def process_batch(self, file_paths: List[str]) -> List[Dict]:
229
  results = []
230
  for file_path in file_paths:
231
  try:
232
+ result = self.process_document(file_path, os.path.basename(file_path))
233
  results.append({"status": "success", "result": result, "file": file_path})
234
  except Exception as e:
235
  results.append({"status": "error", "error": str(e), "file": file_path})
236
  return results
237
 
238
+ # [Previous HTML_TEMPLATE remains the same]
239
+
240
  app = Flask(__name__)
241
  processor = EnhancedDocProcessor()
242
 
 
250
  return jsonify({'error': 'No files uploaded'}), 400
251
 
252
  files = request.files.getlist('files[]')
 
253
 
 
254
  with tempfile.TemporaryDirectory() as temp_dir:
255
  file_paths = []
256
  for file in files:
257
  if file.filename.endswith('.pdf'):
 
258
  secure_name = secure_filename(file.filename)
 
259
  temp_path = os.path.join(temp_dir, secure_name)
260
  file.save(temp_path)
261
  file_paths.append(temp_path)
262
 
263
  try:
264
+ results = processor.process_batch(file_paths)
265
  except Exception as e:
266
  return jsonify({'error': str(e)}), 500
 
267
 
268
  return jsonify(results)
269