sanjaymalladi commited on
Commit
14e3f2f
·
1 Parent(s): 0d5fa1f

Initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +11 -0
  2. Dockerfile +10 -0
  3. app.py +357 -0
  4. requirments.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env/
7
+ venv/
8
+ .env
9
+ *.db
10
+ document_store/
11
+ temp_*
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ from flask import Flask, request, jsonify, render_template_string
3
+ import PyPDF2
4
+ import sqlite3
5
+ from datetime import datetime
6
+ import re
7
+ import os
8
+ import hashlib
9
+ from typing import List, Dict
10
+ import shutil
11
+ from sklearn.pipeline import Pipeline
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.naive_bayes import MultinomialNB
14
+ import numpy as np
15
+ import joblib
16
+ import base64
17
+
18
+ # HTML template with embedded JavaScript
19
+ HTML_TEMPLATE = """
20
+ <!DOCTYPE html>
21
+ <html lang="en">
22
+ <head>
23
+ <meta charset="UTF-8">
24
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
25
+ <title>Document Processor</title>
26
+ <script src="https://cdn.tailwindcss.com"></script>
27
+ <style>
28
+ /* Additional custom styles can go here */
29
+ .processing {
30
+ animation: pulse 2s infinite;
31
+ }
32
+ @keyframes pulse {
33
+ 0% { opacity: 1; }
34
+ 50% { opacity: 0.5; }
35
+ 100% { opacity: 1; }
36
+ }
37
+ </style>
38
+ </head>
39
+ <body class="bg-gray-50">
40
+ <div class="container mx-auto p-6 max-w-4xl">
41
+ <div class="mb-8">
42
+ <h1 class="text-3xl font-bold mb-2">Smart Document Processor</h1>
43
+ <p class="text-gray-600">Upload and analyze PDF documents with AI</p>
44
+ </div>
45
+
46
+ <!-- Upload Section -->
47
+ <div class="mb-8">
48
+ <div id="dropZone" class="border-2 border-dashed border-gray-300 rounded-lg p-8 text-center hover:border-blue-500 transition-colors">
49
+ <input type="file" multiple accept=".pdf" id="fileInput" class="hidden">
50
+ <div class="cursor-pointer">
51
+ <svg class="w-12 h-12 text-gray-400 mx-auto mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor">
52
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12"/>
53
+ </svg>
54
+ <span class="text-lg mb-2 block">Drop PDFs here or click to upload</span>
55
+ <span class="text-sm text-gray-500">Supports multiple files</span>
56
+ </div>
57
+ </div>
58
+ </div>
59
+
60
+ <!-- File List -->
61
+ <div id="fileList" class="mb-8 hidden">
62
+ <h2 class="text-xl font-semibold mb-4">Selected Files</h2>
63
+ <div id="fileListContent" class="space-y-2"></div>
64
+ <button id="processButton" class="mt-4 bg-blue-600 text-white px-6 py-2 rounded-lg hover:bg-blue-700 disabled:opacity-50">
65
+ Process Documents
66
+ </button>
67
+ </div>
68
+
69
+ <!-- Results Section -->
70
+ <div id="results" class="space-y-4"></div>
71
+
72
+ <!-- Error Alert -->
73
+ <div id="error" class="hidden mt-4 bg-red-100 border border-red-400 text-red-700 px-4 py-3 rounded"></div>
74
+ </div>
75
+
76
+ <script>
77
+ let files = [];
78
+ const dropZone = document.getElementById('dropZone');
79
+ const fileInput = document.getElementById('fileInput');
80
+ const fileList = document.getElementById('fileList');
81
+ const fileListContent = document.getElementById('fileListContent');
82
+ const processButton = document.getElementById('processButton');
83
+ const resultsDiv = document.getElementById('results');
84
+ const errorDiv = document.getElementById('error');
85
+
86
+ // Drag and drop handlers
87
+ dropZone.addEventListener('dragover', (e) => {
88
+ e.preventDefault();
89
+ dropZone.classList.add('border-blue-500');
90
+ });
91
+
92
+ dropZone.addEventListener('dragleave', () => {
93
+ dropZone.classList.remove('border-blue-500');
94
+ });
95
+
96
+ dropZone.addEventListener('drop', (e) => {
97
+ e.preventDefault();
98
+ dropZone.classList.remove('border-blue-500');
99
+ handleFiles(e.dataTransfer.files);
100
+ });
101
+
102
+ dropZone.addEventListener('click', () => {
103
+ fileInput.click();
104
+ });
105
+
106
+ fileInput.addEventListener('change', (e) => {
107
+ handleFiles(e.target.files);
108
+ });
109
+
110
+ function handleFiles(uploadedFiles) {
111
+ files = Array.from(uploadedFiles).filter(file => file.name.toLowerCase().endsWith('.pdf'));
112
+ updateFileList();
113
+ }
114
+
115
+ function updateFileList() {
116
+ if (files.length > 0) {
117
+ fileList.classList.remove('hidden');
118
+ fileListContent.innerHTML = files.map((file, index) => `
119
+ <div class="flex items-center p-3 bg-gray-50 rounded">
120
+ <svg class="w-5 h-5 text-gray-500 mr-3" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor">
121
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/>
122
+ </svg>
123
+ <span>${file.name}</span>
124
+ </div>
125
+ `).join('');
126
+ } else {
127
+ fileList.classList.add('hidden');
128
+ }
129
+ }
130
+
131
+ processButton.addEventListener('click', async () => {
132
+ if (files.length === 0) return;
133
+
134
+ processButton.disabled = true;
135
+ processButton.innerHTML = `
136
+ <svg class="animate-spin -ml-1 mr-3 h-5 w-5 text-white inline" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
137
+ <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
138
+ <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
139
+ </svg>
140
+ Processing...
141
+ `;
142
+
143
+ const formData = new FormData();
144
+ files.forEach(file => {
145
+ formData.append('files[]', file);
146
+ });
147
+
148
+ try {
149
+ const response = await fetch('/batch_process', {
150
+ method: 'POST',
151
+ body: formData
152
+ });
153
+
154
+ const data = await response.json();
155
+ displayResults(data);
156
+ errorDiv.classList.add('hidden');
157
+ } catch (error) {
158
+ errorDiv.textContent = 'Failed to process documents. Please try again.';
159
+ errorDiv.classList.remove('hidden');
160
+ } finally {
161
+ processButton.disabled = false;
162
+ processButton.textContent = 'Process Documents';
163
+ }
164
+ });
165
+
166
+ function displayResults(results) {
167
+ resultsDiv.innerHTML = results.map(result => `
168
+ <div class="border rounded-lg p-4 bg-white shadow-sm">
169
+ <h3 class="font-medium mb-2">${result.result.filename}</h3>
170
+ <div class="grid grid-cols-2 gap-4">
171
+ <div>
172
+ <span class="text-gray-600">Type:</span>
173
+ <span class="ml-2">${result.result.doc_type}</span>
174
+ </div>
175
+ <div>
176
+ <span class="text-gray-600">Date:</span>
177
+ <span class="ml-2">${result.result.date || 'N/A'}</span>
178
+ </div>
179
+ <div>
180
+ <span class="text-gray-600">Amount:</span>
181
+ <span class="ml-2">${result.result.amount ? '$' + result.result.amount.toFixed(2) : 'N/A'}</span>
182
+ </div>
183
+ <div>
184
+ <span class="text-gray-600">Person:</span>
185
+ <span class="ml-2">${result.result.person_name}</span>
186
+ </div>
187
+ </div>
188
+ </div>
189
+ `).join('');
190
+ }
191
+ </script>
192
+ </body>
193
+ </html>
194
+ """
195
+
196
+ class MLDocumentClassifier:
197
+ def __init__(self):
198
+ self.labels = ['Invoice', 'Statement', 'Contract', 'Receipt', 'Report', 'Letter', 'Form']
199
+ self.classifier = Pipeline([
200
+ ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
201
+ ('clf', MultinomialNB())
202
+ ])
203
+ self.is_trained = False
204
+
205
+ def predict(self, text):
206
+ return self._rule_based_classify(text)
207
+
208
+ def _rule_based_classify(self, text):
209
+ text_lower = text.lower()
210
+ rules = [
211
+ ('Invoice', ['invoice', 'bill', 'payment due', 'amount due']),
212
+ ('Statement', ['statement', 'balance', 'transaction history']),
213
+ ('Contract', ['contract', 'agreement', 'terms and conditions']),
214
+ ('Receipt', ['receipt', 'purchased', 'payment received']),
215
+ ('Report', ['report', 'analysis', 'findings']),
216
+ ('Letter', ['dear', 'sincerely', 'regards']),
217
+ ('Form', ['form', 'please fill', 'application'])
218
+ ]
219
+
220
+ scores = []
221
+ for doc_type, keywords in rules:
222
+ score = sum(1 for keyword in keywords if keyword in text_lower)
223
+ scores.append((doc_type, score / len(keywords) if keywords else 0))
224
+
225
+ scores.sort(key=lambda x: x[1], reverse=True)
226
+ return scores[0][0]
227
+
228
+ class EnhancedDocProcessor:
229
+ def __init__(self):
230
+ self.conn = sqlite3.connect(':memory:', check_same_thread=False)
231
+ self.setup_database()
232
+ self.classifier = MLDocumentClassifier()
233
+
234
+ def setup_database(self):
235
+ self.conn.executescript('''
236
+ CREATE TABLE IF NOT EXISTS documents (
237
+ id INTEGER PRIMARY KEY,
238
+ filename TEXT,
239
+ doc_type TEXT,
240
+ person_name TEXT,
241
+ amount REAL,
242
+ date TEXT,
243
+ account_number TEXT,
244
+ raw_text TEXT,
245
+ processed_date TEXT,
246
+ file_hash TEXT,
247
+ version INTEGER,
248
+ user_id TEXT
249
+ );
250
+
251
+ CREATE TABLE IF NOT EXISTS similar_docs (
252
+ doc_id INTEGER,
253
+ similar_doc_id INTEGER,
254
+ similarity_score REAL,
255
+ FOREIGN KEY (doc_id) REFERENCES documents (id),
256
+ FOREIGN KEY (similar_doc_id) REFERENCES documents (id)
257
+ );
258
+ ''')
259
+ self.conn.commit()
260
+
261
+ def extract_text(self, pdf_path: str) -> str:
262
+ try:
263
+ text_parts = []
264
+ with open(pdf_path, 'rb') as file:
265
+ reader = PyPDF2.PdfReader(file)
266
+ for page in reader.pages:
267
+ text = page.extract_text()
268
+ if text:
269
+ text_parts.append(text)
270
+ return "\n".join(text_parts)
271
+ except Exception as e:
272
+ return f"Error extracting text: {str(e)}"
273
+
274
+ def extract_metadata(self, text: str) -> Dict:
275
+ return {
276
+ 'amount': next((float(amt.replace('$','').replace(',',''))
277
+ for amt in re.findall(r'\$[\d,]+\.?\d*', text)), 0.0),
278
+ 'date': next(iter(re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)), None),
279
+ 'account_number': next(iter(re.findall(r'Account\s*#?\s*:?\s*(\d{8,12})', text)), None),
280
+ 'person_name': next(iter(re.findall(r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)', text)), "Unknown")
281
+ }
282
+
283
+ def process_document(self, pdf_path: str, filename: str, user_id: str = None) -> Dict:
284
+ text = self.extract_text(pdf_path)
285
+ doc_type = self.classifier.predict(text)
286
+ metadata = self.extract_metadata(text)
287
+
288
+ cursor = self.conn.execute('''
289
+ INSERT INTO documents
290
+ (filename, doc_type, person_name, amount, date,
291
+ account_number, raw_text, processed_date, user_id)
292
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
293
+ ''', (
294
+ filename, doc_type, metadata['person_name'],
295
+ metadata['amount'], metadata['date'],
296
+ metadata['account_number'], text,
297
+ datetime.now().isoformat(), user_id
298
+ ))
299
+
300
+ doc_id = cursor.lastrowid
301
+ self.conn.commit()
302
+
303
+ return {
304
+ 'id': doc_id,
305
+ 'filename': filename,
306
+ 'doc_type': doc_type,
307
+ **metadata
308
+ }
309
+
310
+ def process_batch(self, file_paths: List[str], user_id: str = None) -> List[Dict]:
311
+ results = []
312
+ for file_path in file_paths:
313
+ try:
314
+ result = self.process_document(file_path, os.path.basename(file_path), user_id)
315
+ results.append({"status": "success", "result": result, "file": file_path})
316
+ except Exception as e:
317
+ results.append({"status": "error", "error": str(e), "file": file_path})
318
+ return results
319
+
320
+ app = Flask(__name__)
321
+ processor = EnhancedDocProcessor()
322
+
323
+ @app.route('/')
324
+ def index():
325
+ return render_template_string(HTML_TEMPLATE)
326
+
327
+ @app.route('/batch_process', methods=['POST'])
328
+ def batch_process():
329
+ if 'files[]' not in request.files:
330
+ return jsonify({'error': 'No files uploaded'}), 400
331
+
332
+ files = request.files.getlist('files[]')
333
+ user_id = request.form.get('user_id')
334
+
335
+ file_paths = []
336
+ for file in files:
337
+ if file.filename.endswith('.pdf'):
338
+ temp_path = f"temp_{file.filename}"
339
+ file.save(temp_path)
340
+ file_paths.append(temp_path)
341
+
342
+ try:
343
+ results = processor.process_batch(file_paths, user_id)
344
+ except Exception as e:
345
+ return jsonify({'error': str(e)}), 500
346
+ finally:
347
+ # Clean up temporary files
348
+ for path in file_paths:
349
+ try:
350
+ os.remove(path)
351
+ except:
352
+ pass
353
+
354
+ return jsonify(results)
355
+
356
+ if __name__ == '__main__':
357
+ app.run(host='0.0.0.0', port=7860, debug=True)
requirments.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ flask==2.0.1
2
+ PyPDF2==3.0.1
3
+ scikit-learn==1.0.2
4
+ numpy==1.21.2
5
+ joblib==1.1.0
6
+ gunicorn==20.1.0