Spaces:

sanjaymalladi
/

document-processor

Build error

App Files Files Community

Sanjay malladi commited on Dec 27, 2024

Commit

354a803

verified ·

1 Parent(s): 6f2eed6

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -15

app.py CHANGED Viewed

@@ -20,9 +20,10 @@ import tempfile
 class PersonIdentifier:
     def __init__(self):
         self.name_patterns = [
-            r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
-            r'Name:?\s*([A-Z][a-z]+\s+[A-Z][a-z]+)',
-            r'([A-Z][a-z]+\s+[A-Z][a-z]+)'
         ]
         self.id_patterns = {
             'ssn': r'(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}',
@@ -38,11 +39,11 @@ class PersonIdentifier:
             'email': None
         }
-        # Extract name
         for pattern in self.name_patterns:
             names = re.findall(pattern, text)
             if names:
-                person_data['name'] = names[0]
                 break
         # Extract IDs
@@ -61,6 +62,7 @@ class PersonIdentifier:
 class MLDocumentClassifier:
     def __init__(self):
         self.labels = [
             'BankApplication_CreditCard',
             'BankApplication_SavingsAccount',
             'ID_DriversLicense',
@@ -71,17 +73,17 @@ class MLDocumentClassifier:
             'Financial_IncomeStatement',
             'Receipt'
         ]
-        self.classifier = Pipeline([
-            ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
-            ('clf', MultinomialNB())
-        ])
-        self.is_trained = False
     def predict(self, text):
         return self._rule_based_classify(text)
     def _rule_based_classify(self, text):
         text_lower = text.lower()
         rules = [
             ('BankApplication_CreditCard', ['credit card application', 'card request', 'new card']),
             ('BankApplication_SavingsAccount', ['savings account', 'open account', 'new account']),
@@ -94,13 +96,17 @@ class MLDocumentClassifier:
             ('Receipt', ['receipt', 'payment received', 'transaction record'])
         ]
-        scores = []
         for doc_type, keywords in rules:
             score = sum(1 for keyword in keywords if keyword in text_lower)
-            scores.append((doc_type, score / len(keywords) if keywords else 0))
-        scores.sort(key=lambda x: x[1], reverse=True)
-        return scores[0][0]
 class EnhancedDocProcessor:
     def __init__(self):

 class PersonIdentifier:
     def __init__(self):
         self.name_patterns = [
+            r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',  # Titles with names
+            r'Name:?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',                     # Names with "Name:" prefix
+            r'(?m)^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)$',                       # Names on their own line
+            r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)'                              # General names
         ]
         self.id_patterns = {
             'ssn': r'(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}',
             'email': None
         }
+        # Extract name with improved patterns
         for pattern in self.name_patterns:
             names = re.findall(pattern, text)
             if names:
+                person_data['name'] = names[0].strip()
                 break
         # Extract IDs
 class MLDocumentClassifier:
     def __init__(self):
         self.labels = [
+            'Invoice',
             'BankApplication_CreditCard',
             'BankApplication_SavingsAccount',
             'ID_DriversLicense',
             'Financial_IncomeStatement',
             'Receipt'
         ]
     def predict(self, text):
         return self._rule_based_classify(text)
     def _rule_based_classify(self, text):
         text_lower = text.lower()
+        # Primary document indicators (strong signals)
+        if 'invoice' in text_lower or 'inv-' in text_lower:
+            return 'Invoice'
         rules = [
             ('BankApplication_CreditCard', ['credit card application', 'card request', 'new card']),
             ('BankApplication_SavingsAccount', ['savings account', 'open account', 'new account']),
             ('Receipt', ['receipt', 'payment received', 'transaction record'])
         ]
+        max_score = 0
+        best_type = 'Unknown'
         for doc_type, keywords in rules:
             score = sum(1 for keyword in keywords if keyword in text_lower)
+            weighted_score = score / len(keywords) if keywords else 0
+            if weighted_score > max_score:
+                max_score = weighted_score
+                best_type = doc_type
+        return best_type
 class EnhancedDocProcessor:
     def __init__(self):