Sanjay malladi commited on
Commit
354a803
·
verified ·
1 Parent(s): 6f2eed6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -15
app.py CHANGED
@@ -20,9 +20,10 @@ import tempfile
20
  class PersonIdentifier:
21
  def __init__(self):
22
  self.name_patterns = [
23
- r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
24
- r'Name:?\s*([A-Z][a-z]+\s+[A-Z][a-z]+)',
25
- r'([A-Z][a-z]+\s+[A-Z][a-z]+)'
 
26
  ]
27
  self.id_patterns = {
28
  'ssn': r'(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}',
@@ -38,11 +39,11 @@ class PersonIdentifier:
38
  'email': None
39
  }
40
 
41
- # Extract name
42
  for pattern in self.name_patterns:
43
  names = re.findall(pattern, text)
44
  if names:
45
- person_data['name'] = names[0]
46
  break
47
 
48
  # Extract IDs
@@ -61,6 +62,7 @@ class PersonIdentifier:
61
  class MLDocumentClassifier:
62
  def __init__(self):
63
  self.labels = [
 
64
  'BankApplication_CreditCard',
65
  'BankApplication_SavingsAccount',
66
  'ID_DriversLicense',
@@ -71,17 +73,17 @@ class MLDocumentClassifier:
71
  'Financial_IncomeStatement',
72
  'Receipt'
73
  ]
74
- self.classifier = Pipeline([
75
- ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)),
76
- ('clf', MultinomialNB())
77
- ])
78
- self.is_trained = False
79
 
80
  def predict(self, text):
81
  return self._rule_based_classify(text)
82
 
83
  def _rule_based_classify(self, text):
84
  text_lower = text.lower()
 
 
 
 
 
85
  rules = [
86
  ('BankApplication_CreditCard', ['credit card application', 'card request', 'new card']),
87
  ('BankApplication_SavingsAccount', ['savings account', 'open account', 'new account']),
@@ -94,13 +96,17 @@ class MLDocumentClassifier:
94
  ('Receipt', ['receipt', 'payment received', 'transaction record'])
95
  ]
96
 
97
- scores = []
 
 
98
  for doc_type, keywords in rules:
99
  score = sum(1 for keyword in keywords if keyword in text_lower)
100
- scores.append((doc_type, score / len(keywords) if keywords else 0))
101
-
102
- scores.sort(key=lambda x: x[1], reverse=True)
103
- return scores[0][0]
 
 
104
 
105
  class EnhancedDocProcessor:
106
  def __init__(self):
 
20
  class PersonIdentifier:
21
  def __init__(self):
22
  self.name_patterns = [
23
+ r'(?:Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', # Titles with names
24
+ r'Name:?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', # Names with "Name:" prefix
25
+ r'(?m)^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)$', # Names on their own line
26
+ r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)' # General names
27
  ]
28
  self.id_patterns = {
29
  'ssn': r'(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}',
 
39
  'email': None
40
  }
41
 
42
+ # Extract name with improved patterns
43
  for pattern in self.name_patterns:
44
  names = re.findall(pattern, text)
45
  if names:
46
+ person_data['name'] = names[0].strip()
47
  break
48
 
49
  # Extract IDs
 
62
  class MLDocumentClassifier:
63
  def __init__(self):
64
  self.labels = [
65
+ 'Invoice',
66
  'BankApplication_CreditCard',
67
  'BankApplication_SavingsAccount',
68
  'ID_DriversLicense',
 
73
  'Financial_IncomeStatement',
74
  'Receipt'
75
  ]
 
 
 
 
 
76
 
77
  def predict(self, text):
78
  return self._rule_based_classify(text)
79
 
80
  def _rule_based_classify(self, text):
81
  text_lower = text.lower()
82
+
83
+ # Primary document indicators (strong signals)
84
+ if 'invoice' in text_lower or 'inv-' in text_lower:
85
+ return 'Invoice'
86
+
87
  rules = [
88
  ('BankApplication_CreditCard', ['credit card application', 'card request', 'new card']),
89
  ('BankApplication_SavingsAccount', ['savings account', 'open account', 'new account']),
 
96
  ('Receipt', ['receipt', 'payment received', 'transaction record'])
97
  ]
98
 
99
+ max_score = 0
100
+ best_type = 'Unknown'
101
+
102
  for doc_type, keywords in rules:
103
  score = sum(1 for keyword in keywords if keyword in text_lower)
104
+ weighted_score = score / len(keywords) if keywords else 0
105
+ if weighted_score > max_score:
106
+ max_score = weighted_score
107
+ best_type = doc_type
108
+
109
+ return best_type
110
 
111
  class EnhancedDocProcessor:
112
  def __init__(self):