awacke1 commited on
Commit
7124f43
·
verified ·
1 Parent(s): b90cc86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -35
app.py CHANGED
@@ -24,20 +24,15 @@ def get_timestamp_prefix() -> str:
24
  def nlp_engine_and_registry(model_family: str, model_path: str) -> tuple:
25
  """🤖 Sparks NLP models with a wink!"""
26
  registry = RecognizerRegistry()
 
27
  if model_family.lower() == "flair":
28
  from flair.models import SequenceTagger
29
  tagger = SequenceTagger.load(model_path)
30
- registry.load_predefined_recognizers()
31
- recognizer = PatternRecognizer(supported_entity="CUSTOM", supported_language="en")
32
- registry.add_recognizer(recognizer)
33
  logger.info(f"Flair model loaded: {model_path}")
34
  return tagger, registry
35
  elif model_family.lower() == "huggingface":
36
  from transformers import pipeline
37
  nlp = pipeline("ner", model=model_path, tokenizer=model_path)
38
- registry.load_predefined_recognizers()
39
- recognizer = PatternRecognizer(supported_entity="CUSTOM", supported_language="en")
40
- registry.add_recognizer(recognizer)
41
  logger.info(f"HuggingFace model loaded: {model_path}")
42
  return nlp, registry
43
  raise ValueError(f"Model family {model_family} unsupported")
@@ -84,44 +79,31 @@ def save_pdf(pdf_input) -> str:
84
  logger.error(f"Upload rejected: {pdf_input.name} exceeds 200MB")
85
  st.error("PDF exceeds 200MB limit")
86
  raise ValueError("PDF too big")
87
- try:
88
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir="/tmp") as tmp:
89
- tmp.write(pdf_input.read())
90
- logger.info(f"Uploaded PDF to {tmp.name}, size: {pdf_input.size} bytes")
91
- return tmp.name
92
- except Exception as e:
93
- logger.error(f"Upload failed: {str(e)}")
94
- st.error(f"Upload error: {str(e)}")
95
- raise
96
 
97
  # Feature Spotlight: 📄 PDF Wizardry Unleashed!
98
  # Uploads zip through, PHI vanishes, and out pops a safe PDF with timestamp pizzazz! ✨
99
 
100
  def read_pdf(pdf_path: str) -> str:
101
  """📖 Gobbles PDF text like candy!"""
102
- try:
103
- reader = PdfReader(pdf_path)
104
- text = "".join(page.extract_text() or "" + "\n" for page in reader.pages)
105
- logger.info(f"Extracted {len(text)} chars from {pdf_path}")
106
- return text
107
- except Exception as e:
108
- logger.error(f"Read failed: {str(e)}")
109
- raise
110
 
111
  def create_pdf(text: str, input_path: str, output_filename: str) -> str:
112
  """🖨️ Spins a new PDF with PHI-proof charm!"""
113
- try:
114
- reader = PdfReader(input_path)
115
- writer = PdfWriter()
116
- for page in reader.pages:
117
- writer.add_page(page)
118
- with open(output_filename, "wb") as f:
119
- writer.write(f)
120
- logger.info(f"Created PDF: {output_filename}")
121
- return output_filename
122
- except Exception as e:
123
- logger.error(f"Create failed: {str(e)}")
124
- raise
125
 
126
  # Sidebar
127
  st.sidebar.header("PHI De-identification with Presidio")
 
24
  def nlp_engine_and_registry(model_family: str, model_path: str) -> tuple:
25
  """🤖 Sparks NLP models with a wink!"""
26
  registry = RecognizerRegistry()
27
+ registry.load_predefined_recognizers()
28
  if model_family.lower() == "flair":
29
  from flair.models import SequenceTagger
30
  tagger = SequenceTagger.load(model_path)
 
 
 
31
  logger.info(f"Flair model loaded: {model_path}")
32
  return tagger, registry
33
  elif model_family.lower() == "huggingface":
34
  from transformers import pipeline
35
  nlp = pipeline("ner", model=model_path, tokenizer=model_path)
 
 
 
36
  logger.info(f"HuggingFace model loaded: {model_path}")
37
  return nlp, registry
38
  raise ValueError(f"Model family {model_family} unsupported")
 
79
  logger.error(f"Upload rejected: {pdf_input.name} exceeds 200MB")
80
  st.error("PDF exceeds 200MB limit")
81
  raise ValueError("PDF too big")
82
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir="/tmp") as tmp:
83
+ tmp.write(pdf_input.read())
84
+ logger.info(f"Uploaded PDF to {tmp.name}, size: {pdf_input.size} bytes")
85
+ return tmp.name
 
 
 
 
 
86
 
87
  # Feature Spotlight: 📄 PDF Wizardry Unleashed!
88
  # Uploads zip through, PHI vanishes, and out pops a safe PDF with timestamp pizzazz! ✨
89
 
90
  def read_pdf(pdf_path: str) -> str:
91
  """📖 Gobbles PDF text like candy!"""
92
+ reader = PdfReader(pdf_path)
93
+ text = "".join(page.extract_text() or "" + "\n" for page in reader.pages)
94
+ logger.info(f"Extracted {len(text)} chars from {pdf_path}")
95
+ return text
 
 
 
 
96
 
97
  def create_pdf(text: str, input_path: str, output_filename: str) -> str:
98
  """🖨️ Spins a new PDF with PHI-proof charm!"""
99
+ reader = PdfReader(input_path)
100
+ writer = PdfWriter()
101
+ for page in reader.pages:
102
+ writer.add_page(page)
103
+ with open(output_filename, "wb") as f:
104
+ writer.write(f)
105
+ logger.info(f"Created PDF: {output_filename}")
106
+ return output_filename
 
 
 
 
107
 
108
  # Sidebar
109
  st.sidebar.header("PHI De-identification with Presidio")