nuojohnchen commited on
Commit
3ff06ea
·
verified ·
1 Parent(s): d358536

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -6
app.py CHANGED
@@ -5,7 +5,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
6
  from io import BytesIO
7
  from PIL import Image
8
- from pdf2image import convert_from_bytes
 
9
  from transformers import NougatProcessor, VisionEncoderDecoderModel
10
 
11
  # Set environment variables
@@ -92,11 +93,18 @@ def extract_text_from_pdf(pdf_bytes):
92
  # Load Nougat model
93
  processor, model = load_nougat_model()
94
 
95
- # Convert PDF to images
96
- images = convert_from_bytes(pdf_bytes)
97
  full_text = ""
98
 
99
- for img in images:
 
 
 
 
 
 
 
100
  # Process with Nougat
101
  pixel_values = processor(img, return_tensors="pt").pixel_values.to(model.device)
102
 
@@ -104,7 +112,7 @@ def extract_text_from_pdf(pdf_bytes):
104
  outputs = model.generate(
105
  pixel_values,
106
  min_length=1,
107
- max_new_tokens=1024, # Adjust based on expected page content length
108
  bad_words_ids=[[processor.tokenizer.unk_token_id]],
109
  )
110
 
@@ -113,6 +121,9 @@ def extract_text_from_pdf(pdf_bytes):
113
  page_text = processor.post_process_generation(page_text, fix_markdown=True)
114
 
115
  full_text += page_text + "\n\n"
 
 
 
116
 
117
  # Clear GPU memory
118
  del pixel_values, outputs
@@ -120,7 +131,9 @@ def extract_text_from_pdf(pdf_bytes):
120
 
121
  return full_text
122
  except Exception as e:
123
- print(f"PDF extraction error: {str(e)}")
 
 
124
  return default_paper_content
125
  finally:
126
  # Clear GPU memory
 
5
  import torch
6
  from io import BytesIO
7
  from PIL import Image
8
+ import fitz # PyMuPDF
9
+ import numpy as np
10
  from transformers import NougatProcessor, VisionEncoderDecoderModel
11
 
12
  # Set environment variables
 
93
  # Load Nougat model
94
  processor, model = load_nougat_model()
95
 
96
+ # Convert PDF to images using PyMuPDF
97
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
98
  full_text = ""
99
 
100
+ for page_num in range(len(doc)):
101
+ page = doc.load_page(page_num)
102
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality
103
+
104
+ # Convert to PIL Image
105
+ img_data = pix.samples
106
+ img = Image.frombytes("RGB", [pix.width, pix.height], img_data)
107
+
108
  # Process with Nougat
109
  pixel_values = processor(img, return_tensors="pt").pixel_values.to(model.device)
110
 
 
112
  outputs = model.generate(
113
  pixel_values,
114
  min_length=1,
115
+ max_new_tokens=1024,
116
  bad_words_ids=[[processor.tokenizer.unk_token_id]],
117
  )
118
 
 
121
  page_text = processor.post_process_generation(page_text, fix_markdown=True)
122
 
123
  full_text += page_text + "\n\n"
124
+
125
+ # Print progress
126
+ print(f"Processed page {page_num+1}/{len(doc)}")
127
 
128
  # Clear GPU memory
129
  del pixel_values, outputs
 
131
 
132
  return full_text
133
  except Exception as e:
134
+ import traceback
135
+ error_details = traceback.format_exc()
136
+ print(f"PDF extraction error: {str(e)}\n{error_details}")
137
  return default_paper_content
138
  finally:
139
  # Clear GPU memory