os1187 commited on
Commit
3ce84fd
1 Parent(s): ff9190e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -15
app.py CHANGED
@@ -18,27 +18,20 @@ def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
18
  return embeddings
19
 
20
  # Function to convert PDF to images
21
- def pdf_to_images(pdf_file, img_dir):
22
- # Open the provided PDF file
23
- doc = fitz.open(pdf_file)
24
 
 
 
 
25
  # Create the directory if it doesn't exist
26
  os.makedirs(img_dir, exist_ok=True)
27
 
28
- for page_num in range(len(doc)):
29
- # Get the page
30
- page = doc.load_page(page_num)
31
-
32
- # Render the page to an image
33
- pix = page.get_pixmap()
34
-
35
- # Define the output image path
36
- output_file = f"{img_dir}/page_{page_num + 1}.png"
37
 
38
- # Save the image
39
- pix.save(output_file)
40
 
41
- print(f"Converted {len(doc)} pages to images and saved in {img_dir}")
42
 
43
  # Function to get text embeddings using a transformer model
44
  def get_text_embeddings(text, model_name='bert-base-uncased'):
 
18
  return embeddings
19
 
20
  # Function to convert PDF to images
21
+ from pdf2image import convert_from_path
 
 
22
 
23
+ def pdf_to_images(pdf_file, img_dir):
24
+ images = convert_from_path(pdf_file)
25
+
26
  # Create the directory if it doesn't exist
27
  os.makedirs(img_dir, exist_ok=True)
28
 
29
+ for i, image in enumerate(images):
30
+ image_path = f"{img_dir}/page_{i + 1}.png"
31
+ image.save(image_path, "PNG")
 
 
 
 
 
 
32
 
33
+ print(f"Converted {len(images)} pages to images and saved in {img_dir}")
 
34
 
 
35
 
36
  # Function to get text embeddings using a transformer model
37
  def get_text_embeddings(text, model_name='bert-base-uncased'):