Update app.py
Browse files
app.py
CHANGED
@@ -18,27 +18,20 @@ def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
|
|
18 |
return embeddings
|
19 |
|
20 |
# Function to convert PDF to images
|
21 |
-
|
22 |
-
# Open the provided PDF file
|
23 |
-
doc = fitz.open(pdf_file)
|
24 |
|
|
|
|
|
|
|
25 |
# Create the directory if it doesn't exist
|
26 |
os.makedirs(img_dir, exist_ok=True)
|
27 |
|
28 |
-
for
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
# Render the page to an image
|
33 |
-
pix = page.get_pixmap()
|
34 |
-
|
35 |
-
# Define the output image path
|
36 |
-
output_file = f"{img_dir}/page_{page_num + 1}.png"
|
37 |
|
38 |
-
|
39 |
-
pix.save(output_file)
|
40 |
|
41 |
-
print(f"Converted {len(doc)} pages to images and saved in {img_dir}")
|
42 |
|
43 |
# Function to get text embeddings using a transformer model
|
44 |
def get_text_embeddings(text, model_name='bert-base-uncased'):
|
|
|
18 |
return embeddings
|
19 |
|
20 |
# Function to convert PDF to images
|
21 |
+
from pdf2image import convert_from_path
|
|
|
|
|
22 |
|
23 |
+
def pdf_to_images(pdf_file, img_dir):
|
24 |
+
images = convert_from_path(pdf_file)
|
25 |
+
|
26 |
# Create the directory if it doesn't exist
|
27 |
os.makedirs(img_dir, exist_ok=True)
|
28 |
|
29 |
+
for i, image in enumerate(images):
|
30 |
+
image_path = f"{img_dir}/page_{i + 1}.png"
|
31 |
+
image.save(image_path, "PNG")
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
print(f"Converted {len(images)} pages to images and saved in {img_dir}")
|
|
|
34 |
|
|
|
35 |
|
36 |
# Function to get text embeddings using a transformer model
|
37 |
def get_text_embeddings(text, model_name='bert-base-uncased'):
|