|
import os |
|
import gradio as gr |
|
from transformers import ViTFeatureExtractor, ViTModel |
|
from PIL import Image |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
|
|
|
|
def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'): |
|
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name) |
|
model = ViTModel.from_pretrained(model_name) |
|
|
|
image = Image.open(image_path) |
|
inputs = feature_extractor(images=image, return_tensors="pt") |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state.mean(dim=1) |
|
return embeddings |
|
|
|
|
|
from pdf2image import convert_from_path |
|
|
|
def pdf_to_images(pdf_file, img_dir): |
|
images = convert_from_path(pdf_file) |
|
|
|
|
|
os.makedirs(img_dir, exist_ok=True) |
|
|
|
for i, image in enumerate(images): |
|
image_path = f"{img_dir}/page_{i + 1}.png" |
|
image.save(image_path, "PNG") |
|
|
|
print(f"Converted {len(images)} pages to images and saved in {img_dir}") |
|
|
|
|
|
|
|
def get_text_embeddings(text, model_name='bert-base-uncased'): |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512) |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state.mean(dim=1) |
|
return embeddings |
|
|
|
|
|
def process_pdf_and_generate_response(pdf_file): |
|
|
|
img_dir = "pdf_images" |
|
pdf_to_images(pdf_file, img_dir) |
|
|
|
|
|
image_embeddings = [] |
|
for filename in os.listdir(img_dir): |
|
if filename.endswith(".png"): |
|
image_path = os.path.join(img_dir, filename) |
|
image_embeddings.append(get_image_embeddings(image_path)) |
|
|
|
|
|
pdf_text = "PDF content analysis placeholder" |
|
text_embeddings = get_text_embeddings(pdf_text) |
|
|
|
|
|
combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0) |
|
response = "Response based on the processed PDF" |
|
return response |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_pdf_and_generate_response, |
|
inputs=gr.inputs.File(label="Upload PDF", type="file"), |
|
outputs=gr.outputs.Textbox(), |
|
title="Talk2Deck - Interact with your PDFs", |
|
description="Upload a PDF and receive insights based on its content." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |