Talk2Deck / app.py
os1187's picture
Update app.py
61cc851 verified
raw
history blame
2.8 kB
import os
import gradio as gr
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
from transformers import AutoTokenizer, AutoModel
import torch
# Function to get image embeddings using ViT
def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTModel.from_pretrained(model_name)
image = Image.open(image_path)
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Function to convert PDF to images
from pdf2image import convert_from_path
def pdf_to_images(pdf_file, img_dir):
images = convert_from_path(pdf_file)
# Create the directory if it doesn't exist
os.makedirs(img_dir, exist_ok=True)
for i, image in enumerate(images):
image_path = f"{img_dir}/page_{i + 1}.png"
image.save(image_path, "PNG")
print(f"Converted {len(images)} pages to images and saved in {img_dir}")
# Function to get text embeddings using a transformer model
def get_text_embeddings(text, model_name='bert-base-uncased'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Function to process PDF and generate a response
def process_pdf_and_generate_response(pdf_file):
# Convert PDF to images
img_dir = "pdf_images"
pdf_to_images(pdf_file, img_dir)
# Generate embeddings for each image
image_embeddings = []
for filename in os.listdir(img_dir):
if filename.endswith(".png"):
image_path = os.path.join(img_dir, filename)
image_embeddings.append(get_image_embeddings(image_path))
# Perform some text analysis on the PDF content (replace with your logic)
pdf_text = "PDF content analysis placeholder"
text_embeddings = get_text_embeddings(pdf_text)
# Combine image and text embeddings and generate a response (replace with your logic)
combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
response = "Response based on the processed PDF"
return response
# Gradio interface
iface = gr.Interface(
fn=process_pdf_and_generate_response,
inputs=gr.inputs.File(label="Upload PDF", type="file"),
outputs=gr.outputs.Textbox(),
title="Talk2Deck - Interact with your PDFs",
description="Upload a PDF and receive insights based on its content."
)
if __name__ == "__main__":
iface.launch()