Spaces:

os1187
/

Talk2Deck

Runtime error

File size: 2,801 Bytes

8953fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ce84fd
8953fd3
3ce84fd
 
 
8953fd3
 
 
3ce84fd
 
 
8953fd3
3ce84fd
8953fd3

import os
import gradio as gr
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
from transformers import AutoTokenizer, AutoModel
import torch

# Function to get image embeddings using ViT
def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
    feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
    model = ViTModel.from_pretrained(model_name)
    
    image = Image.open(image_path)
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Function to convert PDF to images
from pdf2image import convert_from_path

def pdf_to_images(pdf_file, img_dir):
    images = convert_from_path(pdf_file)
    
    # Create the directory if it doesn't exist
    os.makedirs(img_dir, exist_ok=True)

    for i, image in enumerate(images):
        image_path = f"{img_dir}/page_{i + 1}.png"
        image.save(image_path, "PNG")

    print(f"Converted {len(images)} pages to images and saved in {img_dir}")


# Function to get text embeddings using a transformer model
def get_text_embeddings(text, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Function to process PDF and generate a response
def process_pdf_and_generate_response(pdf_file):
    # Convert PDF to images
    img_dir = "pdf_images"
    pdf_to_images(pdf_file, img_dir)

    # Generate embeddings for each image
    image_embeddings = []
    for filename in os.listdir(img_dir):
        if filename.endswith(".png"):
            image_path = os.path.join(img_dir, filename)
            image_embeddings.append(get_image_embeddings(image_path))

    # Perform some text analysis on the PDF content (replace with your logic)
    pdf_text = "PDF content analysis placeholder"
    text_embeddings = get_text_embeddings(pdf_text)

    # Combine image and text embeddings and generate a response (replace with your logic)
    combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
    response = "Response based on the processed PDF"
    return response

# Gradio interface
iface = gr.Interface(
    fn=process_pdf_and_generate_response,
    inputs=gr.inputs.File(label="Upload PDF", type="file"),
    outputs=gr.outputs.Textbox(),
    title="Talk2Deck - Interact with your PDFs",
    description="Upload a PDF and receive insights based on its content."
)

if __name__ == "__main__":
    iface.launch()