Spaces:

os1187
/

Talk2Deck

Runtime error

App Files Files Community

Talk2Deck / app.py

os1187

Update app.py

3ce84fd verified 9 months ago

raw

history blame

2.82 kB

	import fitz # PyMuPDF
	import os
	import gradio as gr
	from transformers import ViTFeatureExtractor, ViTModel
	from PIL import Image
	from transformers import AutoTokenizer, AutoModel
	import torch

	# Function to get image embeddings using ViT
	def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
	feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
	model = ViTModel.from_pretrained(model_name)

	image = Image.open(image_path)
	inputs = feature_extractor(images=image, return_tensors="pt")
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
	return embeddings

	# Function to convert PDF to images
	from pdf2image import convert_from_path

	def pdf_to_images(pdf_file, img_dir):
	images = convert_from_path(pdf_file)

	# Create the directory if it doesn't exist
	os.makedirs(img_dir, exist_ok=True)

	for i, image in enumerate(images):
	image_path = f"{img_dir}/page_{i + 1}.png"
	image.save(image_path, "PNG")

	print(f"Converted {len(images)} pages to images and saved in {img_dir}")


	# Function to get text embeddings using a transformer model
	def get_text_embeddings(text, model_name='bert-base-uncased'):
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
	return embeddings

	# Function to process PDF and generate a response
	def process_pdf_and_generate_response(pdf_file):
	# Convert PDF to images
	img_dir = "pdf_images"
	pdf_to_images(pdf_file, img_dir)

	# Generate embeddings for each image
	image_embeddings = []
	for filename in os.listdir(img_dir):
	if filename.endswith(".png"):
	image_path = os.path.join(img_dir, filename)
	image_embeddings.append(get_image_embeddings(image_path))

	# Perform some text analysis on the PDF content (replace with your logic)
	pdf_text = "PDF content analysis placeholder"
	text_embeddings = get_text_embeddings(pdf_text)

	# Combine image and text embeddings and generate a response (replace with your logic)
	combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
	response = "Response based on the processed PDF"
	return response

	# Gradio interface
	iface = gr.Interface(
	fn=process_pdf_and_generate_response,
	inputs=gr.inputs.File(label="Upload PDF", type="file"),
	outputs=gr.outputs.Textbox(),
	title="Talk2Deck - Interact with your PDFs",
	description="Upload a PDF and receive insights based on its content."
	)

	if __name__ == "__main__":
	iface.launch()