import torch from transformers import AutoProcessor, AutoModelForCausalLM from PIL import Image import gradio as gr # Step 1: Load the processor from Hugging Face processor = AutoProcessor.from_pretrained("microsoft/git-large-textcaps") # Step 2: Load the model architecture from Hugging Face model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-textcaps") # Load model structure # Step 3: Load your custom PyTorch weights custom_weights_path = "model_folder/pytorch_model.bin" # Path to your custom weights model.load_state_dict(torch.load(custom_weights_path, map_location=torch.device("cpu"))) # Load custom weights model.eval() # Set the model to evaluation mode # Step 4: Define the caption generation function def generate_caption(image): # Convert the input image to PIL format (if necessary) image = Image.fromarray(image) # Preprocess the image using the processor inputs = processor(images=image, return_tensors="pt") pixel_values = inputs.pixel_values # Generate caption generated_ids = model.generate(pixel_values=pixel_values, max_length=50) generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_caption # Step 5: Define the Gradio interface interface = gr.Interface( fn=generate_caption, # Function to process input inputs=gr.Image(), # Input as image outputs=gr.Textbox(), # Output as text live=True # Enable live prediction ) # Step 6: Launch the Gradio app interface.launch()