import gradio as gr from transformers import TrOCRProcessor, VisionEncoderDecoderModel from PIL import Image import requests #import torch # Load the processor and model processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-stage1') model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-stage1') def generate_text(input_image): # Convert Gradio input image to PIL Image image = Image.fromarray(input_image) # Process the image and generate text pixel_values = processor(images=image, return_tensors="pt").pixel_values generated_ids = model.generate(pixel_values) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_text # Define the Gradio interface iface = gr.Interface( fn=generate_text, inputs=gr.Image(), # Gradio Image input outputs=gr.Textbox(), # Gradio Textbox output ) if __name__ == "__main__": iface.launch()