import streamlit as st from transformers import pipeline, AutoProcessor, AutoModelForCausalLM, MBart50TokenizerFast, MBartForConditionalGeneration, AutoProcessor, AutoModel from PIL import Image import requests from IPython.display import Audio import sys import cv2 from PIL import Image # Load Image to Text model image_processor = AutoProcessor.from_pretrained("sezenkarakus/image-GIT-description-model-v3") image_to_text_model = AutoModelForCausalLM.from_pretrained("sezenkarakus/image-GIT-description-model-v3") # Load Translation model ckpt = 'Narrativa/mbart-large-50-finetuned-opus-en-pt-translation' tokenizer = MBart50TokenizerFast.from_pretrained(ckpt) translation_model = MBartForConditionalGeneration.from_pretrained(ckpt) tokenizer.src_lang = 'en_XX' # Load Audio Model audio_processor = AutoProcessor.from_pretrained("suno/bark-small") audio_model = AutoModel.from_pretrained("suno/bark-small") # Methods def generate_caption(image): pixel_values = image_processor(images=image, return_tensors="pt").pixel_values generated_ids = image_to_text_model.generate(pixel_values=pixel_values, max_length=200) generated_caption = image_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_caption def translate(text): inputs = tokenizer(text, return_tensors='pt') input_ids = inputs.input_ids attention_mask = inputs.attention_mask try: input_ids = input_ids.to('cuda') attention_mask = attention_mask.to('cuda') model = translation_model.to("cuda") except: print('No NVidia GPU, model performance may not be as good') model = translation_model output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['pt_XX']) translated = tokenizer.decode(output[0], skip_special_tokens=True) return translated # Carregamento de imagens locais img_url = 'http://images.cocodataset.org/val2017/000000039769.jpg' # img_url = 'https://farm4.staticflickr.com/3733/9000662079_ce3599d0d8_z.jpg' # img_url = 'https://farm4.staticflickr.com/3088/5793281956_2a15b2559c_z.jpg' # img_url = 'https://farm5.staticflickr.com/4073/4816939054_844feb0078_z.jpg' image = Image.open(requests.get(img_url, stream=True).raw) # Generate using models # Generate text from image caption = generate_caption(image) print(caption) # Translate translated_caption = translate(caption) print(translated_caption) # Generate Audio inputs = audio_processor( text=caption, return_tensors="pt", ) speech_values = audio_model.generate(**inputs, do_sample=True) sampling_rate = audio_model.generation_config.sample_rate Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)