seu-ebers commited on
Commit
fe4a4c9
·
1 Parent(s): 0356df8
Files changed (2) hide show
  1. app.py +83 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline, AutoProcessor, AutoModelForCausalLM, MBart50TokenizerFast, MBartForConditionalGeneration, AutoProcessor, AutoModel
3
+ from PIL import Image
4
+ import requests
5
+ from IPython.display import Audio
6
+ import sys
7
+ import cv2
8
+ from PIL import Image
9
+
10
+ # Load Image to Text model
11
+ image_processor = AutoProcessor.from_pretrained("sezenkarakus/image-GIT-description-model-v3")
12
+ image_to_text_model = AutoModelForCausalLM.from_pretrained("sezenkarakus/image-GIT-description-model-v3")
13
+
14
+ # Load Translation model
15
+ ckpt = 'Narrativa/mbart-large-50-finetuned-opus-en-pt-translation'
16
+
17
+ tokenizer = MBart50TokenizerFast.from_pretrained(ckpt)
18
+ translation_model = MBartForConditionalGeneration.from_pretrained(ckpt)
19
+
20
+ tokenizer.src_lang = 'en_XX'
21
+
22
+ # Load Audio Model
23
+ audio_processor = AutoProcessor.from_pretrained("suno/bark")
24
+ audio_model = AutoModel.from_pretrained("suno/bark")
25
+
26
+
27
+ # Methods
28
+
29
+ def generate_caption(image):
30
+ pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
31
+ generated_ids = image_to_text_model.generate(pixel_values=pixel_values, max_length=200)
32
+ generated_caption = image_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
33
+
34
+ return generated_caption
35
+
36
+ def translate(text):
37
+ inputs = tokenizer(text, return_tensors='pt')
38
+ input_ids = inputs.input_ids
39
+ attention_mask = inputs.attention_mask
40
+
41
+ try:
42
+ input_ids = input_ids.to('cuda')
43
+ attention_mask = attention_mask.to('cuda')
44
+ model = translation_model.to("cuda")
45
+ except:
46
+ print('No NVidia GPU, model performance may not be as good')
47
+ model = translation_model
48
+
49
+ output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['pt_XX'])
50
+ translated = tokenizer.decode(output[0], skip_special_tokens=True)
51
+
52
+ return translated
53
+
54
+ # Carregamento de imagens locais
55
+
56
+ img_url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
57
+ # img_url = 'https://farm4.staticflickr.com/3733/9000662079_ce3599d0d8_z.jpg'
58
+ # img_url = 'https://farm4.staticflickr.com/3088/5793281956_2a15b2559c_z.jpg'
59
+ # img_url = 'https://farm5.staticflickr.com/4073/4816939054_844feb0078_z.jpg'
60
+
61
+ image = Image.open(requests.get(img_url, stream=True).raw)
62
+
63
+ # Generate using models
64
+ # Generate text from image
65
+ caption = generate_caption(image)
66
+ print(caption)
67
+
68
+ # Translate
69
+ translated_caption = translate(caption)
70
+ print(translated_caption)
71
+
72
+ # Generate Audio
73
+ inputs = audio_processor(
74
+ text=caption,
75
+ return_tensors="pt",
76
+ )
77
+
78
+ speech_values = audio_model.generate(**inputs, do_sample=True)
79
+
80
+ sampling_rate = audio_model.generation_config.sample_rate
81
+ Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)
82
+
83
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ accelerate
4
+ streamlit~=1.30.0
5
+ pillow~=10.3.0
6
+ requests~=2.31.0
7
+ ipython~=8.20.0