|
import os |
|
import time |
|
import gradio as gr |
|
import torch |
|
from PIL import Image |
|
from gtts import gTTS |
|
import numpy as np |
|
import cv2 |
|
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM |
|
from huggingface_hub import login |
|
|
|
|
|
hf_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
|
if hf_token: |
|
login(token=hf_token) |
|
|
|
|
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5s') |
|
|
|
|
|
def calculate_glcm_contrast(image): |
|
gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY) |
|
max_value = gray_image.max() + 1 |
|
glcm = np.zeros((max_value, max_value), dtype=np.float64) |
|
|
|
for i in range(gray_image.shape[0] - 1): |
|
for j in range(gray_image.shape[1] - 1): |
|
x = gray_image[i, j] |
|
y = gray_image[i + 1, j + 1] |
|
glcm[x, y] += 1 |
|
|
|
glcm = glcm / glcm.sum() |
|
|
|
contrast = 0.0 |
|
for i in range(max_value): |
|
for j in range(max_value): |
|
contrast += (i - j) ** 2 * glcm[i, j] |
|
|
|
return contrast |
|
|
|
|
|
def analyze_image_properties(image): |
|
|
|
image_rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) |
|
avg_color_per_row = np.average(image_rgb, axis=0) |
|
avg_color = np.average(avg_color_per_row, axis=0) |
|
temperature = 'fria' if np.mean(avg_color) < 128 else 'quente' |
|
|
|
|
|
texture_contrast = calculate_glcm_contrast(image) |
|
texture = 'lisa' if texture_contrast < 100 else 'texturizada' |
|
|
|
return temperature, texture |
|
|
|
|
|
def describe_image(image): |
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
inputs = processor(image, return_tensors="pt") |
|
out = model.generate(**inputs) |
|
description = processor.decode(out[0], skip_special_tokens=True) |
|
return description |
|
|
|
|
|
def translate_description(description): |
|
model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt' |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True)) |
|
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) |
|
return translated_text |
|
|
|
|
|
def process_image(image): |
|
|
|
results = model(image) |
|
detected_image = results.render()[0] |
|
|
|
|
|
mean_rgb = np.mean(np.array(image), axis=(0, 1)) |
|
|
|
|
|
temperature, texture = analyze_image_properties(image) |
|
|
|
|
|
description = describe_image(image) |
|
translated_description = translate_description(description) |
|
|
|
|
|
final_description = f"{translated_description}. A textura é {texture} e a temperatura de cor é {temperature}." |
|
|
|
|
|
tts = gTTS(text=final_description, lang='pt') |
|
attempts = 0 |
|
while attempts < 5: |
|
try: |
|
tts.save("output.mp3") |
|
break |
|
except gTTS.tts.gTTSError as e: |
|
if e.r.status_code == 429: |
|
print("Too many requests. Waiting before retrying...") |
|
time.sleep(5) |
|
attempts += 1 |
|
else: |
|
raise e |
|
|
|
|
|
return Image.fromarray(detected_image), final_description, "output.mp3" |
|
|
|
|
|
example_image_path = "example1.JPG" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="pil"), |
|
outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")], |
|
examples=[example_image_path] |
|
) |
|
|
|
iface.launch() |
|
|