Spaces:

rmayormartins
/

inclusion-visually-impaired-image2speech

Sleeping

App Files Files Community

inclusion-visually-impaired-image2speech / app.py

rmayormartins

Subindo arquivos33137

2f3c50a 8 months ago

raw

history blame

4.13 kB

	import os
	import time
	import gradio as gr
	import torch
	from PIL import Image
	from gtts import gTTS
	import numpy as np
	import cv2
	from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
	from huggingface_hub import login

	# Ler o token da variável de ambiente
	hf_token = os.getenv("HUGGINGFACE_TOKEN")

	if hf_token:
	login(token=hf_token)

	# Carregar o modelo YOLOv5
	model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

	# Função para calcular a GLCM e o contraste manualmente
	def calculate_glcm_contrast(image):
	gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
	max_value = gray_image.max() + 1
	glcm = np.zeros((max_value, max_value), dtype=np.float64)

	for i in range(gray_image.shape[0] - 1):
	for j in range(gray_image.shape[1] - 1):
	x = gray_image[i, j]
	y = gray_image[i + 1, j + 1]
	glcm[x, y] += 1

	glcm = glcm / glcm.sum()

	contrast = 0.0
	for i in range(max_value):
	for j in range(max_value):
	contrast += (i - j) ** 2 * glcm[i, j]

	return contrast

	# Função para analisar a textura e a temperatura de cor
	def analyze_image_properties(image):
	# Análise de cor (média RGB)
	image_rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
	avg_color_per_row = np.average(image_rgb, axis=0)
	avg_color = np.average(avg_color_per_row, axis=0)
	temperature = 'fria' if np.mean(avg_color) < 128 else 'quente'

	# Análise de textura
	texture_contrast = calculate_glcm_contrast(image)
	texture = 'lisa' if texture_contrast < 100 else 'texturizada'

	return temperature, texture

	# Função para descrever imagem usando BLIP
	def describe_image(image):
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
	inputs = processor(image, return_tensors="pt")
	out = model.generate(**inputs)
	description = processor.decode(out[0], skip_special_tokens=True)
	return description

	# Função para traduzir descrição para português
	def translate_description(description):
	model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True))
	translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
	return translated_text

	# Função principal para processar imagem e gerar saída de voz
	def process_image(image):
	# Detecção de objetos
	results = model(image)
	detected_image = results.render()[0]

	# Análise de cor (média RGB)
	mean_rgb = np.mean(np.array(image), axis=(0, 1))

	# Análise de textura e temperatura de cor
	temperature, texture = analyze_image_properties(image)

	# Descrição da imagem
	description = describe_image(image)
	translated_description = translate_description(description)

	# Construir a descrição final
	final_description = f"{translated_description}. A textura é {texture} e a temperatura de cor é {temperature}."

	# Texto para voz
	tts = gTTS(text=final_description, lang='pt')
	attempts = 0
	while attempts < 5:
	try:
	tts.save("output.mp3")
	break
	except gTTS.tts.gTTSError as e:
	if e.r.status_code == 429:
	print("Too many requests. Waiting before retrying...")
	time.sleep(5)
	attempts += 1
	else:
	raise e

	# Retornar imagem com detecções, descrição e áudio
	return Image.fromarray(detected_image), final_description, "output.mp3"

	# Carregar imagem de exemplo diretamente do código
	example_image_path = "example1.JPG"

	# Interface Gradio
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")],
	examples=[example_image_path]
	)

	iface.launch()