Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForCausalLM | |
from PIL import Image | |
import torch | |
from gtts import gTTS | |
import spacy | |
import requests | |
import nltk.tree | |
import re | |
import os | |
# Carregar o modelo de português do spaCy | |
nlp = spacy.load("pt_core_news_sm") | |
# Chave para o LX-Parser | |
key = "eb159d39469d84f0ff47167a4d89cada" | |
# Funções de manipulação gramatical | |
def invert_adj_n(doc, tags): | |
frase = [] | |
already = False | |
for i in range(len(doc)): | |
if already: | |
already = False | |
continue | |
if doc[i].tag_ != "PUNCT": | |
if tags[i] == "A": | |
if i + 1 < len(tags) and tags[i + 1] == "N": | |
frase.append(doc[i + 1].text) | |
frase.append(doc[i].text) | |
already = True | |
else: | |
frase.append(doc[i].text) | |
else: | |
frase.append(doc[i].text) | |
else: | |
frase.append(doc[i].text) | |
return frase | |
def adjust_adj(doc, tags): | |
frase = [] | |
for i in range(len(doc)): | |
frase.append(doc[i].text) | |
if tags[i] == "A": | |
if i + 1 < len(tags) and tags[i + 1] == "A": | |
frase.append("e") | |
return frase | |
def adjust_art(doc, tags): | |
frase = [] | |
already = False | |
for i in range(len(doc)): | |
if already: | |
already = False | |
continue | |
text = doc[i].text | |
if tags[i] == "ART" and text.lower() == "a": | |
if i + 1 < len(doc): | |
gender = doc[i + 1].morph.get("Gender") | |
number = doc[i + 1].morph.get("Number") | |
if gender and number: | |
if gender[0] == "Masc" and number[0] == "Sing": | |
frase.append("um") | |
elif gender[0] == "Fem" and number[0] == "Sing": | |
frase.append("uma") | |
elif gender[0] == "Masc" and number[0] != "Sing": | |
frase.append("os") | |
else: | |
frase.append("as") | |
else: | |
frase.append(text) | |
else: | |
frase.append(text) | |
else: | |
frase.append(text) | |
return frase | |
def create_sentence(doc, tags, frase): | |
tmp = frase | |
for i in range(len(doc)): | |
text = doc[i].text | |
if doc[i].is_sent_start: | |
tmp[i] = tmp[i].capitalize() | |
if doc[i].tag_ == "PUNCT": | |
tmp[i - 1] += text | |
return tmp | |
def get_productions(texto): | |
format = 'parentheses' | |
url = "https://portulanclarin.net/workbench/lx-parser/api/" | |
request_data = { | |
'method': 'parse', | |
'jsonrpc': '2.0', | |
'id': 0, | |
'params': { | |
'text': texto, | |
'format': format, | |
'key': key, | |
}, | |
} | |
request = requests.post(url, json=request_data) | |
response_data = request.json() | |
if "error" in response_data: | |
print("Error:", response_data["error"]) | |
return [] | |
else: | |
result = response_data["result"] | |
productions = [] | |
tree = nltk.tree.Tree.fromstring(result) | |
for tag in tree.productions(): | |
if len(re.findall(r"'.*'", str(tag))) > 0: | |
productions.append(str(tag)) | |
return productions | |
def get_tags(productions): | |
tags = [] | |
for item in productions: | |
if isinstance(item, str): | |
tags.append(item[:item.find(' ->')]) | |
else: | |
tags.append(item) | |
for item in tags: | |
if "'" in item: | |
tags.remove(item) | |
return tags | |
def reordenar_sentenca(sentenca): | |
if not sentenca.strip(): | |
return sentenca | |
sentenca = sentenca.lower() | |
sentence = get_productions(sentenca) | |
tags = get_tags(sentence) | |
doc = nlp(sentenca) | |
if tags[0] != "ART": | |
sentenca = "A " + sentenca.strip() | |
sentence = get_productions(sentenca) | |
tags = get_tags(sentence) | |
doc = nlp(sentenca) | |
if not sentence: | |
return sentenca.strip() | |
aux = [] | |
if len(tags) > 2 and tags[1] == "N" and tags[2] == "N": | |
aux = sentenca.split() | |
tmp = aux[1] | |
aux[1] = aux[2] | |
aux.insert(2, "de") | |
aux[3] = tmp | |
sentenca = " ".join(aux) | |
sentence = get_productions(sentenca) | |
tags = get_tags(sentence) | |
doc = nlp(sentenca) | |
frase = [] | |
already = False | |
person = 3 | |
tmp_doc = [] | |
for token in doc: | |
tmp_doc.append(token) | |
frase = invert_adj_n(tmp_doc, tags) | |
nova_sentenca = ' '.join(frase) | |
productions = get_productions(nova_sentenca) | |
tags = get_tags(productions) | |
doc = nlp(nova_sentenca) | |
while nova_sentenca != sentenca: | |
frase = invert_adj_n(doc, tags) | |
sentenca = nova_sentenca | |
nova_sentenca = ' '.join(frase) | |
productions = get_productions(nova_sentenca) | |
tags = get_tags(productions) | |
doc = nlp(nova_sentenca) | |
frase = adjust_adj(doc, tags) | |
nova_sentenca = ' '.join(frase) | |
productions = get_productions(nova_sentenca) | |
tags = get_tags(productions) | |
doc = nlp(nova_sentenca) | |
while nova_sentenca != sentenca: | |
frase = adjust_adj(doc, tags) | |
sentenca = nova_sentenca | |
nova_sentenca = ' '.join(frase) | |
productions = get_productions(nova_sentenca) | |
tags = get_tags(productions) | |
doc = nlp(nova_sentenca) | |
frase = adjust_art(doc, tags) | |
sentenca = ' '.join(frase) | |
productions = get_productions(sentenca) | |
tags = get_tags(productions) | |
doc = nlp(sentenca) | |
frase = create_sentence(doc, tags, frase) | |
sentenca_normalizada = "" | |
for i in range(len(frase)): | |
sentenca_normalizada += frase[i] + " " | |
return sentenca_normalizada.strip() | |
def prepare_image(image_path): | |
image = Image.open(image_path).convert("RGB") | |
inputs = processor(images=image, return_tensors="pt").to(device) | |
return image, inputs.pixel_values | |
def generate_caption(pixel_values): | |
model.eval() | |
with torch.no_grad(): | |
generated_ids = model.generate( | |
pixel_values=pixel_values, | |
max_length=50, | |
num_beams=4, | |
early_stopping=True, | |
no_repeat_ngram_size=2 | |
) | |
return processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
def text_to_speech_gtts(text, lang='pt'): | |
tts = gTTS(text=text, lang=lang) | |
tts.save("output.mp3") | |
return "output.mp3" | |
# Carregar os modelos | |
processor = AutoProcessor.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic") | |
model = AutoModelForCausalLM.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic") | |
# Configurar o dispositivo (GPU ou CPU) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
# Função principal para processar a imagem e gerar a voz | |
def process_image(image): | |
_, pixel_values = prepare_image(image) | |
caption_pt = generate_caption(pixel_values) | |
sentenca_normalizada = reordenar_sentenca(caption_pt) | |
audio_file = text_to_speech_gtts(sentenca_normalizada) | |
productions = get_productions(sentenca_normalizada) | |
return sentenca_normalizada, productions, audio_file | |
# Caminhos para as imagens de exemplo | |
example_image_paths = [ | |
"example1.jpeg", | |
"example2.jpeg", | |
"example3.jpeg" | |
] | |
# Interface Gradio | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type="filepath"), | |
outputs=[gr.Textbox(label="Sentença Normalizada"), gr.Textbox(label="Classes Gramaticais"), gr.Audio(type="filepath", label="Áudio")], | |
examples=example_image_paths, | |
title="Image to Voice", | |
description="Gera uma descrição em português e a converte em voz a partir de uma imagem." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |