|
import gradio as gr |
|
import torch |
|
from PIL import Image |
|
from transformers import pipeline, CLIPProcessor, CLIPModel |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from gtts import gTTS |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) |
|
|
|
|
|
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device) |
|
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") |
|
|
|
|
|
summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum") |
|
|
|
|
|
arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization") |
|
|
|
|
|
translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M") |
|
|
|
|
|
def get_wikipedia_summary(landmark_name, language='en'): |
|
url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}" |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
paragraphs = soup.find_all('p') |
|
summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()]) |
|
|
|
return summary_text[:2000] |
|
|
|
|
|
def load_landmarks(filename): |
|
landmarks = {} |
|
with open(filename, 'r', encoding='utf-8') as file: |
|
for line in file: |
|
if line.strip(): |
|
english_name, arabic_name = line.strip().split('|') |
|
landmarks[english_name] = arabic_name |
|
return landmarks |
|
|
|
|
|
landmarks_dict = load_landmarks("landmarks.txt") |
|
|
|
|
|
def text_to_speech(text, language='en'): |
|
tts = gTTS(text=text, lang=language) |
|
audio_file = "summary.mp3" |
|
tts.save(audio_file) |
|
return audio_file |
|
|
|
|
|
def generate_caption(image): |
|
return caption_image(image)[0]['generated_text'] |
|
|
|
|
|
def classify_image(image, labels): |
|
inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True) |
|
outputs = clip_model(**inputs) |
|
logits_per_image = outputs.logits_per_image |
|
probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0] |
|
top_label = labels[probs.argmax()] |
|
top_prob = probs.max() |
|
return top_label, top_prob |
|
|
|
|
|
def summarize_description(full_description, language): |
|
if language == 'ar': |
|
return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] |
|
else: |
|
return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] |
|
|
|
|
|
def translate_results(caption, top_label, top_prob, landmarks_dict, language): |
|
if language == 'ar': |
|
caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] |
|
classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] |
|
else: |
|
caption_translated = caption |
|
classification_result = f"Best match: {top_label} with probability {top_prob:.4f}" |
|
|
|
return caption_translated, classification_result |
|
|
|
|
|
def process_image(image, language='en'): |
|
try: |
|
|
|
caption = generate_caption(image) |
|
|
|
|
|
top_label, top_prob = classify_image(image, list(landmarks_dict.keys())) |
|
|
|
|
|
landmark_name = top_label if language == 'en' else landmarks_dict[top_label] |
|
full_description = get_wikipedia_summary(landmark_name, language) |
|
|
|
|
|
summarized_description = summarize_description(full_description, language) |
|
|
|
|
|
caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language) |
|
|
|
|
|
audio_file = text_to_speech(summarized_description, language) |
|
|
|
|
|
if language == 'ar': |
|
return f"<div style='text-align: right;'>{caption_translated}</div>", \ |
|
f"<div style='text-align: right;'>{classification_result}</div>", \ |
|
f"<div style='text-align: right;'>{summarized_description}</div>", \ |
|
audio_file |
|
else: |
|
return caption_translated, classification_result, summarized_description, audio_file |
|
except Exception as e: |
|
return "Error processing the image.", str(e), "", "" |
|
|
|
|
|
english_interface = gr.Interface( |
|
fn=lambda image: process_image(image, language='en'), |
|
inputs=gr.Image(type="pil", label="Upload Image"), |
|
outputs=[ |
|
gr.Textbox(label="Generated Caption"), |
|
gr.Textbox(label="Classification Result"), |
|
gr.Textbox(label="Summarized Description", lines=10), |
|
gr.Audio(label="Summary Audio", type="filepath") |
|
], |
|
title="Landmark Recognition", |
|
description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.", |
|
examples=[ |
|
["SOL.jfif"], |
|
["OIP.jfif"] |
|
] |
|
) |
|
|
|
|
|
arabic_interface = gr.Interface( |
|
fn=lambda image: process_image(image, language='ar'), |
|
inputs=gr.Image(type="pil", label="تحميل صورة"), |
|
outputs=[ |
|
gr.HTML(label="التعليق المولد"), |
|
gr.HTML(label="نتيجة التصنيف"), |
|
gr.HTML(label="الوصف الملخص"), |
|
gr.Audio(label="صوت الملخص", type="filepath") |
|
], |
|
title="التعرف على المعالم", |
|
description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة", |
|
examples=[ |
|
["SOL.jfif"], |
|
["OIP.jfif"] |
|
] |
|
) |
|
|
|
|
|
demo = gr.TabbedInterface( |
|
[english_interface, arabic_interface], |
|
["English", "العربية"] |
|
) |
|
|
|
|
|
demo.launch() |