File size: 9,163 Bytes
1e29672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import gradio as gr  # Import Gradio for creating web interfaces
import torch  # Import PyTorch for deep learning
from PIL import Image  # Import PIL for image processing
from transformers import pipeline, CLIPProcessor, CLIPModel  # Import necessary classes from Hugging Face Transformers
import requests  # Import requests for making HTTP requests
from bs4 import BeautifulSoup  # Import BeautifulSoup for web scraping
from gtts import gTTS  # Import gTTS for text-to-speech conversion

# Define the device to use (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the BLIP model for image captioning
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)

# Load CLIP model for image classification
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# Load the English summarization model
summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum")

# Load the Arabic summarization model
arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization")

# Load the translation model
translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M")

# Function to fetch long texts from Wikipedia
def get_wikipedia_summary(landmark_name, language='en'):
    url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}"  # Construct the URL
    response = requests.get(url)  # Make an HTTP GET request to fetch the page
    soup = BeautifulSoup(response.content, 'html.parser')  # Parse the HTML content with BeautifulSoup

    paragraphs = soup.find_all('p')  # Extract all paragraph elements
    summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()])  # Join text from all paragraphs

    return summary_text[:2000]  # Return the first 2000 characters of the summary

# Function to load landmarks from an external file
def load_landmarks(filename):
    landmarks = {}
    with open(filename, 'r', encoding='utf-8') as file:  # Open the file in read mode
        for line in file:
            if line.strip():
                english_name, arabic_name = line.strip().split('|')  # Split by the delimiter
                landmarks[english_name] = arabic_name  # Add to the dictionary
    return landmarks  # Return the dictionary of landmarks

# Load landmarks from the file
landmarks_dict = load_landmarks("landmarks.txt")

# Function to convert text to speech
def text_to_speech(text, language='en'):
    tts = gTTS(text=text, lang=language)  # Create a gTTS object for text-to-speech
    audio_file = "summary.mp3"  # Define the audio file name
    tts.save(audio_file)  # Save the audio file
    return audio_file  # Return the path to the audio file

# Function to generate a caption for the image
def generate_caption(image):
    return caption_image(image)[0]['generated_text']  # Get generated caption from the model

# Function to classify the image using the CLIP model
def classify_image(image, labels):
    inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True)  # Prepare inputs for CLIP model
    outputs = clip_model(**inputs)  # Get model outputs
    logits_per_image = outputs.logits_per_image  # Get logits for images
    probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0]  # Compute probabilities
    top_label = labels[probs.argmax()]  # Get the label with the highest probability
    top_prob = probs.max()  # Get the highest probability value
    return top_label, top_prob  # Return top label and probability

# Function to summarize the description
def summarize_description(full_description, language):
    if language == 'ar':
        return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text']  # Summarize in Arabic
    else:
        return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text']  # Summarize in English

# Function to translate the caption and classification result
def translate_results(caption, top_label, top_prob, landmarks_dict, language):
    if language == 'ar':
        caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text']  # Translate caption to Arabic
        classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text']  # Translate classification result
    else:
        caption_translated = caption  # Keep caption in English
        classification_result = f"Best match: {top_label} with probability {top_prob:.4f}"  # Create English classification result

    return caption_translated, classification_result  # Return translated results

# Function to process the image and generate results
def process_image(image, language='en'):
    try:
        # Generate caption for the image
        caption = generate_caption(image)  # Call the caption generation function

        # Classify the image
        top_label, top_prob = classify_image(image, list(landmarks_dict.keys()))  # Use keys for classification

        # Determine the appropriate name to use based on the language
        landmark_name = top_label if language == 'en' else landmarks_dict[top_label]
        full_description = get_wikipedia_summary(landmark_name, language)  # Get the Wikipedia summary for the top label

        # Summarize the full description
        summarized_description = summarize_description(full_description, language)  # Call the summarization function

        # Translate caption and classification result
        caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language)  # Call the translation function

        # Convert the summarized description to speech
        audio_file = text_to_speech(summarized_description, language)  # Convert summary to audio

        # Return results formatted for Arabic
        if language == 'ar':
            return f"<div style='text-align: right;'>{caption_translated}</div>", \
                   f"<div style='text-align: right;'>{classification_result}</div>", \
                   f"<div style='text-align: right;'>{summarized_description}</div>", \
                   audio_file  # Return formatted results for Arabic
        else:
            return caption_translated, classification_result, summarized_description, audio_file  # Return results for English
    except Exception as e:
        return "Error processing the image.", str(e), "", ""  # Return error message if any exception occurs

# Create Gradio interface for English
english_interface = gr.Interface(
    fn=lambda image: process_image(image, language='en'),  # Function to call on image upload
    inputs=gr.Image(type="pil", label="Upload Image"),  # Input field for image upload
    outputs=[  # Define output fields
        gr.Textbox(label="Generated Caption"),  # Output for generated caption
        gr.Textbox(label="Classification Result"),  # Output for classification result
        gr.Textbox(label="Summarized Description", lines=10),  # Output for summarized description
        gr.Audio(label="Summary Audio", type="filepath")  # Output for audio summary
    ],
    title="Landmark Recognition",  # Title of the interface
    description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.",  # Description of the tool
    examples=[  # Examples for user
        ["SOL.jfif"],
        ["OIP.jfif"]
    ]
)

# Create Gradio interface for Arabic
arabic_interface = gr.Interface(
    fn=lambda image: process_image(image, language='ar'),  # Function to call on image upload
    inputs=gr.Image(type="pil", label="تحميل صورة"),  # Input field for image upload in Arabic
    outputs=[  # Define output fields
        gr.HTML(label="التعليق المولد"),  # Output for generated caption in Arabic
        gr.HTML(label="نتيجة التصنيف"),  # Output for classification result in Arabic
        gr.HTML(label="الوصف الملخص"),  # Output for summarized description in Arabic
        gr.Audio(label="صوت الملخص", type="filepath")  # Output for audio summary in Arabic
    ],
    title="التعرف على المعالم",  # Title of the interface in Arabic
    description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة",  # Description of the tool in Arabic
    examples=[  # Examples for user
        ["SOL.jfif"],
        ["OIP.jfif"]
    ]
)

# Merge all interfaces into a tabbed interface
demo = gr.TabbedInterface(
    [english_interface, arabic_interface],  # List of interfaces to include
    ["English", "العربية"]  # Names of the tabs
)

# Launch the interface
demo.launch()  # Start the Gradio application.