import gradio as gr # Import Gradio for creating web interfaces import torch # Import PyTorch for deep learning from PIL import Image # Import PIL for image processing from transformers import pipeline, CLIPProcessor, CLIPModel # Import necessary classes from Hugging Face Transformers import requests # Import requests for making HTTP requests from bs4 import BeautifulSoup # Import BeautifulSoup for web scraping from gtts import gTTS # Import gTTS for text-to-speech conversion # Define the device to use (CPU or GPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the BLIP model for image captioning caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) # Load CLIP model for image classification clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device) clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") # Load the English summarization model summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum") # Load the Arabic summarization model arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization") # Load the translation model translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M") # Function to fetch long texts from Wikipedia def get_wikipedia_summary(landmark_name, language='en'): url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}" # Construct the URL response = requests.get(url) # Make an HTTP GET request to fetch the page soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup paragraphs = soup.find_all('p') # Extract all paragraph elements summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()]) # Join text from all paragraphs return summary_text[:2000] # Return the first 2000 characters of the summary # Function to load landmarks from an external file def load_landmarks(filename): landmarks = {} with open(filename, 'r', encoding='utf-8') as file: # Open the file in read mode for line in file: if line.strip(): english_name, arabic_name = line.strip().split('|') # Split by the delimiter landmarks[english_name] = arabic_name # Add to the dictionary return landmarks # Return the dictionary of landmarks # Load landmarks from the file landmarks_dict = load_landmarks("landmarks.txt") # Function to convert text to speech def text_to_speech(text, language='en'): tts = gTTS(text=text, lang=language) # Create a gTTS object for text-to-speech audio_file = "summary.mp3" # Define the audio file name tts.save(audio_file) # Save the audio file return audio_file # Return the path to the audio file # Function to generate a caption for the image def generate_caption(image): return caption_image(image)[0]['generated_text'] # Get generated caption from the model # Function to classify the image using the CLIP model def classify_image(image, labels): inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True) # Prepare inputs for CLIP model outputs = clip_model(**inputs) # Get model outputs logits_per_image = outputs.logits_per_image # Get logits for images probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0] # Compute probabilities top_label = labels[probs.argmax()] # Get the label with the highest probability top_prob = probs.max() # Get the highest probability value return top_label, top_prob # Return top label and probability # Function to summarize the description def summarize_description(full_description, language): if language == 'ar': return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in Arabic else: return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in English # Function to translate the caption and classification result def translate_results(caption, top_label, top_prob, landmarks_dict, language): if language == 'ar': caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate caption to Arabic classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate classification result else: caption_translated = caption # Keep caption in English classification_result = f"Best match: {top_label} with probability {top_prob:.4f}" # Create English classification result return caption_translated, classification_result # Return translated results # Function to process the image and generate results def process_image(image, language='en'): try: # Generate caption for the image caption = generate_caption(image) # Call the caption generation function # Classify the image top_label, top_prob = classify_image(image, list(landmarks_dict.keys())) # Use keys for classification # Determine the appropriate name to use based on the language landmark_name = top_label if language == 'en' else landmarks_dict[top_label] full_description = get_wikipedia_summary(landmark_name, language) # Get the Wikipedia summary for the top label # Summarize the full description summarized_description = summarize_description(full_description, language) # Call the summarization function # Translate caption and classification result caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language) # Call the translation function # Convert the summarized description to speech audio_file = text_to_speech(summarized_description, language) # Convert summary to audio # Return results formatted for Arabic if language == 'ar': return f"
{caption_translated}
", \ f"
{classification_result}
", \ f"
{summarized_description}
", \ audio_file # Return formatted results for Arabic else: return caption_translated, classification_result, summarized_description, audio_file # Return results for English except Exception as e: return "Error processing the image.", str(e), "", "" # Return error message if any exception occurs # Create Gradio interface for English english_interface = gr.Interface( fn=lambda image: process_image(image, language='en'), # Function to call on image upload inputs=gr.Image(type="pil", label="Upload Image"), # Input field for image upload outputs=[ # Define output fields gr.Textbox(label="Generated Caption"), # Output for generated caption gr.Textbox(label="Classification Result"), # Output for classification result gr.Textbox(label="Summarized Description", lines=10), # Output for summarized description gr.Audio(label="Summary Audio", type="filepath") # Output for audio summary ], title="Landmark Recognition", # Title of the interface description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.", # Description of the tool examples=[ # Examples for user ["SOL.jfif"], ["OIP.jfif"] ] ) # Create Gradio interface for Arabic arabic_interface = gr.Interface( fn=lambda image: process_image(image, language='ar'), # Function to call on image upload inputs=gr.Image(type="pil", label="تحميل صورة"), # Input field for image upload in Arabic outputs=[ # Define output fields gr.HTML(label="التعليق المولد"), # Output for generated caption in Arabic gr.HTML(label="نتيجة التصنيف"), # Output for classification result in Arabic gr.HTML(label="الوصف الملخص"), # Output for summarized description in Arabic gr.Audio(label="صوت الملخص", type="filepath") # Output for audio summary in Arabic ], title="التعرف على المعالم", # Title of the interface in Arabic description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة", # Description of the tool in Arabic examples=[ # Examples for user ["SOL.jfif"], ["OIP.jfif"] ] ) # Merge all interfaces into a tabbed interface demo = gr.TabbedInterface( [english_interface, arabic_interface], # List of interfaces to include ["English", "العربية"] # Names of the tabs ) # Launch the interface demo.launch() # Start the Gradio application.