Spaces:

Xalphinions
/

watermelon2

Sleeping

App Files Files Community

Xalphinions commited on Apr 6

Commit

5900417

verified ·

1 Parent(s): a14089e

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +2 -0
.gradio/certificate.pem +31 -0
README.md +3 -9
__pycache__/infer_watermelon.cpython-310.pyc +0 -0
__pycache__/train_watermelon.cpython-310.pyc +0 -0
app.py +318 -0
app_local_backup.py +290 -0
infer_watermelon.py +150 -0
models/model_1_20250406-064126.pt +3 -0
models/model_1_20250406-064635.pt +3 -0
models/model_2_20250406-065053.pt +3 -0
models/watermelon_model_final.pt +3 -0
requirements.txt +6 -0
runs/events.out.tfevents.1743920786.vm-jinzq.2059144.0 +3 -0
runs/events.out.tfevents.1743920828.vm-jinzq.2059396.0 +3 -0
runs/events.out.tfevents.1743921401.jzqdebug-c245a8-job-84fn7.812.0 +3 -0
runs/events.out.tfevents.1743921735.jzqdebug-c245a8-job-84fn7.1262.0 +3 -0
temp/temp_audio.wav +3 -0
temp/temp_image.jpg +3 -0
train_watermelon.py +261 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+temp/temp_audio.wav filter=lfs diff=lfs merge=lfs -text
+temp/temp_image.jpg filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Watermelon2
-emoji: 🚀
-colorFrom: yellow
-colorTo: red
-sdk: gradio
-sdk_version: 5.23.3
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: watermelon2
 app_file: app.py
+sdk: gradio
+sdk_version: 5.9.1
 ---

__pycache__/infer_watermelon.cpython-310.pyc ADDED Viewed

Binary file (4.39 kB). View file

__pycache__/train_watermelon.cpython-310.pyc ADDED Viewed

Binary file (6.74 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+import sys
+import torch
+import numpy as np
+import gradio as gr
+import torchaudio
+import torchvision
+# Import Gradio Spaces GPU decorator
+try:
+    from gradio import spaces
+    HAS_SPACES = True
+    print("\033[92mINFO\033[0m: Gradio Spaces detected, GPU acceleration will be enabled")
+except ImportError:
+    HAS_SPACES = False
+    print("\033[93mWARN\033[0m: gradio.spaces not available, running without GPU optimization")
+# Add parent directory to path to import preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import functions from infer_watermelon.py and train_watermelon for the model
+from train_watermelon import WatermelonModel
+# Modified version of process_audio_data specifically for the app to handle various tensor shapes
+def app_process_audio_data(waveform, sample_rate):
+    """Modified version of process_audio_data for the app that handles different tensor dimensions"""
+    try:
+        print(f"\033[92mDEBUG\033[0m: Processing audio - Initial shape: {waveform.shape}, Sample rate: {sample_rate}")
+        # Handle different tensor dimensions
+        if waveform.dim() == 3:
+            print(f"\033[92mDEBUG\033[0m: Found 3D tensor, converting to 2D")
+            # For 3D tensor, take the first item (batch dimension)
+            waveform = waveform[0]
+        if waveform.dim() == 2:
+            # Use the first channel for stereo audio
+            waveform = waveform[0]
+            print(f"\033[92mDEBUG\033[0m: Using first channel, new shape: {waveform.shape}")
+        # Resample to 16kHz if needed
+        resample_rate = 16000
+        if sample_rate != resample_rate:
+            print(f"\033[92mDEBUG\033[0m: Resampling from {sample_rate}Hz to {resample_rate}Hz")
+            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        # Ensure 3 seconds of audio
+        if waveform.size(0) < 3 * resample_rate:
+            print(f"\033[92mDEBUG\033[0m: Padding audio from {waveform.size(0)} to {3 * resample_rate} samples")
+            waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0)))
+        else:
+            print(f"\033[92mDEBUG\033[0m: Trimming audio from {waveform.size(0)} to {3 * resample_rate} samples")
+            waveform = waveform[: 3 * resample_rate]
+        # Apply MFCC transformation
+        print(f"\033[92mDEBUG\033[0m: Applying MFCC transformation")
+        mfcc_transform = torchaudio.transforms.MFCC(
+            sample_rate=resample_rate,
+            n_mfcc=13,
+            melkwargs={
+                "n_fft": 256,
+                "win_length": 256,
+                "hop_length": 128,
+                "n_mels": 40,
+            }
+        )
+        mfcc = mfcc_transform(waveform)
+        print(f"\033[92mDEBUG\033[0m: MFCC output shape: {mfcc.shape}")
+        return mfcc
+    except Exception as e:
+        import traceback
+        print(f"\033[91mERR!\033[0m: Error in audio processing: {e}")
+        print(traceback.format_exc())
+        return None
+# Similarly for images, but let's import the original one
+from preprocess import process_image_data
+# Define prediction function
+def predict_sweetness(audio, image, model_path):
+    """Predict sweetness of a watermelon from audio and image input"""
+    try:
+        # Now check CUDA availability inside the GPU-decorated function
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
+        else:
+            device = torch.device("cpu")
+            print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
+        # Load model inside the function to ensure it's on the correct device
+        model = WatermelonModel().to(device)
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model.eval()
+        print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
+        # Debug information about input types
+        print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
+        print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
+        print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
+        if isinstance(image, np.ndarray):
+            print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
+        # Handle different audio input formats
+        if isinstance(audio, tuple) and len(audio) == 2:
+            # Standard Gradio format: (sample_rate, audio_data)
+            sample_rate, audio_data = audio
+            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+        elif isinstance(audio, tuple) and len(audio) > 2:
+            # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
+            sample_rate, audio_data = audio[0], audio[-1]
+            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+        elif isinstance(audio, str):
+            # Direct path to audio file
+            audio_data, sample_rate = torchaudio.load(audio)
+            print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
+        else:
+            return f"Error: Unsupported audio format. Got {type(audio)}"
+        # Create a temporary file path for the audio and image
+        temp_dir = "temp"
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
+        temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
+        # Import necessary libraries
+        from PIL import Image
+        # Audio handling - direct processing from the data in memory
+        if isinstance(audio_data, np.ndarray):
+            # Convert numpy array to tensor
+            print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
+            audio_tensor = torch.tensor(audio_data).float()
+            # Handle different audio dimensions
+            if audio_data.ndim == 1:
+                # Single channel audio
+                audio_tensor = audio_tensor.unsqueeze(0)
+            elif audio_data.ndim == 2:
+                # Ensure channels are first dimension
+                if audio_data.shape[0] > audio_data.shape[1]:
+                    # More rows than columns, probably (samples, channels)
+                    audio_tensor = torch.tensor(audio_data.T).float()
+        else:
+            # Already a tensor
+            audio_tensor = audio_data.float()
+        print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
+        # Skip saving/loading and process directly
+        mfcc = app_process_audio_data(audio_tensor, sample_rate)
+        print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
+        # Image handling
+        if isinstance(image, np.ndarray):
+            print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
+            pil_image = Image.fromarray(image)
+            pil_image.save(temp_image_path)
+            print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
+        elif isinstance(image, str):
+            # If image is already a path
+            temp_image_path = image
+            print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
+        else:
+            return f"Error: Unsupported image format. Got {type(image)}"
+        # Process image
+        print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
+        image_tensor = torchvision.io.read_image(temp_image_path)
+        print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
+        image_tensor = image_tensor.float()
+        processed_image = process_image_data(image_tensor)
+        print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
+        # Add batch dimension for inference and move to device
+        if mfcc is not None:
+            mfcc = mfcc.unsqueeze(0).to(device)
+            print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
+        if processed_image is not None:
+            processed_image = processed_image.unsqueeze(0).to(device)
+            print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
+        # Run inference
+        print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
+        if mfcc is not None and processed_image is not None:
+            with torch.no_grad():
+                sweetness = model(mfcc, processed_image)
+                print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
+        else:
+            return "Error: Failed to process inputs. Please check the debug logs."
+        # Format the result
+        if sweetness is not None:
+            result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
+            # Add a qualitative description
+            if sweetness.item() < 9:
+                result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
+            elif sweetness.item() < 10:
+                result += "\n\nThis watermelon has moderate sweetness."
+            elif sweetness.item() < 11:
+                result += "\n\nThis watermelon is sweet! A good choice."
+            else:
+                result += "\n\nThis watermelon is very sweet! Excellent choice!"
+            return result
+        else:
+            return "Error: Could not predict sweetness. Please try again with different inputs."
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n\n"
+        error_msg += traceback.format_exc()
+        print(f"\033[91mERR!\033[0m: {error_msg}")
+        return error_msg
+# Apply GPU decorator if available in Gradio Spaces environment
+if HAS_SPACES:
+    predict_sweetness_gpu = spaces.GPU(predict_sweetness)
+    print("\033[92mINFO\033[0m: GPU optimization enabled for prediction function")
+else:
+    predict_sweetness_gpu = predict_sweetness
+def create_app(model_path):
+    """Create and launch the Gradio interface"""
+    # Define the prediction function with model path
+    def predict_fn(audio, image):
+        if HAS_SPACES:
+            # Use GPU-optimized function if available
+            return predict_sweetness_gpu(audio, image, model_path)
+        else:
+            # Use regular function otherwise
+            return predict_sweetness(audio, image, model_path)
+    # Create Gradio interface
+    with gr.Blocks(title="Watermelon Sweetness Predictor", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🍉 Watermelon Sweetness Predictor")
+        gr.Markdown("""
+        This app predicts the sweetness of a watermelon based on its sound and appearance.
+        ## Instructions:
+        1. Upload or record an audio of tapping the watermelon
+        2. Upload or capture an image of the watermelon
+        3. Click 'Predict' to get the sweetness estimation
+        """)
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
+                image_input = gr.Image(label="Upload or Capture Image")
+                submit_btn = gr.Button("Predict Sweetness", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(label="Prediction Results", lines=6)
+        submit_btn.click(
+            fn=predict_fn,
+            inputs=[audio_input, image_input],
+            outputs=output
+        )
+        gr.Markdown("""
+        ## How it works
+        The app uses a deep learning model that combines:
+        - Audio analysis using MFCC features and LSTM neural network
+        - Image analysis using ResNet-50 convolutional neural network
+        The model was trained on a dataset of watermelons with known sweetness values.
+        ## Tips for best results
+        - For audio: Tap the watermelon with your knuckle and record the sound
+        - For image: Take a clear photo of the whole watermelon in good lighting
+        """)
+    return interface
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Watermelon Sweetness Prediction App")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="models/watermelon_model_final.pt",
+        help="Path to the trained model file"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a shareable link for the app"
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable verbose debug output"
+    )
+    args = parser.parse_args()
+    if args.debug:
+        print(f"\033[92mINFO\033[0m: Debug mode enabled")
+    # Check if model exists
+    if not os.path.exists(args.model_path):
+        print(f"\033[91mERR!\033[0m: Model not found at {args.model_path}")
+        print("\033[92mINFO\033[0m: Please train a model first or provide a valid model path")
+        sys.exit(1)
+    # Create and launch the app
+    app = create_app(args.model_path)
+    app.launch(share=args.share)

app_local_backup.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import os
+import sys
+import torch
+import numpy as np
+import gradio as gr
+import torchaudio
+import torchvision
+# Add parent directory to path to import preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import functions from infer_watermelon.py
+from infer_watermelon import load_model
+# Modified version of process_audio_data specifically for the app to handle various tensor shapes
+def app_process_audio_data(waveform, sample_rate):
+    """Modified version of process_audio_data for the app that handles different tensor dimensions"""
+    try:
+        print(f"\033[92mDEBUG\033[0m: Processing audio - Initial shape: {waveform.shape}, Sample rate: {sample_rate}")
+        # Handle different tensor dimensions
+        if waveform.dim() == 3:
+            print(f"\033[92mDEBUG\033[0m: Found 3D tensor, converting to 2D")
+            # For 3D tensor, take the first item (batch dimension)
+            waveform = waveform[0]
+        if waveform.dim() == 2:
+            # Use the first channel for stereo audio
+            waveform = waveform[0]
+            print(f"\033[92mDEBUG\033[0m: Using first channel, new shape: {waveform.shape}")
+        # Resample to 16kHz if needed
+        resample_rate = 16000
+        if sample_rate != resample_rate:
+            print(f"\033[92mDEBUG\033[0m: Resampling from {sample_rate}Hz to {resample_rate}Hz")
+            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        # Ensure 3 seconds of audio
+        if waveform.size(0) < 3 * resample_rate:
+            print(f"\033[92mDEBUG\033[0m: Padding audio from {waveform.size(0)} to {3 * resample_rate} samples")
+            waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0)))
+        else:
+            print(f"\033[92mDEBUG\033[0m: Trimming audio from {waveform.size(0)} to {3 * resample_rate} samples")
+            waveform = waveform[: 3 * resample_rate]
+        # Apply MFCC transformation
+        print(f"\033[92mDEBUG\033[0m: Applying MFCC transformation")
+        mfcc_transform = torchaudio.transforms.MFCC(
+            sample_rate=resample_rate,
+            n_mfcc=13,
+            melkwargs={
+                "n_fft": 256,
+                "win_length": 256,
+                "hop_length": 128,
+                "n_mels": 40,
+            }
+        )
+        mfcc = mfcc_transform(waveform)
+        print(f"\033[92mDEBUG\033[0m: MFCC output shape: {mfcc.shape}")
+        return mfcc
+    except Exception as e:
+        import traceback
+        print(f"\033[91mERR!\033[0m: Error in audio processing: {e}")
+        print(traceback.format_exc())
+        return None
+# Similarly for images, but let's import the original one
+from preprocess import process_image_data
+def init_model(model_path):
+    """Initialize the model for inference"""
+    model, device = load_model(model_path)
+    return model, device
+def predict_sweetness(audio, image, model, device):
+    """Predict sweetness of a watermelon from audio and image input"""
+    try:
+        # Debug information about input types
+        print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
+        print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
+        print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
+        if isinstance(image, np.ndarray):
+            print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
+        # Handle different audio input formats
+        if isinstance(audio, tuple) and len(audio) == 2:
+            # Standard Gradio format: (sample_rate, audio_data)
+            sample_rate, audio_data = audio
+            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+        elif isinstance(audio, tuple) and len(audio) > 2:
+            # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
+            sample_rate, audio_data = audio[0], audio[-1]
+            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+        elif isinstance(audio, str):
+            # Direct path to audio file
+            import torchaudio
+            audio_data, sample_rate = torchaudio.load(audio)
+            print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
+        else:
+            return f"Error: Unsupported audio format. Got {type(audio)}"
+        # Create a temporary file path for the audio and image
+        temp_dir = "temp"
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
+        temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
+        # Import necessary libraries
+        import torchaudio
+        import torchvision
+        import torchvision.transforms.functional as F
+        from PIL import Image
+        # Audio handling - direct processing from the data in memory
+        if isinstance(audio_data, np.ndarray):
+            # Convert numpy array to tensor
+            print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
+            audio_tensor = torch.tensor(audio_data).float()
+            # Handle different audio dimensions
+            if audio_data.ndim == 1:
+                # Single channel audio
+                audio_tensor = audio_tensor.unsqueeze(0)
+            elif audio_data.ndim == 2:
+                # Ensure channels are first dimension
+                if audio_data.shape[0] > audio_data.shape[1]:
+                    # More rows than columns, probably (samples, channels)
+                    audio_tensor = torch.tensor(audio_data.T).float()
+        else:
+            # Already a tensor
+            audio_tensor = audio_data.float()
+        print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
+        # Skip saving/loading and process directly
+        mfcc = app_process_audio_data(audio_tensor, sample_rate)
+        print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
+        # Image handling
+        if isinstance(image, np.ndarray):
+            print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
+            pil_image = Image.fromarray(image)
+            pil_image.save(temp_image_path)
+            print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
+        elif isinstance(image, str):
+            # If image is already a path
+            temp_image_path = image
+            print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
+        else:
+            return f"Error: Unsupported image format. Got {type(image)}"
+        # Process image
+        print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
+        image_tensor = torchvision.io.read_image(temp_image_path)
+        print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
+        image_tensor = image_tensor.float()
+        processed_image = process_image_data(image_tensor)
+        print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
+        # Add batch dimension for inference
+        if mfcc is not None:
+            mfcc = mfcc.unsqueeze(0).to(device)
+            print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
+        if processed_image is not None:
+            processed_image = processed_image.unsqueeze(0).to(device)
+            print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
+        # Run inference
+        print(f"\033[92mDEBUG\033[0m: Running inference")
+        if mfcc is not None and processed_image is not None:
+            with torch.no_grad():
+                sweetness = model(mfcc, processed_image)
+                print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
+        else:
+            return "Error: Failed to process inputs. Please check the debug logs."
+        # Format the result
+        if sweetness is not None:
+            result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
+            # Add a qualitative description
+            if sweetness.item() < 9:
+                result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
+            elif sweetness.item() < 10:
+                result += "\n\nThis watermelon has moderate sweetness."
+            elif sweetness.item() < 11:
+                result += "\n\nThis watermelon is sweet! A good choice."
+            else:
+                result += "\n\nThis watermelon is very sweet! Excellent choice!"
+            return result
+        else:
+            return "Error: Could not predict sweetness. Please try again with different inputs."
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n\n"
+        error_msg += traceback.format_exc()
+        print(f"\033[91mERR!\033[0m: {error_msg}")
+        return error_msg
+def create_app(model_path):
+    """Create and launch the Gradio interface"""
+    # Initialize model
+    model, device = init_model(model_path)
+    # Define the prediction function with model and device
+    def predict_fn(audio, image):
+        return predict_sweetness(audio, image, model, device)
+    # Create Gradio interface
+    with gr.Blocks(title="Watermelon Sweetness Predictor") as interface:
+        gr.Markdown("# 🍉 Watermelon Sweetness Predictor")
+        gr.Markdown("""
+        This app predicts the sweetness of a watermelon based on its sound and appearance.
+        ## Instructions:
+        1. Upload or record an audio of tapping the watermelon
+        2. Upload or capture an image of the watermelon
+        3. Click 'Submit' to get the predicted sweetness
+        """)
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
+                image_input = gr.Image(label="Upload or Capture Image")
+                submit_btn = gr.Button("Predict Sweetness", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(label="Prediction Results", lines=6)
+        submit_btn.click(
+            fn=predict_fn,
+            inputs=[audio_input, image_input],
+            outputs=output
+        )
+        gr.Markdown("""
+        ## How it works
+        The app uses a deep learning model that combines:
+        - Audio analysis using MFCC features and LSTM neural network
+        - Image analysis using ResNet-50 convolutional neural network
+        The model was trained on a dataset of watermelons with known sweetness values.
+        """)
+    return interface
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Watermelon Sweetness Prediction App")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="models/watermelon_model_final.pt",
+        help="Path to the trained model file"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a shareable link for the app"
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable verbose debug output"
+    )
+    args = parser.parse_args()
+    if args.debug:
+        print(f"\033[92mINFO\033[0m: Debug mode enabled")
+    # Check if model exists
+    if not os.path.exists(args.model_path):
+        print(f"\033[91mERR!\033[0m: Model not found at {args.model_path}")
+        print("\033[92mINFO\033[0m: Please train a model first or provide a valid model path")
+        sys.exit(1)
+    # Create and launch the app
+    app = create_app(args.model_path)
+    app.launch(share=args.share)

infer_watermelon.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import sys
+import torch
+import torchaudio
+import torchvision
+import argparse
+import numpy as np
+# Add parent directory to path to import the preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from preprocess import process_audio_data, process_image_data
+# Import the model definition
+from train_watermelon import WatermelonModel
+def load_model(model_path):
+    """Load a trained model from the given path"""
+    device = torch.device(
+        "cuda" if torch.cuda.is_available()
+        else "mps" if torch.backends.mps.is_available()
+        else "cpu"
+    )
+    print(f"\033[92mINFO\033[0m: Using device: {device}")
+    model = WatermelonModel().to(device)
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    model.eval()
+    print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
+    return model, device
+def infer_single_sample(audio_path, image_path, model, device):
+    """Run inference on a single sample"""
+    # Load and process audio
+    try:
+        waveform, sample_rate = torchaudio.load(audio_path)
+        mfcc = process_audio_data(waveform, sample_rate).to(device)
+        # Load and process image
+        image = torchvision.io.read_image(image_path)
+        image = image.float()
+        processed_image = process_image_data(image).to(device)
+        # Add batch dimension
+        mfcc = mfcc.unsqueeze(0)
+        processed_image = processed_image.unsqueeze(0)
+        # Run inference
+        with torch.no_grad():
+            sweetness = model(mfcc, processed_image)
+        return sweetness.item()
+    except Exception as e:
+        print(f"\033[91mERR!\033[0m: Error in inference: {e}")
+        return None
+def infer_from_directory(data_dir, model_path, output_file=None, num_samples=None):
+    """Run inference on samples from the dataset directory"""
+    # Load model
+    model, device = load_model(model_path)
+    # Collect all samples
+    samples = []
+    results = []
+    print(f"\033[92mINFO\033[0m: Reading samples from {data_dir}")
+    # Walk through the directory structure
+    for sweetness_dir in os.listdir(data_dir):
+        try:
+            sweetness = float(sweetness_dir)
+            sweetness_path = os.path.join(data_dir, sweetness_dir)
+            if os.path.isdir(sweetness_path):
+                for id_dir in os.listdir(sweetness_path):
+                    id_path = os.path.join(sweetness_path, id_dir)
+                    if os.path.isdir(id_path):
+                        audio_file = os.path.join(id_path, f"{id_dir}.wav")
+                        image_file = os.path.join(id_path, f"{id_dir}.jpg")
+                        if os.path.exists(audio_file) and os.path.exists(image_file):
+                            samples.append((audio_file, image_file, sweetness, id_dir))
+        except ValueError:
+            # Skip directories that are not valid sweetness values
+            continue
+    # Limit the number of samples if specified
+    if num_samples is not None and num_samples > 0:
+        samples = samples[:num_samples]
+    print(f"\033[92mINFO\033[0m: Running inference on {len(samples)} samples")
+    # Run inference on each sample
+    for i, (audio_file, image_file, true_sweetness, sample_id) in enumerate(samples):
+        print(f"\033[92mINFO\033[0m: Processing sample {i+1}/{len(samples)}: {sample_id}")
+        predicted_sweetness = infer_single_sample(audio_file, image_file, model, device)
+        if predicted_sweetness is not None:
+            error = abs(predicted_sweetness - true_sweetness)
+            results.append({
+                'sample_id': sample_id,
+                'true_sweetness': true_sweetness,
+                'predicted_sweetness': predicted_sweetness,
+                'error': error
+            })
+            print(f"  Sample ID: {sample_id}")
+            print(f"  True sweetness: {true_sweetness:.2f}")
+            print(f"  Predicted sweetness: {predicted_sweetness:.2f}")
+            print(f"  Error: {error:.2f}")
+    # Calculate mean absolute error
+    if results:
+        mae = np.mean([result['error'] for result in results])
+        print(f"\033[92mINFO\033[0m: Mean Absolute Error: {mae:.4f}")
+    # Save results to file if specified
+    if output_file and results:
+        with open(output_file, 'w') as f:
+            f.write("sample_id,true_sweetness,predicted_sweetness,error\n")
+            for result in results:
+                f.write(f"{result['sample_id']},{result['true_sweetness']:.2f},{result['predicted_sweetness']:.2f},{result['error']:.2f}\n")
+        print(f"\033[92mINFO\033[0m: Results saved to {output_file}")
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="Watermelon Sweetness Inference")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the trained model file")
+    parser.add_argument("--data_dir", type=str, default="../cleaned", help="Path to the cleaned dataset directory")
+    parser.add_argument("--output_file", type=str, help="Path to save inference results (CSV)")
+    parser.add_argument("--num_samples", type=int, help="Number of samples to run inference on (default: all)")
+    parser.add_argument("--audio_path", type=str, help="Path to a single audio file for inference")
+    parser.add_argument("--image_path", type=str, help="Path to a single image file for inference")
+    args = parser.parse_args()
+    # Check if single sample inference or dataset inference
+    if args.audio_path and args.image_path:
+        # Single sample inference
+        model, device = load_model(args.model_path)
+        sweetness = infer_single_sample(args.audio_path, args.image_path, model, device)
+        print(f"Predicted sweetness: {sweetness:.2f}")
+    else:
+        # Dataset inference
+        infer_from_directory(args.data_dir, args.model_path, args.output_file, args.num_samples)
+if __name__ == "__main__":
+    main()

models/model_1_20250406-064126.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5df632222fa87e09e635f90e5cce14bdd9fd34b442bf18daaf13e54dedfed132
+size 96095572

models/model_1_20250406-064635.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02999bd33592de717dc1ec8054dc570193074c3f25a7283b3daa580b727b7134
+size 96095572

models/model_2_20250406-065053.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80f999a1540c42ed74491692aa66c3b5a6171f972bdf47c9d52556fe1673c8dd
+size 96095572

models/watermelon_model_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:086780aee9897ea51a6b0da0fed8aaa61ae97563c70a8c6577849ef9a0220edb
+size 96095241

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+torchaudio>=2.0.0
+torchvision>=0.15.0
+gradio>=3.50.0
+numpy>=1.20.0
+pillow>=9.0.0

runs/events.out.tfevents.1743920786.vm-jinzq.2059144.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3e44b329373e1b4e8233833c35e382cf1c548c03a449e237c89b4c0333af42f
+size 88

runs/events.out.tfevents.1743920828.vm-jinzq.2059396.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1396659d9fdb300ed3bf8ee38bf6605c634376c36a3e47e8398968eb9ea4b6ea
+size 88

runs/events.out.tfevents.1743921401.jzqdebug-c245a8-job-84fn7.812.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4346aad295036526c9dadae4a84f18cab863a1ec43f13b0d5b32566b5361179
+size 14985

runs/events.out.tfevents.1743921735.jzqdebug-c245a8-job-84fn7.1262.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ec7e16dece5b3f09408359a3a18fb40a87f23e02e1b16981ebb9ea9e463f6ef
+size 7238

temp/temp_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da44f18fa66bb5db09dc6ef4ea542c5274d8b2a1d952efd1db1ceec7948ca44
+size 1058488

temp/temp_image.jpg ADDED Viewed

Git LFS Details

SHA256: 88a3633370f2a04e0c41946cdcd6f63883eca31ae8534b8f4379d6e8b84a25f0
Pointer size: 131 Bytes
Size of remote file: 406 kB

train_watermelon.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import os
+import time
+import torch
+import torchaudio
+import torchvision
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import sys
+# Add parent directory to path to import the preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from preprocess import process_audio_data, process_image_data
+# Print library versions
+print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
+print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}")
+print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}")
+# Device selection
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)
+print(f"\033[92mINFO\033[0m: Using device: {device}")
+# Hyperparameters
+batch_size = 16
+epochs = 2
+learning_rate = 0.0001
+# Model save directory
+os.makedirs("models/", exist_ok=True)
+class WatermelonDataset(Dataset):
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+        self.samples = []
+        # Walk through the directory structure
+        for sweetness_dir in os.listdir(data_dir):
+            sweetness = float(sweetness_dir)
+            sweetness_path = os.path.join(data_dir, sweetness_dir)
+            if os.path.isdir(sweetness_path):
+                for id_dir in os.listdir(sweetness_path):
+                    id_path = os.path.join(sweetness_path, id_dir)
+                    if os.path.isdir(id_path):
+                        audio_file = os.path.join(id_path, f"{id_dir}.wav")
+                        image_file = os.path.join(id_path, f"{id_dir}.jpg")
+                        if os.path.exists(audio_file) and os.path.exists(image_file):
+                            self.samples.append((audio_file, image_file, sweetness))
+        print(f"\033[92mINFO\033[0m: Loaded {len(self.samples)} samples from {data_dir}")
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        audio_path, image_path, label = self.samples[idx]
+        # Load and process audio
+        try:
+            waveform, sample_rate = torchaudio.load(audio_path)
+            mfcc = process_audio_data(waveform, sample_rate)
+            # Load and process image
+            image = torchvision.io.read_image(image_path)
+            image = image.float()
+            processed_image = process_image_data(image)
+            return mfcc, processed_image, torch.tensor(label).float()
+        except Exception as e:
+            print(f"\033[91mERR!\033[0m: Error processing sample {idx}: {e}")
+            # Return a fallback sample or skip this sample
+            # For simplicity, we'll return the first sample again
+            if idx == 0:  # Prevent infinite recursion
+                raise e
+            return self.__getitem__(0)
+class WatermelonModel(torch.nn.Module):
+    def __init__(self):
+        super(WatermelonModel, self).__init__()
+        # LSTM for audio features
+        self.lstm = torch.nn.LSTM(
+            input_size=376, hidden_size=64, num_layers=2, batch_first=True
+        )
+        self.lstm_fc = torch.nn.Linear(
+            64, 128
+        )  # Convert LSTM output to 128-dim for merging
+        # ResNet50 for image features
+        self.resnet = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
+        self.resnet.fc = torch.nn.Linear(
+            self.resnet.fc.in_features, 128
+        )  # Convert ResNet output to 128-dim for merging
+        # Fully connected layers for final prediction
+        self.fc1 = torch.nn.Linear(256, 64)
+        self.fc2 = torch.nn.Linear(64, 1)
+        self.relu = torch.nn.ReLU()
+    def forward(self, mfcc, image):
+        # LSTM branch
+        lstm_output, _ = self.lstm(mfcc)
+        lstm_output = lstm_output[:, -1, :]  # Use the output of the last time step
+        lstm_output = self.lstm_fc(lstm_output)
+        # ResNet branch
+        resnet_output = self.resnet(image)
+        # Concatenate LSTM and ResNet outputs
+        merged = torch.cat((lstm_output, resnet_output), dim=1)
+        # Fully connected layers
+        output = self.relu(self.fc1(merged))
+        output = self.fc2(output)
+        return output
+def train_model(data_dir, output_dir="models/"):
+    # Create dataset
+    dataset = WatermelonDataset(data_dir)
+    n_samples = len(dataset)
+    # Split dataset
+    train_size = int(0.7 * n_samples)
+    val_size = int(0.2 * n_samples)
+    test_size = n_samples - train_size - val_size
+    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
+        dataset, [train_size, val_size, test_size]
+    )
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = WatermelonModel().to(device)
+    # Loss function and optimizer
+    criterion = torch.nn.MSELoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    # TensorBoard
+    writer = SummaryWriter("runs/")
+    global_step = 0
+    print(f"\033[92mINFO\033[0m: Training model for {epochs} epochs")
+    print(f"\033[92mINFO\033[0m: Training samples: {len(train_dataset)}")
+    print(f"\033[92mINFO\033[0m: Validation samples: {len(val_dataset)}")
+    print(f"\033[92mINFO\033[0m: Test samples: {len(test_dataset)}")
+    print(f"\033[92mINFO\033[0m: Batch size: {batch_size}")
+    # Training loop
+    for epoch in range(epochs):
+        print(f"\033[92mINFO\033[0m: Training epoch ({epoch+1}/{epochs})")
+        model.train()
+        running_loss = 0.0
+        for i, (mfcc, image, label) in enumerate(train_loader):
+            try:
+                mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                optimizer.zero_grad()
+                output = model(mfcc, image)
+                label = label.view(-1, 1).float()
+                loss = criterion(output, label)
+                loss.backward()
+                optimizer.step()
+                running_loss += loss.item()
+                writer.add_scalar("Training Loss", loss.item(), global_step)
+                global_step += 1
+                if i % 10 == 0:
+                    print(f"\033[92mINFO\033[0m: Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}")
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: Error in training batch {i}: {e}")
+                continue
+        # Validation phase
+        model.eval()
+        val_loss = 0.0
+        with torch.no_grad():
+            for i, (mfcc, image, label) in enumerate(val_loader):
+                try:
+                    mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                    output = model(mfcc, image)
+                    label = label.view(-1, 1).float()
+                    loss = criterion(output, label)
+                    val_loss += loss.item()
+                except Exception as e:
+                    print(f"\033[91mERR!\033[0m: Error in validation batch {i}: {e}")
+                    continue
+        avg_train_loss = running_loss / len(train_loader) if len(train_loader) > 0 else float('inf')
+        avg_val_loss = val_loss / len(val_loader) if len(val_loader) > 0 else float('inf')
+        # Record validation loss
+        writer.add_scalar("Validation Loss", avg_val_loss, epoch)
+        print(
+            f"Epoch [{epoch+1}/{epochs}], Training Loss: {avg_train_loss:.4f}, "
+            f"Validation Loss: {avg_val_loss:.4f}"
+        )
+        # Save model checkpoint
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+        model_path = os.path.join(output_dir, f"model_{epoch+1}_{timestamp}.pt")
+        torch.save(model.state_dict(), model_path)
+        print(
+            f"\033[92mINFO\033[0m: Model checkpoint epoch [{epoch+1}/{epochs}] saved: {model_path}"
+        )
+    # Save final model
+    final_model_path = os.path.join(output_dir, "watermelon_model_final.pt")
+    torch.save(model.state_dict(), final_model_path)
+    print(f"\033[92mINFO\033[0m: Final model saved: {final_model_path}")
+    print(f"\033[92mINFO\033[0m: Training complete")
+    return final_model_path
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Train the Watermelon Sweetness Prediction Model")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="../cleaned",
+        help="Path to the cleaned dataset directory"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="models/",
+        help="Directory to save model checkpoints and the final model"
+    )
+    args = parser.parse_args()
+    # Ensure output directory exists
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Train the model
+    final_model_path = train_model(args.data_dir, args.output_dir)
+    print(f"\033[92mINFO\033[0m: Training completed successfully!")
+    print(f"\033[92mINFO\033[0m: Final model saved at: {final_model_path}")