Spaces:

Xalphinions
/

watermelon2

Sleeping

App Files Files Community

Xalphinions commited on 18 days ago

Commit

dd995d1

verified ·

1 Parent(s): 13b45d3

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +89 -180
app_moe.py +13 -3

app.py CHANGED Viewed

@@ -34,11 +34,10 @@ class WatermelonMoEModel(torch.nn.Module):
             weights: Optional list of weights for each model (None for equal weighting)
         """
         super(WatermelonMoEModel, self).__init__()
-        self.models = torch.nn.ModuleList()  # Use ModuleList instead of regular list
         self.model_configs = model_configs
         # Load each model
-        loaded_count = 0
         for config in model_configs:
             img_backbone = config["image_backbone"]
             audio_backbone = config["audio_backbone"]
@@ -50,31 +49,22 @@ class WatermelonMoEModel(torch.nn.Module):
             model_path = os.path.join(model_dir, f"{img_backbone}_{audio_backbone}_model.pt")
             if os.path.exists(model_path):
                 print(f"\033[92mINFO\033[0m: Loading model {img_backbone}_{audio_backbone} from {model_path}")
-                try:
-                    model.load_state_dict(torch.load(model_path, map_location='cpu'))
-                    model.eval()  # Set to evaluation mode
-                    self.models.append(model)
-                    loaded_count += 1
-                except Exception as e:
-                    print(f"\033[91mERR!\033[0m: Failed to load model from {model_path}: {e}")
-                    continue
             else:
                 print(f"\033[91mERR!\033[0m: Model checkpoint not found at {model_path}")
                 continue
-        # Add a dummy parameter if no models were loaded to prevent StopIteration
-        if loaded_count == 0:
-            print(f"\033[91mERR!\033[0m: No models were successfully loaded!")
-            self.dummy_param = torch.nn.Parameter(torch.zeros(1))
         # Set model weights (uniform by default)
-        if weights and loaded_count > 0:
             assert len(weights) == len(self.models), "Number of weights must match number of models"
             self.weights = weights
         else:
-            self.weights = [1.0 / max(loaded_count, 1)] * max(loaded_count, 1)
-        print(f"\033[92mINFO\033[0m: Loaded {loaded_count} models for MoE ensemble")
         print(f"\033[92mINFO\033[0m: Model weights: {self.weights}")
     def to(self, device):
@@ -90,10 +80,9 @@ class WatermelonMoEModel(torch.nn.Module):
         Forward pass through the MoE model.
         Returns the weighted average of all model outputs.
         """
-        # Check if we have models loaded
         if not self.models:
             print(f"\033[91mERR!\033[0m: No models available for inference!")
-            return torch.tensor([0.0], device=mfcc.device)  # Return a default value
         outputs = []
@@ -101,6 +90,8 @@ class WatermelonMoEModel(torch.nn.Module):
         with torch.no_grad():
             for i, model in enumerate(self.models):
                 output = model(mfcc, image)
                 outputs.append(output * self.weights[i])
         # Return weighted average
@@ -166,196 +157,114 @@ def predict_sugar_content(audio, image, model_dir="models", weights=None):
     """Function with GPU acceleration to predict watermelon sugar content in Brix using MoE model"""
     try:
         # Check CUDA availability inside the GPU-decorated function
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
-        else:
-            device = torch.device("cpu")
-            print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
         # Load MoE model
         moe_model = WatermelonMoEModel(TOP_MODELS, model_dir, weights)
-        # Explicitly move the entire model to device
-        moe_model = moe_model.to(device)
         moe_model.eval()
         print(f"\033[92mINFO\033[0m: Loaded MoE model with {len(moe_model.models)} backbone models")
-        # Debug information about input types
-        print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
-        print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
-        print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
-        if isinstance(image, np.ndarray):
-            print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
         # Handle different audio input formats
-        if isinstance(audio, tuple) and len(audio) == 2:
-            # Standard Gradio format: (sample_rate, audio_data)
-            sample_rate, audio_data = audio
-            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
-            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
-        elif isinstance(audio, tuple) and len(audio) > 2:
-            # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
-            sample_rate, audio_data = audio[0], audio[-1]
-            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
-            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
         elif isinstance(audio, str):
-            # Direct path to audio file
             audio_data, sample_rate = torchaudio.load(audio)
-            print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
         else:
             return f"Error: Unsupported audio format. Got {type(audio)}"
-        # Create a temporary file path for the audio and image
-        temp_dir = "temp"
-        os.makedirs(temp_dir, exist_ok=True)
-        temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
-        temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
-        # Import necessary libraries
-        from PIL import Image
-        # Audio handling - direct processing from the data in memory
         if isinstance(audio_data, np.ndarray):
-            # Convert numpy array to tensor
-            print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
             audio_tensor = torch.tensor(audio_data).float()
-            # Handle different audio dimensions
-            if audio_data.ndim == 1:
-                # Single channel audio
-                audio_tensor = audio_tensor.unsqueeze(0)
-            elif audio_data.ndim == 2:
-                # Ensure channels are first dimension
-                if audio_data.shape[0] > audio_data.shape[1]:
-                    # More rows than columns, probably (samples, channels)
-                    audio_tensor = torch.tensor(audio_data.T).float()
         else:
-            # Already a tensor
             audio_tensor = audio_data.float()
-        print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
-        # Skip saving/loading and process directly
         mfcc = app_process_audio_data(audio_tensor, sample_rate)
-        print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
-        # Image handling
         if isinstance(image, np.ndarray):
-            print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
-            pil_image = Image.fromarray(image)
-            pil_image.save(temp_image_path)
-            print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
         elif isinstance(image, str):
-            # If image is already a path
-            temp_image_path = image
-            print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
         else:
             return f"Error: Unsupported image format. Got {type(image)}"
-        # Process image
-        print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
-        image_tensor = torchvision.io.read_image(temp_image_path)
-        print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
         image_tensor = image_tensor.float()
         processed_image = process_image_data(image_tensor)
-        print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
-        # Add batch dimension for inference and move to device
-        if mfcc is not None:
-            # Ensure mfcc is on the same device as the model
-            mfcc = mfcc.unsqueeze(0).to(device)
-            print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}, device: {mfcc.device}")
-        if processed_image is not None:
-            # Ensure processed_image is on the same device as the model
-            processed_image = processed_image.unsqueeze(0).to(device)
-            print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}, device: {processed_image.device}")
-        # Double-check model is on the correct device
-        try:
-            param = next(moe_model.parameters())
-            print(f"\033[92mDEBUG\033[0m: MoE model device: {param.device}")
-            # Check individual models
-            for i, model in enumerate(moe_model.models):
-                try:
-                    model_param = next(model.parameters())
-                    print(f"\033[92mDEBUG\033[0m: Model {i} device: {model_param.device}")
-                except StopIteration:
-                    print(f"\033[91mERR!\033[0m: Model {i} has no parameters!")
-        except StopIteration:
-            print(f"\033[91mERR!\033[0m: MoE model has no parameters!")
-        # Run inference with MoE model
-        print(f"\033[92mDEBUG\033[0m: Running inference with MoE model on device: {device}")
-        if mfcc is not None and processed_image is not None:
-            with torch.no_grad():
-                brix_value = moe_model(mfcc, processed_image)
-                print(f"\033[92mDEBUG\033[0m: Prediction successful: {brix_value.item()}")
-        else:
-            return "Error: Failed to process inputs. Please check the debug logs."
-        # Format the result with a range display
-        if brix_value is not None:
-            brix_score = brix_value.item()
-            # Create a header with the numerical result
-            result = f"🍉 Predicted Sugar Content: {brix_score:.1f}° Brix 🍉\n\n"
-            # Add extra info about the MoE model
-            result += "Using Ensemble of Top-3 Models:\n"
-            result += "- EfficientNet-B3 + Transformer\n"
-            result += "- EfficientNet-B0 + Transformer\n"
-            result += "- ResNet-50 + Transformer\n\n"
-            # Add Brix scale visualization
-            result += "Sugar Content Scale (in °Brix):\n"
-            result += "──────────────────────────────────\n"
-            # Create the scale display with Brix ranges
-            scale_ranges = [
-                (0, 8, "Low Sugar (< 8° Brix)"),
-                (8, 9, "Mild Sweetness (8-9° Brix)"),
-                (9, 10, "Medium Sweetness (9-10° Brix)"),
-                (10, 11, "Sweet (10-11° Brix)"),
-                (11, 13, "Very Sweet (11-13° Brix)")
-            ]
-            # Find which category the prediction falls into
-            user_category = None
-            for min_val, max_val, category_name in scale_ranges:
-                if min_val <= brix_score < max_val:
-                    user_category = category_name
-                    break
-            if brix_score >= scale_ranges[-1][0]:  # Handle edge case
-                user_category = scale_ranges[-1][2]
-            # Display the scale with the user's result highlighted
-            for min_val, max_val, category_name in scale_ranges:
-                if category_name == user_category:
-                    result += f"▶ {min_val}-{max_val}: {category_name} ◀ (YOUR WATERMELON)\n"
-                else:
-                    result += f"  {min_val}-{max_val}: {category_name}\n"
-            result += "──────────────────────────────────\n\n"
-            # Add assessment of the watermelon's sugar content
-            if brix_score < 8:
-                result += "Assessment: This watermelon has low sugar content. It may taste bland or slightly bitter."
-            elif brix_score < 9:
-                result += "Assessment: This watermelon has mild sweetness. Acceptable flavor but not very sweet."
-            elif brix_score < 10:
-                result += "Assessment: This watermelon has moderate sugar content. It should have pleasant sweetness."
-            elif brix_score < 11:
-                result += "Assessment: This watermelon has good sugar content! It should be sweet and juicy."
             else:
-                result += "Assessment: This watermelon has excellent sugar content! Perfect choice for maximum sweetness and flavor."
-            return result
         else:
-            return "Error: Could not predict sugar content. Please try again with different inputs."
     except Exception as e:
         import traceback
         error_msg = f"Error: {str(e)}\n\n"

             weights: Optional list of weights for each model (None for equal weighting)
         """
         super(WatermelonMoEModel, self).__init__()
+        self.models = []
         self.model_configs = model_configs
         # Load each model
         for config in model_configs:
             img_backbone = config["image_backbone"]
             audio_backbone = config["audio_backbone"]
             model_path = os.path.join(model_dir, f"{img_backbone}_{audio_backbone}_model.pt")
             if os.path.exists(model_path):
                 print(f"\033[92mINFO\033[0m: Loading model {img_backbone}_{audio_backbone} from {model_path}")
+                model.load_state_dict(torch.load(model_path, map_location='cpu'))
             else:
                 print(f"\033[91mERR!\033[0m: Model checkpoint not found at {model_path}")
                 continue
+            model.eval()  # Set to evaluation mode
+            self.models.append(model)
         # Set model weights (uniform by default)
+        if weights:
             assert len(weights) == len(self.models), "Number of weights must match number of models"
             self.weights = weights
         else:
+            self.weights = [1.0 / len(self.models)] * len(self.models) if self.models else [1.0]
+        print(f"\033[92mINFO\033[0m: Loaded {len(self.models)} models for MoE ensemble")
         print(f"\033[92mINFO\033[0m: Model weights: {self.weights}")
     def to(self, device):
         Forward pass through the MoE model.
         Returns the weighted average of all model outputs.
         """
         if not self.models:
             print(f"\033[91mERR!\033[0m: No models available for inference!")
+            return torch.tensor([0.0], device=mfcc.device)
         outputs = []
         with torch.no_grad():
             for i, model in enumerate(self.models):
                 output = model(mfcc, image)
+                # print the output value
+                print(f"\033[92mDEBUG\033[0m: Model {i} output: {output}")
                 outputs.append(output * self.weights[i])
         # Return weighted average
     """Function with GPU acceleration to predict watermelon sugar content in Brix using MoE model"""
     try:
         # Check CUDA availability inside the GPU-decorated function
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"\033[92mINFO\033[0m: Using device: {device}")
         # Load MoE model
         moe_model = WatermelonMoEModel(TOP_MODELS, model_dir, weights)
+        moe_model = moe_model.to(device)  # Move entire model to device
         moe_model.eval()
         print(f"\033[92mINFO\033[0m: Loaded MoE model with {len(moe_model.models)} backbone models")
         # Handle different audio input formats
+        if isinstance(audio, tuple) and len(audio) >= 2:
+            sample_rate, audio_data = audio[0], audio[1] if len(audio) == 2 else audio[-1]
         elif isinstance(audio, str):
             audio_data, sample_rate = torchaudio.load(audio)
         else:
             return f"Error: Unsupported audio format. Got {type(audio)}"
+        # Convert audio to tensor if needed
         if isinstance(audio_data, np.ndarray):
             audio_tensor = torch.tensor(audio_data).float()
         else:
             audio_tensor = audio_data.float()
+        # Process audio
         mfcc = app_process_audio_data(audio_tensor, sample_rate)
+        if mfcc is None:
+            return "Error: Failed to process audio input"
+        # Process image
         if isinstance(image, np.ndarray):
+            image_tensor = torch.from_numpy(image).permute(2, 0, 1)  # Convert to CxHxW format
         elif isinstance(image, str):
+            image_tensor = torchvision.io.read_image(image)
         else:
             return f"Error: Unsupported image format. Got {type(image)}"
         image_tensor = image_tensor.float()
         processed_image = process_image_data(image_tensor)
+        if processed_image is None:
+            return "Error: Failed to process image input"
+        # Add batch dimension and move to device
+        mfcc = mfcc.unsqueeze(0).to(device)
+        processed_image = processed_image.unsqueeze(0).to(device)
+        # Run inference
+        with torch.no_grad():
+            brix_value = moe_model(mfcc, processed_image)
+            prediction = brix_value.item()
+            print(f"\033[92mDEBUG\033[0m: Raw prediction: {prediction}")
+            # Ensure prediction is within reasonable bounds (e.g., 6-13 Brix)
+            prediction = max(6.0, min(13.0, prediction))
+            print(f"\033[92mDEBUG\033[0m: Bounded prediction: {prediction}")
+        # Format the result
+        result = f"🍉 Predicted Sugar Content: {prediction:.1f}° Brix 🍉\n\n"
+        # Add extra info about the MoE model
+        result += "Using Ensemble of Top-3 Models:\n"
+        result += "- EfficientNet-B3 + Transformer\n"
+        result += "- EfficientNet-B0 + Transformer\n"
+        result += "- ResNet-50 + Transformer\n\n"
+        # Add Brix scale visualization
+        result += "Sugar Content Scale (in °Brix):\n"
+        result += "──────────────────────────────────\n"
+        # Create the scale display with Brix ranges
+        scale_ranges = [
+            (0, 8, "Low Sugar (< 8° Brix)"),
+            (8, 9, "Mild Sweetness (8-9° Brix)"),
+            (9, 10, "Medium Sweetness (9-10° Brix)"),
+            (10, 11, "Sweet (10-11° Brix)"),
+            (11, 13, "Very Sweet (11-13° Brix)")
+        ]
+        # Find which category the prediction falls into
+        user_category = None
+        for min_val, max_val, category_name in scale_ranges:
+            if min_val <= prediction < max_val:
+                user_category = category_name
+                break
+        if prediction >= scale_ranges[-1][0]:  # Handle edge case
+            user_category = scale_ranges[-1][2]
+        # Display the scale with the user's result highlighted
+        for min_val, max_val, category_name in scale_ranges:
+            if category_name == user_category:
+                result += f"▶ {min_val}-{max_val}: {category_name} ◀ (YOUR WATERMELON)\n"
             else:
+                result += f"  {min_val}-{max_val}: {category_name}\n"
+        result += "──────────────────────────────────\n\n"
+        # Add assessment of the watermelon's sugar content
+        if prediction < 8:
+            result += "Assessment: This watermelon has low sugar content. It may taste bland or slightly bitter."
+        elif prediction < 9:
+            result += "Assessment: This watermelon has mild sweetness. Acceptable flavor but not very sweet."
+        elif prediction < 10:
+            result += "Assessment: This watermelon has moderate sugar content. It should have pleasant sweetness."
+        elif prediction < 11:
+            result += "Assessment: This watermelon has good sugar content! It should be sweet and juicy."
         else:
+            result += "Assessment: This watermelon has excellent sugar content! Perfect choice for maximum sweetness and flavor."
+        return result
     except Exception as e:
         import traceback
         error_msg = f"Error: {str(e)}\n\n"

app_moe.py CHANGED Viewed

@@ -273,9 +273,19 @@ def predict_sugar_content(audio, image, model_dir="models", weights=None):
             print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}, device: {processed_image.device}")
         # Double-check model is on the correct device
-        print(f"\033[92mDEBUG\033[0m: MoE model device: {next(moe_model.parameters()).device}")
-        for i, model in enumerate(moe_model.models):
-            print(f"\033[92mDEBUG\033[0m: Model {i} device: {next(model.parameters()).device}")
         # Run inference with MoE model
         print(f"\033[92mDEBUG\033[0m: Running inference with MoE model on device: {device}")

             print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}, device: {processed_image.device}")
         # Double-check model is on the correct device
+        try:
+            param = next(moe_model.parameters())
+            print(f"\033[92mDEBUG\033[0m: MoE model device: {param.device}")
+            # Check individual models
+            for i, model in enumerate(moe_model.models):
+                try:
+                    model_param = next(model.parameters())
+                    print(f"\033[92mDEBUG\033[0m: Model {i} device: {model_param.device}")
+                except StopIteration:
+                    print(f"\033[91mERR!\033[0m: Model {i} has no parameters!")
+        except StopIteration:
+            print(f"\033[91mERR!\033[0m: MoE model has no parameters!")
         # Run inference with MoE model
         print(f"\033[92mDEBUG\033[0m: Running inference with MoE model on device: {device}")