Spaces:

Xalphinions
/

watermelon

Sleeping

App Files Files Community

Xalphinions commited on Apr 6

Commit

fdc673b

verified ·

1 Parent(s): 8711293

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +1 -0
README.md +3 -9
__pycache__/infer.cpython-39.pyc +0 -0
__pycache__/preprocess.cpython-39.pyc +0 -0
__pycache__/train.cpython-39.pyc +0 -0
__pycache__/train_2.cpython-39.pyc +0 -0
app.py +284 -0
infer.py +52 -0
preprocess.py +45 -0
preprocess_file.py +149 -0
temp/temp_audio.wav +0 -0
temp/temp_image.jpg +3 -0
train.py +283 -0
train_2.py +449 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+temp/temp_image.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Watermelon
-emoji: 👁
-colorFrom: pink
-colorTo: red
-sdk: gradio
-sdk_version: 5.23.3
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: watermelon
 app_file: app.py
+sdk: gradio
+sdk_version: 4.44.1
 ---

__pycache__/infer.cpython-39.pyc ADDED Viewed

Binary file (1.48 kB). View file

__pycache__/preprocess.cpython-39.pyc ADDED Viewed

Binary file (1.27 kB). View file

__pycache__/train.cpython-39.pyc ADDED Viewed

Binary file (7.13 kB). View file

__pycache__/train_2.cpython-39.pyc ADDED Viewed

Binary file (10.7 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import torch, torchaudio, torchvision
+import os
+import gradio as gr
+import numpy as np
+from preprocess import process_audio_data, process_image_data
+from train import WatermelonModel
+from infer import infer
+def load_model(model_path):
+    global device
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+    )
+    print(f"\033[92mINFO\033[0m: Using device: {device}")
+    # Check if the file exists
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model file not found at {model_path}")
+    # Check if the file is empty or very small
+    file_size = os.path.getsize(model_path)
+    if file_size < 1000:  # Less than 1KB is suspiciously small for a model
+        print(f"\033[93mWARNING\033[0m: Model file size is only {file_size} bytes, which is suspiciously small")
+    try:
+        model = WatermelonModel().to(device)
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model.eval()
+        print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
+        return model
+    except RuntimeError as e:
+        if "failed finding central directory" in str(e):
+            print(f"\033[91mERROR\033[0m: The model file at {model_path} appears to be corrupted.")
+            print("This can happen if:")
+            print("  1. The model saving process was interrupted")
+            print("  2. The file was not properly downloaded")
+            print("  3. The path points to a file that is not a valid PyTorch model")
+            print(f"File size: {file_size} bytes")
+        raise
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Watermelon sweetness predictor")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the trained model")
+    args = parser.parse_args()
+    model = load_model(args.model_path)
+    def predict(audio, image):
+        try:
+            # Debug audio input
+            print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
+            print(f"\033[92mDEBUG\033[0m: Audio input value: {audio}")
+            # Handle different formats of audio input from Gradio
+            if audio is None:
+                return "Error: No audio provided. Please upload or record audio."
+            if isinstance(audio, tuple) and len(audio) >= 2:
+                sr, audio_data = audio[0], audio[-1]
+                print(f"\033[92mDEBUG\033[0m: Audio format: sr={sr}, audio_data shape={audio_data.shape if hasattr(audio_data, 'shape') else 'no shape'}")
+            elif isinstance(audio, tuple) and len(audio) == 1:
+                # Handle single element tuple
+                audio_data = audio[0]
+                sr = 44100  # Assume default sample rate
+                print(f"\033[92mDEBUG\033[0m: Single element audio tuple, using default sr={sr}")
+            elif isinstance(audio, np.ndarray):
+                # Handle direct numpy array
+                audio_data = audio
+                sr = 44100  # Assume default sample rate
+                print(f"\033[92mDEBUG\033[0m: Audio is numpy array, using default sr={sr}")
+            else:
+                return f"Error: Unexpected audio format: {type(audio)}"
+            # Ensure audio_data is correctly shaped
+            if isinstance(audio_data, np.ndarray):
+                # Make sure we have a 2D array
+                if len(audio_data.shape) == 1:
+                    audio_data = np.expand_dims(audio_data, axis=0)
+                    print(f"\033[92mDEBUG\033[0m: Reshaped 1D audio to 2D: {audio_data.shape}")
+                # If channels are the second dimension, transpose
+                if len(audio_data.shape) == 2 and audio_data.shape[0] > audio_data.shape[1]:
+                    audio_data = np.transpose(audio_data)
+                    print(f"\033[92mDEBUG\033[0m: Transposed audio shape to: {audio_data.shape}")
+            # Convert to tensor
+            audio_tensor = torch.tensor(audio_data).float()
+            print(f"\033[92mDEBUG\033[0m: Audio tensor shape: {audio_tensor.shape}")
+            # Process audio data and handle None case
+            mfcc = process_audio_data(audio_tensor, sr)
+            if mfcc is None:
+                return "Error: Failed to process audio data. Make sure your audio contains a clear tapping sound."
+            mfcc = mfcc.to(device)
+            print(f"\033[92mDEBUG\033[0m: MFCC shape: {mfcc.shape}")
+            # Debug image input
+            print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
+            print(f"\033[92mDEBUG\033[0m: Image shape: {image.shape if hasattr(image, 'shape') else 'No shape'}")
+            # Process image data and handle None case
+            if image is None:
+                return "Error: No image provided. Please upload an image."
+            # Handle different image formats
+            if isinstance(image, np.ndarray):
+                # Check if image is properly formatted (H, W, C) with 3 channels
+                if len(image.shape) == 3 and image.shape[2] == 3:
+                    # Convert to tensor with shape (C, H, W) as expected by PyTorch
+                    img = torch.tensor(image).float().permute(2, 0, 1)
+                    print(f"\033[92mDEBUG\033[0m: Converted image to tensor with shape: {img.shape}")
+                elif len(image.shape) == 2:
+                    # Grayscale image, expand to 3 channels
+                    img = torch.tensor(image).float().unsqueeze(0).repeat(3, 1, 1)
+                    print(f"\033[92mDEBUG\033[0m: Converted grayscale image to RGB tensor with shape: {img.shape}")
+                else:
+                    return f"Error: Unexpected image shape: {image.shape}. Expected RGB or grayscale image."
+            else:
+                return f"Error: Unexpected image format: {type(image)}. Expected numpy array."
+            # Scale pixel values to [0, 1] if needed
+            if img.max() > 1.0:
+                img = img / 255.0
+                print(f"\033[92mDEBUG\033[0m: Scaled image pixel values to range [0, 1]")
+            # Get image dimensions and check if they're reasonable
+            print(f"\033[92mDEBUG\033[0m: Final image tensor shape before processing: {img.shape}")
+            # Process image
+            try:
+                img_processed = process_image_data(img)
+                if img_processed is None:
+                    return "Error: Failed to process image data. Make sure your image clearly shows a watermelon."
+                img_processed = img_processed.to(device)
+                print(f"\033[92mDEBUG\033[0m: Processed image shape: {img_processed.shape}")
+            except Exception as e:
+                print(f"\033[91mERROR\033[0m: Image processing error: {str(e)}")
+                return f"Error in image processing: {str(e)}"
+            # Run inference
+            try:
+                # Based on the error, it seems infer() expects file paths, not tensors
+                # Let's create temporary files for the processed data
+                temp_dir = os.path.join(os.getcwd(), "temp")
+                os.makedirs(temp_dir, exist_ok=True)
+                # Save the audio to a temporary file if infer expects a file path
+                temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
+                if not isinstance(audio, str) and isinstance(audio, tuple) and len(audio) >= 2:
+                    # If we have the original audio data and sample rate
+                    audio_array = audio[-1]
+                    sr = audio[0]
+                    # Check if the audio array is valid
+                    if audio_array.size == 0:
+                        return "Error: Audio data is empty. Please record a longer audio clip."
+                    # Get the duration of the audio
+                    duration = audio_array.shape[-1] / sr
+                    print(f"\033[92mDEBUG\033[0m: Audio duration: {duration:.2f} seconds")
+                    # Check if we have at least 1 second of audio - but don't reject, just pad if needed
+                    min_duration = 1.0  # minimum 1 second of audio
+                    if duration < min_duration:
+                        print(f"\033[93mWARNING\033[0m: Audio is shorter than {min_duration} seconds. Padding will be applied.")
+                        # Calculate samples needed to reach minimum duration
+                        samples_needed = int(min_duration * sr) - audio_array.shape[-1]
+                        # Pad with zeros
+                        padding = np.zeros((audio_array.shape[0], samples_needed), dtype=audio_array.dtype)
+                        audio_array = np.concatenate([audio_array, padding], axis=1)
+                        print(f"\033[92mDEBUG\033[0m: Padded audio to shape: {audio_array.shape}")
+                    # Make sure audio has 2 dimensions
+                    if len(audio_array.shape) == 1:
+                        audio_array = np.expand_dims(audio_array, axis=0)
+                    print(f"\033[92mDEBUG\033[0m: Audio array shape before saving: {audio_array.shape}, sr: {sr}")
+                    # Make sure it's in the right format for torchaudio.save
+                    audio_tensor = torch.tensor(audio_array).float()
+                    if audio_tensor.dim() == 1:
+                        audio_tensor = audio_tensor.unsqueeze(0)
+                    torchaudio.save(temp_audio_path, audio_tensor, sr)
+                    print(f"\033[92mDEBUG\033[0m: Saved temporary audio file to {temp_audio_path}")
+                    # Let's also process the audio here to verify it works
+                    test_mfcc = process_audio_data(audio_tensor, sr)
+                    if test_mfcc is None:
+                        return "Error: Unable to process the audio. Please try recording a different audio sample."
+                    else:
+                        print(f"\033[92mDEBUG\033[0m: Audio pre-check passed. MFCC shape: {test_mfcc.shape}")
+                    audio_path = temp_audio_path
+                else:
+                    # If we don't have a valid path, return an error
+                    return "Error: Cannot process audio for inference. Invalid audio format."
+                # Save the image to a temporary file if infer expects a file path
+                temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
+                if isinstance(image, np.ndarray):
+                    import cv2
+                    cv2.imwrite(temp_image_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+                    print(f"\033[92mDEBUG\033[0m: Saved temporary image file to {temp_image_path}")
+                    image_path = temp_image_path
+                else:
+                    # If we don't have a valid image, return an error
+                    return "Error: Cannot process image for inference. Invalid image format."
+                # Create a modified version of infer that handles None returns
+                def safe_infer(audio_path, image_path, model, device):
+                    try:
+                        return infer(audio_path, image_path, model, device)
+                    except Exception as e:
+                        print(f"\033[91mERROR\033[0m: Error in infer function: {str(e)}")
+                        # Try a more direct approach
+                        try:
+                            # Load audio and process
+                            audio, sr = torchaudio.load(audio_path)
+                            mfcc = process_audio_data(audio, sr)
+                            if mfcc is None:
+                                raise ValueError("Audio processing failed - MFCC is None")
+                            mfcc = mfcc.to(device)
+                            # Load image and process
+                            image = cv2.imread(image_path)
+                            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                            image_tensor = torch.tensor(image).float().permute(2, 0, 1) / 255.0
+                            img_processed = process_image_data(image_tensor)
+                            if img_processed is None:
+                                raise ValueError("Image processing failed - processed image is None")
+                            img_processed = img_processed.to(device)
+                            # Run model inference
+                            with torch.no_grad():
+                                prediction = model(mfcc, img_processed)
+                            return prediction
+                        except Exception as e2:
+                            print(f"\033[91mERROR\033[0m: Fallback inference also failed: {str(e2)}")
+                            raise
+                # Call our safer version
+                print(f"\033[92mDEBUG\033[0m: Calling safe_infer with audio_path={audio_path}, image_path={image_path}")
+                sweetness = safe_infer(audio_path, image_path, model, device)
+                if sweetness is None:
+                    return "Error: The model was unable to make a prediction. Please try with different inputs."
+                print(f"\033[92mDEBUG\033[0m: Inference result: {sweetness.item()}")
+                return f"Predicted Sweetness: {sweetness.item():.2f}/10"
+            except Exception as e:
+                import traceback
+                print(f"\033[91mERROR\033[0m: Inference failed: {str(e)}")
+                print(f"\033[91mTraceback\033[0m: {traceback.format_exc()}")
+                return f"Error during inference: {str(e)}"
+        except Exception as e:
+            import traceback
+            print(f"\033[91mERROR\033[0m: Prediction failed: {str(e)}")
+            print(f"\033[91mTraceback\033[0m: {traceback.format_exc()}")
+            return f"Error processing input: {str(e)}"
+    audio_input = gr.Audio(label="Upload or Record Audio")
+    image_input = gr.Image(label="Upload or Capture Image")
+    output = gr.Textbox(label="Predicted Sweetness")
+    interface = gr.Interface(
+        fn=predict,
+        inputs=[audio_input, image_input],
+        outputs=output,
+        title="Watermelon Sweetness Predictor",
+        description="Upload an audio file and an image to predict the sweetness of a watermelon."
+    )
+    try:
+        interface.launch(share=True)  # Enable sharing to avoid localhost access issues
+    except Exception as e:
+        print(f"\033[91mERROR\033[0m: Failed to launch interface: {e}")
+        print("\033[93mTIP\033[0m: If you're running in a remote environment or container, try setting additional parameters:")
+        print("    interface.launch(server_name='0.0.0.0', share=True)")

infer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch, torchaudio
+import argparse
+from preprocess import process_audio_data, process_image_data
+from train_2 import WatermelonModel
+def infer(audio, image, model, device):
+    # Load and preprocess the input data
+    audio, sr = torchaudio.load(audio)
+    mfcc = process_audio_data(audio, sr).to(device)
+    img = process_image_data(image).to(device)
+    if mfcc is None or img is None:
+        return None
+    # Run inference
+    with torch.no_grad():
+        predicted_sweetness = model(mfcc, img).item()
+    return predicted_sweetness
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run Watermelon Sweetness Prediction")
+    parser.add_argument(
+        "--model_path", type=str, required=True, help="Path to the saved model file"
+    )
+    parser.add_argument(
+        "--audio_path", type=str, required=True, help="Path to audio file"
+    )
+    parser.add_argument(
+        "--image_path", type=str, required=True, help="Path to image file"
+    )
+    args = parser.parse_args()
+    # Initialize the model and device
+    print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
+    device = torch.device(
+        "cuda"
+        if torch.cuda.is_available()
+        else "mps" if torch.backends.mps.is_available() else "cpu"
+    )
+    print(f"\033[92mINFO\033[0m: Using device: {device}")
+    model = WatermelonModel().to(device)
+    model.load_state_dict(torch.load(args.model_path, map_location=device))
+    # Example paths to audio and image files
+    audio_path = args.audio_patb
+    image_path = args.image_path
+    # Run inference
+    sweetness = infer(audio_path, image_path, model, device)
+    print(f"Predicted sweetness: {sweetness}")

preprocess.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torchaudio
+import torchvision
+resample_rate = 16000
+def process_audio_data(waveform, sample_rate):
+    try:
+        waveform = waveform[0]  # 使用左声道
+        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        if waveform.size(0) < 3 * resample_rate:
+            waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0)))
+        else:
+            waveform = waveform[: 3 * resample_rate]
+        mfcc = torchaudio.transforms.MFCC(
+            sample_rate=resample_rate,
+            n_mfcc=13,
+            melkwargs={
+                "n_fft": 256,
+                "win_length": 256,
+                "hop_length": 128,
+                "n_mels": 40,
+            }
+        )(waveform)
+        return mfcc
+    except Exception as e:
+        print(f"ERR!: Error in audio processing: {e}")
+        return None
+def process_image_data(image):
+    try:
+        image = torchvision.transforms.Resize((1080, 1080))(image)
+        image = image / 255.0
+        image = torchvision.transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        )(image)
+        return image
+    except Exception as e:
+        print(f"ERR!: Error in image processing: {e}")
+        return None

preprocess_file.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import glob
+import torch
+import torchaudio
+import torchvision
+from torch.utils.data import Dataset
+from concurrent.futures import ThreadPoolExecutor
+from preprocess import process_audio_data, process_image_data, resample_rate
+class PreprocessedDataset(Dataset):
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+        self.samples = [
+            os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".pt")
+        ]
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        sample_path = self.samples[idx]
+        mfcc, image, label = torch.load(sample_path)
+        # Process data
+        mfcc = process_audio_data(mfcc, resample_rate)
+        image = process_image_data(image)
+        return mfcc, image, label
+def load_audio_file(audio_path):
+    if not os.path.exists(audio_path):
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    try:
+        # Try the default torchaudio loader first
+        waveform, sample_rate = torchaudio.load(audio_path)
+    except Exception as e:
+        print(f"Warning: Could not load {audio_path} with torchaudio: {e}")
+        # Fall back to librosa (you'll need to install it: pip install librosa)
+        try:
+            import librosa
+            import numpy as np
+            waveform_np, sample_rate = librosa.load(audio_path, sr=None)
+            # Convert to torch tensor with shape [1, length] to match torchaudio format
+            waveform = torch.from_numpy(waveform_np[np.newaxis, :]).float()
+            print(f"Successfully loaded with librosa: {audio_path}")
+        except Exception as final_e:
+            raise RuntimeError(f"Failed to load audio file {audio_path} with all available methods: {final_e}")
+    return waveform, sample_rate
+def load_image_file(image_path):
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+    image = torchvision.io.read_image(image_path)
+    return image
+def process_sample(sample_path, save_dir):
+    # Recursively search for audio and image files
+    audio_files = []
+    image_files = []
+    # Walk through all subdirectories
+    for root, _, files in os.walk(sample_path):
+        for file in files:
+            if file.lower().endswith(('.wav', '.mp3', '.flac')):
+                audio_files.append(os.path.join(root, file))
+            elif file.lower().endswith(('.jpg', '.jpeg', '.png')):
+                image_files.append(os.path.join(root, file))
+    if not audio_files:
+        print(f"Warning: No audio file found in {sample_path}. Skipping this sample.")
+        return
+    if not image_files:
+        print(f"Warning: No image file found in {sample_path}. Skipping this sample.")
+        return
+    # Use the first found audio and image files
+    audio_path = audio_files[0]
+    image_path = image_files[0]
+    print(f"Processing audio: {audio_path}")
+    print(f"Processing image: {image_path}")
+    waveform, sample_rate = load_audio_file(audio_path)
+    image = load_image_file(image_path)
+    # Process data
+    mfcc = process_audio_data(waveform, sample_rate)
+    processed_image = process_image_data(image)
+    # Save processed data
+    save_path = os.path.join(save_dir, f"{os.path.basename(sample_path)}.pt")
+    torch.save((mfcc, processed_image, float(os.path.basename(sample_path))), save_path)
+    print(f"Processed and saved: {save_path}")
+def process_and_save(data_dir, save_dir):
+    os.makedirs(save_dir, exist_ok=True)
+    sample_paths = [os.path.join(data_dir, d) for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
+    if not sample_paths:
+        print(f"Warning: No sample directories found in {data_dir}")
+        return
+    print(f"Found {len(sample_paths)} sample directories to process")
+    successful = 0
+    failed = 0
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_sample, path, save_dir) for path in sample_paths]
+        for future in futures:
+            try:
+                future.result()  # Wait for all threads to complete
+                successful += 1
+            except Exception as e:
+                failed += 1
+                print(f"Error processing a sample: {e}")
+    print(f"Processing complete. Successfully processed: {successful}, Failed: {failed}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Preprocess the dataset")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="cleaned",
+        help="Path to the cleaned dataset directory",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="processed",
+        help="Path to the processed dataset directory",
+    )
+    args = parser.parse_args()
+    print(f"Processing dataset from: {args.data_dir}")
+    print(f"Saving processed data to: {args.save_dir}")
+    process_and_save(args.data_dir, args.save_dir)
+    print("Preprocessing complete")

temp/temp_audio.wav ADDED Viewed

Binary file (58 Bytes). View file

temp/temp_image.jpg ADDED Viewed

Git LFS Details

SHA256: 4af3d138bad5d27184910f9d38bac40565a57a7bb2f5efc12a7d7e28aa8f126a
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

train.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+import time
+import torch, torchaudio, torchvision
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import numpy as np
+# 打印库的版本信息
+print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
+print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}")
+print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}")
+# 设备选择
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)
+print(f"\033[92mINFO\033[0m: Using device: {device}")
+# 超参数设置
+batch_size = 1
+epochs = 20
+# 模型保存目录
+os.makedirs("./models/", exist_ok=True)
+class PreprocessedDataset(Dataset):
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+        self.samples = [
+            os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".pt")
+        ]
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        sample_path = self.samples[idx]
+        mfcc, image, label = torch.load(sample_path)
+        return mfcc.float(), image.float(), label
+class WatermelonModel(torch.nn.Module):
+    def __init__(self):
+        super(WatermelonModel, self).__init__()
+        # LSTM for audio features
+        self.lstm = torch.nn.LSTM(
+            input_size=376, hidden_size=64, num_layers=2, batch_first=True
+        )
+        self.lstm_fc = torch.nn.Linear(
+            64, 128
+        )  # Convert LSTM output to 128-dim for merging
+        # ResNet50 for image features
+        self.resnet = torchvision.models.resnet50(pretrained=True)
+        self.resnet.fc = torch.nn.Linear(
+            self.resnet.fc.in_features, 128
+        )  # Convert ResNet output to 128-dim for merging
+        # Fully connected layers for final prediction
+        self.fc1 = torch.nn.Linear(256, 64)
+        self.fc2 = torch.nn.Linear(64, 1)
+        self.relu = torch.nn.ReLU()
+    def forward(self, mfcc, image):
+        # LSTM branch
+        lstm_output, _ = self.lstm(mfcc)
+        lstm_output = lstm_output[:, -1, :]  # Use the output of the last time step
+        lstm_output = self.lstm_fc(lstm_output)
+        # ResNet branch
+        resnet_output = self.resnet(image)
+        # Concatenate LSTM and ResNet outputs
+        merged = torch.cat((lstm_output, resnet_output), dim=1)
+        # Fully connected layers
+        output = self.relu(self.fc1(merged))
+        output = self.fc2(output)
+        return output
+def evaluate_model(model, test_loader, criterion):
+    model.eval()
+    test_loss = 0.0
+    mae_sum = 0.0
+    all_predictions = []
+    all_labels = []
+    # For debugging
+    debug_samples = []
+    with torch.no_grad():
+        for mfcc, image, label in test_loader:
+            mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+            output = model(mfcc, image)
+            label = label.view(-1, 1).float()
+            # Store debug samples
+            if len(debug_samples) < 5:
+                debug_samples.append((output.item(), label.item()))
+            # Calculate MSE loss
+            loss = criterion(output, label)
+            test_loss += loss.item()
+            # Calculate MAE
+            mae = torch.abs(output - label).mean()
+            mae_sum += mae.item()
+            # Store predictions and labels for additional analysis
+            all_predictions.extend(output.cpu().numpy())
+            all_labels.extend(label.cpu().numpy())
+    avg_loss = test_loss / len(test_loader)
+    avg_mae = mae_sum / len(test_loader)
+    # Convert to numpy arrays for easier analysis
+    all_predictions = np.array(all_predictions).flatten()
+    all_labels = np.array(all_labels).flatten()
+    # Print debug samples
+    print("\nDEBUG SAMPLES (Prediction, Label):")
+    for i, (pred, label) in enumerate(debug_samples):
+        print(f"Sample {i+1}: Prediction = {pred:.4f}, Label = {label:.4f}, Difference = {abs(pred-label):.4f}")
+    return avg_loss, avg_mae, all_predictions, all_labels
+def train_model():
+    # 数据集加载
+    data_dir = "./processed/"
+    dataset = PreprocessedDataset(data_dir)
+    n_samples = len(dataset)
+    # Check label range
+    all_labels = []
+    for i in range(min(10, len(dataset))):
+        _, _, label = dataset[i]
+        all_labels.append(label)
+    print("\nLABEL RANGE CHECK:")
+    print(f"Sample labels: {all_labels}")
+    print(f"Min label: {min(all_labels)}, Max label: {max(all_labels)}")
+    train_size = int(0.7 * n_samples)
+    val_size = int(0.2 * n_samples)
+    test_size = n_samples - train_size - val_size
+    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
+        dataset, [train_size, val_size, test_size]
+    )
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    model = WatermelonModel().to(device)
+    # 损失函数和优化器
+    criterion = torch.nn.MSELoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    # TensorBoard
+    writer = SummaryWriter("runs/")
+    global_step = 0
+    print(f"\033[92mINFO\033[0m: Training model for {epochs} epochs")
+    print(f"\033[92mINFO\033[0m: Training samples: {len(train_dataset)}")
+    print(f"\033[92mINFO\033[0m: Validation samples: {len(val_dataset)}")
+    print(f"\033[92mINFO\033[0m: Test samples: {len(test_dataset)}")
+    print(f"\033[92mINFO\033[0m: Batch size: {batch_size}")
+    best_val_loss = float('inf')
+    best_model_path = None
+    # 训练循环
+    for epoch in range(epochs):
+        print(f"\033[92mINFO\033[0m: Training epoch ({epoch+1}/{epochs})")
+        model.train()
+        running_loss = 0.0
+        try:
+            for mfcc, image, label in train_loader:
+                mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                optimizer.zero_grad()
+                output = model(mfcc, image)
+                label = label.view(-1, 1).float()
+                loss = criterion(output, label)
+                loss.backward()
+                optimizer.step()
+                running_loss += loss.item()
+                writer.add_scalar("Training Loss", loss.item(), global_step)
+                global_step += 1
+        except Exception as e:
+            print(f"\033[91mERR!\033[0m: {e}")
+        # 验证阶段
+        model.eval()
+        val_loss = 0.0
+        with torch.no_grad():
+            try:
+                for mfcc, image, label in val_loader:
+                    mfcc, image, label = (
+                        mfcc.to(device),
+                        image.to(device),
+                        label.to(device),
+                    )
+                    output = model(mfcc, image)
+                    loss = criterion(output, label.view(-1, 1))
+                    val_loss += loss.item()
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: {e}")
+        avg_val_loss = val_loss / len(val_loader)
+        # 记录验证损失
+        writer.add_scalar("Validation Loss", avg_val_loss, epoch)
+        print(
+            f"Epoch [{epoch+1}/{epochs}], Training Loss: {running_loss/len(train_loader):.4f}, "
+            f"Validation Loss: {avg_val_loss:.4f}"
+        )
+        # 保存模型检查点
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+        model_path = f"models/model_{epoch+1}_{timestamp}.pt"
+        torch.save(model.state_dict(), model_path)
+        # Save the best model based on validation loss
+        if avg_val_loss < best_val_loss:
+            best_val_loss = avg_val_loss
+            best_model_path = model_path
+            print(f"\033[92mINFO\033[0m: New best model saved with validation loss: {best_val_loss:.4f}")
+        print(
+            f"\033[92mINFO\033[0m: Model checkpoint epoch [{epoch+1}/{epochs}] saved: {model_path}"
+        )
+    print(f"\033[92mINFO\033[0m: Training complete")
+    # Load the best model for testing
+    print(f"\033[92mINFO\033[0m: Loading best model from {best_model_path} for testing")
+    model.load_state_dict(torch.load(best_model_path))
+    # Evaluate on test set
+    test_loss, test_mae, predictions, labels = evaluate_model(model, test_loader, criterion)
+    # Calculate additional metrics
+    max_error = np.max(np.abs(predictions - labels))
+    min_error = np.min(np.abs(predictions - labels))
+    print("\n" + "="*50)
+    print("TEST RESULTS:")
+    print(f"Test Loss (MSE): {test_loss:.4f}")
+    print(f"Mean Absolute Error: {test_mae:.4f}")
+    print(f"Maximum Absolute Error: {max_error:.4f}")
+    print(f"Minimum Absolute Error: {min_error:.4f}")
+    # Add test results to TensorBoard
+    writer.add_scalar("Test/MSE", test_loss, 0)
+    writer.add_scalar("Test/MAE", test_mae, 0)
+    writer.add_scalar("Test/Max_Error", max_error, 0)
+    writer.add_scalar("Test/Min_Error", min_error, 0)
+    # Create a histogram of absolute errors
+    abs_errors = np.abs(predictions - labels)
+    writer.add_histogram("Test/Absolute_Errors", abs_errors, 0)
+    print("="*50)
+    writer.close()
+if __name__ == "__main__":
+    train_model()

train_2.py ADDED Viewed

	@@ -0,0 +1,449 @@

+import os
+import time
+import argparse
+import torch
+import torchaudio
+import torchvision
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import numpy as np
+from efficient_model import MobileNetGRUModel, EfficientNetCNNModel, SqueezeNetTransformerModel
+# Print library version information
+print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
+print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}")
+print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}")
+# Device selection
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)
+print(f"\033[92mINFO\033[0m: Using device: {device}")
+# Hyperparameters (using the best configuration from search)
+batch_size = 4
+epochs = 20
+fc_hidden_size = 64
+learning_rate = 0.0005
+dropout_rate = 0.5
+# Model save directory
+os.makedirs("./models/", exist_ok=True)
+class PreprocessedDataset(Dataset):
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+        self.samples = [
+            os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".pt")
+        ]
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        sample_path = self.samples[idx]
+        mfcc, image, label = torch.load(sample_path)
+        return mfcc.float(), image.float(), label
+def calculate_mae(outputs, labels):
+    """Calculate Mean Absolute Error between outputs and labels"""
+    return torch.abs(outputs - labels).mean().item()
+def evaluate_model(model, test_loader, criterion):
+    model.eval()
+    test_loss = 0.0
+    mae_sum = 0.0
+    all_predictions = []
+    all_labels = []
+    # For debugging
+    debug_samples = []
+    with torch.no_grad():
+        for mfcc, image, label in test_loader:
+            mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+            output = model(mfcc, image)
+            label = label.view(-1, 1).float()
+            # Store debug samples (handling batch dimension properly)
+            if len(debug_samples) < 5:
+                # Extract individual samples from the batch
+                for i in range(min(len(output), 5 - len(debug_samples))):
+                    debug_samples.append((output[i][0].item(), label[i][0].item()))
+            # Calculate MSE loss
+            loss = criterion(output, label)
+            test_loss += loss.item()
+            # Calculate MAE
+            mae = torch.abs(output - label).mean()
+            mae_sum += mae.item()
+            # Store predictions and labels for additional analysis
+            all_predictions.extend(output.cpu().numpy())
+            all_labels.extend(label.cpu().numpy())
+    avg_loss = test_loss / len(test_loader)
+    avg_mae = mae_sum / len(test_loader)
+    # Convert to numpy arrays for easier analysis
+    all_predictions = np.array(all_predictions).flatten()
+    all_labels = np.array(all_labels).flatten()
+    # Print debug samples
+    print("\nDEBUG SAMPLES (Prediction, Label):")
+    for i, (pred, label) in enumerate(debug_samples):
+        print(f"Sample {i+1}: Prediction = {pred:.4f}, Label = {label:.4f}, Difference = {abs(pred-label):.4f}")
+    return avg_loss, avg_mae, all_predictions, all_labels
+def train_model(model_type):
+    try:
+        # Create model based on type
+        if model_type == "mobilenet_gru":
+            model = MobileNetGRUModel(
+                gru_hidden_size=32,
+                gru_layers=1,
+                fc_hidden_size=fc_hidden_size,
+                dropout_rate=dropout_rate
+            ).to(device)
+            model_name = "MobileNetGRU"
+        elif model_type == "efficientnet_cnn":
+            model = EfficientNetCNNModel(
+                fc_hidden_size=fc_hidden_size,
+                dropout_rate=dropout_rate
+            ).to(device)
+            model_name = "EfficientNetCNN"
+        elif model_type == "squeezenet_transformer":
+            model = SqueezeNetTransformerModel(
+                nhead=4,
+                dim_feedforward=128,
+                fc_hidden_size=fc_hidden_size,
+                dropout_rate=dropout_rate
+            ).to(device)
+            model_name = "SqueezeNetTransformer"
+        else:
+            raise ValueError(f"Unknown model type: {model_type}")
+        # Data loading
+        data_dir = "./processed/"
+        dataset = PreprocessedDataset(data_dir)
+        n_samples = len(dataset)
+        # Check label range
+        all_labels = []
+        for i in range(min(10, len(dataset))):
+            _, _, label = dataset[i]
+            all_labels.append(label)
+        print("\nLABEL RANGE CHECK:")
+        print(f"Sample labels: {all_labels}")
+        print(f"Min label: {min(all_labels)}, Max label: {max(all_labels)}")
+        train_size = int(0.7 * n_samples)
+        val_size = int(0.2 * n_samples)
+        test_size = n_samples - train_size - val_size
+        train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
+            dataset, [train_size, val_size, test_size]
+        )
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+        # Loss function and optimizer
+        criterion = torch.nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+        # TensorBoard
+        writer = SummaryWriter(f"runs/{model_name}/")
+        global_step = 0
+        print(f"\033[92mINFO\033[0m: Training {model_name} model for {epochs} epochs")
+        print(f"\033[92mINFO\033[0m: Training samples: {len(train_dataset)}")
+        print(f"\033[92mINFO\033[0m: Validation samples: {len(val_dataset)}")
+        print(f"\033[92mINFO\033[0m: Test samples: {len(test_dataset)}")
+        print(f"\033[92mINFO\033[0m: Batch size: {batch_size}")
+        print(f"\033[92mINFO\033[0m: Learning rate: {learning_rate}")
+        print(f"\033[92mINFO\033[0m: Dropout rate: {dropout_rate}")
+        best_val_loss = float('inf')
+        best_model_path = None
+        # Calculate model size
+        model_size = sum(p.numel() for p in model.parameters()) / 1e6  # in millions
+        print(f"\033[92mINFO\033[0m: Model parameters: {model_size:.2f}M")
+        # Training loop
+        for epoch in range(epochs):
+            print(f"\033[92mINFO\033[0m: Training epoch ({epoch+1}/{epochs})")
+            model.train()
+            running_loss = 0.0
+            running_mae = 0.0
+            n_batches = 0
+            start_time = time.time()
+            try:
+                for mfcc, image, label in train_loader:
+                    mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                    optimizer.zero_grad()
+                    output = model(mfcc, image)
+                    label = label.view(-1, 1).float()
+                    loss = criterion(output, label)
+                    loss.backward()
+                    optimizer.step()
+                    running_loss += loss.item()
+                    running_mae += calculate_mae(output, label)
+                    n_batches += 1
+                    writer.add_scalar("Training/Loss", loss.item(), global_step)
+                    writer.add_scalar("Training/MAE", calculate_mae(output, label), global_step)
+                    global_step += 1
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: {e}")
+            epoch_time = time.time() - start_time
+            # Validation phase
+            model.eval()
+            val_loss = 0.0
+            val_mae = 0.0
+            val_batches = 0
+            with torch.no_grad():
+                try:
+                    for mfcc, image, label in val_loader:
+                        mfcc, image, label = (
+                            mfcc.to(device),
+                            image.to(device),
+                            label.to(device),
+                        )
+                        output = model(mfcc, image)
+                        label = label.view(-1, 1).float()
+                        # Calculate loss
+                        loss = criterion(output, label)
+                        val_loss += loss.item()
+                        # Calculate MAE
+                        val_mae += calculate_mae(output, label)
+                        val_batches += 1
+                except Exception as e:
+                    print(f"\033[91mERR!\033[0m: {e}")
+            avg_train_loss = running_loss / n_batches
+            avg_train_mae = running_mae / n_batches
+            avg_val_loss = val_loss / val_batches
+            avg_val_mae = val_mae / val_batches
+            # Record validation metrics
+            writer.add_scalar("Validation/Loss", avg_val_loss, epoch)
+            writer.add_scalar("Validation/MAE", avg_val_mae, epoch)
+            print(
+                f"Epoch [{epoch+1}/{epochs}], Time: {epoch_time:.2f}s, "
+                f"Train Loss: {avg_train_loss:.4f}, Train MAE: {avg_train_mae:.4f}, "
+                f"Val Loss: {avg_val_loss:.4f}, Val MAE: {avg_val_mae:.4f}"
+            )
+            # Save model checkpoint
+            timestamp = time.strftime("%Y%m%d-%H%M%S")
+            model_path = f"models/{model_name}_model_{epoch+1}_{timestamp}.pt"
+            torch.save(model.state_dict(), model_path)
+            # Save the best model based on validation loss
+            if avg_val_loss < best_val_loss:
+                best_val_loss = avg_val_loss
+                best_model_path = model_path
+                print(f"\033[92mINFO\033[0m: New best model saved with validation loss: {best_val_loss:.4f}")
+            print(
+                f"\033[92mINFO\033[0m: Model checkpoint epoch [{epoch+1}/{epochs}] saved: {model_path}"
+            )
+        print(f"\033[92mINFO\033[0m: Training complete")
+        # Load the best model for testing
+        print(f"\033[92mINFO\033[0m: Loading best model from {best_model_path} for testing")
+        model.load_state_dict(torch.load(best_model_path))
+        # Evaluate on test set
+        test_loss, test_mae, predictions, labels = evaluate_model(model, test_loader, criterion)
+        # Calculate additional metrics
+        max_error = np.max(np.abs(predictions - labels))
+        min_error = np.min(np.abs(predictions - labels))
+        print("\n" + "="*50)
+        print(f"TEST RESULTS FOR {model_name}:")
+        print(f"Test Loss (MSE): {test_loss:.4f}")
+        print(f"Mean Absolute Error: {test_mae:.4f}")
+        print(f"Maximum Absolute Error: {max_error:.4f}")
+        print(f"Minimum Absolute Error: {min_error:.4f}")
+        # Add test results to TensorBoard
+        writer.add_scalar("Test/MSE", test_loss, 0)
+        writer.add_scalar("Test/MAE", test_mae, 0)
+        writer.add_scalar("Test/Max_Error", max_error, 0)
+        writer.add_scalar("Test/Min_Error", min_error, 0)
+        # Create a histogram of absolute errors
+        abs_errors = np.abs(predictions - labels)
+        writer.add_histogram("Test/Absolute_Errors", abs_errors, 0)
+        print("="*50)
+        # Final summary
+        print("\nTRAINING SUMMARY:")
+        print(f"Model: {model_name}")
+        print(f"Model Size: {model_size:.2f}M parameters")
+        print(f"Best Validation Loss: {best_val_loss:.4f}")
+        print(f"Final Test Loss: {test_loss:.4f}")
+        print(f"Final Test MAE: {test_mae:.4f}")
+        print(f"Best model saved at: {best_model_path}")
+        writer.close()
+        # Return metrics for comparison
+        return {
+            "model_name": model_name,
+            "model_size": model_size,
+            "val_loss": best_val_loss,
+            "test_loss": test_loss,
+            "test_mae": test_mae,
+            "model_path": best_model_path
+        }
+    except Exception as e:
+        print(f"\033[91mERR!\033[0m: Error training {model_type}: {e}")
+        # Return a placeholder result
+        return {
+            "model_name": model_type,
+            "model_size": 0,
+            "val_loss": float('inf'),
+            "test_loss": float('inf'),
+            "test_mae": float('inf'),
+            "model_path": None,
+            "error": str(e)
+        }
+def test_cpu_inference(model_path, model_type):
+    """Test CPU inference speed for the given model"""
+    # Create model based on type
+    if model_type == "mobilenet_gru":
+        model = MobileNetGRUModel(
+            gru_hidden_size=32,
+            gru_layers=1,
+            fc_hidden_size=fc_hidden_size,
+            dropout_rate=dropout_rate
+        )
+        model_name = "MobileNetGRU"
+    elif model_type == "efficientnet_cnn":
+        model = EfficientNetCNNModel(
+            fc_hidden_size=fc_hidden_size,
+            dropout_rate=dropout_rate
+        )
+        model_name = "EfficientNetCNN"
+    elif model_type == "squeezenet_transformer":
+        model = SqueezeNetTransformerModel(
+            nhead=4,
+            dim_feedforward=128,
+            fc_hidden_size=fc_hidden_size,
+            dropout_rate=dropout_rate
+        )
+        model_name = "SqueezeNetTransformer"
+    else:
+        raise ValueError(f"Unknown model type: {model_type}")
+    # Load model weights
+    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+    model.eval()
+    # Create dummy input
+    dummy_mfcc = torch.randn(1, 10, 376)  # Batch size 1, 10 time steps, 376 features
+    dummy_image = torch.randn(1, 3, 224, 224)  # Batch size 1, 3 channels, 224x224 image
+    # Warm-up
+    for _ in range(10):
+        _ = model(dummy_mfcc, dummy_image)
+    # Measure inference time
+    num_runs = 100
+    start_time = time.time()
+    for _ in range(num_runs):
+        _ = model(dummy_mfcc, dummy_image)
+    end_time = time.time()
+    avg_time = (end_time - start_time) / num_runs
+    print(f"\n{model_name} CPU Inference Time:")
+    print(f"Average over {num_runs} runs: {avg_time*1000:.2f} ms")
+    return avg_time
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train and evaluate efficient models")
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=["mobilenet_gru", "efficientnet_cnn", "squeezenet_transformer", "all"],
+        default="all",
+        help="Model architecture to train"
+    )
+    args = parser.parse_args()
+    results = []
+    if args.model == "all":
+        # Train all models
+        for model_type in ["mobilenet_gru", "efficientnet_cnn", "squeezenet_transformer"]:
+            print(f"\n\n{'='*50}")
+            print(f"TRAINING {model_type.upper()}")
+            print(f"{'='*50}\n")
+            result = train_model(model_type)
+            results.append(result)
+            # Test CPU inference
+            inference_time = test_cpu_inference(result["model_path"], model_type)
+            result["inference_time"] = inference_time
+    else:
+        # Train specific model
+        result = train_model(args.model)
+        results.append(result)
+        # Test CPU inference
+        inference_time = test_cpu_inference(result["model_path"], args.model)
+        result["inference_time"] = inference_time
+    # Compare results
+    print("\n\n" + "="*80)
+    print("MODEL COMPARISON")
+    print("="*80)
+    print(f"{'Model':<25} {'Size (M)':<10} {'Val Loss':<10} {'Test Loss':<10} {'Test MAE':<10} {'CPU Time (ms)':<15}")
+    print("-"*80)
+    for result in results:
+        print(f"{result['model_name']:<25} {result['model_size']:<10.2f} {result['val_loss']:<10.4f} "
+              f"{result['test_loss']:<10.4f} {result['test_mae']:<10.4f} {result['inference_time']*1000:<15.2f}")
+    print("="*80)
+    # Find best model
+    best_model = min(results, key=lambda x: x["test_mae"])
+    print(f"\nBEST MODEL: {best_model['model_name']}")
+    print(f"Test MAE: {best_model['test_mae']:.4f}")
+    print(f"CPU Inference Time: {best_model['inference_time']*1000:.2f} ms")
+    print(f"Model Path: {best_model['model_path']}")