Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- .gradio/certificate.pem +31 -0
- README.md +3 -9
- __pycache__/infer_watermelon.cpython-310.pyc +0 -0
- __pycache__/train_watermelon.cpython-310.pyc +0 -0
- app.py +318 -0
- app_local_backup.py +290 -0
- infer_watermelon.py +150 -0
- models/model_1_20250406-064126.pt +3 -0
- models/model_1_20250406-064635.pt +3 -0
- models/model_2_20250406-065053.pt +3 -0
- models/watermelon_model_final.pt +3 -0
- requirements.txt +6 -0
- runs/events.out.tfevents.1743920786.vm-jinzq.2059144.0 +3 -0
- runs/events.out.tfevents.1743920828.vm-jinzq.2059396.0 +3 -0
- runs/events.out.tfevents.1743921401.jzqdebug-c245a8-job-84fn7.812.0 +3 -0
- runs/events.out.tfevents.1743921735.jzqdebug-c245a8-job-84fn7.1262.0 +3 -0
- temp/temp_audio.wav +3 -0
- temp/temp_image.jpg +3 -0
- train_watermelon.py +261 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
temp/temp_audio.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
temp/temp_image.jpg filter=lfs diff=lfs merge=lfs -text
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 🚀
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.23.3
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: watermelon2
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 5.9.1
|
6 |
---
|
|
|
|
__pycache__/infer_watermelon.cpython-310.pyc
ADDED
Binary file (4.39 kB). View file
|
|
__pycache__/train_watermelon.cpython-310.pyc
ADDED
Binary file (6.74 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import gradio as gr
|
6 |
+
import torchaudio
|
7 |
+
import torchvision
|
8 |
+
|
9 |
+
# Import Gradio Spaces GPU decorator
|
10 |
+
try:
|
11 |
+
from gradio import spaces
|
12 |
+
HAS_SPACES = True
|
13 |
+
print("\033[92mINFO\033[0m: Gradio Spaces detected, GPU acceleration will be enabled")
|
14 |
+
except ImportError:
|
15 |
+
HAS_SPACES = False
|
16 |
+
print("\033[93mWARN\033[0m: gradio.spaces not available, running without GPU optimization")
|
17 |
+
|
18 |
+
# Add parent directory to path to import preprocess functions
|
19 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
20 |
+
|
21 |
+
# Import functions from infer_watermelon.py and train_watermelon for the model
|
22 |
+
from train_watermelon import WatermelonModel
|
23 |
+
|
24 |
+
# Modified version of process_audio_data specifically for the app to handle various tensor shapes
|
25 |
+
def app_process_audio_data(waveform, sample_rate):
|
26 |
+
"""Modified version of process_audio_data for the app that handles different tensor dimensions"""
|
27 |
+
try:
|
28 |
+
print(f"\033[92mDEBUG\033[0m: Processing audio - Initial shape: {waveform.shape}, Sample rate: {sample_rate}")
|
29 |
+
|
30 |
+
# Handle different tensor dimensions
|
31 |
+
if waveform.dim() == 3:
|
32 |
+
print(f"\033[92mDEBUG\033[0m: Found 3D tensor, converting to 2D")
|
33 |
+
# For 3D tensor, take the first item (batch dimension)
|
34 |
+
waveform = waveform[0]
|
35 |
+
|
36 |
+
if waveform.dim() == 2:
|
37 |
+
# Use the first channel for stereo audio
|
38 |
+
waveform = waveform[0]
|
39 |
+
print(f"\033[92mDEBUG\033[0m: Using first channel, new shape: {waveform.shape}")
|
40 |
+
|
41 |
+
# Resample to 16kHz if needed
|
42 |
+
resample_rate = 16000
|
43 |
+
if sample_rate != resample_rate:
|
44 |
+
print(f"\033[92mDEBUG\033[0m: Resampling from {sample_rate}Hz to {resample_rate}Hz")
|
45 |
+
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform)
|
46 |
+
|
47 |
+
# Ensure 3 seconds of audio
|
48 |
+
if waveform.size(0) < 3 * resample_rate:
|
49 |
+
print(f"\033[92mDEBUG\033[0m: Padding audio from {waveform.size(0)} to {3 * resample_rate} samples")
|
50 |
+
waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0)))
|
51 |
+
else:
|
52 |
+
print(f"\033[92mDEBUG\033[0m: Trimming audio from {waveform.size(0)} to {3 * resample_rate} samples")
|
53 |
+
waveform = waveform[: 3 * resample_rate]
|
54 |
+
|
55 |
+
# Apply MFCC transformation
|
56 |
+
print(f"\033[92mDEBUG\033[0m: Applying MFCC transformation")
|
57 |
+
mfcc_transform = torchaudio.transforms.MFCC(
|
58 |
+
sample_rate=resample_rate,
|
59 |
+
n_mfcc=13,
|
60 |
+
melkwargs={
|
61 |
+
"n_fft": 256,
|
62 |
+
"win_length": 256,
|
63 |
+
"hop_length": 128,
|
64 |
+
"n_mels": 40,
|
65 |
+
}
|
66 |
+
)
|
67 |
+
|
68 |
+
mfcc = mfcc_transform(waveform)
|
69 |
+
print(f"\033[92mDEBUG\033[0m: MFCC output shape: {mfcc.shape}")
|
70 |
+
|
71 |
+
return mfcc
|
72 |
+
except Exception as e:
|
73 |
+
import traceback
|
74 |
+
print(f"\033[91mERR!\033[0m: Error in audio processing: {e}")
|
75 |
+
print(traceback.format_exc())
|
76 |
+
return None
|
77 |
+
|
78 |
+
# Similarly for images, but let's import the original one
|
79 |
+
from preprocess import process_image_data
|
80 |
+
|
81 |
+
# Define prediction function
|
82 |
+
def predict_sweetness(audio, image, model_path):
|
83 |
+
"""Predict sweetness of a watermelon from audio and image input"""
|
84 |
+
try:
|
85 |
+
# Now check CUDA availability inside the GPU-decorated function
|
86 |
+
if torch.cuda.is_available():
|
87 |
+
device = torch.device("cuda")
|
88 |
+
print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
|
89 |
+
else:
|
90 |
+
device = torch.device("cpu")
|
91 |
+
print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
|
92 |
+
|
93 |
+
# Load model inside the function to ensure it's on the correct device
|
94 |
+
model = WatermelonModel().to(device)
|
95 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
96 |
+
model.eval()
|
97 |
+
print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
|
98 |
+
|
99 |
+
# Debug information about input types
|
100 |
+
print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
|
101 |
+
print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
|
102 |
+
print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
|
103 |
+
if isinstance(image, np.ndarray):
|
104 |
+
print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
|
105 |
+
|
106 |
+
# Handle different audio input formats
|
107 |
+
if isinstance(audio, tuple) and len(audio) == 2:
|
108 |
+
# Standard Gradio format: (sample_rate, audio_data)
|
109 |
+
sample_rate, audio_data = audio
|
110 |
+
print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
|
111 |
+
print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
|
112 |
+
elif isinstance(audio, tuple) and len(audio) > 2:
|
113 |
+
# Sometimes Gradio returns (sample_rate, audio_data, other_info...)
|
114 |
+
sample_rate, audio_data = audio[0], audio[-1]
|
115 |
+
print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
|
116 |
+
print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
|
117 |
+
elif isinstance(audio, str):
|
118 |
+
# Direct path to audio file
|
119 |
+
audio_data, sample_rate = torchaudio.load(audio)
|
120 |
+
print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
|
121 |
+
else:
|
122 |
+
return f"Error: Unsupported audio format. Got {type(audio)}"
|
123 |
+
|
124 |
+
# Create a temporary file path for the audio and image
|
125 |
+
temp_dir = "temp"
|
126 |
+
os.makedirs(temp_dir, exist_ok=True)
|
127 |
+
|
128 |
+
temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
|
129 |
+
temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
|
130 |
+
|
131 |
+
# Import necessary libraries
|
132 |
+
from PIL import Image
|
133 |
+
|
134 |
+
# Audio handling - direct processing from the data in memory
|
135 |
+
if isinstance(audio_data, np.ndarray):
|
136 |
+
# Convert numpy array to tensor
|
137 |
+
print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
|
138 |
+
audio_tensor = torch.tensor(audio_data).float()
|
139 |
+
|
140 |
+
# Handle different audio dimensions
|
141 |
+
if audio_data.ndim == 1:
|
142 |
+
# Single channel audio
|
143 |
+
audio_tensor = audio_tensor.unsqueeze(0)
|
144 |
+
elif audio_data.ndim == 2:
|
145 |
+
# Ensure channels are first dimension
|
146 |
+
if audio_data.shape[0] > audio_data.shape[1]:
|
147 |
+
# More rows than columns, probably (samples, channels)
|
148 |
+
audio_tensor = torch.tensor(audio_data.T).float()
|
149 |
+
else:
|
150 |
+
# Already a tensor
|
151 |
+
audio_tensor = audio_data.float()
|
152 |
+
|
153 |
+
print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
|
154 |
+
|
155 |
+
# Skip saving/loading and process directly
|
156 |
+
mfcc = app_process_audio_data(audio_tensor, sample_rate)
|
157 |
+
print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
|
158 |
+
|
159 |
+
# Image handling
|
160 |
+
if isinstance(image, np.ndarray):
|
161 |
+
print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
|
162 |
+
pil_image = Image.fromarray(image)
|
163 |
+
pil_image.save(temp_image_path)
|
164 |
+
print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
|
165 |
+
elif isinstance(image, str):
|
166 |
+
# If image is already a path
|
167 |
+
temp_image_path = image
|
168 |
+
print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
|
169 |
+
else:
|
170 |
+
return f"Error: Unsupported image format. Got {type(image)}"
|
171 |
+
|
172 |
+
# Process image
|
173 |
+
print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
|
174 |
+
image_tensor = torchvision.io.read_image(temp_image_path)
|
175 |
+
print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
|
176 |
+
image_tensor = image_tensor.float()
|
177 |
+
processed_image = process_image_data(image_tensor)
|
178 |
+
print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
|
179 |
+
|
180 |
+
# Add batch dimension for inference and move to device
|
181 |
+
if mfcc is not None:
|
182 |
+
mfcc = mfcc.unsqueeze(0).to(device)
|
183 |
+
print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
|
184 |
+
|
185 |
+
if processed_image is not None:
|
186 |
+
processed_image = processed_image.unsqueeze(0).to(device)
|
187 |
+
print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
|
188 |
+
|
189 |
+
# Run inference
|
190 |
+
print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
|
191 |
+
if mfcc is not None and processed_image is not None:
|
192 |
+
with torch.no_grad():
|
193 |
+
sweetness = model(mfcc, processed_image)
|
194 |
+
print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
|
195 |
+
else:
|
196 |
+
return "Error: Failed to process inputs. Please check the debug logs."
|
197 |
+
|
198 |
+
# Format the result
|
199 |
+
if sweetness is not None:
|
200 |
+
result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
|
201 |
+
|
202 |
+
# Add a qualitative description
|
203 |
+
if sweetness.item() < 9:
|
204 |
+
result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
|
205 |
+
elif sweetness.item() < 10:
|
206 |
+
result += "\n\nThis watermelon has moderate sweetness."
|
207 |
+
elif sweetness.item() < 11:
|
208 |
+
result += "\n\nThis watermelon is sweet! A good choice."
|
209 |
+
else:
|
210 |
+
result += "\n\nThis watermelon is very sweet! Excellent choice!"
|
211 |
+
|
212 |
+
return result
|
213 |
+
else:
|
214 |
+
return "Error: Could not predict sweetness. Please try again with different inputs."
|
215 |
+
|
216 |
+
except Exception as e:
|
217 |
+
import traceback
|
218 |
+
error_msg = f"Error: {str(e)}\n\n"
|
219 |
+
error_msg += traceback.format_exc()
|
220 |
+
print(f"\033[91mERR!\033[0m: {error_msg}")
|
221 |
+
return error_msg
|
222 |
+
|
223 |
+
# Apply GPU decorator if available in Gradio Spaces environment
|
224 |
+
if HAS_SPACES:
|
225 |
+
predict_sweetness_gpu = spaces.GPU(predict_sweetness)
|
226 |
+
print("\033[92mINFO\033[0m: GPU optimization enabled for prediction function")
|
227 |
+
else:
|
228 |
+
predict_sweetness_gpu = predict_sweetness
|
229 |
+
|
230 |
+
def create_app(model_path):
|
231 |
+
"""Create and launch the Gradio interface"""
|
232 |
+
# Define the prediction function with model path
|
233 |
+
def predict_fn(audio, image):
|
234 |
+
if HAS_SPACES:
|
235 |
+
# Use GPU-optimized function if available
|
236 |
+
return predict_sweetness_gpu(audio, image, model_path)
|
237 |
+
else:
|
238 |
+
# Use regular function otherwise
|
239 |
+
return predict_sweetness(audio, image, model_path)
|
240 |
+
|
241 |
+
# Create Gradio interface
|
242 |
+
with gr.Blocks(title="Watermelon Sweetness Predictor", theme=gr.themes.Soft()) as interface:
|
243 |
+
gr.Markdown("# 🍉 Watermelon Sweetness Predictor")
|
244 |
+
gr.Markdown("""
|
245 |
+
This app predicts the sweetness of a watermelon based on its sound and appearance.
|
246 |
+
|
247 |
+
## Instructions:
|
248 |
+
1. Upload or record an audio of tapping the watermelon
|
249 |
+
2. Upload or capture an image of the watermelon
|
250 |
+
3. Click 'Predict' to get the sweetness estimation
|
251 |
+
""")
|
252 |
+
|
253 |
+
with gr.Row():
|
254 |
+
with gr.Column():
|
255 |
+
audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
|
256 |
+
image_input = gr.Image(label="Upload or Capture Image")
|
257 |
+
submit_btn = gr.Button("Predict Sweetness", variant="primary")
|
258 |
+
|
259 |
+
with gr.Column():
|
260 |
+
output = gr.Textbox(label="Prediction Results", lines=6)
|
261 |
+
|
262 |
+
submit_btn.click(
|
263 |
+
fn=predict_fn,
|
264 |
+
inputs=[audio_input, image_input],
|
265 |
+
outputs=output
|
266 |
+
)
|
267 |
+
|
268 |
+
gr.Markdown("""
|
269 |
+
## How it works
|
270 |
+
|
271 |
+
The app uses a deep learning model that combines:
|
272 |
+
- Audio analysis using MFCC features and LSTM neural network
|
273 |
+
- Image analysis using ResNet-50 convolutional neural network
|
274 |
+
|
275 |
+
The model was trained on a dataset of watermelons with known sweetness values.
|
276 |
+
|
277 |
+
## Tips for best results
|
278 |
+
- For audio: Tap the watermelon with your knuckle and record the sound
|
279 |
+
- For image: Take a clear photo of the whole watermelon in good lighting
|
280 |
+
""")
|
281 |
+
|
282 |
+
return interface
|
283 |
+
|
284 |
+
if __name__ == "__main__":
|
285 |
+
import argparse
|
286 |
+
|
287 |
+
parser = argparse.ArgumentParser(description="Watermelon Sweetness Prediction App")
|
288 |
+
parser.add_argument(
|
289 |
+
"--model_path",
|
290 |
+
type=str,
|
291 |
+
default="models/watermelon_model_final.pt",
|
292 |
+
help="Path to the trained model file"
|
293 |
+
)
|
294 |
+
parser.add_argument(
|
295 |
+
"--share",
|
296 |
+
action="store_true",
|
297 |
+
help="Create a shareable link for the app"
|
298 |
+
)
|
299 |
+
parser.add_argument(
|
300 |
+
"--debug",
|
301 |
+
action="store_true",
|
302 |
+
help="Enable verbose debug output"
|
303 |
+
)
|
304 |
+
|
305 |
+
args = parser.parse_args()
|
306 |
+
|
307 |
+
if args.debug:
|
308 |
+
print(f"\033[92mINFO\033[0m: Debug mode enabled")
|
309 |
+
|
310 |
+
# Check if model exists
|
311 |
+
if not os.path.exists(args.model_path):
|
312 |
+
print(f"\033[91mERR!\033[0m: Model not found at {args.model_path}")
|
313 |
+
print("\033[92mINFO\033[0m: Please train a model first or provide a valid model path")
|
314 |
+
sys.exit(1)
|
315 |
+
|
316 |
+
# Create and launch the app
|
317 |
+
app = create_app(args.model_path)
|
318 |
+
app.launch(share=args.share)
|
app_local_backup.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import gradio as gr
|
6 |
+
import torchaudio
|
7 |
+
import torchvision
|
8 |
+
|
9 |
+
# Add parent directory to path to import preprocess functions
|
10 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11 |
+
|
12 |
+
# Import functions from infer_watermelon.py
|
13 |
+
from infer_watermelon import load_model
|
14 |
+
|
15 |
+
# Modified version of process_audio_data specifically for the app to handle various tensor shapes
|
16 |
+
def app_process_audio_data(waveform, sample_rate):
|
17 |
+
"""Modified version of process_audio_data for the app that handles different tensor dimensions"""
|
18 |
+
try:
|
19 |
+
print(f"\033[92mDEBUG\033[0m: Processing audio - Initial shape: {waveform.shape}, Sample rate: {sample_rate}")
|
20 |
+
|
21 |
+
# Handle different tensor dimensions
|
22 |
+
if waveform.dim() == 3:
|
23 |
+
print(f"\033[92mDEBUG\033[0m: Found 3D tensor, converting to 2D")
|
24 |
+
# For 3D tensor, take the first item (batch dimension)
|
25 |
+
waveform = waveform[0]
|
26 |
+
|
27 |
+
if waveform.dim() == 2:
|
28 |
+
# Use the first channel for stereo audio
|
29 |
+
waveform = waveform[0]
|
30 |
+
print(f"\033[92mDEBUG\033[0m: Using first channel, new shape: {waveform.shape}")
|
31 |
+
|
32 |
+
# Resample to 16kHz if needed
|
33 |
+
resample_rate = 16000
|
34 |
+
if sample_rate != resample_rate:
|
35 |
+
print(f"\033[92mDEBUG\033[0m: Resampling from {sample_rate}Hz to {resample_rate}Hz")
|
36 |
+
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform)
|
37 |
+
|
38 |
+
# Ensure 3 seconds of audio
|
39 |
+
if waveform.size(0) < 3 * resample_rate:
|
40 |
+
print(f"\033[92mDEBUG\033[0m: Padding audio from {waveform.size(0)} to {3 * resample_rate} samples")
|
41 |
+
waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0)))
|
42 |
+
else:
|
43 |
+
print(f"\033[92mDEBUG\033[0m: Trimming audio from {waveform.size(0)} to {3 * resample_rate} samples")
|
44 |
+
waveform = waveform[: 3 * resample_rate]
|
45 |
+
|
46 |
+
# Apply MFCC transformation
|
47 |
+
print(f"\033[92mDEBUG\033[0m: Applying MFCC transformation")
|
48 |
+
mfcc_transform = torchaudio.transforms.MFCC(
|
49 |
+
sample_rate=resample_rate,
|
50 |
+
n_mfcc=13,
|
51 |
+
melkwargs={
|
52 |
+
"n_fft": 256,
|
53 |
+
"win_length": 256,
|
54 |
+
"hop_length": 128,
|
55 |
+
"n_mels": 40,
|
56 |
+
}
|
57 |
+
)
|
58 |
+
|
59 |
+
mfcc = mfcc_transform(waveform)
|
60 |
+
print(f"\033[92mDEBUG\033[0m: MFCC output shape: {mfcc.shape}")
|
61 |
+
|
62 |
+
return mfcc
|
63 |
+
except Exception as e:
|
64 |
+
import traceback
|
65 |
+
print(f"\033[91mERR!\033[0m: Error in audio processing: {e}")
|
66 |
+
print(traceback.format_exc())
|
67 |
+
return None
|
68 |
+
|
69 |
+
# Similarly for images, but let's import the original one
|
70 |
+
from preprocess import process_image_data
|
71 |
+
|
72 |
+
def init_model(model_path):
|
73 |
+
"""Initialize the model for inference"""
|
74 |
+
model, device = load_model(model_path)
|
75 |
+
return model, device
|
76 |
+
|
77 |
+
def predict_sweetness(audio, image, model, device):
|
78 |
+
"""Predict sweetness of a watermelon from audio and image input"""
|
79 |
+
try:
|
80 |
+
# Debug information about input types
|
81 |
+
print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
|
82 |
+
print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
|
83 |
+
print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
|
84 |
+
if isinstance(image, np.ndarray):
|
85 |
+
print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
|
86 |
+
|
87 |
+
# Handle different audio input formats
|
88 |
+
if isinstance(audio, tuple) and len(audio) == 2:
|
89 |
+
# Standard Gradio format: (sample_rate, audio_data)
|
90 |
+
sample_rate, audio_data = audio
|
91 |
+
print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
|
92 |
+
print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
|
93 |
+
elif isinstance(audio, tuple) and len(audio) > 2:
|
94 |
+
# Sometimes Gradio returns (sample_rate, audio_data, other_info...)
|
95 |
+
sample_rate, audio_data = audio[0], audio[-1]
|
96 |
+
print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
|
97 |
+
print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
|
98 |
+
elif isinstance(audio, str):
|
99 |
+
# Direct path to audio file
|
100 |
+
import torchaudio
|
101 |
+
audio_data, sample_rate = torchaudio.load(audio)
|
102 |
+
print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
|
103 |
+
else:
|
104 |
+
return f"Error: Unsupported audio format. Got {type(audio)}"
|
105 |
+
|
106 |
+
# Create a temporary file path for the audio and image
|
107 |
+
temp_dir = "temp"
|
108 |
+
os.makedirs(temp_dir, exist_ok=True)
|
109 |
+
|
110 |
+
temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
|
111 |
+
temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
|
112 |
+
|
113 |
+
# Import necessary libraries
|
114 |
+
import torchaudio
|
115 |
+
import torchvision
|
116 |
+
import torchvision.transforms.functional as F
|
117 |
+
from PIL import Image
|
118 |
+
|
119 |
+
# Audio handling - direct processing from the data in memory
|
120 |
+
if isinstance(audio_data, np.ndarray):
|
121 |
+
# Convert numpy array to tensor
|
122 |
+
print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
|
123 |
+
audio_tensor = torch.tensor(audio_data).float()
|
124 |
+
|
125 |
+
# Handle different audio dimensions
|
126 |
+
if audio_data.ndim == 1:
|
127 |
+
# Single channel audio
|
128 |
+
audio_tensor = audio_tensor.unsqueeze(0)
|
129 |
+
elif audio_data.ndim == 2:
|
130 |
+
# Ensure channels are first dimension
|
131 |
+
if audio_data.shape[0] > audio_data.shape[1]:
|
132 |
+
# More rows than columns, probably (samples, channels)
|
133 |
+
audio_tensor = torch.tensor(audio_data.T).float()
|
134 |
+
else:
|
135 |
+
# Already a tensor
|
136 |
+
audio_tensor = audio_data.float()
|
137 |
+
|
138 |
+
print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
|
139 |
+
|
140 |
+
# Skip saving/loading and process directly
|
141 |
+
mfcc = app_process_audio_data(audio_tensor, sample_rate)
|
142 |
+
print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
|
143 |
+
|
144 |
+
# Image handling
|
145 |
+
if isinstance(image, np.ndarray):
|
146 |
+
print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
|
147 |
+
pil_image = Image.fromarray(image)
|
148 |
+
pil_image.save(temp_image_path)
|
149 |
+
print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
|
150 |
+
elif isinstance(image, str):
|
151 |
+
# If image is already a path
|
152 |
+
temp_image_path = image
|
153 |
+
print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
|
154 |
+
else:
|
155 |
+
return f"Error: Unsupported image format. Got {type(image)}"
|
156 |
+
|
157 |
+
# Process image
|
158 |
+
print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
|
159 |
+
image_tensor = torchvision.io.read_image(temp_image_path)
|
160 |
+
print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
|
161 |
+
image_tensor = image_tensor.float()
|
162 |
+
processed_image = process_image_data(image_tensor)
|
163 |
+
print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
|
164 |
+
|
165 |
+
# Add batch dimension for inference
|
166 |
+
if mfcc is not None:
|
167 |
+
mfcc = mfcc.unsqueeze(0).to(device)
|
168 |
+
print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
|
169 |
+
|
170 |
+
if processed_image is not None:
|
171 |
+
processed_image = processed_image.unsqueeze(0).to(device)
|
172 |
+
print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
|
173 |
+
|
174 |
+
# Run inference
|
175 |
+
print(f"\033[92mDEBUG\033[0m: Running inference")
|
176 |
+
if mfcc is not None and processed_image is not None:
|
177 |
+
with torch.no_grad():
|
178 |
+
sweetness = model(mfcc, processed_image)
|
179 |
+
print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
|
180 |
+
else:
|
181 |
+
return "Error: Failed to process inputs. Please check the debug logs."
|
182 |
+
|
183 |
+
# Format the result
|
184 |
+
if sweetness is not None:
|
185 |
+
result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
|
186 |
+
|
187 |
+
# Add a qualitative description
|
188 |
+
if sweetness.item() < 9:
|
189 |
+
result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
|
190 |
+
elif sweetness.item() < 10:
|
191 |
+
result += "\n\nThis watermelon has moderate sweetness."
|
192 |
+
elif sweetness.item() < 11:
|
193 |
+
result += "\n\nThis watermelon is sweet! A good choice."
|
194 |
+
else:
|
195 |
+
result += "\n\nThis watermelon is very sweet! Excellent choice!"
|
196 |
+
|
197 |
+
return result
|
198 |
+
else:
|
199 |
+
return "Error: Could not predict sweetness. Please try again with different inputs."
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
import traceback
|
203 |
+
error_msg = f"Error: {str(e)}\n\n"
|
204 |
+
error_msg += traceback.format_exc()
|
205 |
+
print(f"\033[91mERR!\033[0m: {error_msg}")
|
206 |
+
return error_msg
|
207 |
+
|
208 |
+
def create_app(model_path):
|
209 |
+
"""Create and launch the Gradio interface"""
|
210 |
+
# Initialize model
|
211 |
+
model, device = init_model(model_path)
|
212 |
+
|
213 |
+
# Define the prediction function with model and device
|
214 |
+
def predict_fn(audio, image):
|
215 |
+
return predict_sweetness(audio, image, model, device)
|
216 |
+
|
217 |
+
# Create Gradio interface
|
218 |
+
with gr.Blocks(title="Watermelon Sweetness Predictor") as interface:
|
219 |
+
gr.Markdown("# 🍉 Watermelon Sweetness Predictor")
|
220 |
+
gr.Markdown("""
|
221 |
+
This app predicts the sweetness of a watermelon based on its sound and appearance.
|
222 |
+
|
223 |
+
## Instructions:
|
224 |
+
1. Upload or record an audio of tapping the watermelon
|
225 |
+
2. Upload or capture an image of the watermelon
|
226 |
+
3. Click 'Submit' to get the predicted sweetness
|
227 |
+
""")
|
228 |
+
|
229 |
+
with gr.Row():
|
230 |
+
with gr.Column():
|
231 |
+
audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
|
232 |
+
image_input = gr.Image(label="Upload or Capture Image")
|
233 |
+
submit_btn = gr.Button("Predict Sweetness", variant="primary")
|
234 |
+
|
235 |
+
with gr.Column():
|
236 |
+
output = gr.Textbox(label="Prediction Results", lines=6)
|
237 |
+
|
238 |
+
submit_btn.click(
|
239 |
+
fn=predict_fn,
|
240 |
+
inputs=[audio_input, image_input],
|
241 |
+
outputs=output
|
242 |
+
)
|
243 |
+
|
244 |
+
gr.Markdown("""
|
245 |
+
## How it works
|
246 |
+
|
247 |
+
The app uses a deep learning model that combines:
|
248 |
+
- Audio analysis using MFCC features and LSTM neural network
|
249 |
+
- Image analysis using ResNet-50 convolutional neural network
|
250 |
+
|
251 |
+
The model was trained on a dataset of watermelons with known sweetness values.
|
252 |
+
""")
|
253 |
+
|
254 |
+
return interface
|
255 |
+
|
256 |
+
if __name__ == "__main__":
|
257 |
+
import argparse
|
258 |
+
|
259 |
+
parser = argparse.ArgumentParser(description="Watermelon Sweetness Prediction App")
|
260 |
+
parser.add_argument(
|
261 |
+
"--model_path",
|
262 |
+
type=str,
|
263 |
+
default="models/watermelon_model_final.pt",
|
264 |
+
help="Path to the trained model file"
|
265 |
+
)
|
266 |
+
parser.add_argument(
|
267 |
+
"--share",
|
268 |
+
action="store_true",
|
269 |
+
help="Create a shareable link for the app"
|
270 |
+
)
|
271 |
+
parser.add_argument(
|
272 |
+
"--debug",
|
273 |
+
action="store_true",
|
274 |
+
help="Enable verbose debug output"
|
275 |
+
)
|
276 |
+
|
277 |
+
args = parser.parse_args()
|
278 |
+
|
279 |
+
if args.debug:
|
280 |
+
print(f"\033[92mINFO\033[0m: Debug mode enabled")
|
281 |
+
|
282 |
+
# Check if model exists
|
283 |
+
if not os.path.exists(args.model_path):
|
284 |
+
print(f"\033[91mERR!\033[0m: Model not found at {args.model_path}")
|
285 |
+
print("\033[92mINFO\033[0m: Please train a model first or provide a valid model path")
|
286 |
+
sys.exit(1)
|
287 |
+
|
288 |
+
# Create and launch the app
|
289 |
+
app = create_app(args.model_path)
|
290 |
+
app.launch(share=args.share)
|
infer_watermelon.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
import torchvision
|
6 |
+
import argparse
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Add parent directory to path to import the preprocess functions
|
10 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11 |
+
from preprocess import process_audio_data, process_image_data
|
12 |
+
|
13 |
+
# Import the model definition
|
14 |
+
from train_watermelon import WatermelonModel
|
15 |
+
|
16 |
+
def load_model(model_path):
|
17 |
+
"""Load a trained model from the given path"""
|
18 |
+
device = torch.device(
|
19 |
+
"cuda" if torch.cuda.is_available()
|
20 |
+
else "mps" if torch.backends.mps.is_available()
|
21 |
+
else "cpu"
|
22 |
+
)
|
23 |
+
print(f"\033[92mINFO\033[0m: Using device: {device}")
|
24 |
+
|
25 |
+
model = WatermelonModel().to(device)
|
26 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
27 |
+
model.eval()
|
28 |
+
print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
|
29 |
+
|
30 |
+
return model, device
|
31 |
+
|
32 |
+
def infer_single_sample(audio_path, image_path, model, device):
|
33 |
+
"""Run inference on a single sample"""
|
34 |
+
# Load and process audio
|
35 |
+
try:
|
36 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
37 |
+
mfcc = process_audio_data(waveform, sample_rate).to(device)
|
38 |
+
|
39 |
+
# Load and process image
|
40 |
+
image = torchvision.io.read_image(image_path)
|
41 |
+
image = image.float()
|
42 |
+
processed_image = process_image_data(image).to(device)
|
43 |
+
|
44 |
+
# Add batch dimension
|
45 |
+
mfcc = mfcc.unsqueeze(0)
|
46 |
+
processed_image = processed_image.unsqueeze(0)
|
47 |
+
|
48 |
+
# Run inference
|
49 |
+
with torch.no_grad():
|
50 |
+
sweetness = model(mfcc, processed_image)
|
51 |
+
|
52 |
+
return sweetness.item()
|
53 |
+
except Exception as e:
|
54 |
+
print(f"\033[91mERR!\033[0m: Error in inference: {e}")
|
55 |
+
return None
|
56 |
+
|
57 |
+
def infer_from_directory(data_dir, model_path, output_file=None, num_samples=None):
|
58 |
+
"""Run inference on samples from the dataset directory"""
|
59 |
+
# Load model
|
60 |
+
model, device = load_model(model_path)
|
61 |
+
|
62 |
+
# Collect all samples
|
63 |
+
samples = []
|
64 |
+
results = []
|
65 |
+
|
66 |
+
print(f"\033[92mINFO\033[0m: Reading samples from {data_dir}")
|
67 |
+
|
68 |
+
# Walk through the directory structure
|
69 |
+
for sweetness_dir in os.listdir(data_dir):
|
70 |
+
try:
|
71 |
+
sweetness = float(sweetness_dir)
|
72 |
+
sweetness_path = os.path.join(data_dir, sweetness_dir)
|
73 |
+
|
74 |
+
if os.path.isdir(sweetness_path):
|
75 |
+
for id_dir in os.listdir(sweetness_path):
|
76 |
+
id_path = os.path.join(sweetness_path, id_dir)
|
77 |
+
|
78 |
+
if os.path.isdir(id_path):
|
79 |
+
audio_file = os.path.join(id_path, f"{id_dir}.wav")
|
80 |
+
image_file = os.path.join(id_path, f"{id_dir}.jpg")
|
81 |
+
|
82 |
+
if os.path.exists(audio_file) and os.path.exists(image_file):
|
83 |
+
samples.append((audio_file, image_file, sweetness, id_dir))
|
84 |
+
except ValueError:
|
85 |
+
# Skip directories that are not valid sweetness values
|
86 |
+
continue
|
87 |
+
|
88 |
+
# Limit the number of samples if specified
|
89 |
+
if num_samples is not None and num_samples > 0:
|
90 |
+
samples = samples[:num_samples]
|
91 |
+
|
92 |
+
print(f"\033[92mINFO\033[0m: Running inference on {len(samples)} samples")
|
93 |
+
|
94 |
+
# Run inference on each sample
|
95 |
+
for i, (audio_file, image_file, true_sweetness, sample_id) in enumerate(samples):
|
96 |
+
print(f"\033[92mINFO\033[0m: Processing sample {i+1}/{len(samples)}: {sample_id}")
|
97 |
+
|
98 |
+
predicted_sweetness = infer_single_sample(audio_file, image_file, model, device)
|
99 |
+
|
100 |
+
if predicted_sweetness is not None:
|
101 |
+
error = abs(predicted_sweetness - true_sweetness)
|
102 |
+
results.append({
|
103 |
+
'sample_id': sample_id,
|
104 |
+
'true_sweetness': true_sweetness,
|
105 |
+
'predicted_sweetness': predicted_sweetness,
|
106 |
+
'error': error
|
107 |
+
})
|
108 |
+
print(f" Sample ID: {sample_id}")
|
109 |
+
print(f" True sweetness: {true_sweetness:.2f}")
|
110 |
+
print(f" Predicted sweetness: {predicted_sweetness:.2f}")
|
111 |
+
print(f" Error: {error:.2f}")
|
112 |
+
|
113 |
+
# Calculate mean absolute error
|
114 |
+
if results:
|
115 |
+
mae = np.mean([result['error'] for result in results])
|
116 |
+
print(f"\033[92mINFO\033[0m: Mean Absolute Error: {mae:.4f}")
|
117 |
+
|
118 |
+
# Save results to file if specified
|
119 |
+
if output_file and results:
|
120 |
+
with open(output_file, 'w') as f:
|
121 |
+
f.write("sample_id,true_sweetness,predicted_sweetness,error\n")
|
122 |
+
for result in results:
|
123 |
+
f.write(f"{result['sample_id']},{result['true_sweetness']:.2f},{result['predicted_sweetness']:.2f},{result['error']:.2f}\n")
|
124 |
+
print(f"\033[92mINFO\033[0m: Results saved to {output_file}")
|
125 |
+
|
126 |
+
return results
|
127 |
+
|
128 |
+
def main():
|
129 |
+
parser = argparse.ArgumentParser(description="Watermelon Sweetness Inference")
|
130 |
+
parser.add_argument("--model_path", type=str, required=True, help="Path to the trained model file")
|
131 |
+
parser.add_argument("--data_dir", type=str, default="../cleaned", help="Path to the cleaned dataset directory")
|
132 |
+
parser.add_argument("--output_file", type=str, help="Path to save inference results (CSV)")
|
133 |
+
parser.add_argument("--num_samples", type=int, help="Number of samples to run inference on (default: all)")
|
134 |
+
parser.add_argument("--audio_path", type=str, help="Path to a single audio file for inference")
|
135 |
+
parser.add_argument("--image_path", type=str, help="Path to a single image file for inference")
|
136 |
+
|
137 |
+
args = parser.parse_args()
|
138 |
+
|
139 |
+
# Check if single sample inference or dataset inference
|
140 |
+
if args.audio_path and args.image_path:
|
141 |
+
# Single sample inference
|
142 |
+
model, device = load_model(args.model_path)
|
143 |
+
sweetness = infer_single_sample(args.audio_path, args.image_path, model, device)
|
144 |
+
print(f"Predicted sweetness: {sweetness:.2f}")
|
145 |
+
else:
|
146 |
+
# Dataset inference
|
147 |
+
infer_from_directory(args.data_dir, args.model_path, args.output_file, args.num_samples)
|
148 |
+
|
149 |
+
if __name__ == "__main__":
|
150 |
+
main()
|
models/model_1_20250406-064126.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5df632222fa87e09e635f90e5cce14bdd9fd34b442bf18daaf13e54dedfed132
|
3 |
+
size 96095572
|
models/model_1_20250406-064635.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02999bd33592de717dc1ec8054dc570193074c3f25a7283b3daa580b727b7134
|
3 |
+
size 96095572
|
models/model_2_20250406-065053.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80f999a1540c42ed74491692aa66c3b5a6171f972bdf47c9d52556fe1673c8dd
|
3 |
+
size 96095572
|
models/watermelon_model_final.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:086780aee9897ea51a6b0da0fed8aaa61ae97563c70a8c6577849ef9a0220edb
|
3 |
+
size 96095241
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.0.0
|
2 |
+
torchaudio>=2.0.0
|
3 |
+
torchvision>=0.15.0
|
4 |
+
gradio>=3.50.0
|
5 |
+
numpy>=1.20.0
|
6 |
+
pillow>=9.0.0
|
runs/events.out.tfevents.1743920786.vm-jinzq.2059144.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3e44b329373e1b4e8233833c35e382cf1c548c03a449e237c89b4c0333af42f
|
3 |
+
size 88
|
runs/events.out.tfevents.1743920828.vm-jinzq.2059396.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1396659d9fdb300ed3bf8ee38bf6605c634376c36a3e47e8398968eb9ea4b6ea
|
3 |
+
size 88
|
runs/events.out.tfevents.1743921401.jzqdebug-c245a8-job-84fn7.812.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4346aad295036526c9dadae4a84f18cab863a1ec43f13b0d5b32566b5361179
|
3 |
+
size 14985
|
runs/events.out.tfevents.1743921735.jzqdebug-c245a8-job-84fn7.1262.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ec7e16dece5b3f09408359a3a18fb40a87f23e02e1b16981ebb9ea9e463f6ef
|
3 |
+
size 7238
|
temp/temp_audio.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8da44f18fa66bb5db09dc6ef4ea542c5274d8b2a1d952efd1db1ceec7948ca44
|
3 |
+
size 1058488
|
temp/temp_image.jpg
ADDED
![]() |
Git LFS Details
|
train_watermelon.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
import torchvision
|
6 |
+
import numpy as np
|
7 |
+
from torch.utils.data import Dataset, DataLoader
|
8 |
+
from torch.utils.tensorboard import SummaryWriter
|
9 |
+
import sys
|
10 |
+
|
11 |
+
# Add parent directory to path to import the preprocess functions
|
12 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
13 |
+
from preprocess import process_audio_data, process_image_data
|
14 |
+
|
15 |
+
# Print library versions
|
16 |
+
print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
|
17 |
+
print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}")
|
18 |
+
print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}")
|
19 |
+
|
20 |
+
# Device selection
|
21 |
+
device = torch.device(
|
22 |
+
"cuda"
|
23 |
+
if torch.cuda.is_available()
|
24 |
+
else "mps" if torch.backends.mps.is_available() else "cpu"
|
25 |
+
)
|
26 |
+
print(f"\033[92mINFO\033[0m: Using device: {device}")
|
27 |
+
|
28 |
+
# Hyperparameters
|
29 |
+
batch_size = 16
|
30 |
+
epochs = 2
|
31 |
+
learning_rate = 0.0001
|
32 |
+
|
33 |
+
# Model save directory
|
34 |
+
os.makedirs("models/", exist_ok=True)
|
35 |
+
|
36 |
+
|
37 |
+
class WatermelonDataset(Dataset):
|
38 |
+
def __init__(self, data_dir):
|
39 |
+
self.data_dir = data_dir
|
40 |
+
self.samples = []
|
41 |
+
|
42 |
+
# Walk through the directory structure
|
43 |
+
for sweetness_dir in os.listdir(data_dir):
|
44 |
+
sweetness = float(sweetness_dir)
|
45 |
+
sweetness_path = os.path.join(data_dir, sweetness_dir)
|
46 |
+
|
47 |
+
if os.path.isdir(sweetness_path):
|
48 |
+
for id_dir in os.listdir(sweetness_path):
|
49 |
+
id_path = os.path.join(sweetness_path, id_dir)
|
50 |
+
|
51 |
+
if os.path.isdir(id_path):
|
52 |
+
audio_file = os.path.join(id_path, f"{id_dir}.wav")
|
53 |
+
image_file = os.path.join(id_path, f"{id_dir}.jpg")
|
54 |
+
|
55 |
+
if os.path.exists(audio_file) and os.path.exists(image_file):
|
56 |
+
self.samples.append((audio_file, image_file, sweetness))
|
57 |
+
|
58 |
+
print(f"\033[92mINFO\033[0m: Loaded {len(self.samples)} samples from {data_dir}")
|
59 |
+
|
60 |
+
def __len__(self):
|
61 |
+
return len(self.samples)
|
62 |
+
|
63 |
+
def __getitem__(self, idx):
|
64 |
+
audio_path, image_path, label = self.samples[idx]
|
65 |
+
|
66 |
+
# Load and process audio
|
67 |
+
try:
|
68 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
69 |
+
mfcc = process_audio_data(waveform, sample_rate)
|
70 |
+
|
71 |
+
# Load and process image
|
72 |
+
image = torchvision.io.read_image(image_path)
|
73 |
+
image = image.float()
|
74 |
+
processed_image = process_image_data(image)
|
75 |
+
|
76 |
+
return mfcc, processed_image, torch.tensor(label).float()
|
77 |
+
except Exception as e:
|
78 |
+
print(f"\033[91mERR!\033[0m: Error processing sample {idx}: {e}")
|
79 |
+
# Return a fallback sample or skip this sample
|
80 |
+
# For simplicity, we'll return the first sample again
|
81 |
+
if idx == 0: # Prevent infinite recursion
|
82 |
+
raise e
|
83 |
+
return self.__getitem__(0)
|
84 |
+
|
85 |
+
|
86 |
+
class WatermelonModel(torch.nn.Module):
|
87 |
+
def __init__(self):
|
88 |
+
super(WatermelonModel, self).__init__()
|
89 |
+
|
90 |
+
# LSTM for audio features
|
91 |
+
self.lstm = torch.nn.LSTM(
|
92 |
+
input_size=376, hidden_size=64, num_layers=2, batch_first=True
|
93 |
+
)
|
94 |
+
self.lstm_fc = torch.nn.Linear(
|
95 |
+
64, 128
|
96 |
+
) # Convert LSTM output to 128-dim for merging
|
97 |
+
|
98 |
+
# ResNet50 for image features
|
99 |
+
self.resnet = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
|
100 |
+
self.resnet.fc = torch.nn.Linear(
|
101 |
+
self.resnet.fc.in_features, 128
|
102 |
+
) # Convert ResNet output to 128-dim for merging
|
103 |
+
|
104 |
+
# Fully connected layers for final prediction
|
105 |
+
self.fc1 = torch.nn.Linear(256, 64)
|
106 |
+
self.fc2 = torch.nn.Linear(64, 1)
|
107 |
+
self.relu = torch.nn.ReLU()
|
108 |
+
|
109 |
+
def forward(self, mfcc, image):
|
110 |
+
# LSTM branch
|
111 |
+
lstm_output, _ = self.lstm(mfcc)
|
112 |
+
lstm_output = lstm_output[:, -1, :] # Use the output of the last time step
|
113 |
+
lstm_output = self.lstm_fc(lstm_output)
|
114 |
+
|
115 |
+
# ResNet branch
|
116 |
+
resnet_output = self.resnet(image)
|
117 |
+
|
118 |
+
# Concatenate LSTM and ResNet outputs
|
119 |
+
merged = torch.cat((lstm_output, resnet_output), dim=1)
|
120 |
+
|
121 |
+
# Fully connected layers
|
122 |
+
output = self.relu(self.fc1(merged))
|
123 |
+
output = self.fc2(output)
|
124 |
+
|
125 |
+
return output
|
126 |
+
|
127 |
+
|
128 |
+
def train_model(data_dir, output_dir="models/"):
|
129 |
+
# Create dataset
|
130 |
+
dataset = WatermelonDataset(data_dir)
|
131 |
+
n_samples = len(dataset)
|
132 |
+
|
133 |
+
# Split dataset
|
134 |
+
train_size = int(0.7 * n_samples)
|
135 |
+
val_size = int(0.2 * n_samples)
|
136 |
+
test_size = n_samples - train_size - val_size
|
137 |
+
|
138 |
+
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
|
139 |
+
dataset, [train_size, val_size, test_size]
|
140 |
+
)
|
141 |
+
|
142 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
143 |
+
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
|
144 |
+
|
145 |
+
# Initialize model
|
146 |
+
model = WatermelonModel().to(device)
|
147 |
+
|
148 |
+
# Loss function and optimizer
|
149 |
+
criterion = torch.nn.MSELoss()
|
150 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
|
151 |
+
|
152 |
+
# TensorBoard
|
153 |
+
writer = SummaryWriter("runs/")
|
154 |
+
global_step = 0
|
155 |
+
|
156 |
+
print(f"\033[92mINFO\033[0m: Training model for {epochs} epochs")
|
157 |
+
print(f"\033[92mINFO\033[0m: Training samples: {len(train_dataset)}")
|
158 |
+
print(f"\033[92mINFO\033[0m: Validation samples: {len(val_dataset)}")
|
159 |
+
print(f"\033[92mINFO\033[0m: Test samples: {len(test_dataset)}")
|
160 |
+
print(f"\033[92mINFO\033[0m: Batch size: {batch_size}")
|
161 |
+
|
162 |
+
# Training loop
|
163 |
+
for epoch in range(epochs):
|
164 |
+
print(f"\033[92mINFO\033[0m: Training epoch ({epoch+1}/{epochs})")
|
165 |
+
|
166 |
+
model.train()
|
167 |
+
running_loss = 0.0
|
168 |
+
|
169 |
+
for i, (mfcc, image, label) in enumerate(train_loader):
|
170 |
+
try:
|
171 |
+
mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
|
172 |
+
|
173 |
+
optimizer.zero_grad()
|
174 |
+
output = model(mfcc, image)
|
175 |
+
label = label.view(-1, 1).float()
|
176 |
+
loss = criterion(output, label)
|
177 |
+
loss.backward()
|
178 |
+
optimizer.step()
|
179 |
+
|
180 |
+
running_loss += loss.item()
|
181 |
+
writer.add_scalar("Training Loss", loss.item(), global_step)
|
182 |
+
global_step += 1
|
183 |
+
|
184 |
+
if i % 10 == 0:
|
185 |
+
print(f"\033[92mINFO\033[0m: Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}")
|
186 |
+
|
187 |
+
except Exception as e:
|
188 |
+
print(f"\033[91mERR!\033[0m: Error in training batch {i}: {e}")
|
189 |
+
continue
|
190 |
+
|
191 |
+
# Validation phase
|
192 |
+
model.eval()
|
193 |
+
val_loss = 0.0
|
194 |
+
with torch.no_grad():
|
195 |
+
for i, (mfcc, image, label) in enumerate(val_loader):
|
196 |
+
try:
|
197 |
+
mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
|
198 |
+
output = model(mfcc, image)
|
199 |
+
label = label.view(-1, 1).float()
|
200 |
+
loss = criterion(output, label)
|
201 |
+
val_loss += loss.item()
|
202 |
+
except Exception as e:
|
203 |
+
print(f"\033[91mERR!\033[0m: Error in validation batch {i}: {e}")
|
204 |
+
continue
|
205 |
+
|
206 |
+
avg_train_loss = running_loss / len(train_loader) if len(train_loader) > 0 else float('inf')
|
207 |
+
avg_val_loss = val_loss / len(val_loader) if len(val_loader) > 0 else float('inf')
|
208 |
+
|
209 |
+
# Record validation loss
|
210 |
+
writer.add_scalar("Validation Loss", avg_val_loss, epoch)
|
211 |
+
|
212 |
+
print(
|
213 |
+
f"Epoch [{epoch+1}/{epochs}], Training Loss: {avg_train_loss:.4f}, "
|
214 |
+
f"Validation Loss: {avg_val_loss:.4f}"
|
215 |
+
)
|
216 |
+
|
217 |
+
# Save model checkpoint
|
218 |
+
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
219 |
+
model_path = os.path.join(output_dir, f"model_{epoch+1}_{timestamp}.pt")
|
220 |
+
torch.save(model.state_dict(), model_path)
|
221 |
+
|
222 |
+
print(
|
223 |
+
f"\033[92mINFO\033[0m: Model checkpoint epoch [{epoch+1}/{epochs}] saved: {model_path}"
|
224 |
+
)
|
225 |
+
|
226 |
+
# Save final model
|
227 |
+
final_model_path = os.path.join(output_dir, "watermelon_model_final.pt")
|
228 |
+
torch.save(model.state_dict(), final_model_path)
|
229 |
+
print(f"\033[92mINFO\033[0m: Final model saved: {final_model_path}")
|
230 |
+
|
231 |
+
print(f"\033[92mINFO\033[0m: Training complete")
|
232 |
+
return final_model_path
|
233 |
+
|
234 |
+
|
235 |
+
if __name__ == "__main__":
|
236 |
+
import argparse
|
237 |
+
|
238 |
+
parser = argparse.ArgumentParser(description="Train the Watermelon Sweetness Prediction Model")
|
239 |
+
parser.add_argument(
|
240 |
+
"--data_dir",
|
241 |
+
type=str,
|
242 |
+
default="../cleaned",
|
243 |
+
help="Path to the cleaned dataset directory"
|
244 |
+
)
|
245 |
+
parser.add_argument(
|
246 |
+
"--output_dir",
|
247 |
+
type=str,
|
248 |
+
default="models/",
|
249 |
+
help="Directory to save model checkpoints and the final model"
|
250 |
+
)
|
251 |
+
|
252 |
+
args = parser.parse_args()
|
253 |
+
|
254 |
+
# Ensure output directory exists
|
255 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
256 |
+
|
257 |
+
# Train the model
|
258 |
+
final_model_path = train_model(args.data_dir, args.output_dir)
|
259 |
+
|
260 |
+
print(f"\033[92mINFO\033[0m: Training completed successfully!")
|
261 |
+
print(f"\033[92mINFO\033[0m: Final model saved at: {final_model_path}")
|