Spaces:

eyad-silx
/

neat

Configuration error

File size: 11,705 Bytes

f0a4dca

"""BackpropNEAT implementation."""

import jax
import jax.numpy as jnp
import numpy as np
from typing import Dict, List, Tuple

from .network import Network
from .genome import Genome

class BackpropNEAT:
    """Backpropagation-based NEAT implementation."""
    
    def __init__(self, population_size=5, n_inputs=2, n_outputs=1, n_hidden=64,
                 learning_rate=0.01, beta=0.9):
        """Initialize BackpropNEAT."""
        self.population_size = population_size
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.beta = beta
        
        # Initialize population
        self.population = []
        self.momentum_buffers = []
        
        for _ in range(population_size):
            # Create genome with skip connections
            genome = Genome(n_inputs, n_outputs, n_hidden)
            genome.add_layer_connections()  # Add standard layer connections
            genome.add_skip_connections(0.3)  # Add skip connections with 30% probability
            
            # Create network from genome
            network = Network(genome)
            self.population.append(network)
            
            # Initialize momentum buffer for this network
            momentum = {
                'weights': {k: jnp.zeros_like(w) for k, w in network.params['weights'].items()},
                'biases': jnp.zeros_like(network.params['biases']),
                'gamma': jnp.zeros_like(network.params['gamma']),
                'beta': jnp.zeros_like(network.params['beta'])
            }
            self.momentum_buffers.append(momentum)
        
        # Create train step function
        self._train_step = self._make_train_step()
        
        # Bind train step to each network
        for i, network in enumerate(self.population):
            network.population_idx = i
            # Create a bound method for each network
            network._train_step = lambda p, x, y, idx=i: self._train_step(self, p, x, y, idx)
    
    def forward(self, params, x):
        """Forward pass through network."""
        return self.population[0].forward(params, x)
    
    def _make_train_step(self):
        """Create training step function."""
        # Constants for numerical stability
        eps = 1e-7
        min_lr = 1e-6
        max_lr = 1e-2
        
        def loss_fn(params, x, y):
            """Compute loss for parameters."""
            logits = self.forward(params, x)
            
            # Binary cross entropy loss with label smoothing
            alpha = 0.1  # Label smoothing factor
            
            # Smooth labels
            y_smooth = (1 - alpha) * y + alpha * 0.5
            
            # Convert logits to probabilities
            probs = 0.5 * (logits + 1)  # Map from [-1,1] to [0,1]
            probs = jnp.clip(probs, eps, 1 - eps)
            
            # Compute loss with label smoothing
            bce_loss = -jnp.mean(
                0.5 * (1 + y_smooth) * jnp.log(probs) + 
                0.5 * (1 - y_smooth) * jnp.log(1 - probs)
            )
            
            # L2 regularization with very small weight
            l2_reg = sum(jnp.sum(w ** 2) for w in params['weights'].values())
            return bce_loss + 0.000001 * l2_reg
        
        @jax.jit
        def compute_updates(params, x, y):
            """Compute gradients and loss."""
            loss_value, grads = jax.value_and_grad(loss_fn)(params, x, y)
            return grads, loss_value
        
        def train_step(self, params, x, y, network_idx):
            """Perform single training step with momentum."""
            # Compute gradients
            grads, loss_value = compute_updates(params, x, y)
            
            # Get momentum buffer for this network
            momentum = self.momentum_buffers[network_idx]
            
            # Gradient norm for adaptive learning rate
            grad_norm = jnp.sqrt(
                sum(jnp.sum(g ** 2) for g in grads['weights'].values()) +
                jnp.sum(grads['biases'] ** 2) +
                jnp.sum(grads['gamma'] ** 2) +
                jnp.sum(grads['beta'] ** 2) +
                eps  # Add eps for numerical stability
            )
            
            # Compute adaptive learning rate
            if grad_norm > 1.0:
                effective_lr = self.learning_rate / grad_norm
            else:
                effective_lr = self.learning_rate * (1.0 + jnp.log(grad_norm + eps))
            
            # Clip learning rate to reasonable range
            effective_lr = jnp.clip(effective_lr, min_lr, max_lr)
            
            # Update weights momentum with adaptive learning rate
            new_weights = {}
            for k in params['weights'].keys():
                grad = grads['weights'][k]
                
                # Update momentum with gradient clipping
                momentum['weights'][k] = (
                    self.beta * momentum['weights'][k] +
                    (1 - self.beta) * jnp.clip(grad, -1.0, 1.0)
                )
                
                # Apply update with weight decay
                weight_decay = 0.0001 * params['weights'][k]
                new_weights[k] = params['weights'][k] - effective_lr * (
                    momentum['weights'][k] + weight_decay
                )
            
            # Update biases momentum
            momentum['biases'] = (
                self.beta * momentum['biases'] +
                (1 - self.beta) * jnp.clip(grads['biases'], -1.0, 1.0)
            )
            new_biases = params['biases'] - effective_lr * momentum['biases']
            
            # Update layer norm parameters with smaller learning rate
            ln_lr = 0.1 * effective_lr  # Slower updates for stability
            
            # Gamma (scale)
            momentum['gamma'] = (
                self.beta * momentum['gamma'] +
                (1 - self.beta) * jnp.clip(grads['gamma'], -0.1, 0.1)
            )
            new_gamma = params['gamma'] - ln_lr * momentum['gamma']
            new_gamma = jnp.clip(new_gamma, 0.1, 10.0)  # Prevent collapse
            
            # Beta (shift)
            momentum['beta'] = (
                self.beta * momentum['beta'] +
                (1 - self.beta) * jnp.clip(grads['beta'], -0.1, 0.1)
            )
            new_beta = params['beta'] - ln_lr * momentum['beta']
            
            return {
                'weights': new_weights,
                'biases': new_biases,
                'gamma': new_gamma,
                'beta': new_beta
            }, loss_value
        
        return train_step
    
    def _mutate_genome(self, genome: Genome) -> Genome:
        """Mutate genome architecture."""
        new_genome = genome.copy()
        
        # Mutate weights and biases
        for key in list(new_genome.params['weights'].keys()):
            if np.random.random() < 0.1:
                new_genome.params['weights'][key] += np.random.normal(0, 0.2)
        
        for key in list(new_genome.params['biases'].keys()):
            if np.random.random() < 0.1:
                new_genome.params['biases'][key] += np.random.normal(0, 0.2)
        
        return new_genome
    
    def _select_parent(self, fitnesses: List[float]) -> int:
        """Select parent using tournament selection."""
        # Tournament selection
        tournament_size = 3
        best_idx = np.random.randint(len(fitnesses))
        best_fitness = fitnesses[best_idx]
        
        for _ in range(tournament_size - 1):
            idx = np.random.randint(len(fitnesses))
            if fitnesses[idx] > best_fitness:
                best_idx = idx
                best_fitness = fitnesses[idx]
        
        return best_idx
    
    def _compute_fitness(self, network: Network, x: jnp.ndarray, y: jnp.ndarray,
                        n_epochs: int = 100, batch_size: int = 32) -> float:
        """Compute fitness of network."""
        n_samples = x.shape[0]
        best_loss = float('inf')
        best_accuracy = 0.0
        
        # Initial prediction
        initial_pred = network.predict(x)
        initial_acc = float(jnp.mean((initial_pred == y)))
        
        # Train network
        no_improve = 0
        for epoch in range(n_epochs):
            # Shuffle data
            perm = np.random.permutation(n_samples)
            x_shuffled = x[perm]
            y_shuffled = y[perm]
            
            # Train in batches
            epoch_losses = []
            for i in range(0, n_samples, batch_size):
                batch_x = x_shuffled[i:min(i + batch_size, n_samples)]
                batch_y = y_shuffled[i:min(i + batch_size, n_samples)]
                
                # Train step
                network.params, loss = network._train_step(network.params, batch_x, batch_y)
                epoch_losses.append(float(loss))
            
            # Update best loss
            avg_loss = float(np.mean(epoch_losses))
            if avg_loss < best_loss:
                best_loss = avg_loss
                no_improve = 0
            else:
                no_improve += 1
            
            # Compute accuracy
            predictions = network.predict(x)
            accuracy = float(jnp.mean((predictions == y)))
            best_accuracy = max(best_accuracy, accuracy)
            
            # Print progress every 10 epochs
            if epoch % 10 == 0:
                print(f"Epoch {epoch}: Loss = {avg_loss:.4f}, Accuracy = {accuracy:.4f}")
            
            # Early stopping if good accuracy or no improvement
            if accuracy > 0.95 or no_improve >= 10:
                print(f"Early stopping at epoch {epoch}")
                print(f"Final accuracy: {accuracy:.4f}")
                break
        
        # Print improvement
        print(f"Network improved from {initial_acc:.4f} to {best_accuracy:.4f}")
        
        # Fitness based on accuracy
        fitness = best_accuracy
        
        return float(fitness)
    
    def evolve(self, x: jnp.ndarray, y: jnp.ndarray, n_generations: int = 50) -> Network:
        """Evolve network architectures."""
        for generation in range(n_generations):
            print(f"\nGeneration {generation}")
            
            # Evaluate current population
            fitnesses = []
            for network in self.population:
                fitness = self._compute_fitness(network, x, y)
                fitnesses.append(fitness)
                
                # Update best network
                if fitness > self.best_fitness:
                    self.best_fitness = fitness
                    self.best_network = Network(network.genome.copy())
                    print(f"New best fitness: {fitness:.4f}")
            
            # Create new population through selection and mutation
            new_population = []
            
            # Keep best network (elitism)
            best_idx = np.argmax(fitnesses)
            new_population.append(Network(self.population[best_idx].genome.copy()))
            
            # Create rest of population
            while len(new_population) < self.population_size:
                # Select parent
                parent_idx = self._select_parent(fitnesses)
                parent = self.population[parent_idx].genome
                
                # Create child through mutation
                child_genome = self._mutate_genome(parent)
                new_population.append(Network(child_genome))
            
            self.population = new_population
        
        return self.best_network