import gradio as gr def calculate_training_metrics( gpu_choice, precision, num_gpus, num_parameters, dataset_tokens, num_epochs, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85 ): """ Calculates both the training time and cost for LLM training with parallel computing. Args: - gpu_choice (str): The choice of GPU model - precision (str): The precision level for training - num_gpus (int): Number of GPUs for parallel computing - num_parameters (float): Number of model parameters in billions - dataset_tokens (float): Number of tokens in the dataset - num_epochs (int): Number of training epochs - utilization_rate (float): GPU utilization rate (0 < rate ≤ 1) - overhead (float): Overhead multiplier for additional costs - cost_per_gpu_hour (float): Cost per GPU hour in dollars Returns: - tuple: (total_cost, training_days, training_hours) """ # GPU throughputs in FLOPS (operations per second) gpu_throughputs = { 'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12}, 'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12}, 'V100': {'tensor': 130e12}, 'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12}, 'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12} } # Get the base GPU throughput base_throughput = gpu_throughputs[gpu_choice][precision] # Calculate effective throughput with multiple GPUs # Assuming 90% scaling efficiency for parallel computing parallel_efficiency = 0.9 effective_throughput = base_throughput * num_gpus * parallel_efficiency # Calculate total tokens processed (dataset_tokens * epochs) total_tokens = dataset_tokens * num_epochs # Calculate total FLOPS needed (6 operations per parameter per token) total_flops = 6 * num_parameters * total_tokens # Calculate raw computation hours needed compute_hours = total_flops / (effective_throughput * 3600) # Adjust for utilization rate and overhead actual_hours = (compute_hours / utilization_rate) * overhead # Calculate days and remaining hours training_days = int(actual_hours // 24) training_hours = actual_hours % 24 # Calculate total cost (cost per GPU * number of GPUs * hours) total_cost = actual_hours * cost_per_gpu_hour * num_gpus return total_cost, training_days, training_hours def gradio_interface( gpu_choice, precision, num_gpus, num_parameters, dataset_tokens, num_epochs, utilization_rate, overhead, cost_per_gpu_hour ): # Convert inputs to appropriate numeric types num_parameters = float(num_parameters) * 1e9 # Convert billions to actual number dataset_tokens = float(dataset_tokens) * 1e9 # Convert billions to actual number num_gpus = int(num_gpus) num_epochs = int(num_epochs) utilization_rate = float(utilization_rate) overhead = float(overhead) cost_per_gpu_hour = float(cost_per_gpu_hour) # Calculate metrics cost, days, hours = calculate_training_metrics( gpu_choice, precision, num_gpus, num_parameters, dataset_tokens, num_epochs, utilization_rate, overhead, cost_per_gpu_hour ) # Format the output message time_msg = f"{days} days and {hours:.1f} hours" cost_msg = f"{cost:,.2f}$" return time_msg, cost_msg # Define available GPU choices and their default precisions gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"] # Create the Gradio interface title = "

LLM Training Time and Cost Calculator

" description = """

Calculate both the training time and cost for large language models (LLM) with parallel computing support.

Input Parameters:

GPU Selection: Choose from various GPU models with different compute capabilities
Number of GPUs: Specify how many GPUs to use in parallel
Model Size: Number of parameters in billions
Dataset Size: Number of tokens in your dataset in billions
Training Epochs: Number of times to iterate over the dataset
Utilization Rate: Expected GPU utilization (typically 0.4-0.7)
Overhead: Additional time/cost factor for data loading, checkpointing, etc.

Ouputs:

Estimated Training Time: Total days and hours required for training
Estimated Training Cost: Total cost in dollars based on GPU hours

Modified from this Hf Space. """ iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'), gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'), gr.Number(label="Number of GPUs", value=1, minimum=1, maximum=1024), gr.Number(label="Number of Parameters (billions)", value=70), gr.Number(label="Dataset Tokens (billions)", value=1), gr.Number(label="Number of Epochs", value=3, minimum=1), gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"), gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead Factor"), gr.Number(label="Cost per GPU Hour ($)", value=1.85) ], outputs=[gr.Textbox(label="Estimated Training Time:"), gr.Textbox(label="Estimated Training Cost:")], title=title, description=description, article="

Improved with good intentions by ghost.

" ) if __name__ == "__main__": iface.launch()