Spaces:

ghost613
/

LLM-Training-Time-and-Cost-Calculator

Running

App Files Files Community

ghost613 commited on 29 days ago

Commit

92c24be

verified ·

1 Parent(s): 20b31f0

Upload 2 files

Browse files

Files changed (2) hide show

README.md +3 -3
app.py +134 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: LLM Training Time And Cost Calculator
-emoji: 🚀
-colorFrom: pink
 colorTo: green
 sdk: gradio
 sdk_version: 5.12.0

 ---
+title: LLM Time And Cost Calculator
+emoji: 🐢
+colorFrom: yellow
 colorTo: green
 sdk: gradio
 sdk_version: 5.12.0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gradio as gr
+def calculate_training_metrics(
+    gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
+    num_epochs, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85
+):
+    """
+    Calculates both the training time and cost for LLM training with parallel computing.
+    Args:
+    - gpu_choice (str): The choice of GPU model
+    - precision (str): The precision level for training
+    - num_gpus (int): Number of GPUs for parallel computing
+    - num_parameters (float): Number of model parameters in billions
+    - dataset_tokens (float): Number of tokens in the dataset
+    - num_epochs (int): Number of training epochs
+    - utilization_rate (float): GPU utilization rate (0 < rate ≤ 1)
+    - overhead (float): Overhead multiplier for additional costs
+    - cost_per_gpu_hour (float): Cost per GPU hour in dollars
+    Returns:
+    - tuple: (total_cost, training_days, training_hours)
+    """
+    # GPU throughputs in FLOPS (operations per second)
+    gpu_throughputs = {
+        'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12},
+        'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12},
+        'V100': {'tensor': 130e12},
+        'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12},
+        'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12}
+    }
+    # Get the base GPU throughput
+    base_throughput = gpu_throughputs[gpu_choice][precision]
+    # Calculate effective throughput with multiple GPUs
+    # Assuming 90% scaling efficiency for parallel computing
+    parallel_efficiency = 0.9
+    effective_throughput = base_throughput * num_gpus * parallel_efficiency
+    # Calculate total tokens processed (dataset_tokens * epochs)
+    total_tokens = dataset_tokens * num_epochs
+    # Calculate total FLOPS needed (6 operations per parameter per token)
+    total_flops = 6 * num_parameters * total_tokens
+    # Calculate raw computation hours needed
+    compute_hours = total_flops / (effective_throughput * 3600)
+    # Adjust for utilization rate and overhead
+    actual_hours = (compute_hours / utilization_rate) * overhead
+    # Calculate days and remaining hours
+    training_days = int(actual_hours // 24)
+    training_hours = actual_hours % 24
+    # Calculate total cost (cost per GPU * number of GPUs * hours)
+    total_cost = actual_hours * cost_per_gpu_hour * num_gpus
+    return total_cost, training_days, training_hours
+def gradio_interface(
+    gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
+    num_epochs, utilization_rate, overhead, cost_per_gpu_hour
+):
+    # Convert inputs to appropriate numeric types
+    num_parameters = float(num_parameters) * 1e9  # Convert billions to actual number
+    dataset_tokens = float(dataset_tokens) * 1e9  # Convert billions to actual number
+    num_gpus = int(num_gpus)
+    num_epochs = int(num_epochs)
+    utilization_rate = float(utilization_rate)
+    overhead = float(overhead)
+    cost_per_gpu_hour = float(cost_per_gpu_hour)
+    # Calculate metrics
+    cost, days, hours = calculate_training_metrics(
+        gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
+        num_epochs, utilization_rate, overhead, cost_per_gpu_hour
+    )
+    # Format the output message
+    time_msg = f"{days} days and {hours:.1f} hours"
+    cost_msg = f"{cost:,.2f}$"
+    return time_msg, cost_msg
+# Define available GPU choices and their default precisions
+gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"]
+# Create the Gradio interface
+title = "<h2 style='text-align: center;'>LLM Training Time and Cost Calculator</h2>"
+description = """
+<p style='text-align: center;'>Calculate both the training time and cost for large language models (LLM) with parallel computing support.</p>
+<p><strong>Input Parameters:</strong></p>
+<ul>
+    <li><strong>GPU Selection:</strong> Choose from various GPU models with different compute capabilities</li>
+    <li><strong>Number of GPUs:</strong> Specify how many GPUs to use in parallel</li>
+    <li><strong>Model Size:</strong> Number of parameters in billions</li>
+    <li><strong>Dataset Size:</strong> Number of tokens in your dataset in billions</li>
+    <li><strong>Training Epochs:</strong> Number of times to iterate over the dataset</li>
+    <li><strong>Utilization Rate:</strong> Expected GPU utilization (typically 0.4-0.7)</li>
+    <li><strong>Overhead:</strong> Additional time/cost factor for data loading, checkpointing, etc.</li>
+</ul>
+Ouputs:
+<ul>
+    <li><strong>Estimated Training Time:</strong> Total days and hours required for training</li>
+    <li><strong>Estimated Training Cost:</strong> Total cost in dollars based on GPU hours</li>
+</ul>
+Modified from <a href="https://huggingface.co/spaces/Heng666/LLM-Training-Cost-Calculator">this Hf Space</a>.
+"""
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'),
+        gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'),
+        gr.Number(label="Number of GPUs", value=1, minimum=1, maximum=1024),
+        gr.Number(label="Number of Parameters (billions)", value=70),
+        gr.Number(label="Dataset Tokens (billions)", value=1),
+        gr.Number(label="Number of Epochs", value=3, minimum=1),
+        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"),
+        gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead Factor"),
+        gr.Number(label="Cost per GPU Hour ($)", value=1.85)
+    ],
+    outputs=[gr.Textbox(label="Estimated Training Time:"),
+             gr.Textbox(label="Estimated Training Cost:")],
+    title=title,
+    description=description,
+    article="<p style='text-align: center;'>Improved with good intentions by ghost.</p>"
+)
+if __name__ == "__main__":
+    iface.launch()