Upload 2 files
Browse files
README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
---
|
2 |
-
title: LLM
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.12.0
|
|
|
1 |
---
|
2 |
+
title: LLM Time And Cost Calculator
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: yellow
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.12.0
|
app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def calculate_training_metrics(
|
4 |
+
gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
|
5 |
+
num_epochs, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85
|
6 |
+
):
|
7 |
+
"""
|
8 |
+
Calculates both the training time and cost for LLM training with parallel computing.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
- gpu_choice (str): The choice of GPU model
|
12 |
+
- precision (str): The precision level for training
|
13 |
+
- num_gpus (int): Number of GPUs for parallel computing
|
14 |
+
- num_parameters (float): Number of model parameters in billions
|
15 |
+
- dataset_tokens (float): Number of tokens in the dataset
|
16 |
+
- num_epochs (int): Number of training epochs
|
17 |
+
- utilization_rate (float): GPU utilization rate (0 < rate ≤ 1)
|
18 |
+
- overhead (float): Overhead multiplier for additional costs
|
19 |
+
- cost_per_gpu_hour (float): Cost per GPU hour in dollars
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
- tuple: (total_cost, training_days, training_hours)
|
23 |
+
"""
|
24 |
+
|
25 |
+
# GPU throughputs in FLOPS (operations per second)
|
26 |
+
gpu_throughputs = {
|
27 |
+
'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12},
|
28 |
+
'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12},
|
29 |
+
'V100': {'tensor': 130e12},
|
30 |
+
'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12},
|
31 |
+
'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12}
|
32 |
+
}
|
33 |
+
|
34 |
+
# Get the base GPU throughput
|
35 |
+
base_throughput = gpu_throughputs[gpu_choice][precision]
|
36 |
+
|
37 |
+
# Calculate effective throughput with multiple GPUs
|
38 |
+
# Assuming 90% scaling efficiency for parallel computing
|
39 |
+
parallel_efficiency = 0.9
|
40 |
+
effective_throughput = base_throughput * num_gpus * parallel_efficiency
|
41 |
+
|
42 |
+
# Calculate total tokens processed (dataset_tokens * epochs)
|
43 |
+
total_tokens = dataset_tokens * num_epochs
|
44 |
+
|
45 |
+
# Calculate total FLOPS needed (6 operations per parameter per token)
|
46 |
+
total_flops = 6 * num_parameters * total_tokens
|
47 |
+
|
48 |
+
# Calculate raw computation hours needed
|
49 |
+
compute_hours = total_flops / (effective_throughput * 3600)
|
50 |
+
|
51 |
+
# Adjust for utilization rate and overhead
|
52 |
+
actual_hours = (compute_hours / utilization_rate) * overhead
|
53 |
+
|
54 |
+
# Calculate days and remaining hours
|
55 |
+
training_days = int(actual_hours // 24)
|
56 |
+
training_hours = actual_hours % 24
|
57 |
+
|
58 |
+
# Calculate total cost (cost per GPU * number of GPUs * hours)
|
59 |
+
total_cost = actual_hours * cost_per_gpu_hour * num_gpus
|
60 |
+
|
61 |
+
return total_cost, training_days, training_hours
|
62 |
+
|
63 |
+
def gradio_interface(
|
64 |
+
gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
|
65 |
+
num_epochs, utilization_rate, overhead, cost_per_gpu_hour
|
66 |
+
):
|
67 |
+
# Convert inputs to appropriate numeric types
|
68 |
+
num_parameters = float(num_parameters) * 1e9 # Convert billions to actual number
|
69 |
+
dataset_tokens = float(dataset_tokens) * 1e9 # Convert billions to actual number
|
70 |
+
num_gpus = int(num_gpus)
|
71 |
+
num_epochs = int(num_epochs)
|
72 |
+
utilization_rate = float(utilization_rate)
|
73 |
+
overhead = float(overhead)
|
74 |
+
cost_per_gpu_hour = float(cost_per_gpu_hour)
|
75 |
+
|
76 |
+
# Calculate metrics
|
77 |
+
cost, days, hours = calculate_training_metrics(
|
78 |
+
gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
|
79 |
+
num_epochs, utilization_rate, overhead, cost_per_gpu_hour
|
80 |
+
)
|
81 |
+
|
82 |
+
# Format the output message
|
83 |
+
time_msg = f"{days} days and {hours:.1f} hours"
|
84 |
+
cost_msg = f"{cost:,.2f}$"
|
85 |
+
|
86 |
+
return time_msg, cost_msg
|
87 |
+
|
88 |
+
# Define available GPU choices and their default precisions
|
89 |
+
gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"]
|
90 |
+
|
91 |
+
# Create the Gradio interface
|
92 |
+
title = "<h2 style='text-align: center;'>LLM Training Time and Cost Calculator</h2>"
|
93 |
+
description = """
|
94 |
+
<p style='text-align: center;'>Calculate both the training time and cost for large language models (LLM) with parallel computing support.</p>
|
95 |
+
<p><strong>Input Parameters:</strong></p>
|
96 |
+
<ul>
|
97 |
+
<li><strong>GPU Selection:</strong> Choose from various GPU models with different compute capabilities</li>
|
98 |
+
<li><strong>Number of GPUs:</strong> Specify how many GPUs to use in parallel</li>
|
99 |
+
<li><strong>Model Size:</strong> Number of parameters in billions</li>
|
100 |
+
<li><strong>Dataset Size:</strong> Number of tokens in your dataset in billions</li>
|
101 |
+
<li><strong>Training Epochs:</strong> Number of times to iterate over the dataset</li>
|
102 |
+
<li><strong>Utilization Rate:</strong> Expected GPU utilization (typically 0.4-0.7)</li>
|
103 |
+
<li><strong>Overhead:</strong> Additional time/cost factor for data loading, checkpointing, etc.</li>
|
104 |
+
</ul>
|
105 |
+
Ouputs:
|
106 |
+
<ul>
|
107 |
+
<li><strong>Estimated Training Time:</strong> Total days and hours required for training</li>
|
108 |
+
<li><strong>Estimated Training Cost:</strong> Total cost in dollars based on GPU hours</li>
|
109 |
+
</ul>
|
110 |
+
Modified from <a href="https://huggingface.co/spaces/Heng666/LLM-Training-Cost-Calculator">this Hf Space</a>.
|
111 |
+
"""
|
112 |
+
|
113 |
+
iface = gr.Interface(
|
114 |
+
fn=gradio_interface,
|
115 |
+
inputs=[
|
116 |
+
gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'),
|
117 |
+
gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'),
|
118 |
+
gr.Number(label="Number of GPUs", value=1, minimum=1, maximum=1024),
|
119 |
+
gr.Number(label="Number of Parameters (billions)", value=70),
|
120 |
+
gr.Number(label="Dataset Tokens (billions)", value=1),
|
121 |
+
gr.Number(label="Number of Epochs", value=3, minimum=1),
|
122 |
+
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"),
|
123 |
+
gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead Factor"),
|
124 |
+
gr.Number(label="Cost per GPU Hour ($)", value=1.85)
|
125 |
+
],
|
126 |
+
outputs=[gr.Textbox(label="Estimated Training Time:"),
|
127 |
+
gr.Textbox(label="Estimated Training Cost:")],
|
128 |
+
title=title,
|
129 |
+
description=description,
|
130 |
+
article="<p style='text-align: center;'>Improved with good intentions by ghost.</p>"
|
131 |
+
)
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
iface.launch()
|