ghost613 commited on
Commit
92c24be
·
verified ·
1 Parent(s): 20b31f0

Upload 2 files

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +134 -0
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- title: LLM Training Time And Cost Calculator
3
- emoji: 🚀
4
- colorFrom: pink
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.12.0
 
1
  ---
2
+ title: LLM Time And Cost Calculator
3
+ emoji: 🐢
4
+ colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.12.0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def calculate_training_metrics(
4
+ gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
5
+ num_epochs, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85
6
+ ):
7
+ """
8
+ Calculates both the training time and cost for LLM training with parallel computing.
9
+
10
+ Args:
11
+ - gpu_choice (str): The choice of GPU model
12
+ - precision (str): The precision level for training
13
+ - num_gpus (int): Number of GPUs for parallel computing
14
+ - num_parameters (float): Number of model parameters in billions
15
+ - dataset_tokens (float): Number of tokens in the dataset
16
+ - num_epochs (int): Number of training epochs
17
+ - utilization_rate (float): GPU utilization rate (0 < rate ≤ 1)
18
+ - overhead (float): Overhead multiplier for additional costs
19
+ - cost_per_gpu_hour (float): Cost per GPU hour in dollars
20
+
21
+ Returns:
22
+ - tuple: (total_cost, training_days, training_hours)
23
+ """
24
+
25
+ # GPU throughputs in FLOPS (operations per second)
26
+ gpu_throughputs = {
27
+ 'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12},
28
+ 'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12},
29
+ 'V100': {'tensor': 130e12},
30
+ 'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12},
31
+ 'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12}
32
+ }
33
+
34
+ # Get the base GPU throughput
35
+ base_throughput = gpu_throughputs[gpu_choice][precision]
36
+
37
+ # Calculate effective throughput with multiple GPUs
38
+ # Assuming 90% scaling efficiency for parallel computing
39
+ parallel_efficiency = 0.9
40
+ effective_throughput = base_throughput * num_gpus * parallel_efficiency
41
+
42
+ # Calculate total tokens processed (dataset_tokens * epochs)
43
+ total_tokens = dataset_tokens * num_epochs
44
+
45
+ # Calculate total FLOPS needed (6 operations per parameter per token)
46
+ total_flops = 6 * num_parameters * total_tokens
47
+
48
+ # Calculate raw computation hours needed
49
+ compute_hours = total_flops / (effective_throughput * 3600)
50
+
51
+ # Adjust for utilization rate and overhead
52
+ actual_hours = (compute_hours / utilization_rate) * overhead
53
+
54
+ # Calculate days and remaining hours
55
+ training_days = int(actual_hours // 24)
56
+ training_hours = actual_hours % 24
57
+
58
+ # Calculate total cost (cost per GPU * number of GPUs * hours)
59
+ total_cost = actual_hours * cost_per_gpu_hour * num_gpus
60
+
61
+ return total_cost, training_days, training_hours
62
+
63
+ def gradio_interface(
64
+ gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
65
+ num_epochs, utilization_rate, overhead, cost_per_gpu_hour
66
+ ):
67
+ # Convert inputs to appropriate numeric types
68
+ num_parameters = float(num_parameters) * 1e9 # Convert billions to actual number
69
+ dataset_tokens = float(dataset_tokens) * 1e9 # Convert billions to actual number
70
+ num_gpus = int(num_gpus)
71
+ num_epochs = int(num_epochs)
72
+ utilization_rate = float(utilization_rate)
73
+ overhead = float(overhead)
74
+ cost_per_gpu_hour = float(cost_per_gpu_hour)
75
+
76
+ # Calculate metrics
77
+ cost, days, hours = calculate_training_metrics(
78
+ gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
79
+ num_epochs, utilization_rate, overhead, cost_per_gpu_hour
80
+ )
81
+
82
+ # Format the output message
83
+ time_msg = f"{days} days and {hours:.1f} hours"
84
+ cost_msg = f"{cost:,.2f}$"
85
+
86
+ return time_msg, cost_msg
87
+
88
+ # Define available GPU choices and their default precisions
89
+ gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"]
90
+
91
+ # Create the Gradio interface
92
+ title = "<h2 style='text-align: center;'>LLM Training Time and Cost Calculator</h2>"
93
+ description = """
94
+ <p style='text-align: center;'>Calculate both the training time and cost for large language models (LLM) with parallel computing support.</p>
95
+ <p><strong>Input Parameters:</strong></p>
96
+ <ul>
97
+ <li><strong>GPU Selection:</strong> Choose from various GPU models with different compute capabilities</li>
98
+ <li><strong>Number of GPUs:</strong> Specify how many GPUs to use in parallel</li>
99
+ <li><strong>Model Size:</strong> Number of parameters in billions</li>
100
+ <li><strong>Dataset Size:</strong> Number of tokens in your dataset in billions</li>
101
+ <li><strong>Training Epochs:</strong> Number of times to iterate over the dataset</li>
102
+ <li><strong>Utilization Rate:</strong> Expected GPU utilization (typically 0.4-0.7)</li>
103
+ <li><strong>Overhead:</strong> Additional time/cost factor for data loading, checkpointing, etc.</li>
104
+ </ul>
105
+ Ouputs:
106
+ <ul>
107
+ <li><strong>Estimated Training Time:</strong> Total days and hours required for training</li>
108
+ <li><strong>Estimated Training Cost:</strong> Total cost in dollars based on GPU hours</li>
109
+ </ul>
110
+ Modified from <a href="https://huggingface.co/spaces/Heng666/LLM-Training-Cost-Calculator">this Hf Space</a>.
111
+ """
112
+
113
+ iface = gr.Interface(
114
+ fn=gradio_interface,
115
+ inputs=[
116
+ gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'),
117
+ gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'),
118
+ gr.Number(label="Number of GPUs", value=1, minimum=1, maximum=1024),
119
+ gr.Number(label="Number of Parameters (billions)", value=70),
120
+ gr.Number(label="Dataset Tokens (billions)", value=1),
121
+ gr.Number(label="Number of Epochs", value=3, minimum=1),
122
+ gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"),
123
+ gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead Factor"),
124
+ gr.Number(label="Cost per GPU Hour ($)", value=1.85)
125
+ ],
126
+ outputs=[gr.Textbox(label="Estimated Training Time:"),
127
+ gr.Textbox(label="Estimated Training Cost:")],
128
+ title=title,
129
+ description=description,
130
+ article="<p style='text-align: center;'>Improved with good intentions by ghost.</p>"
131
+ )
132
+
133
+ if __name__ == "__main__":
134
+ iface.launch()