Add system monitor
Browse files- Dockerfile +2 -0
- app_system_monitor.py +87 -0
- app_training.py +15 -4
- requirements-monitor.txt +4 -0
- trainer.py +13 -8
Dockerfile
CHANGED
@@ -44,6 +44,8 @@ RUN pyenv install ${PYTHON_VERSION} && \
|
|
44 |
RUN pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1
|
45 |
COPY --chown=1000 requirements.txt /tmp/requirements.txt
|
46 |
RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
|
|
|
|
|
47 |
|
48 |
COPY --chown=1000 . ${HOME}/app
|
49 |
RUN cd Tune-A-Video && patch -p1 < ../patch
|
|
|
44 |
RUN pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1
|
45 |
COPY --chown=1000 requirements.txt /tmp/requirements.txt
|
46 |
RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
|
47 |
+
COPY --chown=1000 requirements-monitor.txt /tmp/requirements-monitor.txt
|
48 |
+
RUN pip install --no-cache-dir -U -r /tmp/requirements-monitor.txt
|
49 |
|
50 |
COPY --chown=1000 . ${HOME}/app
|
51 |
RUN cd Tune-A-Video && patch -p1 < ../patch
|
app_system_monitor.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
|
5 |
+
import collections
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import nvitop
|
9 |
+
import pandas as pd
|
10 |
+
import plotly.express as px
|
11 |
+
import psutil
|
12 |
+
|
13 |
+
|
14 |
+
class SystemMonitor:
|
15 |
+
MAX_SIZE = 61
|
16 |
+
|
17 |
+
def __init__(self):
|
18 |
+
self.devices = nvitop.Device.all()
|
19 |
+
self.cpu_memory_usage = collections.deque(
|
20 |
+
[0 for _ in range(self.MAX_SIZE)], maxlen=self.MAX_SIZE)
|
21 |
+
self.cpu_memory_usage_str = ''
|
22 |
+
self.gpu_memory_usage = collections.deque(
|
23 |
+
[0 for _ in range(self.MAX_SIZE)], maxlen=self.MAX_SIZE)
|
24 |
+
self.gpu_util = collections.deque([0 for _ in range(self.MAX_SIZE)],
|
25 |
+
maxlen=self.MAX_SIZE)
|
26 |
+
self.gpu_memory_usage_str = ''
|
27 |
+
self.gpu_util_str = ''
|
28 |
+
|
29 |
+
def update(self) -> None:
|
30 |
+
self.update_cpu()
|
31 |
+
self.update_gpu()
|
32 |
+
|
33 |
+
def update_cpu(self) -> None:
|
34 |
+
memory = psutil.virtual_memory()
|
35 |
+
self.cpu_memory_usage.append(memory.percent)
|
36 |
+
self.cpu_memory_usage_str = f'{memory.used / 1024**3:0.2f}GiB / {memory.total / 1024**3:0.2f}GiB ({memory.percent}%)'
|
37 |
+
|
38 |
+
def update_gpu(self) -> None:
|
39 |
+
if not self.devices:
|
40 |
+
return
|
41 |
+
device = self.devices[0]
|
42 |
+
self.gpu_memory_usage.append(device.memory_percent())
|
43 |
+
self.gpu_util.append(device.gpu_utilization())
|
44 |
+
self.gpu_memory_usage_str = f'{device.memory_usage()} ({device.memory_percent()}%)'
|
45 |
+
self.gpu_util_str = f'{device.gpu_utilization()}%'
|
46 |
+
|
47 |
+
def get_json(self) -> dict[str, str]:
|
48 |
+
return {
|
49 |
+
'CPU memory usage': self.cpu_memory_usage_str,
|
50 |
+
'GPU memory usage': self.gpu_memory_usage_str,
|
51 |
+
'GPU Util': self.gpu_util_str,
|
52 |
+
}
|
53 |
+
|
54 |
+
def get_graph_data(self) -> dict[str, list[int | float]]:
|
55 |
+
return {
|
56 |
+
'index': list(range(-self.MAX_SIZE + 1, 1)),
|
57 |
+
'CPU memory usage': self.cpu_memory_usage,
|
58 |
+
'GPU memory usage': self.gpu_memory_usage,
|
59 |
+
'GPU Util': self.gpu_util,
|
60 |
+
}
|
61 |
+
|
62 |
+
def get_graph(self):
|
63 |
+
df = pd.DataFrame(self.get_graph_data())
|
64 |
+
return px.line(df,
|
65 |
+
x='index',
|
66 |
+
y=[
|
67 |
+
'CPU memory usage',
|
68 |
+
'GPU memory usage',
|
69 |
+
'GPU Util',
|
70 |
+
],
|
71 |
+
range_y=[-5,
|
72 |
+
105]).update_layout(xaxis_title='Time',
|
73 |
+
yaxis_title='Percentage')
|
74 |
+
|
75 |
+
|
76 |
+
def create_monitor_demo() -> gr.Blocks:
|
77 |
+
monitor = SystemMonitor()
|
78 |
+
with gr.Blocks() as demo:
|
79 |
+
gr.JSON(value=monitor.update, every=1, visible=False)
|
80 |
+
gr.JSON(value=monitor.get_json, show_label=False, every=1)
|
81 |
+
gr.Plot(value=monitor.get_graph, show_label=False, every=1)
|
82 |
+
return demo
|
83 |
+
|
84 |
+
|
85 |
+
if __name__ == '__main__':
|
86 |
+
demo = create_monitor_demo()
|
87 |
+
demo.queue(api_open=False).launch()
|
app_training.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6 |
|
7 |
import gradio as gr
|
8 |
|
|
|
9 |
from constants import UploadTarget
|
10 |
from inference import InferencePipeline
|
11 |
from trainer import Trainer
|
@@ -13,6 +14,11 @@ from trainer import Trainer
|
|
13 |
|
14 |
def create_training_demo(trainer: Trainer,
|
15 |
pipe: InferencePipeline | None = None) -> gr.Blocks:
|
|
|
|
|
|
|
|
|
|
|
16 |
hf_token = os.getenv('HF_TOKEN')
|
17 |
with gr.Blocks() as demo:
|
18 |
with gr.Row():
|
@@ -108,8 +114,14 @@ def create_training_demo(trainer: Trainer,
|
|
108 |
run_button = gr.Button('Start Training')
|
109 |
|
110 |
with gr.Box():
|
111 |
-
gr.
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
if pipe is not None:
|
115 |
run_button.click(fn=pipe.clear)
|
@@ -136,8 +148,7 @@ def create_training_demo(trainer: Trainer,
|
|
136 |
upload_to,
|
137 |
remove_gpu_after_training,
|
138 |
input_token,
|
139 |
-
]
|
140 |
-
outputs=output_message)
|
141 |
return demo
|
142 |
|
143 |
|
|
|
6 |
|
7 |
import gradio as gr
|
8 |
|
9 |
+
from app_system_monitor import create_monitor_demo
|
10 |
from constants import UploadTarget
|
11 |
from inference import InferencePipeline
|
12 |
from trainer import Trainer
|
|
|
14 |
|
15 |
def create_training_demo(trainer: Trainer,
|
16 |
pipe: InferencePipeline | None = None) -> gr.Blocks:
|
17 |
+
def read_log() -> str:
|
18 |
+
with open(trainer.log_file) as f:
|
19 |
+
lines = f.readlines()
|
20 |
+
return ''.join(lines[-10:])
|
21 |
+
|
22 |
hf_token = os.getenv('HF_TOKEN')
|
23 |
with gr.Blocks() as demo:
|
24 |
with gr.Row():
|
|
|
114 |
run_button = gr.Button('Start Training')
|
115 |
|
116 |
with gr.Box():
|
117 |
+
gr.Text(label='Log',
|
118 |
+
value=read_log,
|
119 |
+
lines=10,
|
120 |
+
max_lines=10,
|
121 |
+
every=1)
|
122 |
+
if not os.getenv('DISABLE_SYSTEM_MONITOR'):
|
123 |
+
with gr.Accordion(label='System info', open=False):
|
124 |
+
create_monitor_demo()
|
125 |
|
126 |
if pipe is not None:
|
127 |
run_button.click(fn=pipe.clear)
|
|
|
148 |
upload_to,
|
149 |
remove_gpu_after_training,
|
150 |
input_token,
|
151 |
+
])
|
|
|
152 |
return demo
|
153 |
|
154 |
|
requirements-monitor.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nvitop==1.1.1
|
2 |
+
pandas==2.0.0
|
3 |
+
plotly==5.14.1
|
4 |
+
psutil==5.9.4
|
trainer.py
CHANGED
@@ -32,6 +32,9 @@ class Trainer:
|
|
32 |
self.checkpoint_dir = pathlib.Path('checkpoints')
|
33 |
self.checkpoint_dir.mkdir(exist_ok=True)
|
34 |
|
|
|
|
|
|
|
35 |
def download_base_model(self, base_model_id: str) -> str:
|
36 |
model_dir = self.checkpoint_dir / base_model_id
|
37 |
if not model_dir.exists():
|
@@ -72,7 +75,7 @@ class Trainer:
|
|
72 |
upload_to: str,
|
73 |
remove_gpu_after_training: bool,
|
74 |
input_token: str,
|
75 |
-
) ->
|
76 |
if SPACE_ID == ORIGINAL_SPACE_ID:
|
77 |
raise gr.Error(
|
78 |
'This Space does not work on this Shared UI. Duplicate the Space and attribute a GPU'
|
@@ -134,15 +137,19 @@ class Trainer:
|
|
134 |
OmegaConf.save(config, f)
|
135 |
|
136 |
command = f'accelerate launch Tune-A-Video/train_tuneavideo.py --config {config_path}'
|
137 |
-
|
|
|
|
|
|
|
|
|
138 |
save_model_card(save_dir=output_dir,
|
139 |
base_model=base_model,
|
140 |
training_prompt=training_prompt,
|
141 |
test_prompt=validation_prompt,
|
142 |
test_image_dir='samples')
|
143 |
|
144 |
-
|
145 |
-
|
146 |
|
147 |
if upload_to_hub:
|
148 |
upload_message = self.model_uploader.upload_model(
|
@@ -152,8 +159,8 @@ class Trainer:
|
|
152 |
private=use_private_repo,
|
153 |
delete_existing_repo=delete_existing_repo,
|
154 |
input_token=input_token)
|
155 |
-
|
156 |
-
|
157 |
|
158 |
if remove_gpu_after_training:
|
159 |
space_id = os.getenv('SPACE_ID')
|
@@ -162,5 +169,3 @@ class Trainer:
|
|
162 |
token=self.hf_token if self.hf_token else input_token)
|
163 |
api.request_space_hardware(repo_id=space_id,
|
164 |
hardware='cpu-basic')
|
165 |
-
|
166 |
-
return message
|
|
|
32 |
self.checkpoint_dir = pathlib.Path('checkpoints')
|
33 |
self.checkpoint_dir.mkdir(exist_ok=True)
|
34 |
|
35 |
+
self.log_file = pathlib.Path('log.txt')
|
36 |
+
self.log_file.touch(exist_ok=True)
|
37 |
+
|
38 |
def download_base_model(self, base_model_id: str) -> str:
|
39 |
model_dir = self.checkpoint_dir / base_model_id
|
40 |
if not model_dir.exists():
|
|
|
75 |
upload_to: str,
|
76 |
remove_gpu_after_training: bool,
|
77 |
input_token: str,
|
78 |
+
) -> None:
|
79 |
if SPACE_ID == ORIGINAL_SPACE_ID:
|
80 |
raise gr.Error(
|
81 |
'This Space does not work on this Shared UI. Duplicate the Space and attribute a GPU'
|
|
|
137 |
OmegaConf.save(config, f)
|
138 |
|
139 |
command = f'accelerate launch Tune-A-Video/train_tuneavideo.py --config {config_path}'
|
140 |
+
with open(self.log_file, 'w') as f:
|
141 |
+
subprocess.run(shlex.split(command),
|
142 |
+
stdout=f,
|
143 |
+
stderr=subprocess.STDOUT,
|
144 |
+
text=True)
|
145 |
save_model_card(save_dir=output_dir,
|
146 |
base_model=base_model,
|
147 |
training_prompt=training_prompt,
|
148 |
test_prompt=validation_prompt,
|
149 |
test_image_dir='samples')
|
150 |
|
151 |
+
with open(self.log_file, 'a') as f:
|
152 |
+
f.write('Training completed!\n')
|
153 |
|
154 |
if upload_to_hub:
|
155 |
upload_message = self.model_uploader.upload_model(
|
|
|
159 |
private=use_private_repo,
|
160 |
delete_existing_repo=delete_existing_repo,
|
161 |
input_token=input_token)
|
162 |
+
with open(self.log_file, 'a') as f:
|
163 |
+
f.write(upload_message)
|
164 |
|
165 |
if remove_gpu_after_training:
|
166 |
space_id = os.getenv('SPACE_ID')
|
|
|
169 |
token=self.hf_token if self.hf_token else input_token)
|
170 |
api.request_space_hardware(repo_id=space_id,
|
171 |
hardware='cpu-basic')
|
|
|
|