pipeline-parallelism-with-controllable-memory

Running

App Files Files Community

Wan Xinyi commited on Nov 29, 2023

Commit

be3048f

1 Parent(s): 594a8f9

Add 1f1b

Browse files

Files changed (3) hide show

app.py +15 -9
hand_schedule.py +84 -0
svg_event.py +7 -6

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import auto_schedule
 import v_schedule
 from PIL import Image
 from svg_event import render_manual_graph
 import pathlib
@@ -46,10 +47,13 @@ def calculate(p, m, f, b, w, c, mem):
     baseline_acceleration=None
     baseline_image=None
   else:
-    baseline_time=(f+b+w)*m + (f+b+w+c)*(p-1)
     baseline_bubble=percentage(baseline_time/(f+b+w)/m - 1)
     baseline_acceleration=percentage(0)
-    baseline_image=None
   zb_result = auto_schedule.auto_schedule(p, m, auto_schedule.GraphConfig(
@@ -91,6 +95,8 @@ def calculate(p, m, f, b, w, c, mem):
     zbv_acceleration=percentage(baseline_time/zbv_time - 1) if baseline_time is not None else None
     max_time = max([baseline_time, zb_time, zbv_time])
     zb_image = get_schedule_image(zb_result, max_time)
     zbv_image = get_schedule_image(zbv_result, max_time)
@@ -110,23 +116,23 @@ with gr.Blocks() as demo:
       with gr.Group():
         gr.Markdown("Costs. All costs are used as integers. For ZBV schedules, this is the time of two virtual stages on a stage combined.")
         with gr.Row():
-          f=gr.Number(label="Time of F", value=8, interactive=True, precision=0)
-          b=gr.Number(label="Time of B", value=8, interactive=True, precision=0)
-          w=gr.Number(label="Time of W", value=8, interactive=True, precision=0)
-          c=gr.Number(label="Time of one P2P communication", value=1, interactive=True, precision=0)
   with gr.Group():
     gr.Markdown("Activation memory limit.")
     def update_mem(p, s, mem):
       print("update")
       if s=="custom":
         return mem
-      return p*int(s[:1])
-    memsel=gr.Radio(choices=["1p (Same as 1F1B)", "2p", "3p", "custom"], value="1p (Same as 1F1B)")
     mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For ZBV schedules, this is relative to two virtual stages on a stage combined.", value=p.value, interactive=True, precision=0)
     memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
-  button=gr.Button("Calculate")
   with gr.Group():
     gr.Markdown("1F1B")

 import gradio as gr
 import auto_schedule
 import v_schedule
+import hand_schedule
 from PIL import Image
 from svg_event import render_manual_graph
 import pathlib
     baseline_acceleration=None
     baseline_image=None
   else:
+    baseline_result = hand_schedule.get_hand_schedule(p, m, f, b + w, 0, c)
+    baseline_result = [
+        list(filter(lambda x: x.type in {'F', 'B'}, r)) for r in baseline_result
+    ]
+    baseline_time = get_schedule_time(baseline_result)
     baseline_bubble=percentage(baseline_time/(f+b+w)/m - 1)
     baseline_acceleration=percentage(0)
   zb_result = auto_schedule.auto_schedule(p, m, auto_schedule.GraphConfig(
     zbv_acceleration=percentage(baseline_time/zbv_time - 1) if baseline_time is not None else None
     max_time = max([baseline_time, zb_time, zbv_time])
+    print(max_time)
+    baseline_image = get_schedule_image(baseline_result, max_time)
     zb_image = get_schedule_image(zb_result, max_time)
     zbv_image = get_schedule_image(zbv_result, max_time)
       with gr.Group():
         gr.Markdown("Costs. All costs are used as integers. For ZBV schedules, this is the time of two virtual stages on a stage combined.")
         with gr.Row():
+          f=gr.Number(label="Time of F", value=100, interactive=True, precision=0)
+          b=gr.Number(label="Time of B", value=110, interactive=True, precision=0)
+          w=gr.Number(label="Time of W", value=90, interactive=True, precision=0)
+          c=gr.Number(label="Time of one P2P communication", value=5, interactive=True, precision=0)
   with gr.Group():
     gr.Markdown("Activation memory limit.")
     def update_mem(p, s, mem):
       print("update")
       if s=="custom":
         return mem
+      return int(p*float(s.split('p')[0]) + 0.5)
+    memsel=gr.Radio(choices=["1p (Same as 1F1B)", "1.5p", "2p", "3p", "custom"], value="1p (Same as 1F1B)")
     mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For ZBV schedules, this is relative to two virtual stages on a stage combined.", value=p.value, interactive=True, precision=0)
     memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
+  button=gr.Button("Calculate", variant="primary")
   with gr.Group():
     gr.Markdown("1F1B")

hand_schedule.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from dataclasses import dataclass
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+    rollback: bool = False
+def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
+    assert _n >= 2 * _p
+    stage = [[] for _ in range(_p)]
+    for rank in range(_p):
+        warmup = (_p - rank - 1) * warmup_c
+        for _ in range(warmup):
+            stage[rank].append(0)
+        for i in range(_n):
+            if warmup + i < _n:
+                stage[rank].append(0)
+            stage[rank].append(1)
+            if warmup + i >= (_p - 1) * warmup_c:
+                stage[rank].append(2)
+        for _ in range((_p - 1) * warmup_c - warmup):
+            stage[rank].append(2)
+    labels = ["F", "B", "W"]
+    for rank in range(_p):
+        rank_str = " " * rank
+        for i in range(_n * 3):
+            rank_str += labels[stage[rank][i]]
+        # print(rank_str)
+    size = _p * _n * 3
+    def get_id(_i, _j, _k):
+        return _i * _p * _n + _j * _n + _k
+    t = [-1] * size
+    e = [0] * _p
+    fc = [0] * _p
+    bc = [0] * _p
+    for i in range(3 * _n):
+        for rank in range(_p):
+            last = e[rank]
+            if stage[rank][i] == 0:
+                tmp = e[rank] + _f
+                if rank > 0:
+                    assert t[get_id(0, rank - 1, fc[rank])] > 0
+                    tmp = max(tmp, t[get_id(0, rank - 1, fc[rank])] + _c + _f)
+                e[rank] = tmp
+                t[get_id(0, rank, fc[rank])] = tmp
+                fc[rank] += 1
+            elif stage[rank][i] == 1:
+                tmp = e[rank] + _b
+                if rank < _p - 1:
+                    assert t[get_id(1, rank + 1, bc[rank])] > 0
+                    tmp = max(tmp, t[get_id(1, rank + 1, bc[rank])] + _c + _b)
+                e[rank] = tmp
+                t[get_id(1, rank, bc[rank])] = tmp
+                bc[rank] += 1
+            else:
+                tmp = e[rank] + _w
+                e[rank] = tmp
+                t[get_id(2, rank, i - fc[rank] - bc[rank])] = tmp
+            # if rank == _p - 1:
+            #     print(_f, _b, _w, _c, "->", rank, i, stage[rank][i], e[rank], e[rank] - last)
+    max_time = 0
+    for rank in range(_p):
+        if warmup_c == 2:
+            max_time = max(max_time, e[rank] - t[get_id(0, rank, 0)] + _f)
+        else:
+            max_time = max(max_time, e[rank])
+        # print(rank, "->", e[rank])
+    # exit(0)
+    res = [[] for _ in range(_p)]
+    for rank in range(_p):
+        for i in range(_n):
+            res[rank].append(ScheduledNode(
+              "F", rank, i, t[get_id(0, rank, i)] - _f, t[get_id(0, rank, i)]))
+            res[rank].append(ScheduledNode(
+              "B", rank, i, t[get_id(1, rank, i)] - _b, t[get_id(1, rank, i)]))
+            res[rank].append(ScheduledNode(
+              "W", rank, i, t[get_id(2, rank, i)] - _w, t[get_id(2, rank, i)]))
+        res[rank] = sorted(res[rank], key=lambda x: x.start_time)
+    return res

svg_event.py CHANGED Viewed

@@ -170,8 +170,8 @@ def draw_experiment_and_schedule(exp_events, sched_events, output_filename, tail
     d.save_svg(output_filename)
-def draw_events(events, output_filename, include_w=True, include_o=True, tail=50):
-    canvas_info = CanvasInfo(events, tail, center_title_height=0, enable_info=True)
     max_len = canvas_info.max_len
     # height = canvas_info.height
     # info_height = canvas_info.info_height
@@ -185,8 +185,9 @@ def draw_events(events, output_filename, include_w=True, include_o=True, tail=50
 class CanvasInfo:
-    def __init__(self, events, tail, center_title_height=CENTER_TITLE_HEIGHT, enable_info=True):
-        last_time = max(max([e["completion_time"] for e in dev_evs]) for dev_evs in events)
         self.max_len = (last_time + TIME_PER_UNIT - 1) // TIME_PER_UNIT + tail
         self.height = SPAN_HEIGHT * len(events) + BORDER_SIZE * (len(events) + 1)
@@ -233,7 +234,7 @@ def plot_events(ctx, events, title_text: str, canvas_info: CanvasInfo, include_w
             if ENABLE_BATCH_ID:
                 minibatch = str(e["minibatch"])
                 center = (start + end) // 2
-                data_ctx.text(h, center, minibatch, font_scale=0.7, fill='black' if e["chunk"] == 0 else 'white')
         if ENABLE_BORDER:
             data_ctx.line(h+SPAN_HEIGHT, 0, h+SPAN_HEIGHT+BORDER_SIZE, max_len - 1)
@@ -340,7 +341,7 @@ def render_manual_graph(data, longest_time, enable_batch_id = False):
     #events = load_json_data("no-bb-schedule.json")
     path = os.path.join(tempfile.mkdtemp(), 'a.svg')
-    draw_events(events, path, include_w=True, include_o=False, tail=50)
     return path

     d.save_svg(output_filename)
+def draw_events(events, output_filename, include_w=True, include_o=True, tail=50, longest_time=None):
+    canvas_info = CanvasInfo(events, tail, center_title_height=0, enable_info=True, longest_time=longest_time)
     max_len = canvas_info.max_len
     # height = canvas_info.height
     # info_height = canvas_info.info_height
 class CanvasInfo:
+    def __init__(self, events, tail, center_title_height=CENTER_TITLE_HEIGHT, enable_info=True, longest_time=None):
+        last_time = max(max([e["completion_time"] for e in dev_evs]) for dev_evs in events) if longest_time is None else longest_time
         self.max_len = (last_time + TIME_PER_UNIT - 1) // TIME_PER_UNIT + tail
         self.height = SPAN_HEIGHT * len(events) + BORDER_SIZE * (len(events) + 1)
             if ENABLE_BATCH_ID:
                 minibatch = str(e["minibatch"])
                 center = (start + end) // 2
+                data_ctx.text(h, center, minibatch, font_scale=0.6, fill='black' if e["chunk"] == 0 else 'white')
         if ENABLE_BORDER:
             data_ctx.line(h+SPAN_HEIGHT, 0, h+SPAN_HEIGHT+BORDER_SIZE, max_len - 1)
     #events = load_json_data("no-bb-schedule.json")
     path = os.path.join(tempfile.mkdtemp(), 'a.svg')
+    draw_events(events, path, include_w=True, include_o=False, tail=50, longest_time=longest_time * time_scale)
     return path