pipeline-parallelism-with-controllable-memory

Running

App Files Files Community

Wan Xinyi commited on Nov 30, 2023

Commit

ac0b05c

1 Parent(s): be3048f

Add some presets, support 1f1b with fewer microbatches

Browse files

Files changed (2) hide show

app.py +36 -6
hand_schedule.py +20 -11

app.py CHANGED Viewed

@@ -46,6 +46,7 @@ def calculate(p, m, f, b, w, c, mem):
     baseline_bubble=None
     baseline_acceleration=None
     baseline_image=None
   else:
     baseline_result = hand_schedule.get_hand_schedule(p, m, f, b + w, 0, c)
     baseline_result = [
@@ -70,11 +71,12 @@ def calculate(p, m, f, b, w, c, mem):
   zb_bubble=percentage(zb_time/(f+b+w)/m - 1)
   zb_acceleration=percentage(baseline_time/zb_time - 1) if baseline_time is not None else None
-  if mem < p:
     zbv_time=None
     zbv_bubble=None
     zbv_acceleration=None
     zbv_image=None
   else:
     zbv_graph = v_schedule.PipelineGraph(
                   n_stage=p,
@@ -94,10 +96,13 @@ def calculate(p, m, f, b, w, c, mem):
     zbv_bubble=percentage(zbv_time/(f+b+w)/m - 1)
     zbv_acceleration=percentage(baseline_time/zbv_time - 1) if baseline_time is not None else None
-    max_time = max([baseline_time, zb_time, zbv_time])
-    print(max_time)
     baseline_image = get_schedule_image(baseline_result, max_time)
     zb_image = get_schedule_image(zb_result, max_time)
     zbv_image = get_schedule_image(zbv_result, max_time)
   return [baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image]
@@ -105,6 +110,20 @@ def calculate(p, m, f, b, w, c, mem):
 with gr.Blocks() as demo:
   gr.Markdown(open("description1.md").read())
   gr.Markdown("# Pipeline Scheduler Playground")
   with gr.Row():
     with gr.Column(scale=1):
       with gr.Group():
@@ -142,7 +161,7 @@ with gr.Blocks() as demo:
         baseline_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         baseline_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
-        baseline_image=gr.Image(None, interactive=False, label="Schedule Image")
   with gr.Group():
     gr.Markdown("Zero Bubble Schedule")
@@ -152,7 +171,7 @@ with gr.Blocks() as demo:
         zb_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         zb_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
-        zb_image=gr.Image(None, interactive=False, label="Schedule Image")
   with gr.Group():
     gr.Markdown("Zero Bubble V Schedule (ZBV)")
     with gr.Row():
@@ -161,7 +180,18 @@ with gr.Blocks() as demo:
         zbv_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         zbv_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
-        zbv_image=gr.Image(None, interactive=False, label="Schedule Image")
     button.click(calculate, inputs=[p, m, f, b, w, c, mem], outputs=[baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image])
   gr.Markdown(open("description2.md").read())
 demo.launch()

     baseline_bubble=None
     baseline_acceleration=None
     baseline_image=None
+    baseline_result=None
   else:
     baseline_result = hand_schedule.get_hand_schedule(p, m, f, b + w, 0, c)
     baseline_result = [
   zb_bubble=percentage(zb_time/(f+b+w)/m - 1)
   zb_acceleration=percentage(baseline_time/zb_time - 1) if baseline_time is not None else None
+  if mem < p or m < 2 * p:
     zbv_time=None
     zbv_bubble=None
     zbv_acceleration=None
     zbv_image=None
+    zbv_result=None
   else:
     zbv_graph = v_schedule.PipelineGraph(
                   n_stage=p,
     zbv_bubble=percentage(zbv_time/(f+b+w)/m - 1)
     zbv_acceleration=percentage(baseline_time/zbv_time - 1) if baseline_time is not None else None
+  max_time = max(filter(lambda x: x is not None, [baseline_time, zb_time, zbv_time]))
+  print(max_time)
+  if baseline_result is not None:
     baseline_image = get_schedule_image(baseline_result, max_time)
+  if zb_result is not None:
     zb_image = get_schedule_image(zb_result, max_time)
+  if zbv_result is not None:
     zbv_image = get_schedule_image(zbv_result, max_time)
   return [baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image]
 with gr.Blocks() as demo:
   gr.Markdown(open("description1.md").read())
   gr.Markdown("# Pipeline Scheduler Playground")
+  presets = {
+    'Ideal Case 1p': (4, 12, 20, 20, 20, 0, '1p (Same as 1F1B)'),
+    'Ideal Case 2p': (4, 12, 20, 20, 20, 0, '2p'),
+    'Real Case 1p': (4, 12, 1049, 1122, 903, 79, '1p (Same as 1F1B)'),
+    'Real Case 2p': (4, 12, 1049, 1122, 903, 79, '2p'),
+  }
+  preset_buttons = {}
+  with gr.Group():
+    gr.Markdown("Preset Setups")
+    with gr.Row():
+      for (k, v) in presets.items():
+        preset_buttons[k] = gr.Button(k, variant="secondary")
   with gr.Row():
     with gr.Column(scale=1):
       with gr.Group():
         baseline_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         baseline_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
+        baseline_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
     gr.Markdown("Zero Bubble Schedule")
         zb_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         zb_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
+        zb_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
     gr.Markdown("Zero Bubble V Schedule (ZBV)")
     with gr.Row():
         zbv_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         zbv_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
+        zbv_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
     button.click(calculate, inputs=[p, m, f, b, w, c, mem], outputs=[baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image])
+  for (k, v) in presets.items():
+    def update_preset(pb, p, m, f, b, w, c, mem):
+      print(pb)
+      print(presets[pb])
+      print(presets[pb][-1])
+      return *presets[pb],*calculate(*presets[pb][:-1], update_mem(p, presets[pb][-1], -1))
+    preset_buttons[k].click(
+       update_preset,
+       inputs=[preset_buttons[k], p, m, f, b, w, c, mem],
+       outputs=[p, m, f, b, w, c, memsel, baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image])
   gr.Markdown(open("description2.md").read())
 demo.launch()

hand_schedule.py CHANGED Viewed

@@ -11,8 +11,10 @@ class ScheduledNode:
 def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
-    assert _n >= 2 * _p
     stage = [[] for _ in range(_p)]
     for rank in range(_p):
         warmup = (_p - rank - 1) * warmup_c
         for _ in range(warmup):
@@ -25,12 +27,13 @@ def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
                 stage[rank].append(2)
         for _ in range((_p - 1) * warmup_c - warmup):
             stage[rank].append(2)
-    labels = ["F", "B", "W"]
     for rank in range(_p):
         rank_str = " " * rank
-        for i in range(_n * 3):
             rank_str += labels[stage[rank][i]]
-        # print(rank_str)
     size = _p * _n * 3
     def get_id(_i, _j, _k):
         return _i * _p * _n + _j * _n + _k
@@ -42,6 +45,8 @@ def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
         for rank in range(_p):
             last = e[rank]
             if stage[rank][i] == 0:
                 tmp = e[rank] + _f
                 if rank > 0:
                     assert t[get_id(0, rank - 1, fc[rank])] > 0
@@ -50,17 +55,17 @@ def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
                 t[get_id(0, rank, fc[rank])] = tmp
                 fc[rank] += 1
             elif stage[rank][i] == 1:
                 tmp = e[rank] + _b
                 if rank < _p - 1:
-                    assert t[get_id(1, rank + 1, bc[rank])] > 0
                     tmp = max(tmp, t[get_id(1, rank + 1, bc[rank])] + _c + _b)
                 e[rank] = tmp
                 t[get_id(1, rank, bc[rank])] = tmp
                 bc[rank] += 1
-            else:
-                tmp = e[rank] + _w
-                e[rank] = tmp
-                t[get_id(2, rank, i - fc[rank] - bc[rank])] = tmp
             # if rank == _p - 1:
             #     print(_f, _b, _w, _c, "->", rank, i, stage[rank][i], e[rank], e[rank] - last)
     max_time = 0
@@ -73,7 +78,7 @@ def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
     # exit(0)
     res = [[] for _ in range(_p)]
     for rank in range(_p):
-        for i in range(_n):
             res[rank].append(ScheduledNode(
               "F", rank, i, t[get_id(0, rank, i)] - _f, t[get_id(0, rank, i)]))
             res[rank].append(ScheduledNode(
@@ -81,4 +86,8 @@ def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
             res[rank].append(ScheduledNode(
               "W", rank, i, t[get_id(2, rank, i)] - _w, t[get_id(2, rank, i)]))
         res[rank] = sorted(res[rank], key=lambda x: x.start_time)
-    return res

 def get_hand_schedule(_p, _n, _f, _b, _w, _c, warmup_c=1):
+    # assert _n >= 2 * _p
     stage = [[] for _ in range(_p)]
+    real_n = _n
+    _n = max(_n, _p)
     for rank in range(_p):
         warmup = (_p - rank - 1) * warmup_c
         for _ in range(warmup):
                 stage[rank].append(2)
         for _ in range((_p - 1) * warmup_c - warmup):
             stage[rank].append(2)
+    labels = ["F", "B", "W", '.']
     for rank in range(_p):
         rank_str = " " * rank
+        # for i in range(_n * 3):
+        for i in range(len(stage[rank])):
             rank_str += labels[stage[rank][i]]
+        print(rank_str)
     size = _p * _n * 3
     def get_id(_i, _j, _k):
         return _i * _p * _n + _j * _n + _k
         for rank in range(_p):
             last = e[rank]
             if stage[rank][i] == 0:
+                if fc[rank] >= real_n:
+                  continue
                 tmp = e[rank] + _f
                 if rank > 0:
                     assert t[get_id(0, rank - 1, fc[rank])] > 0
                 t[get_id(0, rank, fc[rank])] = tmp
                 fc[rank] += 1
             elif stage[rank][i] == 1:
+                if bc[rank] >= real_n:
+                  continue
                 tmp = e[rank] + _b
                 if rank < _p - 1:
+                    assert t[get_id(1, rank + 1, bc[rank])] > 0, f"{rank} {i} {bc[rank]}"
                     tmp = max(tmp, t[get_id(1, rank + 1, bc[rank])] + _c + _b)
                 e[rank] = tmp
                 t[get_id(1, rank, bc[rank])] = tmp
                 bc[rank] += 1
+            elif stage[rank][i] == 2:
+                continue
             # if rank == _p - 1:
             #     print(_f, _b, _w, _c, "->", rank, i, stage[rank][i], e[rank], e[rank] - last)
     max_time = 0
     # exit(0)
     res = [[] for _ in range(_p)]
     for rank in range(_p):
+        for i in range(real_n):
             res[rank].append(ScheduledNode(
               "F", rank, i, t[get_id(0, rank, i)] - _f, t[get_id(0, rank, i)]))
             res[rank].append(ScheduledNode(
             res[rank].append(ScheduledNode(
               "W", rank, i, t[get_id(2, rank, i)] - _w, t[get_id(2, rank, i)]))
         res[rank] = sorted(res[rank], key=lambda x: x.start_time)
+    return res
+if __name__ == "__main__":
+    print(get_hand_schedule(16, 16, 1, 1, 1, 0))