Merge branch 'feature/zerobubble' into feature/zerobubble

flybird11111 · web-flow · commit 87e742de4cb6 · 2024-09-27T17:18:49.000+08:00
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -313,12 +313,8 @@ def backward(self, loss: Tensor, inputs=None, retain_graph=False, **kwargs):
         """
 
         # Call the superclass backward method to compute gradients.
-<<<<<<< HEAD
         with self.model._hook_context():
-            super().backward(loss, *args, **kwargs)
-=======
-        super().backward(loss, inputs=inputs, retain_graph=retain_graph, **kwargs)
->>>>>>> [plugin] hybrid support zero bubble pipeline (#6060)
+            super().backward(loss, inputs=inputs, retain_graph=retain_graph, **kwargs)
 
         if self.model.require_grad_sync:
             # If gradient synchronization is required, sync sequence parallelism gradients.
@@ -541,12 +537,8 @@ def backward(self, loss: Tensor, inputs=None, retain_graph=False, **kwargs):
             None
         """
         # Call the superclass backward method to compute gradients.
-<<<<<<< HEAD
         with self.model._hook_context():
-            super().backward(loss, *args, **kwargs)
-=======
-        super().backward(loss, inputs=inputs, retain_graph=retain_graph, **kwargs)
->>>>>>> [plugin] hybrid support zero bubble pipeline (#6060)
+            super().backward(loss, inputs=inputs, retain_graph=retain_graph, **kwargs)
 
         if self.model.require_grad_sync:
             # If gradient synchronization is required, sync sequence parallelism gradients.
@@ -1174,6 +1166,14 @@ def __init__(
                     num_microbatch=num_microbatches,
                     microbatch_size=microbatch_size,
                 )
+            elif pp_style == "zbv":
+                self.scheduler = ZeroBubbleVPipeScheduler(
+                    stage_manager=self.stage_manager,
+                    schedule=scheduler_nodes,
+                    num_model_chunks=num_model_chunks,
+                    num_microbatch=num_microbatches,
+                    microbatch_size=microbatch_size,
+                )
             else:
                 raise NotImplementedError()
         if sequence_parallelism_mode == "ring_attn":
diff --git a/colossalai/pipeline/schedule/v_schedule.py b/colossalai/pipeline/schedule/v_schedule.py
@@ -491,4 +491,4 @@ def even_breaker(x: ScheduledNode):
             #     print(f"{node.type}-{node.minibatch}-{int(node.rollback)}", end=", ")
             # print()
 
-        return local_order_with_rollback
+        return local_order_with_rollback
diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -902,4 +902,4 @@ def forward_backward_step(
 
         self.assert_buffer_empty()
 
-        return result
+        return result
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
@@ -433,7 +433,6 @@ def backward(self, loss, inputs=None, retain_graph=False):
 
         ctx = nullcontext() if self._backward_context is None else self._backward_context()
         with ctx:
-            loss.backward(retain_graph=retain_graph)
             loss.backward(inputs=inputs, retain_graph=retain_graph)
 
         if not self.require_grad_sync:
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -863,4 +863,4 @@ def test_pp():
 
 
 if __name__ == "__main__":
-    test_pp()
+    test_pp()

Original file line number	Diff line number	Diff line change
`@@ -902,4 +902,4 @@ def forward_backward_step(`
`902`	`902`
`903`	`903`	`self.assert_buffer_empty()`
`904`	`904`
`905`		`- return result`
	`905`	`+ return result`
Original file line number	Diff line number	Diff line change
`@@ -863,4 +863,4 @@ def test_pp():`
`863`	`863`
`864`	`864`
`865`	`865`	`if __name__ == "__main__":`
`866`		`- test_pp()`
	`866`	`+ test_pp()`