Refactor pipeline threading and simplify sdg batch processing.

relyt0925 · relyt0925 · commit 0fe07704a230 · 2025-04-27T15:33:41.000-05:00
Added logging to track remaining threads during pipeline execution for better debugging. Removed redundant batching logic in block processing to fix concurrency bug slowing down sdg.
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
@@ -170,11 +170,20 @@ def generate(self, dataset, checkpoint_name=None) -> Dataset:
                 executor.submit(self._generate_single, input_split)
                 for input_split in input_splits
             ]
-
+            threads_remaining_to_execute = len(futures)
+            logger.info(
+                "Total of %d pipeline threads to execute",
+                len(futures),
+            )
             # Collect the results of each batch as they finish. This needs to
             # wait for them all, so the order of waiting doesn't matter
             for future in futures:
                 ds = future.result()
+                threads_remaining_to_execute-=1
+                logger.info(
+                    "Total of %d pipeline threads to check for completion",
+                    threads_remaining_to_execute,
+                )
                 output_splits.append(ds)
                 checkpointer.checkpoint(ds)
         checkpointer.done()
@@ -197,24 +206,7 @@ def _generate_single(self, dataset) -> Dataset:
                 drop_duplicates_cols = block_prop.get("drop_duplicates", False)
                 block = block_type(self.ctx, self, block_name, **block_config)
                 logger.info("Running block: %s", block_name)
-
-                # Check if batching is enabled
-                if not self.ctx.batching_enabled:
-                    logger.info(
-                        "Batching disabled; processing block '%s' single-threaded.",
-                        block_name,
-                    )
-                    dataset = block.generate(dataset)
-                else:
-                    # Split the dataset into batches
-                    input_splits = self._split_dataset(dataset)
-                    # Process each batch in sequence
-                    output_splits = [
-                        block.generate(input_split) for input_split in input_splits
-                    ]
-                    # Combine the processed splits back into a single dataset
-                    dataset = concatenate_datasets(output_splits)
-
+                dataset = block.generate(dataset)
                 # If the dataset is empty after processing, terminate early
                 if len(dataset) == 0:
                     return dataset