refactor: Refactor DataProcessor to use centralized encoder class retrieval

eshwarprasadS · eshwarprasadS · commit 3993541e23d4 · 2025-04-21T21:44:14.000Z
- Removed the encoder class initialization from the DataProcessor constructor.
- Introduced a centralized method for obtaining encoder classes in the encoders module.
- Updated relevant tests to reflect the changes in DataProcessor initialization.
- Cleaned up unused imports and code related to dynamic encoder imports.

Signed-off-by: eshwarprasadS &lt;eshwarprasad.s01@gmail.com&gt;
diff --git a/src/instructlab/sdg/encoders/__init__.py b/src/instructlab/sdg/encoders/__init__.py
@@ -1,19 +1,22 @@
-# Standard
-import importlib
+# Import all encoder classes directly
+# Local
+from .arctic_encoder import ArcticEmbedEncoder
+
+# Create a mapping of encoder types to their classes
+ENCODER_REGISTRY = {
+    "arctic": ArcticEmbedEncoder,
+}
 
 
 def get_encoder_class(encoder_type: str):
     """Get the encoder class based on the encoder type."""
     try:
-        # Convert encoder_type to class name (e.g., 'arctic' -> 'ArcticEmbedEncoder')
-        class_name = f"{encoder_type.capitalize()}EmbedEncoder"
-
-        # Use absolute import instead of relative
-        module_name = f"sdg.src.instructlab.sdg.encoders.{encoder_type}_encoder"
-
-        module = importlib.import_module(module_name)
-
-        # Get the class from the module
-        return getattr(module, class_name)
-    except (ImportError, AttributeError) as e:
-        raise ValueError(f"Unsupported encoder type: '{encoder_type}'") from e
+        if encoder_type not in ENCODER_REGISTRY:
+            supported_encoders = list(ENCODER_REGISTRY.keys())
+            raise ValueError(
+                f"Unsupported encoder type: '{encoder_type}'. "
+                f"Supported types are: {supported_encoders}"
+            )
+        return ENCODER_REGISTRY[encoder_type]
+    except Exception as e:
+        raise ValueError(f"Error getting encoder class: {str(e)}") from e
diff --git a/src/instructlab/sdg/subset_selection.py b/src/instructlab/sdg/subset_selection.py
@@ -20,6 +20,7 @@
 import torch
 
 # Local
+from .encoders import get_encoder_class
 from .utils.subset_selection_utils import (
     compute_pairwise_dense,
     get_default_num_gpus,
@@ -171,19 +172,14 @@ class DataProcessor:
     Enhanced data processor with support for combined files and multiple selection methods.
     """
 
-    def __init__(self, config: ProcessingConfig, encoder_cls):
+    def __init__(self, config: ProcessingConfig):
         """
         Initializes the DataProcessor with the given configuration and encoder class.
 
         Args:
             config (ProcessingConfig): The processing configuration.
-            encoder_cls: The encoder class to use for generating embeddings.
         """
         self.config = config
-        self.encoder = encoder_cls(
-            model_name=config.encoder.encoder_model,
-            testing_mode=config.encoder.testing_mode,
-        )
         self.env = Environment(loader=BaseLoader())
         self.templates = {
             k: self.env.from_string(v) for k, v in config.template.templates.items()
@@ -750,22 +746,7 @@ def _process_dataset_shard(args):
         device = f"cuda:{gpu_id}"
         logger.info(f"GPU {gpu_id} started processing {len(dataset_shard)} samples")
 
-        # Import the encoder directly using the system path
-        # Standard
-
-        sys.path.append(
-            os.path.dirname(
-                os.path.dirname(
-                    os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
-                )
-            )
-        )
-
-        # Import the encoder class using string-based absolute import
-
-        module_name = f"sdg.src.instructlab.sdg.encoders.{encoder_type}_encoder"
-        module = importlib.import_module(module_name)
-        encoder_cls = getattr(module, f"{encoder_type.capitalize()}EmbedEncoder")
+        encoder_cls = get_encoder_class(encoder_type)
 
         # Create encoder instance
         encoder = encoder_cls(
@@ -845,7 +826,7 @@ def _process_dataset_shard(args):
     # pylint: disable=broad-exception-caught
     except Exception as e:
         logger.error(f"Error processing shard on GPU {gpu_id}: {str(e)}")
-        return None
+        raise
 
 
 def _merge_shard_files(shard_files, merged_file):
@@ -1014,24 +995,6 @@ def get_supported_encoders():
     ]
 
 
-def get_encoder_class(encoder_type: str):
-    """Get the encoder class based on the encoder type."""
-    try:
-        # Convert encoder_type to class name (e.g., 'arctic' -> 'ArcticEmbedEncoder')
-        class_name = f"{encoder_type.capitalize()}EmbedEncoder"
-        # Import the module dynamically
-        module = __import__(
-            f"instructlab.sdg.encoders.{encoder_type}_encoder", fromlist=[class_name]
-        )
-        # Get the class from the module
-        return getattr(module, class_name)
-    except (ImportError, AttributeError) as e:
-        supported_encoders = get_supported_encoders()
-        raise ValueError(
-            f"Unsupported encoder type: '{encoder_type}'. "
-            f"Supported types are: {[f'{t}' for t in supported_encoders]}"
-        ) from e
-
 
 def subset_datasets(
     input_files: List[str],
@@ -1081,9 +1044,7 @@ def subset_datasets(
 
     try:
         logger.info(f"Processing configuration: {config}")
-        processor = DataProcessor(
-            config, get_encoder_class(config.encoder.encoder_type)
-        )
+        processor = DataProcessor(config)
         processor.process_files(input_files, config.basic.output_dir)
 
     except Exception as e:
diff --git a/tests/test_subset_selection.py b/tests/test_subset_selection.py
@@ -51,7 +51,7 @@ def data_processor(mock_encoder, mock_gpu_environment):
         input_files=["test.jsonl"],
         subset_sizes=[10, 20.5],
     )
-    return DataProcessor(config, mock_encoder)
+    return DataProcessor(config)
 
 
 def test_format_text(data_processor):
@@ -123,23 +123,6 @@ def test_invalid_subset_sizes(mock_gpu_environment):
             subset_sizes=[-10],
         )
 
-
-def test_process_batch(mock_gpu_environment, data_processor, tmp_path):
-    """Test batch processing of texts"""
-
-    batch_texts = ["text1", "text2", "text3"]
-    output_file = str(tmp_path / "test_batch.h5")
-
-    embedding_dim = data_processor.process_batch(batch_texts, output_file)
-
-    assert embedding_dim is not None
-    assert os.path.exists(output_file)
-
-    with h5py.File(output_file, "r") as f:
-        embeddings = f["embeddings"][:]
-        assert embeddings.shape == (3, embedding_dim)
-
-
 def test_generate_embeddings_parallel(mock_gpu_environment, tmp_path, mock_encoder):
     """Test the parallelized embedding generation feature."""
     # Create a sample dataset
@@ -165,7 +148,7 @@ def test_generate_embeddings_parallel(mock_gpu_environment, tmp_path, mock_encod
     config.system.num_gpus = 2
 
     # Create processor
-    processor = DataProcessor(config, mock_encoder)
+    processor = DataProcessor(config)
 
     # Test case 1: File exists, should return early
     result_path = processor.generate_embeddings(dataset, output_dir)