Skip to content

[BUG] Errors when importing SOK and Data loader #189

@bschifferer

Description

@bschifferer

I am not able to import Merlin Dataloader + SOK in TensorFlow. Either order (first SOK -> Dataloader OR Dataloder -> SOK) throws an error (see thread).

Importing sok and then data loader

import os

import tensorflow as tf
import sparse_operation_kit as sok

os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"  # fraction of free memory

BATCH_SIZE = 64000
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
sok.Init(global_batch_size=BATCH_SIZE)

import nvtabular as nvt
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers
File /usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/config.py:874, in set_logical_device_configuration(device, logical_devices)
    810 """Set the logical device configuration for a `tf.config.PhysicalDevice`.
    811 
    812 A visible `tf.config.PhysicalDevice` will by default have a single
   (...)
    872   RuntimeError: Runtime is already initialized.
    873 """
--> 874 context.context().set_logical_device_configuration(device, logical_devices)

File /usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/context.py:1601, in Context.set_logical_device_configuration(self, dev, virtual_devices)
   1600 if self._context_handle is not None:
-> 1601   raise RuntimeError(
   1602       "Virtual devices cannot be modified after being initialized")
   1604 self._virtual_device_map[dev] = virtual_devices

RuntimeError: Virtual devices cannot be modified after being initialized

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
Input In [1], in <cell line: 13>()
     10 sok.Init(global_batch_size=BATCH_SIZE)
     12 import nvtabular as nvt
---> 13 from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
     14 from nvtabular.framework_utils.tensorflow import layers

File /nvtabular/nvtabular/loader/tensorflow.py:28, in <module>
     25 from nvtabular.loader.backend import DataLoader
     26 from nvtabular.loader.tf_utils import configure_tensorflow, get_dataset_schema_from_feature_columns
---> 28 from_dlpack = configure_tensorflow()
     29 LOG = logging.getLogger("nvtabular")
     30 # tf import must happen after config to restrict memory use

File /nvtabular/nvtabular/loader/tf_utils.py:64, in configure_tensorflow(memory_allocation, device)
     58         tf.config.experimental.set_virtual_device_configuration(
     59             tf_devices[device],
     60             [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=memory_allocation)],
     61         )
     62     except RuntimeError as e:
     63         # Virtual devices must be set before GPUs have been initialized
---> 64         warnings.warn(e)
     66 # versions using TF earlier than 2.3.0 need to use extension
     67 # library for dlpack support to avoid memory leak issue
     68 __TF_DLPACK_STABLE_VERSION = "2.3.0"

TypeError: expected string or bytes-like object

Importing Dataloader and then SOK

import os

import tensorflow as tf
import sparse_operation_kit as sok

os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"  # fraction of free memory

import nvtabular as nvt
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers

BATCH_SIZE = 64000
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
sok.Init(global_batch_size=BATCH_SIZE)
---------------------------------------------------------------------------
AbortedError                              Traceback (most recent call last)
Input In [1], in <cell line: 15>()
     13 BATCH_SIZE = 64000
     14 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
---> 15 sok.Init(global_batch_size=BATCH_SIZE)

File /usr/local/lib/python3.8/dist-packages/SparseOperationKit-1.1.2-py3.8-linux-x86_64.egg/sparse_operation_kit/core/initialize.py:237, in Init(**kwargs)
    234         return _horovod_init(**kwargs)
    235 else:
    236     # horovod not imported
--> 237     return _one_device_init(**kwargs)

File /usr/local/lib/python3.8/dist-packages/SparseOperationKit-1.1.2-py3.8-linux-x86_64.egg/sparse_operation_kit/core/initialize.py:198, in Init.<locals>._one_device_init(**kwargs)
    196 global_seed = kwargs.get("seed", None) or kit_lib.gen_random_seed()
    197 visible_devices = _get_visible_devices()
--> 198 status = kit_lib.plugin_init(local_rank, 1, unique_id, global_seed, visible_devices,
    199                              global_batch_size=kwargs["global_batch_size"])
    200 return status

File <string>:1455, in plugin_init(global_replica_id, num_replicas_in_sync, nccl_unique_id, global_seed, visible_devices, global_batch_size, name)

File /usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/ops.py:7107, in raise_from_not_ok_status(e, name)
   7105 def raise_from_not_ok_status(e, name):
   7106   e.message += (" name: " + name if name is not None else "")
-> 7107   raise core._status_to_exception(e) from None

AbortedError: /workspace/build-env/sparse_operation_kit/kit_cc/kit_cc_infra/src/resources/cpu_resource.cc:47 Intra-process barrier blocking threads time out. [Op:PluginInit]

Metadata

Metadata

Assignees

No one assigned

    Labels

    P1Priority 1bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions