Merge pull request #1285 from clonker/issue_1284

marscher · web-flow · commit 14bf545803a4 · 2018-04-10T12:40:17.000+02:00
- Fixed the calculation of the default chunk size in `iterable.py` and pass the chunk size into estimation when using a pipeline. - DataInMemory now returns the dtype of the first array as output_type - output_type now returns an instance of dtype rather than the class definition - Fixes #1284
diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
@@ -13,6 +13,7 @@ Changelog
 
 - msm: Chapman Kolmogorov validator ensures there are no side effects on the tested model. #1255
 - datasets: Fix default values for kT to ensure integrator produces sane values. #1272, #1275
+- coordinates: fixed fixed handling of default chunksize. #1284
 
 
 2.5.1 (02-17-2018)
diff --git a/pyemma/coordinates/data/_base/iterable.py b/pyemma/coordinates/data/_base/iterable.py
@@ -51,9 +51,11 @@ def _compute_default_cs(dim, itemsize, logger=None):
 
         # TODO: consider rounding this to some cache size of CPU? e.g py-cpuinfo can obtain it.
         # if one time step is already bigger than max_memory, we set the chunksize to 1.
-        max_elements = max(1, int(np.floor(max_bytes / (itemsize * dim))))
-        assert max_elements * dim * itemsize <= max_bytes or max_elements == 1
-        result = max(1, max_elements // dim)
+        bytes_per_frame = itemsize * dim
+        max_frames = max(1, int(np.floor(max_bytes / bytes_per_frame)))
+        assert max_frames * dim * itemsize <= max_bytes or max_frames == 1, \
+            "number of frames times dim times sizeof(dtype) should be smaller or equal than max_bytes"
+        result = max_frames
 
         assert result > 0
         if logger is not None:
@@ -77,7 +79,7 @@ def default_chunksize(self):
                 self._default_chunksize = Iterable._FALLBACK_CHUNKSIZE
             else:
                 self._default_chunksize = Iterable._compute_default_cs(self.dimension(),
-                                                                       self.output_type()().itemsize, self.logger)
+                                                                       self.output_type().itemsize, self.logger)
         return self._default_chunksize
 
     @property
@@ -172,7 +174,7 @@ def _create_iterator(self, skip=0, chunk=0, stride=1, return_trajindex=True, col
 
     def output_type(self):
         r""" By default transformers return single precision floats. """
-        return np.float32
+        return np.float32()
 
     def __iter__(self):
         return self.iterator()
diff --git a/pyemma/coordinates/data/data_in_memory.py b/pyemma/coordinates/data/data_in_memory.py
@@ -106,6 +106,9 @@ def _add_array_to_storage(self, array):
 
         self.data.append(array)
 
+    def output_type(self):
+        return self.data[0].dtype
+
     def _set_dimensions_and_lenghts(self):
         # number of trajectories/data sets
         self._ntraj = len(self.data)
diff --git a/pyemma/coordinates/pipelines.py b/pyemma/coordinates/pipelines.py
@@ -49,15 +49,16 @@ def __init__(self, chain, chunksize=None, param_stride=1):
             omit every n'th data point
 
         """
-        self._chain = []
-        self.chunksize = chunksize
         self.param_stride = param_stride
-        self.chunksize = chunksize
+        self._chunksize = chunksize
 
         # add given elements in chain
+        self._chain = []
         for e in chain:
             self.add_element(e)
 
+        self.chunksize = chunksize
+
         self._estimated = False
 
     @property
@@ -141,7 +142,7 @@ def parametrize(self):
         """
         for element in self._chain:
             if not element.is_reader and not element._estimated:
-                element.estimate(element.data_producer, stride=self.param_stride)
+                element.estimate(element.data_producer, stride=self.param_stride, chunksize=self.chunksize)
 
         self._estimated = True
 
diff --git a/pyemma/coordinates/tests/test_coordinates_iterator.py b/pyemma/coordinates/tests/test_coordinates_iterator.py
@@ -14,7 +14,7 @@ class TestCoordinatesIterator(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.d = [np.random.random((100, 3)) for _ in range(3)]
+        cls.d = [np.random.random((100, 3)).astype(np.float32) for _ in range(3)]
 
     def setUp(self):
         self.tempdir = tempfile.mktemp()
diff --git a/pyemma/coordinates/tests/test_datainmemory.py b/pyemma/coordinates/tests/test_datainmemory.py
@@ -38,8 +38,8 @@
 class TestDataInMemory(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        d = np.random.random((100, 3))
-        d_1d = np.random.random(100)
+        d = np.random.random((100, 3)).astype(np.float32)
+        d_1d = np.random.random(100).astype(np.float32)
 
         cls.d = d
         cls.d_1d = d_1d
@@ -253,7 +253,7 @@ def test_lagged_iterator_1d(self):
                                     err_msg="failed for traj=%s" % idx)
 
     def test_lagged_stridden_access(self):
-        data = np.random.random((1000, 2))
+        data = np.random.random((1000, 2)).astype(np.float32)
         reader = DataInMemory(data)
         strides = [2, 3, 5, 7, 15]
         lags = [1, 3, 7, 10, 30]
diff --git a/pyemma/coordinates/tests/test_pca.py b/pyemma/coordinates/tests/test_pca.py
@@ -153,7 +153,7 @@ def test_number_of_trajectories(self):
         assert self.pca_obj.number_of_trajectories() == 1
 
     def test_output_type(self):
-        assert self.pca_obj.output_type() == np.float32
+        assert self.pca_obj.output_type() == np.float32()
 
     def test_trajectory_length(self):
         assert self.pca_obj.trajectory_length(0) == self.T
diff --git a/pyemma/coordinates/tests/test_source.py b/pyemma/coordinates/tests/test_source.py
@@ -159,7 +159,7 @@ def test_number_of_trajectories(self):
         self.inp.number_of_trajectories() == 1
 
     def test_output_type(self):
-        assert self.inp.output_type() == np.float32
+        assert self.inp.output_type() == np.float32()
 
     def test_topfile(self):
         types.is_string(self.inp.topfile)
diff --git a/pyemma/coordinates/tests/test_sources_merger.py b/pyemma/coordinates/tests/test_sources_merger.py
@@ -44,8 +44,8 @@ def _get_output_compare(self, joiner, stride=1, chunk=0, skip=0):
         for r in self.readers:
             for i, x in enumerate(r.get_output(stride=stride, chunk=chunk, skip=skip)):
                 outs[i].append(x)
-        combined = [np.hstack(outs[i]) for i in range(3)]
-        np.testing.assert_equal(out, combined)
+        combined = [np.hstack(outs[i]).astype(np.float32) for i in range(3)]
+        np.testing.assert_equal([o.astype(np.float32) for o in out], combined)
 
     def test_combined_output(self):
         j = SourcesMerger(self.readers)
@@ -57,7 +57,6 @@ def test_combined_output(self):
     def test_ra_stride(self):
         ra_indices = np.array([[0,7], [0, 23], [1, 30], [2, 9]])
         j = SourcesMerger(self.readers)
-
         self._get_output_compare(j, stride=ra_indices)
 
     def test_non_matching_lengths(self):
diff --git a/pyemma/coordinates/tests/test_tica.py b/pyemma/coordinates/tests/test_tica.py
@@ -372,7 +372,7 @@ def test_number_of_trajectories(self):
         self.tica_obj.number_of_trajectories() == 1
 
     def test_output_type(self):
-        assert self.tica_obj.output_type() == np.float32
+        assert self.tica_obj.output_type() == np.float32()
 
     def test_trajectory_length(self):
         assert self.tica_obj.trajectory_length(0) == self.T