Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 14bf545

Browse files
authored
Merge pull request #1285 from clonker/issue_1284
- Fixed the calculation of the default chunk size in `iterable.py` and pass the chunk size into estimation when using a pipeline. - DataInMemory now returns the dtype of the first array as output_type - output_type now returns an instance of dtype rather than the class definition - Fixes #1284
2 parents e64e2ed + 64a06cf commit 14bf545

File tree

10 files changed

+25
-19
lines changed

10 files changed

+25
-19
lines changed

doc/source/CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Changelog
1313

1414
- msm: Chapman Kolmogorov validator ensures there are no side effects on the tested model. #1255
1515
- datasets: Fix default values for kT to ensure integrator produces sane values. #1272, #1275
16+
- coordinates: fixed fixed handling of default chunksize. #1284
1617

1718

1819
2.5.1 (02-17-2018)

pyemma/coordinates/data/_base/iterable.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,11 @@ def _compute_default_cs(dim, itemsize, logger=None):
5151

5252
# TODO: consider rounding this to some cache size of CPU? e.g py-cpuinfo can obtain it.
5353
# if one time step is already bigger than max_memory, we set the chunksize to 1.
54-
max_elements = max(1, int(np.floor(max_bytes / (itemsize * dim))))
55-
assert max_elements * dim * itemsize <= max_bytes or max_elements == 1
56-
result = max(1, max_elements // dim)
54+
bytes_per_frame = itemsize * dim
55+
max_frames = max(1, int(np.floor(max_bytes / bytes_per_frame)))
56+
assert max_frames * dim * itemsize <= max_bytes or max_frames == 1, \
57+
"number of frames times dim times sizeof(dtype) should be smaller or equal than max_bytes"
58+
result = max_frames
5759

5860
assert result > 0
5961
if logger is not None:
@@ -77,7 +79,7 @@ def default_chunksize(self):
7779
self._default_chunksize = Iterable._FALLBACK_CHUNKSIZE
7880
else:
7981
self._default_chunksize = Iterable._compute_default_cs(self.dimension(),
80-
self.output_type()().itemsize, self.logger)
82+
self.output_type().itemsize, self.logger)
8183
return self._default_chunksize
8284

8385
@property
@@ -172,7 +174,7 @@ def _create_iterator(self, skip=0, chunk=0, stride=1, return_trajindex=True, col
172174

173175
def output_type(self):
174176
r""" By default transformers return single precision floats. """
175-
return np.float32
177+
return np.float32()
176178

177179
def __iter__(self):
178180
return self.iterator()

pyemma/coordinates/data/data_in_memory.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ def _add_array_to_storage(self, array):
106106

107107
self.data.append(array)
108108

109+
def output_type(self):
110+
return self.data[0].dtype
111+
109112
def _set_dimensions_and_lenghts(self):
110113
# number of trajectories/data sets
111114
self._ntraj = len(self.data)

pyemma/coordinates/pipelines.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,16 @@ def __init__(self, chain, chunksize=None, param_stride=1):
4949
omit every n'th data point
5050
5151
"""
52-
self._chain = []
53-
self.chunksize = chunksize
5452
self.param_stride = param_stride
55-
self.chunksize = chunksize
53+
self._chunksize = chunksize
5654

5755
# add given elements in chain
56+
self._chain = []
5857
for e in chain:
5958
self.add_element(e)
6059

60+
self.chunksize = chunksize
61+
6162
self._estimated = False
6263

6364
@property
@@ -141,7 +142,7 @@ def parametrize(self):
141142
"""
142143
for element in self._chain:
143144
if not element.is_reader and not element._estimated:
144-
element.estimate(element.data_producer, stride=self.param_stride)
145+
element.estimate(element.data_producer, stride=self.param_stride, chunksize=self.chunksize)
145146

146147
self._estimated = True
147148

pyemma/coordinates/tests/test_coordinates_iterator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class TestCoordinatesIterator(unittest.TestCase):
1414

1515
@classmethod
1616
def setUpClass(cls):
17-
cls.d = [np.random.random((100, 3)) for _ in range(3)]
17+
cls.d = [np.random.random((100, 3)).astype(np.float32) for _ in range(3)]
1818

1919
def setUp(self):
2020
self.tempdir = tempfile.mktemp()

pyemma/coordinates/tests/test_datainmemory.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
class TestDataInMemory(unittest.TestCase):
3939
@classmethod
4040
def setUpClass(cls):
41-
d = np.random.random((100, 3))
42-
d_1d = np.random.random(100)
41+
d = np.random.random((100, 3)).astype(np.float32)
42+
d_1d = np.random.random(100).astype(np.float32)
4343

4444
cls.d = d
4545
cls.d_1d = d_1d
@@ -253,7 +253,7 @@ def test_lagged_iterator_1d(self):
253253
err_msg="failed for traj=%s" % idx)
254254

255255
def test_lagged_stridden_access(self):
256-
data = np.random.random((1000, 2))
256+
data = np.random.random((1000, 2)).astype(np.float32)
257257
reader = DataInMemory(data)
258258
strides = [2, 3, 5, 7, 15]
259259
lags = [1, 3, 7, 10, 30]

pyemma/coordinates/tests/test_pca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def test_number_of_trajectories(self):
153153
assert self.pca_obj.number_of_trajectories() == 1
154154

155155
def test_output_type(self):
156-
assert self.pca_obj.output_type() == np.float32
156+
assert self.pca_obj.output_type() == np.float32()
157157

158158
def test_trajectory_length(self):
159159
assert self.pca_obj.trajectory_length(0) == self.T

pyemma/coordinates/tests/test_source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def test_number_of_trajectories(self):
159159
self.inp.number_of_trajectories() == 1
160160

161161
def test_output_type(self):
162-
assert self.inp.output_type() == np.float32
162+
assert self.inp.output_type() == np.float32()
163163

164164
def test_topfile(self):
165165
types.is_string(self.inp.topfile)

pyemma/coordinates/tests/test_sources_merger.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ def _get_output_compare(self, joiner, stride=1, chunk=0, skip=0):
4444
for r in self.readers:
4545
for i, x in enumerate(r.get_output(stride=stride, chunk=chunk, skip=skip)):
4646
outs[i].append(x)
47-
combined = [np.hstack(outs[i]) for i in range(3)]
48-
np.testing.assert_equal(out, combined)
47+
combined = [np.hstack(outs[i]).astype(np.float32) for i in range(3)]
48+
np.testing.assert_equal([o.astype(np.float32) for o in out], combined)
4949

5050
def test_combined_output(self):
5151
j = SourcesMerger(self.readers)
@@ -57,7 +57,6 @@ def test_combined_output(self):
5757
def test_ra_stride(self):
5858
ra_indices = np.array([[0,7], [0, 23], [1, 30], [2, 9]])
5959
j = SourcesMerger(self.readers)
60-
6160
self._get_output_compare(j, stride=ra_indices)
6261

6362
def test_non_matching_lengths(self):

pyemma/coordinates/tests/test_tica.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def test_number_of_trajectories(self):
372372
self.tica_obj.number_of_trajectories() == 1
373373

374374
def test_output_type(self):
375-
assert self.tica_obj.output_type() == np.float32
375+
assert self.tica_obj.output_type() == np.float32()
376376

377377
def test_trajectory_length(self):
378378
assert self.tica_obj.trajectory_length(0) == self.T

0 commit comments

Comments
 (0)