Merge pull request #75 from MDAnalysis/issue-71-block-assign

kain88-de · web-flow · commit 6aa6c616f438 · 2018-10-28T09:13:08.000+01:00
balance block sizes (#71)
diff --git a/.travis.yml b/.travis.yml
@@ -15,7 +15,7 @@ env:
     # Set default python version to avoid repetition later
     - PYTHON_VERSION=3.6
     - MAIN_CMD="pytest"
-    - SETUP_CMD="pmda --pep8 -v --cov pmda"
+    - SETUP_CMD="pmda --pep8 --cov pmda"
     # mdanalysis develop from source (see below), which needs
     # minimal CONDA_MDANALYSIS_DEPENDENCIES
     #- CONDA_DEPENDENCIES="mdanalysis mdanalysistests dask joblib pytest-pep8 mock codecov cython hypothesis sphinx"
diff --git a/CHANGELOG b/CHANGELOG
@@ -13,7 +13,7 @@ The rules for this file:
   * release numbers follow "Semantic Versioning" http://semver.org
 
 ------------------------------------------------------------------------------
-xx/xx/18 VOD555, richardjgowers, iparask
+xx/xx/18 VOD555, richardjgowers, iparask, orbeckst
 
   * 0.2.0
 
@@ -23,6 +23,10 @@ Enhancements
   * add readonly_attributes context manager to ParallelAnalysisBase
   * add parallel implementation of Leaflet Finder (Issue #47)
 
+Fixes
+  * always distribute frames over blocks so that no empty blocks are
+    created ("balanced blocks", Issue #71)
+
 
 06/07/18 orbeckst
 
diff --git a/conftest.py b/conftest.py
@@ -11,6 +11,7 @@
 from dask import distributed, multiprocessing
 import pytest
 
+
 @pytest.fixture(scope="session", params=(1, 2))
 def client(tmpdir_factory, request):
     with tmpdir_factory.mktemp("dask_cluster").as_cwd():
diff --git a/docs/api.rst b/docs/api.rst
@@ -26,6 +26,7 @@ a single frame. If your need more flexibility you can use the
 
    api/parallel
    api/custom
+   api/util
 
 .. _pre-defined-analysis-tasks:
 
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -0,0 +1,2 @@
+.. automodule:: pmda.util
+   :members:
diff --git a/pmda/parallel.py b/pmda/parallel.py
@@ -16,6 +16,8 @@
 """
 from __future__ import absolute_import, division
 from contextlib import contextmanager
+import warnings
+
 from six.moves import range
 
 import MDAnalysis as mda
@@ -24,7 +26,7 @@
 from joblib import cpu_count
 import numpy as np
 
-from .util import timeit
+from .util import timeit, make_balanced_slices
 
 
 class Timing(object):
@@ -313,26 +315,36 @@ def run(self,
         start, stop, step = self._trajectory.check_slice_indices(
             start, stop, step)
         n_frames = len(range(start, stop, step))
-        bsize = int(np.ceil(n_frames / float(n_blocks)))
+
+        if n_frames == 0:
+            warnings.warn("run() analyses no frames: check start/stop/step")
+        if n_frames < n_blocks:
+            warnings.warn("run() uses more blocks than frames: "
+                          "decrease n_blocks")
+
+        slices = make_balanced_slices(n_frames, n_blocks,
+                                      start=start, stop=stop, step=step)
 
         with timeit() as total:
             with timeit() as prepare:
                 self._prepare()
             time_prepare = prepare.elapsed
             blocks = []
             with self.readonly_attributes():
-                for b in range(n_blocks):
+                for bslice in slices:
                     task = delayed(
                         self._dask_helper, pure=False)(
-                            b * bsize * step + start,
-                            min(stop, (b + 1) * bsize * step + start),
-                            step,
+                            bslice,
                             self._indices,
                             self._top,
                             self._traj, )
                     blocks.append(task)
                 blocks = delayed(blocks)
                 res = blocks.compute(**scheduler_kwargs)
+            # hack to handle n_frames == 0 in this framework
+            if len(res) == 0:
+                # everything else wants list of block tuples
+                res = [([], [], [], 0)]
             self._results = np.asarray([el[0] for el in res])
             with timeit() as conclude:
                 self._conclude()
@@ -343,7 +355,7 @@ def run(self,
             np.array([el[3] for el in res]), time_prepare, conclude.elapsed)
         return self
 
-    def _dask_helper(self, start, stop, step, indices, top, traj):
+    def _dask_helper(self, bslice, indices, top, traj):
         """helper function to actually setup dask graph"""
         with timeit() as b_universe:
             u = mda.Universe(top, traj)
@@ -352,8 +364,12 @@ def _dask_helper(self, start, stop, step, indices, top, traj):
         res = []
         times_io = []
         times_compute = []
-        for i in range(start, stop, step):
+        # NOTE: bslice.stop cannot be None! Always make sure
+        #       that it comes from  _trajectory.check_slice_indices()!
+        for i in range(bslice.start, bslice.stop, bslice.step):
             with timeit() as b_io:
+                # explicit instead of 'for ts in u.trajectory[bslice]'
+                # so that we can get accurate timing.
                 ts = u.trajectory[i]
             with timeit() as b_compute:
                 res = self._reduce(res, self._single_frame(ts, agroups))
diff --git a/pmda/test/test_parallel.py b/pmda/test/test_parallel.py
@@ -78,6 +78,27 @@ def test_sub_frames(analysis, n_jobs):
     np.testing.assert_almost_equal(analysis.res, [10, 20, 30, 40])
 
 
+@pytest.mark.parametrize('n_jobs', (1, 2, 3))
+def test_no_frames(analysis, n_jobs):
+    u = mda.Universe(analysis._top, analysis._traj)
+    n_frames = u.trajectory.n_frames
+    with pytest.warns(UserWarning):
+        analysis.run(start=n_frames, stop=n_frames+1, n_jobs=n_jobs)
+    assert len(analysis.res) == 0
+    np.testing.assert_equal(analysis.res, [])
+    np.testing.assert_equal(analysis.timing.compute, [])
+    np.testing.assert_equal(analysis.timing.io, [])
+    assert analysis.timing.universe == 0
+
+
+def test_nframes_less_nblocks_warning(analysis):
+    u = mda.Universe(analysis._top, analysis._traj)
+    n_frames = u.trajectory.n_frames
+    with pytest.warns(UserWarning):
+        analysis.run(stop=2, n_blocks=4, n_jobs=2)
+    assert len(analysis.res) == 2
+
+
 def test_scheduler(analysis, scheduler):
     analysis.run(scheduler=scheduler)
 
diff --git a/pmda/test/test_util.py b/pmda/test/test_util.py
@@ -8,14 +8,125 @@
 # Released under the GNU Public Licence, v2 or any higher version
 from __future__ import absolute_import
 
+from six.moves import range
+
+import pytest
+
 import time
-from numpy.testing import assert_almost_equal
+import numpy as np
+from numpy.testing import assert_almost_equal, assert_equal
 
-from pmda.util import timeit
+from pmda.util import timeit, make_balanced_slices
 
 
 def test_timeit():
     with timeit() as timer:
         time.sleep(1)
 
     assert_almost_equal(timer.elapsed, 1, decimal=2)
+
+
+@pytest.mark.parametrize("start", (None, 0, 1, 10))
+@pytest.mark.parametrize("n_frames,n_blocks,result", [
+    (5, 1, [slice(0, None, 1)]),
+    (5, 2, [slice(0, 3, 1), slice(3, None, 1)]),
+    (5, 3, [slice(0, 2, 1), slice(2, 4, 1), slice(4, None, 1)]),
+    (5, 4, [slice(0, 2, 1), slice(2, 3, 1), slice(3, 4, 1),
+            slice(4, None, 1)]),
+    (5, 5, [slice(0, 1, 1), slice(1, 2, 1), slice(2, 3, 1), slice(3, 4, 1),
+            slice(4, None, 1)]),
+    (10, 2, [slice(0, 5, 1), slice(5, None, 1)]),
+    (10, 3, [slice(0, 4, 1), slice(4, 7, 1), slice(7, None, 1)]),
+    (10, 7, [slice(0, 2, 1), slice(2, 4, 1), slice(4, 6, 1), slice(6, 7, 1),
+             slice(7, 8, 1), slice(8, 9, 1), slice(9, None, 1)]),
+])
+def test_make_balanced_slices_step1(n_frames, n_blocks, start, result, step=1):
+    assert step in (None, 1), "This test can only test step None or 1"
+
+    _start = start if start is not None else 0
+    _result = [slice(sl.start + _start,
+                     sl.stop + _start if sl.stop is not None else None,
+                     sl.step) for sl in result]
+
+    slices = make_balanced_slices(n_frames, n_blocks,
+                                  start=start, step=step)
+    assert_equal(slices, _result)
+
+
+def _test_make_balanced_slices(n_blocks, start, stop, step, scale):
+    _start = start if start is not None else 0
+
+    traj_frames = range(scale * stop)
+    frames = traj_frames[start:stop:step]
+    n_frames = len(frames)
+
+    slices = make_balanced_slices(n_frames, n_blocks,
+                                  start=start, stop=stop, step=step)
+
+    assert len(slices) == n_blocks
+
+    # assemble frames again by blocks and show that we have all
+    # the original frames; get the sizes of the blocks
+
+    block_frames = []
+    block_sizes = []
+    for bslice in slices:
+        bframes = traj_frames[bslice]
+        block_frames.extend(list(bframes))
+        block_sizes.append(len(bframes))
+    block_sizes = np.array(block_sizes)
+
+    # check that we have all the frames accounted for
+    assert_equal(np.asarray(block_frames), np.asarray(frames))
+
+    # check that the distribution is balanced
+    if n_frames >= n_blocks:
+        assert np.all(block_sizes > 0)
+        minsize = n_frames // n_blocks
+        assert not np.setdiff1d(block_sizes, [minsize, minsize+1]), \
+            "For n_blocks <= n_frames, block sizes are not balanced"
+    else:
+        # pathological case; we will have blocks with length 0
+        # and n_blocks with 1 frame
+        zero_blocks = block_sizes == 0
+        assert np.sum(zero_blocks) == n_blocks - n_frames
+        assert np.sum(~zero_blocks) == n_frames
+        assert not np.setdiff1d(block_sizes[~zero_blocks], [1]), \
+            "For n_blocks>n_frames, some blocks contain != 1 frame"
+
+
+@pytest.mark.parametrize('n_blocks', [1, 2, 3, 4, 5, 7, 10, 11])
+@pytest.mark.parametrize('start', [0, 1, 10])
+@pytest.mark.parametrize('stop', [11, 100, 256])
+@pytest.mark.parametrize('step', [None, 1, 2, 3, 5, 7])
+@pytest.mark.parametrize('scale', [1, 2])
+def test_make_balanced_slices(n_blocks, start, stop, step, scale):
+    return _test_make_balanced_slices(n_blocks, start, stop, step, scale)
+
+
+def test_make_balanced_slices_step_gt_stop(n_blocks=2, start=None,
+                                           stop=5, step=6, scale=1):
+    return _test_make_balanced_slices(n_blocks, start, stop, step, scale)
+
+
+@pytest.mark.parametrize('n_blocks', [1, 2])
+@pytest.mark.parametrize('start', [0, 10])
+@pytest.mark.parametrize('step', [None, 1, 2])
+def test_make_balanced_slices_empty(n_blocks, start, step):
+    slices = make_balanced_slices(0, n_blocks, start=start, step=step)
+    assert slices == []
+
+
+@pytest.mark.parametrize("n_frames,n_blocks,start,stop,step",
+                         [(-1, 5, None, None, None), (5, 0, None, None, None),
+                          (5, -1, None, None, None), (0, 0, None, None, None),
+                          (-1, -1, None, None, None),
+                          (5, 4, -1, None, None), (0, 5, -1, None, None),
+                          (5, 0, -1, None, None),
+                          (5, 4, None, -1, None), (5, 4, 3, 2, None),
+                          (5, 4, None, None, -1), (5, 4, None, None, 0)])
+def test_make_balanced_slices_ValueError(n_frames, n_blocks,
+                                         start, stop, step):
+    with pytest.raises(ValueError):
+        make_balanced_slices(n_frames, n_blocks,
+                             start=start, stop=stop, step=step)
diff --git a/pmda/util.py b/pmda/util.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+.. automodule:: pmda.util`
	`2`	`+ :members:`