From f20e13f0b233753a1b7f1899d24ad03d0d801b68 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Fri, 6 Dec 2024 16:55:46 +0100
Subject: [PATCH] Add round-robin candidate generation strategy

The previous patch introduced [placement]max_allocation_candidates
config option to limit the number of candidates generated for a single
query.

If the number of generated allocation candidates are limited by that
config option then it is possible to get candidates from a limited set of
root providers (computes, anchoring providers) as placement uses a
depth-first strategy, generating all candidates from the first root
before considering the next one.

To avoid unbalanced results this patch introduces a new config option
[placement]allocation_candidates_generation_strategy with the possible
values:
* depth-first, the original strategy that generates all candidate from
  the first root before moving to the next. This is will be the default
  strategy for backward compatibility
* breadth-first, a new possible strategy that generates candidates from
  available roots in a round-robin fashion, one candidate from each
  root before taking the second candidate from the first root.

Closes-Bug: #2070257
Change-Id: Ib7a140374bc91cc9ab597d0923b0623f618ec32c
---
 placement/conf/placement.py                   | 32 ++++++++++
 placement/objects/allocation_candidate.py     | 21 ++++++-
 .../functional/test_allocation_candidates.py  | 52 ++++++++++++++-
 .../unit/objects/test_allocation_candidate.py | 57 +++++++++++++++++
 placement/tests/unit/test_util.py             | 28 +++++++++
 placement/util.py                             | 13 ++++
 ...it-and-strategy.yaml-e73796898163fb55.yaml | 63 +++++++++++++++++++
 7 files changed, 261 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/bug-2070257-allocation-candidates-generation-limit-and-strategy.yaml-e73796898163fb55.yaml

diff --git a/placement/conf/placement.py b/placement/conf/placement.py
index ed0c325cc..507be1a3a 100644
--- a/placement/conf/placement.py
+++ b/placement/conf/placement.py
@@ -84,6 +84,38 @@ under the same root having inventory from the same resource class
 to tune this config option based on the memory available for the
 placement service and the client timeout setting on the client side. A good
 initial value could be around 100000.
+
+In a deployment with wide and symmetric provider trees we also recommend to
+change the [placement]allocation_candidates_generation_strategy to
+breadth-first.
+"""),
+    cfg.StrOpt(
+        'allocation_candidates_generation_strategy',
+        default="depth-first",
+        choices=("depth-first", "breadth-first"),
+        help="""
+Defines the order placement visits viable root providers during allocation
+candidate generation:
+
+* depth-first, generates all candidates from the first viable root provider
+  before moving to the next.
+
+* breadth-first, generates candidates from viable roots in a round-robin
+  fashion, creating one candidate from each viable root before creating the
+  second candidate from the first root.
+
+If the deployment has wide and symmetric provider trees, i.e. there are
+multiple children providers under the same root having inventory from the same
+resource class (e.g. in case of nova's mdev GPU or PCI in Placement features)
+then the depth-first strategy with a max_allocation_candidates
+limit might produce candidates from a limited set of root providers. On the
+other hand breadth-first strategy will ensure that the candidates are returned
+from all viable roots in a balanced way.
+
+Both strategies produce the candidates in the API response in an undefined but
+deterministic order. That is, all things being equal, two requests for
+allocation candidates will return the same results in the same order; but no
+guarantees are made as to how that order is determined.
 """),
 ]
 
diff --git a/placement/objects/allocation_candidate.py b/placement/objects/allocation_candidate.py
index 96604c22f..341fa6acb 100644
--- a/placement/objects/allocation_candidate.py
+++ b/placement/objects/allocation_candidate.py
@@ -24,6 +24,7 @@ from placement import exception
 from placement.objects import research_context as res_ctx
 from placement.objects import resource_provider as rp_obj
 from placement.objects import trait as trait_obj
+from placement import util
 
 
 _ALLOC_TBL = models.Allocation.__table__
@@ -718,9 +719,21 @@ def _get_areq_list_generators(areq_lists_by_anchor, all_suffixes):
     ]
 
 
-def _generate_areq_lists(areq_lists_by_anchor, all_suffixes):
+def _generate_areq_lists(rw_ctx, areq_lists_by_anchor, all_suffixes):
+    strategy = (
+        rw_ctx.config.placement.allocation_candidates_generation_strategy)
     generators = _get_areq_list_generators(areq_lists_by_anchor, all_suffixes)
-    return itertools.chain(*generators)
+    if strategy == "depth-first":
+        # Generates all solutions from the first anchor before moving to the
+        # next
+        return itertools.chain(*generators)
+    if strategy == "breadth-first":
+        # Generates solutions from anchors in a round-robin manner. So the
+        # number of solutions generated are balanced between each viable
+        # anchors.
+        return util.roundrobin(*generators)
+
+    raise ValueError("Strategy '%s' not recognized" % strategy)
 
 # TODO(efried): Move _merge_candidates to rw_ctx?
 
@@ -769,7 +782,9 @@ def _merge_candidates(candidates, rw_ctx):
     all_suffixes = set(candidates)
     num_granular_groups = len(all_suffixes - set(['']))
     max_a_c = rw_ctx.config.placement.max_allocation_candidates
-    for areq_list in _generate_areq_lists(areq_lists_by_anchor, all_suffixes):
+    for areq_list in _generate_areq_lists(
+        rw_ctx, areq_lists_by_anchor, all_suffixes
+    ):
         # At this point, each AllocationRequest in areq_list is still
         # marked as use_same_provider. This is necessary to filter by group
         # policy, which enforces how these interact with each other.
diff --git a/placement/tests/functional/test_allocation_candidates.py b/placement/tests/functional/test_allocation_candidates.py
index 037ebf68e..032395c31 100644
--- a/placement/tests/functional/test_allocation_candidates.py
+++ b/placement/tests/functional/test_allocation_candidates.py
@@ -32,6 +32,9 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase):
 
         self.conf_fixture.conf.set_override(
             "max_allocation_candidates", 100000, group="placement")
+        self.conf_fixture.conf.set_override(
+            "allocation_candidates_generation_strategy", "breadth-first",
+            group="placement")
 
     def create_tree(self, num_roots, num_child, num_res_per_child):
         self.roots = {}
@@ -108,11 +111,14 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase):
             expected_candidates=1000, expected_computes_with_candidates=2)
 
     def test_too_many_candidates_global_limit_is_hit_result_unbalanced(self):
+        self.conf_fixture.conf.set_override(
+            "allocation_candidates_generation_strategy", "depth-first",
+            group="placement")
         # With max_allocation_candidates set to 100k limit this test now
         # runs in reasonable time (10 sec on my machine), without that it would
         # time out.
-        # However, with the global limit in place only the first compute gets
-        # candidates.
+        # However, with depth-first strategy and with the global limit in place
+        # only the first compute gets candidates.
         # 524288 valid candidates, the generation stops at 100k candidates,
         # only 1000 is returned, result is unbalanced as the first 100k
         # candidate is always from the first compute.
@@ -121,6 +127,21 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase):
             req_limit=1000,
             expected_candidates=1000, expected_computes_with_candidates=1)
 
+    def test_too_many_candidates_global_limit_is_hit_breadth_first_balanced(
+        self
+    ):
+        # With max_allocation_candidates set to 100k limit this test now
+        # runs in reasonable time (10 sec on my machine), without that it would
+        # time out.
+        # With the round-robin candidate generator in place the 100k generated
+        # candidates spread across both computes now.
+        # 524288 valid candidates, the generation stops at 100k candidates,
+        # only 1000 is returned, result is balanced between the computes
+        self._test_num_candidates_and_computes(
+            computes=2, pfs=8, vfs_per_pf=8, req_groups=6, req_res_per_group=1,
+            req_limit=1000,
+            expected_candidates=1000, expected_computes_with_candidates=2)
+
     def test_global_limit_hit(self):
         # 8192 possible candidates, global limit is set to 8000, higher request
         # limit so number of candidates are limited by the global limit
@@ -140,3 +161,30 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase):
             computes=2, pfs=8, vfs_per_pf=8, req_groups=4, req_res_per_group=1,
             req_limit=9000,
             expected_candidates=8192, expected_computes_with_candidates=2)
+
+    def test_breadth_first_strategy_generates_stable_ordering(self):
+        """Run the same query twice against the same two tree and assert that
+        response text is exactly the same proving that even with breadth-first
+        strategy the candidate ordering is stable.
+        """
+
+        self.create_tree(num_roots=2, num_child=8, num_res_per_child=8)
+
+        def query():
+            return client.get(
+                self.get_candidate_query(
+                    num_groups=2, num_res=1,
+                    limit=1000),
+                headers=self.headers)
+
+        conf = self.conf_fixture.conf
+        with direct.PlacementDirect(conf) as client:
+            resp = query()
+            self.assertEqual(200, resp.status_code)
+            body1 = resp.text
+
+            resp = query()
+            self.assertEqual(200, resp.status_code)
+            body2 = resp.text
+
+            self.assertEqual(body1, body2)
diff --git a/placement/tests/unit/objects/test_allocation_candidate.py b/placement/tests/unit/objects/test_allocation_candidate.py
index dc63baf66..2b82175b1 100644
--- a/placement/tests/unit/objects/test_allocation_candidate.py
+++ b/placement/tests/unit/objects/test_allocation_candidate.py
@@ -97,3 +97,60 @@ class TestAllocationCandidatesNoDB(base.TestCase):
         for group in different_subtree:
             self.assertFalse(
                 ac_obj._check_same_subtree(group, parent_by_rp))
+
+    @mock.patch('placement.objects.research_context._has_provider_trees',
+                new=mock.Mock(return_value=True))
+    def _test_generate_areq_list(self, strategy, expected_candidates):
+        self.conf_fixture.conf.set_override(
+            "allocation_candidates_generation_strategy", strategy,
+            group="placement")
+
+        rw_ctx = res_ctx.RequestWideSearchContext(
+            self.context, placement_lib.RequestWideParams(), True)
+        areq_lists_by_anchor = {
+            "root1": {
+                "": ["r1A", "r1B",],
+                "group1": ["r1g1A", "r1g1B",],
+            },
+            "root2": {
+                "": ["r2A"],
+                "group1": ["r2g1A", "r2g1B"],
+            },
+            "root3": {
+                "": ["r3A"],
+            },
+        }
+        generator = ac_obj._generate_areq_lists(
+            rw_ctx, areq_lists_by_anchor, {"", "group1"})
+
+        self.assertEqual(expected_candidates, list(generator))
+
+    def test_generate_areq_lists_depth_first(self):
+        # Depth-first will generate all root1 candidates first then root2,
+        # root3 is ignored as it has no candidate for group1.
+        expected_candidates = [
+            ('r1A', 'r1g1A'),
+            ('r1A', 'r1g1B'),
+            ('r1B', 'r1g1A'),
+            ('r1B', 'r1g1B'),
+            ('r2A', 'r2g1A'),
+            ('r2A', 'r2g1B'),
+        ]
+        self._test_generate_areq_list("depth-first", expected_candidates)
+
+    @mock.patch('placement.objects.research_context._has_provider_trees',
+                new=mock.Mock(return_value=True))
+    def test_generate_areq_lists_breadth_first(self):
+        # Breadth-first will take one candidate from root1 then root2 then goes
+        # back to root1 etc. Root2 runs out of candidates earlier than root1 so
+        # the last two candidates are both from root1. The root3 is still
+        # ignored as it has no candidates for group1.
+        expected_candidates = [
+            ('r1A', 'r1g1A'),
+            ('r2A', 'r2g1A'),
+            ('r1A', 'r1g1B'),
+            ('r2A', 'r2g1B'),
+            ('r1B', 'r1g1A'),
+            ('r1B', 'r1g1B')
+        ]
+        self._test_generate_areq_list("breadth-first", expected_candidates)
diff --git a/placement/tests/unit/test_util.py b/placement/tests/unit/test_util.py
index b6c28915d..92e6a317f 100644
--- a/placement/tests/unit/test_util.py
+++ b/placement/tests/unit/test_util.py
@@ -32,6 +32,7 @@ from placement.objects import resource_class as rc_obj
 from placement.objects import resource_provider as rp_obj
 from placement.tests.unit import base
 from placement import util
+from placement.util import roundrobin
 
 
 class TestCheckAccept(testtools.TestCase):
@@ -1450,3 +1451,30 @@ class RunOnceTests(testtools.TestCase):
         self.assertRaises(ValueError, f.reset)
         self.assertFalse(f.called)
         mock_clean.assert_called_once_with()
+
+
+class RoundRobinTests(testtools.TestCase):
+    def test_no_input(self):
+        self.assertEqual([], list(roundrobin()))
+
+    def test_single_input(self):
+        self.assertEqual([1, 2], list(roundrobin(iter([1, 2]))))
+
+    def test_balanced_inputs(self):
+        self.assertEqual(
+            [1, "x", 2, "y"],
+            list(roundrobin(
+                iter([1, 2]),
+                iter(["x", "y"]))
+            )
+        )
+
+    def test_unbalanced_inputs(self):
+        self.assertEqual(
+            ["A", "D", "E", "B", "F", "C"],
+            list(roundrobin(
+                iter("ABC"),
+                iter("D"),
+                iter("EF"))
+            )
+        )
diff --git a/placement/util.py b/placement/util.py
index 5ded33a9d..9220b6a88 100644
--- a/placement/util.py
+++ b/placement/util.py
@@ -614,3 +614,16 @@ def run_once(message, logger, cleanup=None):
         wrapper.reset = functools.partial(reset, wrapper)
         return wrapper
     return outer_wrapper
+
+
+def roundrobin(*iterables):
+    """roundrobin(iter('ABC'), iter('D'), iter('EF')) --> A D E B F C
+    Returns a new generator consuming items from the passed in iterators in a
+    round-robin fashion.
+    It is adapted from
+    https://docs.python.org/3/library/itertools.html#itertools-recipes
+    """
+    iterators = map(iter, iterables)
+    for num_active in range(len(iterables), 0, -1):
+        iterators = itertools.cycle(itertools.islice(iterators, num_active))
+        yield from map(next, iterators)
diff --git a/releasenotes/notes/bug-2070257-allocation-candidates-generation-limit-and-strategy.yaml-e73796898163fb55.yaml b/releasenotes/notes/bug-2070257-allocation-candidates-generation-limit-and-strategy.yaml-e73796898163fb55.yaml
new file mode 100644
index 000000000..c0f323415
--- /dev/null
+++ b/releasenotes/notes/bug-2070257-allocation-candidates-generation-limit-and-strategy.yaml-e73796898163fb55.yaml
@@ -0,0 +1,63 @@
+---
+fixes:
+  - |
+    In a deployment with wide and symmetric provider trees, i.e. where there
+    are multiple children providers under the same root having inventory from
+    the same resource class (e.g. in case of nova's mdev GPU or PCI in
+    Placement features) if the allocation candidate request asks for resources
+    from those children RPs in multiple request groups the number of possible
+    allocation candidates grows rapidly.
+    E.g.:
+
+      * 1 root, 8 child RPs with 1 unit of resource each
+        a_c requests 6 groups with 1 unit of resource each
+        => 8*7*6*5*4*3=20160 possible candidates
+
+      * 1 root, 8 child RPs with 6 unit of resources each
+        a_c requests 6 groups with 6 unit of resources each
+        => 8^6=262144 possible candidates
+
+    Placement generates these candidates fully before applying the limit
+    parameter provided in the allocation candidate query to be able do a random
+    sampling if ``[placement]randomize_allocation_candidates`` is True.
+
+    Placement takes excessive time and memory to generate this amount of
+    allocation candidates and the client might time out waiting for the
+    response or the Placement API service run out of memory and crash.
+
+    To avoid request timeout or out of memory events a new
+    ``[placement]max_allocation_candidates`` config option is implemented. This
+    limit is applied not after the request limit but *during* the
+    candidate generation process. So this new option can be used to limit the
+    runtime and memory consumption of the Placement API service.
+
+    The new config option is defaulted to ``-1``, meaning no limit, to keep the
+    legacy behavior. We suggest to tune this config in the affected
+    deployments based on the memory available for the Placement service and the
+    timeout setting of the clients. A good initial value could be around
+    ``100000``.
+
+    If the number of generated allocation candidates is limited by the
+    ``[placement]max_allocation_candidates`` config option then it is possible
+    to get candidates from a limited set of root providers (e.g. compute
+    nodes) as placement uses a depth-first strategy, i.e. generating all
+    candidates from the first root before considering the next one. To avoid
+    this issue a new config option
+    ``[placement]allocation_candidates_generation_strategy`` is introduced
+    with two possible values:
+
+      * ``depth-first``, generates all candidates from the first viable root
+        provider before moving to the next. This is the default and this
+        triggers the old behavior
+
+      * ``breadth-first``, generates candidates from viable roots in a
+        round-robin fashion, creating one candidate from each viable root
+        before creating the second candidate from the first root. This is the
+        possible behavior.
+
+    In a deployment where ``[placement]max_allocation_candidates`` is
+    configured to a positive number we recommend to set
+    ``[placement]allocation_candidates_generation_strategy`` to
+    ``breadth-first``.
+
+    .. _Bug#2070257: https://bugs.launchpad.net/nova/+bug/2070257