Merge "Reproduce GET a_c slowness bug/2126751"

2025-10-09 17:27:22 +00:00
parent 00fef09983 0b783c7e16
commit af09d65c3e
2 changed files with 117 additions and 0 deletions
--- a/placement/objects/allocation_candidate.py
+++ b/placement/objects/allocation_candidate.py
@@ -13,6 +13,7 @@
 import collections
 import copy
 import itertools
+import time

 import os_traits
 from oslo_log import log as logging
@@ -782,6 +783,9 @@ def _merge_candidates(candidates, rw_ctx):
    all_suffixes = set(candidates)
    num_granular_groups = len(all_suffixes - set(['']))
    max_a_c = rw_ctx.config.placement.max_allocation_candidates
+
+    dropped = 0
+    start = time.monotonic()
    for areq_list in _generate_areq_lists(
        rw_ctx, areq_lists_by_anchor, all_suffixes
    ):
@@ -816,8 +820,15 @@ def _merge_candidates(candidates, rw_ctx):
        # now exceeds capacity where amounts of the same RP+RC were
        # folded together.  So do a final capacity check/filter.
        if rw_ctx.exceeds_capacity(areq):
+            dropped += 1
            continue
        areqs.add(areq)
+        if len(areqs) == 1:
+            LOG.warn(
+                "Found the first valid candidate in %.2f secs and "
+                "dropped %d invalid ones", time.monotonic() - start, dropped)
+        start = time.monotonic()
+        dropped = 0

        if max_a_c >= 0 and len(areqs) >= max_a_c:
            break
--- a/placement/tests/functional/test_allocation_candidates.py
+++ b/placement/tests/functional/test_allocation_candidates.py
@@ -188,3 +188,109 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase):
            body2 = resp.text

            self.assertEqual(body1, body2)
+
+    def test_many_non_viable_candidates_8_8(self):
+        # This simulates that we have a single resource per RP (in this case
+        # one VF, but it could be on PF resource it does not matter). We have
+        # many RPs and we request many groups of one resource. This creates a
+        # situation where even if the number of candidates are limited by
+        # max_allocation_candidates the algorithm generate a lot of invalid
+        # candidates that needs to be filtered out which takes excessive time.
+        #
+        # We have 8 RPs with 1 resource, and we request 8 groups with
+        # 1 resource.
+        # Placement will generate an initial candidate matrix by satisfying
+        # each group independently (G is request group, R is RP):
+        #
+        # G1: [R1, R2,..., R8]
+        # G2: [R1, R2,..., R8]
+        # ...
+        # G8: [R1, R2,..., R8]
+        #
+        # Then creates all the possible combinations and check if they are
+        # valid (C is candidate, G1-R1 means G1 group satisfied from R1 RP):
+        # C1: [G1-R1, G2-R1, ..., G8-R1] # invalid R1 has 1 res but C1 needs 8
+        # C2: [G1-R1, G2-R1, ..., G8-R2] # invalid R1 has 1 res but C2 needs 7
+        # ...
+        # Cx: [G1-R1, G2-R2, ..., G8-R8] # valid each Rx has 1 res and
+        #                                # Cx ask form 1 res each
+        #
+        # So placement generates an exessive amount of invalid (and therefore
+        # later filtered) candidates before it finds the first valid one.
+        # The max_allocation_candidates check only applies to valid candidates
+        # so it cannot prevent the excessive runtime of generating candidates
+        # that turns out to be invalid.
+        #
+        # With the extra logging we see that the first valid Cx is:
+        # WARNING [placement.objects.allocation_candidate] Found the first
+        #         valid candidate in 1.73 secs and dropped 342391 invalid ones
+        #
+        # If you bump this from 1000 to 10k max candidates then you will see a
+        # very long runtime.
+        #
+        # This runs in 12 seconds.
+        self.conf_fixture.conf.set_override(
+            "max_allocation_candidates", 1000, group="placement")
+        self._test_num_candidates_and_computes(
+            computes=1, pfs=8, vfs_per_pf=1, req_groups=8, req_res_per_group=1,
+            req_limit=1000,
+            expected_candidates=1000, expected_computes_with_candidates=1)
+
+# This is bug https://bugs.launchpad.net/placement/+bug/2126751 the below
+# case should run in reasonable time
+#
+#    def test_many_non_viable_candidates_21_8(self):
+#        # This is runs for more than 120 seconds
+#        self.conf_fixture.conf.set_override(
+#            "max_allocation_candidates", 1000, group="placement")
+#        self._test_num_candidates_and_computes(
+#            computes=1, pfs=21, vfs_per_pf=1, req_groups=8,
+#            req_res_per_group=1,
+#            req_limit=1000,
+#            expected_candidates=1000, expected_computes_with_candidates=1)
+#
+#    def test_many_non_viable_candidates_21_16(self):
+#        # This is runs for more than 120 seconds
+#        self.conf_fixture.conf.set_override(
+#            "max_allocation_candidates", 1000, group="placement")
+#        self._test_num_candidates_and_computes(
+#            computes=1, pfs=21, vfs_per_pf=1, req_groups=16,
+#            req_res_per_group=1,
+#            req_limit=1000,
+#            expected_candidates=1000, expected_computes_with_candidates=1)
+#
+#    def test_many_non_viable_candidates_21_21(self):
+#        # This is runs for more than 120 seconds
+#        self.conf_fixture.conf.set_override(
+#            "max_allocation_candidates", 1000, group="placement")
+#        self._test_num_candidates_and_computes(
+#            computes=1, pfs=21, vfs_per_pf=1, req_groups=21,
+#            req_res_per_group=1,
+#            req_limit=1000,
+#            expected_candidates=1000, expected_computes_with_candidates=1)
+#
+#    def test_many_non_viable_candidates_21_8_two_computes(self):
+#        # This is runs for more than 120 seconds
+#        self.conf_fixture.conf.set_override(
+#            "max_allocation_candidates", 1000, group="placement")
+#        self.conf_fixture.conf.set_override(
+#            "allocation_candidates_generation_strategy", "breadth-first",
+#            group="placement")
+#        self._test_num_candidates_and_computes(
+#            computes=2, pfs=21, vfs_per_pf=1, req_groups=8,
+#            req_res_per_group=1,
+#            req_limit=1000,
+#            expected_candidates=1000, expected_computes_with_candidates=2)
+#
+#    def test_many_non_viable_candidates_21_21_two_computes(self):
+#        # This is runs for more than 120 seconds
+#        self.conf_fixture.conf.set_override(
+#            "max_allocation_candidates", 1000, group="placement")
+#        self.conf_fixture.conf.set_override(
+#            "allocation_candidates_generation_strategy", "breadth-first",
+#            group="placement")
+#        self._test_num_candidates_and_computes(
+#            computes=2, pfs=21, vfs_per_pf=1, req_groups=21,
+#            req_res_per_group=1,
+#            req_limit=1000,
+#            expected_candidates=1000, expected_computes_with_candidates=2)