From 0b783c7e1695f50b812cbae725f7c7913efd41a1 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Tue, 30 Sep 2025 16:36:34 +0200 Subject: [PATCH] Reproduce GET a_c slowness bug/2126751 In certain RP tree configuration GET a_c request takes excessive amount of time even if there is max_allocation_candidates set to a smaller number. E.g with 8 PCI RPs with one resource each and a request with 8 groups of one resource each and having max_allocation_candidates set to 1k the GET a_c query can take 12 seconds. If max_allocation_candidates set to 10k then it take so much time that I just killed it. Related-Bug: #2126751 Signed-off-by: Balazs Gibizer Change-Id: I186612a72c52216fa80fc7015a317505b7779e7d Signed-off-by: Balazs Gibizer --- placement/objects/allocation_candidate.py | 11 ++ .../functional/test_allocation_candidates.py | 106 ++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/placement/objects/allocation_candidate.py b/placement/objects/allocation_candidate.py index 341fa6acb..ab9e7fda9 100644 --- a/placement/objects/allocation_candidate.py +++ b/placement/objects/allocation_candidate.py @@ -13,6 +13,7 @@ import collections import copy import itertools +import time import os_traits from oslo_log import log as logging @@ -782,6 +783,9 @@ def _merge_candidates(candidates, rw_ctx): all_suffixes = set(candidates) num_granular_groups = len(all_suffixes - set([''])) max_a_c = rw_ctx.config.placement.max_allocation_candidates + + dropped = 0 + start = time.monotonic() for areq_list in _generate_areq_lists( rw_ctx, areq_lists_by_anchor, all_suffixes ): @@ -816,8 +820,15 @@ def _merge_candidates(candidates, rw_ctx): # now exceeds capacity where amounts of the same RP+RC were # folded together. So do a final capacity check/filter. if rw_ctx.exceeds_capacity(areq): + dropped += 1 continue areqs.add(areq) + if len(areqs) == 1: + LOG.warn( + "Found the first valid candidate in %.2f secs and " + "dropped %d invalid ones", time.monotonic() - start, dropped) + start = time.monotonic() + dropped = 0 if max_a_c >= 0 and len(areqs) >= max_a_c: break diff --git a/placement/tests/functional/test_allocation_candidates.py b/placement/tests/functional/test_allocation_candidates.py index 032395c31..cdf2063a0 100644 --- a/placement/tests/functional/test_allocation_candidates.py +++ b/placement/tests/functional/test_allocation_candidates.py @@ -188,3 +188,109 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase): body2 = resp.text self.assertEqual(body1, body2) + + def test_many_non_viable_candidates_8_8(self): + # This simulates that we have a single resource per RP (in this case + # one VF, but it could be on PF resource it does not matter). We have + # many RPs and we request many groups of one resource. This creates a + # situation where even if the number of candidates are limited by + # max_allocation_candidates the algorithm generate a lot of invalid + # candidates that needs to be filtered out which takes excessive time. + # + # We have 8 RPs with 1 resource, and we request 8 groups with + # 1 resource. + # Placement will generate an initial candidate matrix by satisfying + # each group independently (G is request group, R is RP): + # + # G1: [R1, R2,..., R8] + # G2: [R1, R2,..., R8] + # ... + # G8: [R1, R2,..., R8] + # + # Then creates all the possible combinations and check if they are + # valid (C is candidate, G1-R1 means G1 group satisfied from R1 RP): + # C1: [G1-R1, G2-R1, ..., G8-R1] # invalid R1 has 1 res but C1 needs 8 + # C2: [G1-R1, G2-R1, ..., G8-R2] # invalid R1 has 1 res but C2 needs 7 + # ... + # Cx: [G1-R1, G2-R2, ..., G8-R8] # valid each Rx has 1 res and + # # Cx ask form 1 res each + # + # So placement generates an exessive amount of invalid (and therefore + # later filtered) candidates before it finds the first valid one. + # The max_allocation_candidates check only applies to valid candidates + # so it cannot prevent the excessive runtime of generating candidates + # that turns out to be invalid. + # + # With the extra logging we see that the first valid Cx is: + # WARNING [placement.objects.allocation_candidate] Found the first + # valid candidate in 1.73 secs and dropped 342391 invalid ones + # + # If you bump this from 1000 to 10k max candidates then you will see a + # very long runtime. + # + # This runs in 12 seconds. + self.conf_fixture.conf.set_override( + "max_allocation_candidates", 1000, group="placement") + self._test_num_candidates_and_computes( + computes=1, pfs=8, vfs_per_pf=1, req_groups=8, req_res_per_group=1, + req_limit=1000, + expected_candidates=1000, expected_computes_with_candidates=1) + +# This is bug https://bugs.launchpad.net/placement/+bug/2126751 the below +# case should run in reasonable time +# +# def test_many_non_viable_candidates_21_8(self): +# # This is runs for more than 120 seconds +# self.conf_fixture.conf.set_override( +# "max_allocation_candidates", 1000, group="placement") +# self._test_num_candidates_and_computes( +# computes=1, pfs=21, vfs_per_pf=1, req_groups=8, +# req_res_per_group=1, +# req_limit=1000, +# expected_candidates=1000, expected_computes_with_candidates=1) +# +# def test_many_non_viable_candidates_21_16(self): +# # This is runs for more than 120 seconds +# self.conf_fixture.conf.set_override( +# "max_allocation_candidates", 1000, group="placement") +# self._test_num_candidates_and_computes( +# computes=1, pfs=21, vfs_per_pf=1, req_groups=16, +# req_res_per_group=1, +# req_limit=1000, +# expected_candidates=1000, expected_computes_with_candidates=1) +# +# def test_many_non_viable_candidates_21_21(self): +# # This is runs for more than 120 seconds +# self.conf_fixture.conf.set_override( +# "max_allocation_candidates", 1000, group="placement") +# self._test_num_candidates_and_computes( +# computes=1, pfs=21, vfs_per_pf=1, req_groups=21, +# req_res_per_group=1, +# req_limit=1000, +# expected_candidates=1000, expected_computes_with_candidates=1) +# +# def test_many_non_viable_candidates_21_8_two_computes(self): +# # This is runs for more than 120 seconds +# self.conf_fixture.conf.set_override( +# "max_allocation_candidates", 1000, group="placement") +# self.conf_fixture.conf.set_override( +# "allocation_candidates_generation_strategy", "breadth-first", +# group="placement") +# self._test_num_candidates_and_computes( +# computes=2, pfs=21, vfs_per_pf=1, req_groups=8, +# req_res_per_group=1, +# req_limit=1000, +# expected_candidates=1000, expected_computes_with_candidates=2) +# +# def test_many_non_viable_candidates_21_21_two_computes(self): +# # This is runs for more than 120 seconds +# self.conf_fixture.conf.set_override( +# "max_allocation_candidates", 1000, group="placement") +# self.conf_fixture.conf.set_override( +# "allocation_candidates_generation_strategy", "breadth-first", +# group="placement") +# self._test_num_candidates_and_computes( +# computes=2, pfs=21, vfs_per_pf=1, req_groups=21, +# req_res_per_group=1, +# req_limit=1000, +# expected_candidates=1000, expected_computes_with_candidates=2)