Merge "Reproduce GET a_c slowness bug/2126751"

This commit is contained in:
Zuul
2025-10-09 17:27:22 +00:00
committed by Gerrit Code Review
2 changed files with 117 additions and 0 deletions

View File

@@ -13,6 +13,7 @@
import collections
import copy
import itertools
import time
import os_traits
from oslo_log import log as logging
@@ -782,6 +783,9 @@ def _merge_candidates(candidates, rw_ctx):
all_suffixes = set(candidates)
num_granular_groups = len(all_suffixes - set(['']))
max_a_c = rw_ctx.config.placement.max_allocation_candidates
dropped = 0
start = time.monotonic()
for areq_list in _generate_areq_lists(
rw_ctx, areq_lists_by_anchor, all_suffixes
):
@@ -816,8 +820,15 @@ def _merge_candidates(candidates, rw_ctx):
# now exceeds capacity where amounts of the same RP+RC were
# folded together. So do a final capacity check/filter.
if rw_ctx.exceeds_capacity(areq):
dropped += 1
continue
areqs.add(areq)
if len(areqs) == 1:
LOG.warn(
"Found the first valid candidate in %.2f secs and "
"dropped %d invalid ones", time.monotonic() - start, dropped)
start = time.monotonic()
dropped = 0
if max_a_c >= 0 and len(areqs) >= max_a_c:
break

View File

@@ -188,3 +188,109 @@ class TestWideTreeAllocationCandidateExplosion(base.TestCase):
body2 = resp.text
self.assertEqual(body1, body2)
def test_many_non_viable_candidates_8_8(self):
# This simulates that we have a single resource per RP (in this case
# one VF, but it could be on PF resource it does not matter). We have
# many RPs and we request many groups of one resource. This creates a
# situation where even if the number of candidates are limited by
# max_allocation_candidates the algorithm generate a lot of invalid
# candidates that needs to be filtered out which takes excessive time.
#
# We have 8 RPs with 1 resource, and we request 8 groups with
# 1 resource.
# Placement will generate an initial candidate matrix by satisfying
# each group independently (G is request group, R is RP):
#
# G1: [R1, R2,..., R8]
# G2: [R1, R2,..., R8]
# ...
# G8: [R1, R2,..., R8]
#
# Then creates all the possible combinations and check if they are
# valid (C is candidate, G1-R1 means G1 group satisfied from R1 RP):
# C1: [G1-R1, G2-R1, ..., G8-R1] # invalid R1 has 1 res but C1 needs 8
# C2: [G1-R1, G2-R1, ..., G8-R2] # invalid R1 has 1 res but C2 needs 7
# ...
# Cx: [G1-R1, G2-R2, ..., G8-R8] # valid each Rx has 1 res and
# # Cx ask form 1 res each
#
# So placement generates an exessive amount of invalid (and therefore
# later filtered) candidates before it finds the first valid one.
# The max_allocation_candidates check only applies to valid candidates
# so it cannot prevent the excessive runtime of generating candidates
# that turns out to be invalid.
#
# With the extra logging we see that the first valid Cx is:
# WARNING [placement.objects.allocation_candidate] Found the first
# valid candidate in 1.73 secs and dropped 342391 invalid ones
#
# If you bump this from 1000 to 10k max candidates then you will see a
# very long runtime.
#
# This runs in 12 seconds.
self.conf_fixture.conf.set_override(
"max_allocation_candidates", 1000, group="placement")
self._test_num_candidates_and_computes(
computes=1, pfs=8, vfs_per_pf=1, req_groups=8, req_res_per_group=1,
req_limit=1000,
expected_candidates=1000, expected_computes_with_candidates=1)
# This is bug https://bugs.launchpad.net/placement/+bug/2126751 the below
# case should run in reasonable time
#
# def test_many_non_viable_candidates_21_8(self):
# # This is runs for more than 120 seconds
# self.conf_fixture.conf.set_override(
# "max_allocation_candidates", 1000, group="placement")
# self._test_num_candidates_and_computes(
# computes=1, pfs=21, vfs_per_pf=1, req_groups=8,
# req_res_per_group=1,
# req_limit=1000,
# expected_candidates=1000, expected_computes_with_candidates=1)
#
# def test_many_non_viable_candidates_21_16(self):
# # This is runs for more than 120 seconds
# self.conf_fixture.conf.set_override(
# "max_allocation_candidates", 1000, group="placement")
# self._test_num_candidates_and_computes(
# computes=1, pfs=21, vfs_per_pf=1, req_groups=16,
# req_res_per_group=1,
# req_limit=1000,
# expected_candidates=1000, expected_computes_with_candidates=1)
#
# def test_many_non_viable_candidates_21_21(self):
# # This is runs for more than 120 seconds
# self.conf_fixture.conf.set_override(
# "max_allocation_candidates", 1000, group="placement")
# self._test_num_candidates_and_computes(
# computes=1, pfs=21, vfs_per_pf=1, req_groups=21,
# req_res_per_group=1,
# req_limit=1000,
# expected_candidates=1000, expected_computes_with_candidates=1)
#
# def test_many_non_viable_candidates_21_8_two_computes(self):
# # This is runs for more than 120 seconds
# self.conf_fixture.conf.set_override(
# "max_allocation_candidates", 1000, group="placement")
# self.conf_fixture.conf.set_override(
# "allocation_candidates_generation_strategy", "breadth-first",
# group="placement")
# self._test_num_candidates_and_computes(
# computes=2, pfs=21, vfs_per_pf=1, req_groups=8,
# req_res_per_group=1,
# req_limit=1000,
# expected_candidates=1000, expected_computes_with_candidates=2)
#
# def test_many_non_viable_candidates_21_21_two_computes(self):
# # This is runs for more than 120 seconds
# self.conf_fixture.conf.set_override(
# "max_allocation_candidates", 1000, group="placement")
# self.conf_fixture.conf.set_override(
# "allocation_candidates_generation_strategy", "breadth-first",
# group="placement")
# self._test_num_candidates_and_computes(
# computes=2, pfs=21, vfs_per_pf=1, req_groups=21,
# req_res_per_group=1,
# req_limit=1000,
# expected_candidates=1000, expected_computes_with_candidates=2)