From ec084de1894f277d21649844fbba85c141a266de Mon Sep 17 00:00:00 2001 From: Darrell Bishop Date: Tue, 15 Jan 2013 08:32:07 -0800 Subject: [PATCH] Optimize the ring builder's _reassign_parts() method. Another ring builder optimization. Profiling revealed hotspots in many calls to min() and list.sort() in _reassign_parts(). That method didn't get exercised in my last optimization pass because that pass targeted a rebalance where nothing really moved around. This time, I wrote a script which created a fresh ring, added a bunch of devices, did the initial balance, deleted some devices, balanced, and added some more back in. Results from homebrew Python 2.7.3 on OS X 10.8.2 Macbook Pro (bare-metal): BEFORE: Using part-power = 18, adding 600 devices, removing 100, then adding 300 more... NOT Profiling to 'initial_balance.prof' wall-time delta: 131.33s NOT Profiling to 'deleting_200_rebalance.prof' wall-time delta: 25.67s NOT Profiling to 'first_rebalance.prof' wall-time delta: 62.00s AFTER: Using part-power = 18, adding 600 devices, removing 100, then adding 300 more... NOT Profiling to 'initial_balance.prof' wall-time delta: 28.04s NOT Profiling to 'deleting_200_rebalance.prof' wall-time delta: 9.35s NOT Profiling to 'first_rebalance.prof' wall-time delta: 16.41s The driver script I used is available here: https://gist.github.com/adb982aec6f0709f1273 Change-Id: I17e270acb12b5e4d4bbb1e34d8867dea90678961 --- swift/common/ring/builder.py | 120 ++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 36 deletions(-) diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index ba1f5b9207..e83c653ea0 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -641,65 +641,97 @@ class RingBuilder(object): sorted((d for d in self._iter_devs() if d['weight']), key=lambda x: x['sort_key']) - tier2children = build_tier_tree(available_devs) - tier2devs = defaultdict(list) tier2sort_key = defaultdict(list) - tiers_by_depth = defaultdict(set) + max_tier_depth = 0 for dev in available_devs: for tier in tiers_for_dev(dev): tier2devs[tier].append(dev) # <-- starts out sorted! tier2sort_key[tier].append(dev['sort_key']) - tiers_by_depth[len(tier)].add(tier) + if len(tier) > max_tier_depth: + max_tier_depth = len(tier) + + tier2children_sets = build_tier_tree(available_devs) + tier2children = defaultdict(list) + tier2children_sort_key = {} + tiers_list = [()] + depth = 1 + while depth <= max_tier_depth: + new_tiers_list = [] + for tier in tiers_list: + child_tiers = list(tier2children_sets[tier]) + child_tiers.sort(key=lambda t: tier2sort_key[t][-1]) + tier2children[tier] = child_tiers + tier2children_sort_key[tier] = map( + lambda t: tier2sort_key[t][-1], child_tiers) + new_tiers_list.extend(child_tiers) + tiers_list = new_tiers_list + depth += 1 for part, replace_replicas in reassign_parts: # Gather up what other tiers (zones, ip_ports, and devices) the # replicas not-to-be-moved are in for this part. - other_replicas = defaultdict(lambda: 0) + other_replicas = defaultdict(int) + unique_tiers_by_tier_len = defaultdict(set) for replica in xrange(self.replicas): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): other_replicas[tier] += 1 - - def find_home_for_replica(tier=(), depth=1): - # Order the tiers by how many replicas of this - # partition they already have. Then, of the ones - # with the smallest number of replicas, pick the - # tier with the hungriest drive and then continue - # searching in that subtree. - # - # There are other strategies we could use here, - # such as hungriest-tier (i.e. biggest - # sum-of-parts-wanted) or picking one at random. - # However, hungriest-drive is what was used here - # before, and it worked pretty well in practice. - # - # Note that this allocator will balance things as - # evenly as possible at each level of the device - # layout. If your layout is extremely unbalanced, - # this may produce poor results. - candidate_tiers = tier2children[tier] - min_count = min(other_replicas[t] for t in candidate_tiers) - candidate_tiers = [t for t in candidate_tiers - if other_replicas[t] == min_count] - candidate_tiers.sort( - key=lambda t: tier2sort_key[t][-1]) - - if depth == max(tiers_by_depth.keys()): - return tier2devs[candidate_tiers[-1]][-1] - - return find_home_for_replica(tier=candidate_tiers[-1], - depth=depth + 1) + unique_tiers_by_tier_len[len(tier)].add(tier) for replica in replace_replicas: - dev = find_home_for_replica() + tier = () + depth = 1 + while depth <= max_tier_depth: + # Order the tiers by how many replicas of this + # partition they already have. Then, of the ones + # with the smallest number of replicas, pick the + # tier with the hungriest drive and then continue + # searching in that subtree. + # + # There are other strategies we could use here, + # such as hungriest-tier (i.e. biggest + # sum-of-parts-wanted) or picking one at random. + # However, hungriest-drive is what was used here + # before, and it worked pretty well in practice. + # + # Note that this allocator will balance things as + # evenly as possible at each level of the device + # layout. If your layout is extremely unbalanced, + # this may produce poor results. + # + # This used to be a cute, recursive function, but it's been + # unrolled for performance. + candidate_tiers = tier2children[tier] + candidates_with_replicas = \ + unique_tiers_by_tier_len[len(tier) + 1] + if len(candidate_tiers) > len(candidates_with_replicas): + # There exists at least one tier with 0 other replicas, + # so work backward among the candidates, accepting the + # first which isn't in other_replicas. + # + # This optimization is to avoid calling the min() + # below, which is expensive if you've got thousands of + # drives. + for t in reversed(candidate_tiers): + if other_replicas[t] == 0: + tier = t + break + else: + min_count = min(other_replicas[t] + for t in candidate_tiers) + tier = (t for t in reversed(candidate_tiers) + if other_replicas[t] == min_count).next() + depth += 1 + dev = tier2devs[tier][-1] dev['parts_wanted'] -= 1 dev['parts'] += 1 old_sort_key = dev['sort_key'] new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in tiers_for_dev(dev): other_replicas[tier] += 1 + unique_tiers_by_tier_len[len(tier)].add(tier) index = bisect.bisect_left(tier2sort_key[tier], old_sort_key) @@ -711,6 +743,22 @@ class RingBuilder(object): tier2devs[tier].insert(new_index, dev) tier2sort_key[tier].insert(new_index, new_sort_key) + # Now jiggle tier2children values to keep them sorted + new_last_sort_key = tier2sort_key[tier][-1] + parent_tier = tier[0:-1] + index = bisect.bisect_left( + tier2children_sort_key[parent_tier], + old_sort_key) + popped = tier2children[parent_tier].pop(index) + tier2children_sort_key[parent_tier].pop(index) + + new_index = bisect.bisect_left( + tier2children_sort_key[parent_tier], + new_last_sort_key) + tier2children[parent_tier].insert(new_index, popped) + tier2children_sort_key[parent_tier].insert( + new_index, new_last_sort_key) + self._replica2part2dev[replica][part] = dev['id'] # Just to save memory and keep from accidental reuse.