From ec084de1894f277d21649844fbba85c141a266de Mon Sep 17 00:00:00 2001
From: Darrell Bishop <darrell@swiftstack.com>
Date: Tue, 15 Jan 2013 08:32:07 -0800
Subject: [PATCH] Optimize the ring builder's _reassign_parts() method.

Another ring builder optimization.  Profiling revealed hotspots in many
calls to min() and list.sort() in _reassign_parts().  That method didn't
get exercised in my last optimization pass because that pass targeted a
rebalance where nothing really moved around.

This time, I wrote a script which created a fresh ring, added a bunch of
devices, did the initial balance, deleted some devices, balanced, and
added some more back in.

Results from homebrew Python 2.7.3 on OS X 10.8.2 Macbook Pro
(bare-metal):

 BEFORE:
 Using part-power = 18, adding 600 devices, removing 100, then adding 300 more...
 NOT Profiling to 'initial_balance.prof'
   wall-time delta: 131.33s
 NOT Profiling to 'deleting_200_rebalance.prof'
   wall-time delta: 25.67s
 NOT Profiling to 'first_rebalance.prof'
   wall-time delta: 62.00s

 AFTER:
 Using part-power = 18, adding 600 devices, removing 100, then adding 300 more...
 NOT Profiling to 'initial_balance.prof'
   wall-time delta: 28.04s
 NOT Profiling to 'deleting_200_rebalance.prof'
   wall-time delta: 9.35s
 NOT Profiling to 'first_rebalance.prof'
   wall-time delta: 16.41s

The driver script I used is available here:
https://gist.github.com/adb982aec6f0709f1273

Change-Id: I17e270acb12b5e4d4bbb1e34d8867dea90678961
---
 swift/common/ring/builder.py | 120 ++++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 36 deletions(-)

diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py
index ba1f5b9207..e83c653ea0 100644
--- a/swift/common/ring/builder.py
+++ b/swift/common/ring/builder.py
@@ -641,65 +641,97 @@ class RingBuilder(object):
             sorted((d for d in self._iter_devs() if d['weight']),
                    key=lambda x: x['sort_key'])
 
-        tier2children = build_tier_tree(available_devs)
-
         tier2devs = defaultdict(list)
         tier2sort_key = defaultdict(list)
-        tiers_by_depth = defaultdict(set)
+        max_tier_depth = 0
         for dev in available_devs:
             for tier in tiers_for_dev(dev):
                 tier2devs[tier].append(dev)  # <-- starts out sorted!
                 tier2sort_key[tier].append(dev['sort_key'])
-                tiers_by_depth[len(tier)].add(tier)
+                if len(tier) > max_tier_depth:
+                    max_tier_depth = len(tier)
+
+        tier2children_sets = build_tier_tree(available_devs)
+        tier2children = defaultdict(list)
+        tier2children_sort_key = {}
+        tiers_list = [()]
+        depth = 1
+        while depth <= max_tier_depth:
+            new_tiers_list = []
+            for tier in tiers_list:
+                child_tiers = list(tier2children_sets[tier])
+                child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
+                tier2children[tier] = child_tiers
+                tier2children_sort_key[tier] = map(
+                    lambda t: tier2sort_key[t][-1], child_tiers)
+                new_tiers_list.extend(child_tiers)
+            tiers_list = new_tiers_list
+            depth += 1
 
         for part, replace_replicas in reassign_parts:
             # Gather up what other tiers (zones, ip_ports, and devices) the
             # replicas not-to-be-moved are in for this part.
-            other_replicas = defaultdict(lambda: 0)
+            other_replicas = defaultdict(int)
+            unique_tiers_by_tier_len = defaultdict(set)
             for replica in xrange(self.replicas):
                 if replica not in replace_replicas:
                     dev = self.devs[self._replica2part2dev[replica][part]]
                     for tier in tiers_for_dev(dev):
                         other_replicas[tier] += 1
-
-            def find_home_for_replica(tier=(), depth=1):
-                # Order the tiers by how many replicas of this
-                # partition they already have. Then, of the ones
-                # with the smallest number of replicas, pick the
-                # tier with the hungriest drive and then continue
-                # searching in that subtree.
-                #
-                # There are other strategies we could use here,
-                # such as hungriest-tier (i.e. biggest
-                # sum-of-parts-wanted) or picking one at random.
-                # However, hungriest-drive is what was used here
-                # before, and it worked pretty well in practice.
-                #
-                # Note that this allocator will balance things as
-                # evenly as possible at each level of the device
-                # layout. If your layout is extremely unbalanced,
-                # this may produce poor results.
-                candidate_tiers = tier2children[tier]
-                min_count = min(other_replicas[t] for t in candidate_tiers)
-                candidate_tiers = [t for t in candidate_tiers
-                                   if other_replicas[t] == min_count]
-                candidate_tiers.sort(
-                    key=lambda t: tier2sort_key[t][-1])
-
-                if depth == max(tiers_by_depth.keys()):
-                    return tier2devs[candidate_tiers[-1]][-1]
-
-                return find_home_for_replica(tier=candidate_tiers[-1],
-                                             depth=depth + 1)
+                        unique_tiers_by_tier_len[len(tier)].add(tier)
 
             for replica in replace_replicas:
-                dev = find_home_for_replica()
+                tier = ()
+                depth = 1
+                while depth <= max_tier_depth:
+                    # Order the tiers by how many replicas of this
+                    # partition they already have. Then, of the ones
+                    # with the smallest number of replicas, pick the
+                    # tier with the hungriest drive and then continue
+                    # searching in that subtree.
+                    #
+                    # There are other strategies we could use here,
+                    # such as hungriest-tier (i.e. biggest
+                    # sum-of-parts-wanted) or picking one at random.
+                    # However, hungriest-drive is what was used here
+                    # before, and it worked pretty well in practice.
+                    #
+                    # Note that this allocator will balance things as
+                    # evenly as possible at each level of the device
+                    # layout. If your layout is extremely unbalanced,
+                    # this may produce poor results.
+                    #
+                    # This used to be a cute, recursive function, but it's been
+                    # unrolled for performance.
+                    candidate_tiers = tier2children[tier]
+                    candidates_with_replicas = \
+                        unique_tiers_by_tier_len[len(tier) + 1]
+                    if len(candidate_tiers) > len(candidates_with_replicas):
+                        # There exists at least one tier with 0 other replicas,
+                        # so work backward among the candidates, accepting the
+                        # first which isn't in other_replicas.
+                        #
+                        # This optimization is to avoid calling the min()
+                        # below, which is expensive if you've got thousands of
+                        # drives.
+                        for t in reversed(candidate_tiers):
+                            if other_replicas[t] == 0:
+                                tier = t
+                                break
+                    else:
+                        min_count = min(other_replicas[t]
+                                        for t in candidate_tiers)
+                        tier = (t for t in reversed(candidate_tiers)
+                                if other_replicas[t] == min_count).next()
+                    depth += 1
+                dev = tier2devs[tier][-1]
                 dev['parts_wanted'] -= 1
                 dev['parts'] += 1
                 old_sort_key = dev['sort_key']
                 new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                 for tier in tiers_for_dev(dev):
                     other_replicas[tier] += 1
+                    unique_tiers_by_tier_len[len(tier)].add(tier)
 
                     index = bisect.bisect_left(tier2sort_key[tier],
                                                old_sort_key)
@@ -711,6 +743,22 @@ class RingBuilder(object):
                     tier2devs[tier].insert(new_index, dev)
                     tier2sort_key[tier].insert(new_index, new_sort_key)
 
+                    # Now jiggle tier2children values to keep them sorted
+                    new_last_sort_key = tier2sort_key[tier][-1]
+                    parent_tier = tier[0:-1]
+                    index = bisect.bisect_left(
+                        tier2children_sort_key[parent_tier],
+                        old_sort_key)
+                    popped = tier2children[parent_tier].pop(index)
+                    tier2children_sort_key[parent_tier].pop(index)
+
+                    new_index = bisect.bisect_left(
+                        tier2children_sort_key[parent_tier],
+                        new_last_sort_key)
+                    tier2children[parent_tier].insert(new_index, popped)
+                    tier2children_sort_key[parent_tier].insert(
+                        new_index, new_last_sort_key)
+
                 self._replica2part2dev[replica][part] = dev['id']
 
         # Just to save memory and keep from accidental reuse.