1 Swift

1.1 Ring builder


1.1.1 Add device

添加device的时候可以指定每个device的meta信息和replication ip,Swift会给每 个device一个id,这个id会一直递增,也就是说新增加的device的id总是比device的id 大,不重用id

if 'id' not in dev:
    dev['id'] = 0
    if self.devs:
        dev['id'] = max(d['id'] for d in self.devs if d) + 1
if dev['id'] < len(self.devs) and self.devs[dev['id']] is not None:
    raise exceptions.DuplicateDeviceError(
        'Duplicate device id: %d' % dev['id'])
# Add holes to self.devs to ensure self.devs[dev['id']] will be the dev
while dev['id'] >= len(self.devs):


return self.parts * self.replicas / \
    sum(d['weight'] for d in self._iter_devs())

在添加device的时候,会重新计算device所期望的Part(原有device需要减去已分配) ,这个值很关键,后面part和device的映射会根据这个值确定分配优先级( parts_wanted大优先级高)

for dev in self._iter_devs():
    if not dev['weight']:
        # With no weight, that means we wish to "drain" the device. So
        # we set the parts_wanted to a really large negative number to
        # indicate its strong desire to give up everything it has.
        dev['parts_wanted'] = -self.parts * self.replicas
        dev['parts_wanted'] = \
            int(weight_of_one_part * dev['weight']) - dev['parts']


1.1.2 Rebalance


def rebalance(self, seed=None):
    Rebalance the ring.

    This is the main work function of the builder, as it will assign and
    reassign partitions to devices in the ring based on weights, distinct
    zones, recent reassignments, etc.

    The process doesn't always perfectly assign partitions (that'd take a
    lot more analysis and therefore a lot more time -- I had code that did
    that before). Because of this, it keeps rebalancing until the device
    skew (number of partitions a device wants compared to what it has) gets
    below 1% or doesn't change by more than 1% (only happens with ring that
    can't be balanced no matter what -- like with 3 zones of differing
    weights with replicas set to 3).

    :returns: (number_of_partitions_altered, resulting_balance)

    if seed:
    self._ring = None
    if self._last_part_moves_epoch is None:
        self.devs_changed = False
        return self.parts, self.get_balance()
    retval = 0
    last_balance = 0
    new_parts, removed_part_count = self._adjust_replica2part2dev_size()
    retval += removed_part_count
    retval += len(new_parts)
    while True:
        reassign_parts = self._gather_reassign_parts()
        retval += len(reassign_parts)
        while self._remove_devs:
            self.devs[self._remove_devs.pop()['id']] = None
        balance = self.get_balance()
        if balance < 1 or abs(last_balance - balance) < 1 or \
                retval == self.parts:
        last_balance = balance
    self.devs_changed = False
    self.version += 1
    return retval, balance


def _reassign_parts(self, reassign_parts):
    For an existing ring data set, partitions are reassigned similarly to
    the initial assignment. The devices are ordered by how many partitions
    they still want and kept in that order throughout the process. The
    gathered partitions are iterated through, assigning them to devices
    according to the "most wanted" while keeping the replicas as "far
    apart" as possible. Two different regions are considered the
    farthest-apart things, followed by zones, then different ip/port pairs
    within a zone; the least-far-apart things are different devices with
    the same ip/port pair in the same zone.

    If you want more replicas than devices, you won't get all your

    :param reassign_parts: An iterable of (part, replicas_to_replace)
                           pairs. replicas_to_replace is an iterable of the
                           replica (an int) to replace for that partition.
                           replicas_to_replace may be shared for multiple
                           partitions, so be sure you do not modify it.
    for dev in self._iter_devs():
        dev['sort_key'] = self._sort_key_for(dev)

    available_devs = \
        sorted((d for d in self._iter_devs() if d['weight']),
               key=lambda x: x['sort_key'])

    tier2devs = defaultdict(list)
    tier2sort_key = defaultdict(list)
    max_tier_depth = 0
    for dev in available_devs:
        for tier in tiers_for_dev(dev):
            tier2devs[tier].append(dev)  # <-- starts out sorted!
            if len(tier) > max_tier_depth:
                max_tier_depth = len(tier)

    tier2children_sets = build_tier_tree(available_devs)
    tier2children = defaultdict(list)
    tier2children_sort_key = {}
    tiers_list = [()]
    depth = 1
    while depth <= max_tier_depth:
        new_tiers_list = []
        for tier in tiers_list:
            child_tiers = list(tier2children_sets[tier])
            child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
            tier2children[tier] = child_tiers
            tier2children_sort_key[tier] = map(
                lambda t: tier2sort_key[t][-1], child_tiers)
        tiers_list = new_tiers_list
        depth += 1

    for part, replace_replicas in reassign_parts:
        # Gather up what other tiers (regions, zones, ip/ports, and
        # devices) the replicas not-to-be-moved are in for this part.
        other_replicas = defaultdict(int)
        unique_tiers_by_tier_len = defaultdict(set)
        for replica in self._replicas_for_part(part):
            if replica not in replace_replicas:
                dev = self.devs[self._replica2part2dev[replica][part]]
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1

        for replica in replace_replicas:
            tier = ()
            depth = 1
            while depth <= max_tier_depth:
                # Order the tiers by how many replicas of this
                # partition they already have. Then, of the ones
                # with the smallest number of replicas, pick the
                # tier with the hungriest drive and then continue
                # searching in that subtree.
                # There are other strategies we could use here,
                # such as hungriest-tier (i.e. biggest
                # sum-of-parts-wanted) or picking one at random.
                # However, hungriest-drive is what was used here
                # before, and it worked pretty well in practice.
                # Note that this allocator will balance things as
                # evenly as possible at each level of the device
                # layout. If your layout is extremely unbalanced,
                # this may produce poor results.
                # This used to be a cute, recursive function, but it's been
                # unrolled for performance.
                candidate_tiers = tier2children[tier]
                candidates_with_replicas = \
                    unique_tiers_by_tier_len[len(tier) + 1]
                if len(candidate_tiers) > len(candidates_with_replicas):
                    # There exists at least one tier with 0 other replicas,
                    # so work backward among the candidates, accepting the
                    # first which isn't in other_replicas.
                    # This optimization is to avoid calling the min()
                    # below, which is expensive if you've got thousands of
                    # drives.
                    for t in reversed(candidate_tiers):
                        if other_replicas[t] == 0:
                            tier = t
                    min_count = min(other_replicas[t]
                                    for t in candidate_tiers)
                    tier = (t for t in reversed(candidate_tiers)
                            if other_replicas[t] == min_count).next()
                depth += 1
            dev = tier2devs[tier][-1]
            dev['parts_wanted'] -= 1
            dev['parts'] += 1
            old_sort_key = dev['sort_key']
            #重新根据新的parts_wanted计算sort key
            new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
            for tier in tiers_for_dev(dev):
                other_replicas[tier] += 1

                index = bisect.bisect_left(tier2sort_key[tier],

                new_index = bisect.bisect_left(tier2sort_key[tier],
                #用新的sort key插入到tier2devs中
                tier2devs[tier].insert(new_index, dev)
                tier2sort_key[tier].insert(new_index, new_sort_key)

                # Now jiggle tier2children values to keep them sorted
                new_last_sort_key = tier2sort_key[tier][-1]
                parent_tier = tier[0:-1]
                index = bisect.bisect_left(
                popped = tier2children[parent_tier].pop(index)

                new_index = bisect.bisect_left(
                tier2children[parent_tier].insert(new_index, popped)
                    new_index, new_last_sort_key)

            self._replica2part2dev[replica][part] = dev['id']

    # Just to save memory and keep from accidental reuse.
    for dev in self._iter_devs():
        del dev['sort_key']

3 Nova

3.1 Develop environment

  1. install package sudo apt-get install python-dev libssl-dev python-pip git-core libxml2-dev libxslt-dev libmysqld-dev
  2. get source code git clone https://github.com/openstack/nova.git nova-dev
  3. configure virtualenv1 easy_install Virtualenv cd nova-dev python tools/install_env.py source .venv/bin/activate
  4. run unit test export PYTHONPATH=$PYTHONPATH:/home/tacy/workspace/nova-dev ./run_tests.sh

blog comments powered by Disqus