diff options
Diffstat (limited to 'extras/rebalance.py')
| -rwxr-xr-x | extras/rebalance.py | 299 | 
1 files changed, 299 insertions, 0 deletions
diff --git a/extras/rebalance.py b/extras/rebalance.py new file mode 100755 index 00000000000..80c614c5dfe --- /dev/null +++ b/extras/rebalance.py @@ -0,0 +1,299 @@ +#!/usr/bin/python + +import atexit +import copy +import optparse +import os +import pipes +import shutil +import string +import subprocess +import sys +import tempfile +import volfilter + +# It's just more convenient to have named fields. +class Brick: +        def __init__ (self, path, name): +                self.path = path +                self.sv_name = name +                self.size = 0 +                self.curr_size = 0 +                self.good_size = 0 +        def set_size (self, size): +                self.size = size +        def set_range (self, rs, re): +                self.r_start = rs +                self.r_end = re +                self.curr_size = self.r_end - self.r_start + 1 +        def __repr__ (self): +                value = self.path[:] +                value += "(%d," % self.size +                if self.curr_size: +                        value += "0x%x,0x%x)" % (self.r_start, self.r_end) +                else: +                        value += "-)" +                return value + +def get_bricks (host, vol): +        t = pipes.Template() +        t.prepend("gluster --remote-host=%s system getspec %s"%(host,vol),".-") +        return t.open(None,"r") + +def generate_stanza (vf, all_xlators, cur_subvol): +        sv_list = [] +        for sv in cur_subvol.subvols: +                generate_stanza(vf,all_xlators,sv) +                sv_list.append(sv.name) +        vf.write("volume %s\n"%cur_subvol.name) +        vf.write("  type %s\n"%cur_subvol.type) +        for kvpair in cur_subvol.opts.iteritems(): +                vf.write("  option %s %s\n"%kvpair) +        if sv_list: +                vf.write("  subvolumes %s\n"%string.join(sv_list)) +        vf.write("end-volume\n\n") + + +def mount_brick (localpath, all_xlators, dht_subvol): + +        # Generate a volfile. +        vf_name = localpath + ".vol" +        vf = open(vf_name,"w") +        generate_stanza(vf,all_xlators,dht_subvol) +        vf.flush() +        vf.close() + +        # Create a brick directory and mount the brick there. +        os.mkdir(localpath) +        subprocess.call(["glusterfs","-f",vf_name,localpath]) + +# We use the command-line tools because there's no getxattr support in the +# Python standard library (which is ridiculous IMO).  Adding the xattr package +# from PyPI would create a new and difficult dependency because the bits to +# satisfy it don't seem to exist in Fedora.  We already expect the command-line +# tools to be there, so it's safer just to rely on them. +# +# We might have to revisit this if we get as far as actually issuing millions +# of setxattr requests.  Even then, it might be better to do that part with a C +# program which has only a build-time dependency. +def get_range (brick): +        t = pipes.Template() +        cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null" +        t.prepend(cmd%brick,".-") +        t.append("grep ^trusted.glusterfs.dht=","--") +        f = t.open(None,"r") +        try: +                value = f.readline().rstrip().split('=')[1][2:] +        except: +                print "could not get layout for %s (might be OK)" % brick +                return None +        v_start = int("0x"+value[16:24],16) +        v_end = int("0x"+value[24:32],16) +        return (v_start, v_end) + +def calc_sizes (bricks, total): +        leftover = 1 << 32 +        for b in bricks: +               if b.size: +                        b.good_size = (b.size << 32) / total +                        leftover -= b.good_size +               else: +                        b.good_size = 0 +        if leftover: +                # Add the leftover to an old brick if we can. +                for b in bricks: +                        if b.good_size: +                                b.good_size += leftover +                                break +                else: +                        # Fine, just add it wherever. +                        bricks[0].good_size += leftover + +# Normalization means sorting the bricks by r_start and (b) ensuring that there +# are no gaps. +def normalize (in_bricks): +        out_bricks = [] +        curr_hash = 0 +        used = 0 +        while curr_hash < (1<<32): +                curr_best = None +                for b in in_bricks: +                        if b.r_start == curr_hash: +                                used += 1 +                                out_bricks.append(b) +                                in_bricks.remove(b) +                                curr_hash = b.r_end + 1 +                                break +                else: +                        print "gap found at 0x%08x" % curr_hash +                        sys.exit(1) +        return out_bricks + in_bricks, used + +def get_score (bricks): +        score = 0 +        curr_hash = 0 +        for b in bricks: +                if not b.curr_size: +                        curr_hash += b.good_size +                        continue +                new_start = curr_hash +                curr_hash += b.good_size +                new_end = curr_hash - 1 +                if new_start > b.r_start: +                        max_start = new_start +                else: +                        max_start = b.r_start +                if new_end < b.r_end: +                        min_end = new_end +                else: +                        min_end = b.r_end +                if max_start <= min_end: +                        score += (min_end - max_start + 1) +        return score + +if __name__ == "__main__": + +	my_usage = "%prog [options] server volume [directory]" +	parser = optparse.OptionParser(usage=my_usage) +        parser.add_option("-f", "--free-space", dest="free_space", +                          default=False, action="store_true", +                          help="use free space instead of total space") +        parser.add_option("-l", "--leave-mounted", dest="leave_mounted", +                          default=False, action="store_true", +                          help="leave subvolumes mounted") +        parser.add_option("-v", "--verbose", dest="verbose", +                          default=False, action="store_true", +                          help="verbose output") +	options, args = parser.parse_args() + +        if len(args) == 3: +                fix_dir = args[2] +        else: +                if len(args) != 2: +                        parser.print_help() +                        sys.exit(1) +                fix_dir = None +        hostname, volname = args[:2] + +        # Make sure stuff gets cleaned up, even if there are exceptions. +        orig_dir = os.getcwd() +        work_dir = tempfile.mkdtemp() +        bricks = [] +        def cleanup_workdir (): +                os.chdir(orig_dir) +                if options.verbose: +                        print "Cleaning up %s" % work_dir +                for b in bricks: +                        subprocess.call(["umount",b.path]) +                shutil.rmtree(work_dir) +        if not options.leave_mounted: +                atexit.register(cleanup_workdir) +        os.chdir(work_dir) + +        # Mount each brick individually, so we can issue brick-specific calls. +        if options.verbose: +                print "Mounting subvolumes..." +        index = 0 +        volfile_pipe = get_bricks(hostname,volname) +        all_xlators, last_xlator = volfilter.load(volfile_pipe) +        for dht_vol in all_xlators.itervalues(): +                if dht_vol.type == "cluster/distribute": +                        break +        else: +                print "no DHT volume found" +                sys.exit(1) +        for sv in dht_vol.subvols: +                #print "found subvol %s" % sv.name +                lpath = "%s/brick%s" % (work_dir, index) +                index += 1 +                mount_brick(lpath,all_xlators,sv) +                bricks.append(Brick(lpath,sv.name)) +        if index == 0: +                print "no bricks" +                sys.exit(1) + +        # Collect all of the sizes. +        if options.verbose: +                print "Collecting information..." +        total = 0 +        for b in bricks: +                info = os.statvfs(b.path) +                # We want a standard unit even if different bricks use +                # different block sizes.  The size is chosen to avoid overflows +                # for very large bricks with very small block sizes, but also +                # accommodate filesystems which use very large block sizes to +                # cheat on benchmarks. +                blocksper100mb = 104857600 / info[0] +                if options.free_space: +                        size = info[3] / blocksper100mb +                else: +                        size = info[2] / blocksper100mb +                if size <= 0: +                        print "brick %s has invalid size %d" % (b.path, size) +                        sys.exit(1) +                b.set_size(size) +                total += size + +        # Collect all of the layout information. +        for b in bricks: +                hash_range = get_range(b.path) +                if hash_range is not None: +                        rs, re = hash_range +                        if rs > re: +                                print "%s has backwards hash range" % b.path +                                sys.exit(1) +                        b.set_range(hash_range[0],hash_range[1]) + +        if options.verbose: +                print "Calculating new layouts..." +        calc_sizes(bricks,total) +        bricks, used = normalize(bricks) + +        # We can't afford O(n!) here, but O(n^2) should be OK and the result +        # should be almost as good. +        while used < len(bricks): +                best_place = used +                best_score = get_score(bricks) +                for i in xrange(used): +                        new_bricks = bricks[:] +                        del new_bricks[used] +                        new_bricks.insert(i,bricks[used]) +                        new_score = get_score(new_bricks) +                        if new_score > best_score: +                                best_place = i +                                best_score = new_score +                if best_place != used: +                        nb = bricks[used] +                        del bricks[used] +                        bricks.insert(best_place,nb) +                used += 1 + +        # Finalize whatever we decided on. +        curr_hash = 0 +        for b in bricks: +                b.r_start = curr_hash +                curr_hash += b.good_size +                b.r_end = curr_hash - 1 + +        print "Here are the xattr values for your size-weighted layout:" +        for b in bricks: +                print "  %s: 0x0000000200000000%08x%08x" % ( +                        b.sv_name, b.r_start, b.r_end) + +        if fix_dir: +                if options.verbose: +                        print "Fixing layout for %s" % fix_dir +                for b in bricks: +                        value = "0x0000000200000000%08x%08x" % ( +                                b.r_start, b.r_end) +                        path = "%s/%s" % (b.path, fix_dir) +                        cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % ( +                                value, path) +                        print cmd + +        if options.leave_mounted: +                print "The following subvolumes are still mounted:" +                for b in bricks: +                        print "%s on %s" % (b.sv_name, b.path) +                print "Don't forget to clean up when you're done." +  | 
