summaryrefslogtreecommitdiffstats
path: root/extras/rebalance.py
blob: 80c614c5dfea37552f48a7739dae91915a414e05 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#!/usr/bin/python

import atexit
import copy
import optparse
import os
import pipes
import shutil
import string
import subprocess
import sys
import tempfile
import volfilter

# It's just more convenient to have named fields.
class Brick:
        def __init__ (self, path, name):
                self.path = path
                self.sv_name = name
                self.size = 0
                self.curr_size = 0
                self.good_size = 0
        def set_size (self, size):
                self.size = size
        def set_range (self, rs, re):
                self.r_start = rs
                self.r_end = re
                self.curr_size = self.r_end - self.r_start + 1
        def __repr__ (self):
                value = self.path[:]
                value += "(%d," % self.size
                if self.curr_size:
                        value += "0x%x,0x%x)" % (self.r_start, self.r_end)
                else:
                        value += "-)"
                return value

def get_bricks (host, vol):
        t = pipes.Template()
        t.prepend("gluster --remote-host=%s system getspec %s"%(host,vol),".-")
        return t.open(None,"r")

def generate_stanza (vf, all_xlators, cur_subvol):
        sv_list = []
        for sv in cur_subvol.subvols:
                generate_stanza(vf,all_xlators,sv)
                sv_list.append(sv.name)
        vf.write("volume %s\n"%cur_subvol.name)
        vf.write("  type %s\n"%cur_subvol.type)
        for kvpair in cur_subvol.opts.iteritems():
                vf.write("  option %s %s\n"%kvpair)
        if sv_list:
                vf.write("  subvolumes %s\n"%string.join(sv_list))
        vf.write("end-volume\n\n")


def mount_brick (localpath, all_xlators, dht_subvol):

        # Generate a volfile.
        vf_name = localpath + ".vol"
        vf = open(vf_name,"w")
        generate_stanza(vf,all_xlators,dht_subvol)
        vf.flush()
        vf.close()

        # Create a brick directory and mount the brick there.
        os.mkdir(localpath)
        subprocess.call(["glusterfs","-f",vf_name,localpath])

# We use the command-line tools because there's no getxattr support in the
# Python standard library (which is ridiculous IMO).  Adding the xattr package
# from PyPI would create a new and difficult dependency because the bits to
# satisfy it don't seem to exist in Fedora.  We already expect the command-line
# tools to be there, so it's safer just to rely on them.
#
# We might have to revisit this if we get as far as actually issuing millions
# of setxattr requests.  Even then, it might be better to do that part with a C
# program which has only a build-time dependency.
def get_range (brick):
        t = pipes.Template()
        cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null"
        t.prepend(cmd%brick,".-")
        t.append("grep ^trusted.glusterfs.dht=","--")
        f = t.open(None,"r")
        try:
                value = f.readline().rstrip().split('=')[1][2:]
        except:
                print "could not get layout for %s (might be OK)" % brick
                return None
        v_start = int("0x"+value[16:24],16)
        v_end = int("0x"+value[24:32],16)
        return (v_start, v_end)

def calc_sizes (bricks, total):
        leftover = 1 << 32
        for b in bricks:
               if b.size:
                        b.good_size = (b.size << 32) / total
                        leftover -= b.good_size
               else:
                        b.good_size = 0
        if leftover:
                # Add the leftover to an old brick if we can.
                for b in bricks:
                        if b.good_size:
                                b.good_size += leftover
                                break
                else:
                        # Fine, just add it wherever.
                        bricks[0].good_size += leftover

# Normalization means sorting the bricks by r_start and (b) ensuring that there
# are no gaps.
def normalize (in_bricks):
        out_bricks = []
        curr_hash = 0
        used = 0
        while curr_hash < (1<<32):
                curr_best = None
                for b in in_bricks:
                        if b.r_start == curr_hash:
                                used += 1
                                out_bricks.append(b)
                                in_bricks.remove(b)
                                curr_hash = b.r_end + 1
                                break
                else:
                        print "gap found at 0x%08x" % curr_hash
                        sys.exit(1)
        return out_bricks + in_bricks, used

def get_score (bricks):
        score = 0
        curr_hash = 0
        for b in bricks:
                if not b.curr_size:
                        curr_hash += b.good_size
                        continue
                new_start = curr_hash
                curr_hash += b.good_size
                new_end = curr_hash - 1
                if new_start > b.r_start:
                        max_start = new_start
                else:
                        max_start = b.r_start
                if new_end < b.r_end:
                        min_end = new_end
                else:
                        min_end = b.r_end
                if max_start <= min_end:
                        score += (min_end - max_start + 1)
        return score

if __name__ == "__main__":

	my_usage = "%prog [options] server volume [directory]"
	parser = optparse.OptionParser(usage=my_usage)
        parser.add_option("-f", "--free-space", dest="free_space",
                          default=False, action="store_true",
                          help="use free space instead of total space")
        parser.add_option("-l", "--leave-mounted", dest="leave_mounted",
                          default=False, action="store_true",
                          help="leave subvolumes mounted")
        parser.add_option("-v", "--verbose", dest="verbose",
                          default=False, action="store_true",
                          help="verbose output")
	options, args = parser.parse_args()

        if len(args) == 3:
                fix_dir = args[2]
        else:
                if len(args) != 2:
                        parser.print_help()
                        sys.exit(1)
                fix_dir = None
        hostname, volname = args[:2]

        # Make sure stuff gets cleaned up, even if there are exceptions.
        orig_dir = os.getcwd()
        work_dir = tempfile.mkdtemp()
        bricks = []
        def cleanup_workdir ():
                os.chdir(orig_dir)
                if options.verbose:
                        print "Cleaning up %s" % work_dir
                for b in bricks:
                        subprocess.call(["umount",b.path])
                shutil.rmtree(work_dir)
        if not options.leave_mounted:
                atexit.register(cleanup_workdir)
        os.chdir(work_dir)

        # Mount each brick individually, so we can issue brick-specific calls.
        if options.verbose:
                print "Mounting subvolumes..."
        index = 0
        volfile_pipe = get_bricks(hostname,volname)
        all_xlators, last_xlator = volfilter.load(volfile_pipe)
        for dht_vol in all_xlators.itervalues():
                if dht_vol.type == "cluster/distribute":
                        break
        else:
                print "no DHT volume found"
                sys.exit(1)
        for sv in dht_vol.subvols:
                #print "found subvol %s" % sv.name
                lpath = "%s/brick%s" % (work_dir, index)
                index += 1
                mount_brick(lpath,all_xlators,sv)
                bricks.append(Brick(lpath,sv.name))
        if index == 0:
                print "no bricks"
                sys.exit(1)

        # Collect all of the sizes.
        if options.verbose:
                print "Collecting information..."
        total = 0
        for b in bricks:
                info = os.statvfs(b.path)
                # We want a standard unit even if different bricks use
                # different block sizes.  The size is chosen to avoid overflows
                # for very large bricks with very small block sizes, but also
                # accommodate filesystems which use very large block sizes to
                # cheat on benchmarks.
                blocksper100mb = 104857600 / info[0]
                if options.free_space:
                        size = info[3] / blocksper100mb
                else:
                        size = info[2] / blocksper100mb
                if size <= 0:
                        print "brick %s has invalid size %d" % (b.path, size)
                        sys.exit(1)
                b.set_size(size)
                total += size

        # Collect all of the layout information.
        for b in bricks:
                hash_range = get_range(b.path)
                if hash_range is not None:
                        rs, re = hash_range
                        if rs > re:
                                print "%s has backwards hash range" % b.path
                                sys.exit(1)
                        b.set_range(hash_range[0],hash_range[1])

        if options.verbose:
                print "Calculating new layouts..."
        calc_sizes(bricks,total)
        bricks, used = normalize(bricks)

        # We can't afford O(n!) here, but O(n^2) should be OK and the result
        # should be almost as good.
        while used < len(bricks):
                best_place = used
                best_score = get_score(bricks)
                for i in xrange(used):
                        new_bricks = bricks[:]
                        del new_bricks[used]
                        new_bricks.insert(i,bricks[used])
                        new_score = get_score(new_bricks)
                        if new_score > best_score:
                                best_place = i
                                best_score = new_score
                if best_place != used:
                        nb = bricks[used]
                        del bricks[used]
                        bricks.insert(best_place,nb)
                used += 1

        # Finalize whatever we decided on.
        curr_hash = 0
        for b in bricks:
                b.r_start = curr_hash
                curr_hash += b.good_size
                b.r_end = curr_hash - 1

        print "Here are the xattr values for your size-weighted layout:"
        for b in bricks:
                print "  %s: 0x0000000200000000%08x%08x" % (
                        b.sv_name, b.r_start, b.r_end)

        if fix_dir:
                if options.verbose:
                        print "Fixing layout for %s" % fix_dir
                for b in bricks:
                        value = "0x0000000200000000%08x%08x" % (
                                b.r_start, b.r_end)
                        path = "%s/%s" % (b.path, fix_dir)
                        cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % (
                                value, path)
                        print cmd

        if options.leave_mounted:
                print "The following subvolumes are still mounted:"
                for b in bricks:
                        print "%s on %s" % (b.sv_name, b.path)
                print "Don't forget to clean up when you're done."