summaryrefslogtreecommitdiffstats
path: root/tests/functional/afr/test_gfid_split_brain_resolution.py
blob: 6e74376fc78b9f6cb0af4bde4327c20dcb02eb9d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#  Copyright (C) 2017-2020 Red Hat, Inc. <http://www.redhat.com>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

from random import choice

from glusto.core import Glusto as g

from glustolibs.gluster.brick_libs import (bring_bricks_offline,
                                           bring_bricks_online)
from glustolibs.gluster.exceptions import ExecutionError
from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
from glustolibs.gluster.glusterdir import mkdir
from glustolibs.gluster.heal_libs import (
    is_volume_in_split_brain, monitor_heal_completion,
    wait_for_self_heal_daemons_to_be_online)
from glustolibs.gluster.heal_ops import (enable_self_heal_daemon, trigger_heal,
                                         trigger_heal_full)
from glustolibs.gluster.lib_utils import collect_bricks_arequal, list_files
from glustolibs.gluster.volume_libs import get_subvols
from glustolibs.gluster.volume_ops import set_volume_options


# pylint: disable=stop-iteration-return, too-many-locals, too-many-statements
@runs_on([[
    'replicated', 'distributed-replicated', 'arbiter', 'distributed-arbiter'
], ['glusterfs']])
class TestSelfHeal(GlusterBaseClass):
    def setUp(self):
        self.get_super_method(self, 'setUp')()

        # A single mount is enough for the test
        self.mounts = self.mounts[0::-1]

        if not self.setup_volume_and_mount_volume(mounts=self.mounts):
            raise ExecutionError('Failed to setup and mount '
                                 '{}'.format(self.volname))

    def tearDown(self):
        if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts):
            raise ExecutionError('Not able to unmount and cleanup '
                                 '{}'.format(self.volname))
        self.get_super_method(self, 'tearDown')()

    @staticmethod
    def _get_two_bricks(subvols, arbiter):
        """
        Yields two bricks from each subvol for dist/pure X arb/repl volumes
        """
        # Get an iterator for py2/3 compatibility
        brick_iter = iter(zip(*subvols))
        prev_brick = next(brick_iter)
        first_brick = prev_brick

        for index, curr_brick in enumerate(brick_iter, 1):
            # `yield` should contain arbiter brick for arbiter type vols
            if not (index == 1 and arbiter):
                yield prev_brick + curr_brick
            prev_brick = curr_brick
        # At the end yield first and last brick from a subvol
        yield prev_brick + first_brick

    def _get_files_in_brick(self, brick_path, dir_path):
        """
        Returns files in format of `dir_path/file_name` from the given brick
        path
        """
        node, path = brick_path.split(':')
        files = list_files(node, path, dir_path)
        self.assertIsNotNone(
            files, 'Unable to get list of files from {}'.format(brick_path))

        files = [file_name.rsplit('/', 1)[-1] for file_name in files]
        return [
            each_file for each_file in files
            if each_file in ('file1', 'file2', 'file3')
        ]

    def _run_cmd_and_assert(self, cmd):
        """
        Run `cmd` on `mnode` and assert for success
        """
        ret, _, err = g.run(self.mnode, cmd)
        self.assertEqual(ret, 0, '`{}` failed with {}'.format(cmd, err))

    def test_gfid_split_brain_resolution(self):
        """
        Description: Simulates gfid split brain on multiple files in a dir and
        resolve them via `bigger-file`, `mtime` and `source-brick` methods

        Steps:
        - Create and mount a replicated volume, create a dir and ~10 data files
        - Simulate gfid splits in 9 of the files
        - Resolve each 3 set of files using `bigger-file`, `mtime` and
          `source-bricks` split-brain resoultion methods
        - Trigger and monitor for heal completion
        - Validate all the files are healed and arequal matches for bricks in
          subvols
        """
        io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)
        arbiter = self.volume_type.find('arbiter') >= 0

        # Disable self-heal daemon and set `quorum-type` option to `none`
        ret = set_volume_options(self.mnode, self.volname, {
            'self-heal-daemon': 'off',
            'cluster.quorum-type': 'none'
        })
        self.assertTrue(
            ret, 'Not able to disable `quorum-type` and '
            '`self-heal` daemon volume options')

        # Create required dir and files from the mount
        split_dir = 'gfid_split_dir'
        file_io = ('cd %s; for i in {1..10}; do ' + io_cmd +
                   ' 1M > %s/file$i; done;')
        ret = mkdir(client, '{}/{}'.format(m_point, split_dir))
        self.assertTrue(ret, 'Unable to create a directory from mount point')
        ret, _, _ = g.run(client, file_io % (m_point, split_dir))

        # `file{4,5,6}` are re-created every time to be used in `bigger-file`
        # resolution method
        cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}'
        split_cmds = {
            1:
            ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)),
            2:
            ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)),
            3: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)),
            4: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)),
        }

        # Get subvols and simulate entry split brain
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        msg = ('Unable to bring files under {} dir to entry split brain while '
               '{} are down')
        for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter),
                                       1):
            # Bring down two bricks from each subvol
            ret = bring_bricks_offline(self.volname, list(bricks))
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            ret, _, _ = g.run(client,
                              'cd {}; {}'.format(m_point, split_cmds[index]))
            self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bricks will be brought down only two times in case of arbiter and
            # bringing remaining files into split brain for `latest-mtime` heal
            if arbiter and index == 2:
                ret, _, _ = g.run(client,
                                  'cd {}; {}'.format(m_point, split_cmds[4]))
                self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bring offline bricks online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                bricks,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(ret, 'Unable to bring {} online'.format(bricks))

        # Enable self-heal daemon, trigger heal and assert volume is in split
        # brain condition
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Failed to enable self heal daemon')

        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, 'Not all self heal daemons are online')

        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger index heal on the volume')

        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, 'Volume should be in split brain condition')

        # Select source brick and take note of files in source brick
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
        source_bricks = [choice(subvol[0:stop]) for subvol in subvols]
        files = [
            self._get_files_in_brick(path, split_dir) for path in source_bricks
        ]

        # Resolve `file1, file2, file3` gfid split files using `source-brick`
        cmd = ('gluster volume heal ' + self.volname + ' split-brain '
               'source-brick {} /' + split_dir + '/{}')
        for index, source_brick in enumerate(source_bricks):
            for each_file in files[index]:
                run_cmd = cmd.format(source_brick, each_file)
                self._run_cmd_and_assert(run_cmd)

        # Resolve `file4, file5, file6` gfid split files using `bigger-file`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain bigger-file /' + split_dir + '/{}')
        for each_file in ('file4', 'file5', 'file6'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Resolve `file7, file8, file9` gfid split files using `latest-mtime`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain latest-mtime /' + split_dir + '/{}')
        for each_file in ('file7', 'file8', 'file9'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Unless `shd` is triggered manually/automatically files will still
        # appear in `heal info`
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full self heal')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, 'All files in volume should be healed after healing files via'
            ' `source-brick`, `bigger-file`, `latest-mtime` methods manually')

        # Validate normal file `file10` and healed files don't differ in
        # subvols via an `arequal`
        for subvol in subvols:
            # Disregard last brick if volume is of arbiter type
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, '
                   '`bigger-file` and `latest-mtime` methods is complete')