tests/functional/arbiter/test_verify_metadata_and_data_heal.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297

#  Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

from glusto.core import Glusto as g

from glustolibs.gluster.brick_libs import (bring_bricks_offline,
                                           bring_bricks_online,
                                           get_online_bricks_list)
from glustolibs.gluster.exceptions import ExecutionError
from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
from glustolibs.gluster.glusterdir import mkdir
from glustolibs.gluster.heal_libs import (
    is_heal_complete, is_volume_in_split_brain, monitor_heal_completion,
    wait_for_self_heal_daemons_to_be_online)
from glustolibs.gluster.heal_ops import (disable_self_heal_daemon,
                                         enable_self_heal_daemon, trigger_heal)
from glustolibs.gluster.lib_utils import (add_user, collect_bricks_arequal,
                                          del_user, group_add, group_del)
from glustolibs.gluster.volume_libs import get_subvols
from glustolibs.io.utils import list_all_files_and_dirs_mounts


@runs_on([['arbiter', 'replicated'], ['glusterfs']])
class TestMetadataAndDataHeal(GlusterBaseClass):
    '''Description: Verify shd heals files after performing metadata and data
    operations while a brick was down'''
    def _dac_helper(self, host, option):
        '''Helper for creating, deleting users and groups'''

        # Permission/Ownership changes required only for `test_metadata..`
        # tests, using random group and usernames
        if 'metadata' not in self.test_dir:
            return

        if option == 'create':
            # Groups
            for group in ('qa_func', 'qa_system'):
                if not group_add(host, group):
                    raise ExecutionError('Unable to {} group {} on '
                                         '{}'.format(option, group, host))

            # User
            if not add_user(host, 'qa_all', group='qa_func'):
                raise ExecutionError('Unable to {} user {} under {} on '
                                     '{}'.format(option, 'qa_all', 'qa_func',
                                                 host))
        elif option == 'delete':
            # Groups
            for group in ('qa_func', 'qa_system'):
                if not group_del(host, group):
                    raise ExecutionError('Unable to {} group {} on '
                                         '{}'.format(option, group, host))

            # User
            if not del_user(host, 'qa_all'):
                raise ExecutionError('Unable to {} user on {}'.format(
                    option, host))

    def setUp(self):
        self.get_super_method(self, 'setUp')()

        # A single mount is enough for all the tests
        self.mounts = self.mounts[0:1]
        self.client = self.mounts[0].client_system

        # Use testcase name as test directory
        self.test_dir = self.id().split('.')[-1]
        self.fqpath = self.mounts[0].mountpoint + '/' + self.test_dir

        if not self.setup_volume_and_mount_volume(mounts=self.mounts):
            raise ExecutionError('Failed to setup and mount '
                                 '{}'.format(self.volname))

        # Crete group and user names required for the test
        self._dac_helper(host=self.client, option='create')

    def tearDown(self):
        # Delete group and user names created as part of setup
        self._dac_helper(host=self.client, option='delete')

        if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts):
            raise ExecutionError('Not able to unmount and cleanup '
                                 '{}'.format(self.volname))

        self.get_super_method(self, 'tearDown')()

    def _perform_io_and_disable_self_heal(self):
        '''Refactor of steps common to all tests: Perform IO, disable heal'''
        ret = mkdir(self.client, self.fqpath)
        self.assertTrue(ret,
                        'Directory creation failed on {}'.format(self.client))
        self.io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
        # Create 6 dir's, 6 files and 6 files in each subdir with 10K data
        file_io = ('''cd {0}; for i in `seq 1 6`;
                    do mkdir dir.$i; {1} 10K > file.$i;
                    for j in `seq 1 6`;
                    do {1} 10K > dir.$i/file.$j; done;
                    done;'''.format(self.fqpath, self.io_cmd))
        ret, _, err = g.run(self.client, file_io)
        self.assertEqual(ret, 0, 'Unable to create directories and data files')
        self.assertFalse(err, '{0} failed with {1}'.format(file_io, err))

        # Disable self heal deamon
        self.assertTrue(disable_self_heal_daemon(self.mnode, self.volname),
                        'Disabling self-heal-daemon falied')

    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            # Perform metadata/data operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
            self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')

    def _validate_heal_completion_and_arequal(self, op_type):
        '''Refactor of steps common to all tests: Validate heal from heal
        commands, verify arequal, perform IO and verify arequal after IO'''

        # Validate heal completion
        self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
                        'Self heal is not completed within timeout')
        self.assertFalse(
            is_volume_in_split_brain(self.mnode, self.volname),
            'Volume is in split brain even after heal completion')

        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        arbiter = self.volume_type.find('arbiter') >= 0
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])

        # Validate arequal
        self._validate_arequal_and_perform_lookup(subvols, stop)

        # Perform some additional metadata/data operations
        cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd)
        ret, _, err = g.run(self.client, cmd)
        self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
        self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

        # Validate arequal after additional operations
        self._validate_arequal_and_perform_lookup(subvols, stop)

    def _validate_arequal_and_perform_lookup(self, subvols, stop):
        '''Refactor of steps common to all tests: Validate arequal from bricks
        backend and perform a lookup of all files from mount'''
        for subvol in subvols:
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        # Perform a lookup of all files and directories on mounts
        self.assertTrue(list_all_files_and_dirs_mounts(self.mounts),
                        'Failed to list all files and dirs from mount')

    def test_metadata_heal_from_shd(self):
        '''Description: Verify files heal after switching on `self-heal-daemon`
        when metadata operations are performed while a brick was down

        Steps:
        1. Create, mount and run IO on volume
        2. Set `self-heal-daemon` to `off`, cyclic brick down and perform
           metadata operations
        3. Set `self-heal-daemon` to `on` and wait for heal completion
        4. Validate areequal checksum on backend bricks
        '''
        op_type = 'metadata'
        self._perform_io_and_disable_self_heal()
        self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
        self._validate_heal_completion_and_arequal(op_type=op_type)
        g.log.info('Pass: Verification of metadata heal after switching on '
                   '`self heal daemon` is complete')

    def test_metadata_heal_from_heal_cmd(self):
        '''Description: Verify files heal after triggering heal command when
        metadata operations are performed while a brick was down

        Steps:
        1. Create, mount and run IO on volume
        2. Set `self-heal-daemon` to `off`, cyclic brick down and perform
        metadata operations
        3. Set `self-heal-daemon` to `on`, invoke `gluster vol <vol> heal`
        4. Validate areequal checksum on backend bricks
        '''
        op_type = 'metadata'
        self._perform_io_and_disable_self_heal()
        self._perform_brick_ops_and_enable_self_heal(op_type=op_type)

        # Invoke `glfsheal`
        self.assertTrue(trigger_heal(self.mnode, self.volname),
                        'Unable to trigger index heal on the volume')

        self._validate_heal_completion_and_arequal(op_type=op_type)
        g.log.info(
            'Pass: Verification of metadata heal via `glfsheal` is complete')

    def test_data_heal_from_shd(self):
        '''Description: Verify files heal after triggering heal command when
        data operations are performed while a brick was down

        Steps:
        1. Create, mount and run IO on volume
        2. Set `self-heal-daemon` to `off`, cyclic brick down and perform data
           operations
        3. Set `self-heal-daemon` to `on` and wait for heal completion
        4. Validate areequal checksum on backend bricks
        '''
        op_type = 'data'
        self._perform_io_and_disable_self_heal()
        self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
        self._validate_heal_completion_and_arequal(op_type=op_type)
        g.log.info('Pass: Verification of data heal after switching on '
                   '`self heal daemon` is complete')