tests/functional/afr/heal/test_existing_shd_should_take_care_of_self_healing.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

#  Copyright (C) 2016-2018  Red Hat, Inc. <http://www.redhat.com>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

""" Description:
        Test Cases in this module tests the self heal daemon process.
"""

import time
from glusto.core import Glusto as g
from glustolibs.gluster.exceptions import ExecutionError
from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
from glustolibs.gluster.brick_libs import (
    bring_bricks_offline, bring_bricks_online,
    select_volume_bricks_to_bring_offline, get_online_bricks_list)
from glustolibs.gluster.heal_libs import (get_self_heal_daemon_pid,
                                          is_shd_daemonized,
                                          monitor_heal_completion,
                                          is_heal_complete)
from glustolibs.gluster.heal_ops import get_heal_info_summary
from glustolibs.io.utils import validate_io_procs


@runs_on([['replicated', 'distributed-replicated'],
          ['glusterfs', 'cifs', 'nfs']])
class SelfHealDaemonProcessTestsWithHealing(GlusterBaseClass):
    """
    SelfHealDaemonProcessTestsWithHealing contains tests which verifies the
    self-heal daemon process with healing.
    """
    def setUp(self):
        """
        setup volume, mount volume and initialize necessary variables
        which is used in tests
        """
        # calling GlusterBaseClass setUpClass
        self.get_super_method(self, 'setUp')()

        # Setup Volume and Mount Volume
        g.log.info("Starting to Setup Volume and Mount Volume")
        ret = self.setup_volume_and_mount_volume(mounts=self.mounts)
        if not ret:
            raise ExecutionError("Failed to Setup_Volume and Mount_Volume")
        g.log.info("Successful in Setup Volume and Mount Volume")

        # Verfiy glustershd process releases its parent process
        g.log.info("Verifying Self Heal Daemon process is daemonized")
        ret = is_shd_daemonized(self.servers)
        if not ret:
            raise ExecutionError("Self Heal Daemon process was still"
                                 " holding parent process.")
        g.log.info("Self Heal Daemon processes are online")

    def tearDown(self):
        """
        Clean up the volume and umount volume from client
        """
        # stopping the volume
        g.log.info("Starting to Unmount Volume and Cleanup Volume")
        ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts)
        if not ret:
            raise ExecutionError("Failed to Unmount Volume and Cleanup Volume")
        g.log.info("Successful in Unmount Volume and Cleanup Volume")

        # calling GlusterBaseClass tearDown
        self.get_super_method(self, 'tearDown')()

    def test_existing_glustershd_should_take_care_of_self_healing(self):
        """
        Test Script which verifies that the existing glustershd should take
        care of self healing

        * Create and start the Replicate volume
        * Check the glustershd processes - Note the pids
        * Bring down the One brick ( lets say brick1)  without affecting
          the cluster
        * Create 1000 files on volume
        * bring the brick1 up which was killed in previous steps
        * check the heal info - proactive self healing should start
        * Bring down brick1 again
        * wait for 60 sec and brought up the brick1
        * Check the glustershd processes - pids should be different
        * Monitor the heal till its complete

        """
        # pylint: disable=too-many-locals,too-many-lines,too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids = pids

        # select the bricks to bring offline
        g.log.info("Selecting bricks to brought offline for volume %s",
                   self.volname)
        bricks_to_bring_offline = \
            select_volume_bricks_to_bring_offline(self.mnode,
                                                  self.volname)
        g.log.info("Brick List to bring offline : %s",
                   bricks_to_bring_offline)

        # Bring down the selected bricks
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # get the bricks which are running
        g.log.info("getting the brick list which are online")
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s",
                   self.volname, online_bricks)

        # write 1MB files to the mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mounts)
        all_mounts_procs = []
        cmd = ("for i in `seq 1 1000`; "
               "do dd if=/dev/urandom of=%s/file_$i "
               "bs=1M count=1; "
               "done"
               % self.mounts[0].mountpoint)
        g.log.info(cmd)
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s",
                   self.volname)
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname, heal_info)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(ret, ("Failed to bring bricks: %s online"
                              % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Wait for 90 sec to start self healing
        g.log.info('Waiting for 90 sec to start self healing')
        time.sleep(90)

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s",
                   self.volname)
        heal_info_after_brick_online = get_heal_info_summary(self.mnode,
                                                             self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s",
                   self.volname, heal_info_after_brick_online)

        # check heal pending is decreased
        flag = False
        for brick in online_bricks:
            if int(heal_info_after_brick_online[brick]['numberOfEntries'])\
                    < int(heal_info[brick]['numberOfEntries']):
                flag = True
                break

        self.assertTrue(flag, "Pro-active self heal is not started")
        g.log.info("Pro-active self heal is started")

        # bring down bricks again
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # wait for 60 sec and brought up the brick again
        g.log.info('waiting for 60 sec and brought up the brick again')
        time.sleep(60)
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(ret, ("Failed to bring bricks: %s online"
                              % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        shd_pids_after_bricks_online = pids

        # compare the glustershd pids
        self.assertNotEqual(glustershd_pids,
                            shd_pids_after_bricks_online,
                            ("self heal daemon process are same before and "
                             "after bringing up bricks online"))
        g.log.info("EXPECTED : self heal daemon process are different before "
                   "and after bringing up bricks online")

        # wait for heal to complete
        g.log.info("Monitoring the heal.....")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, ("Heal is not completed on volume %s"
                              % self.volname))
        g.log.info("Heal Completed on volume %s", self.volname)

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')