support for de-commissioning a node using 'remove-brick'

to achieve this, we now create volume-file with 'decommissioned-nodes' option in distribute volume, then just perform the rebalance set of operations (with 'force' flag set). now onwards, the 'remove-brick' (with 'start' option) operation tries to migrate data from removed bricks to existing bricks. 'remove-brick' also supports similar options as of replace-brick. * (no options) -> works as 'force', will have the current behavior of remove-brick, ie., no data-migration, volume changes. * start (starts remove-brick with data-migration/draining process, which takes care of migrating data and once complete, will commit the changes to volume file) * pause (stop data migration, but keep the volume file intact with extra options whatever is set) * abort (stop data-migration, and fall back to old configuration) * commit (if volume is stopped, commits the changes to volumefile) * force (stops the data-migration and commits the changes to volume file) Change-Id: I3952bcfbe604a0952e68b6accace7014d5e401d3 BUG: 1952 Reviewed-on: http://review.gluster.com/118 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vijay@gluster.com>
author: Amar Tumballi <amar@gluster.com> 2011-09-09 09:42:51 +0530
committer: Vijay Bellur <vijay@gluster.com> 2011-09-13 02:10:12 -0700
commit: 25daa42911d2ff697880ee29c591cac5f2abebed (patch)
tree: 9555284c052e1e205909e91f578a8b46b522ec56 /xlators/mgmt/glusterd/src/glusterd-brick-ops.c
parent: 17e57f27c714c94dd5d9fa91650f83d069f2f4e4 (diff)
1 files changed, 296 insertions, 73 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index 8b3a03b6f3e..8832c69ed4f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -312,7 +312,8 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
                 strcpy (vol_type, "distribute");
 
 	/* Do not allow remove-brick if the volume is plain stripe */
-	if ((volinfo->type == GF_CLUSTER_TYPE_STRIPE) && (volinfo->brick_count == volinfo->sub_count)) {
+	if ((volinfo->type == GF_CLUSTER_TYPE_STRIPE) &&
+            (volinfo->brick_count == volinfo->sub_count)) {
                 snprintf (err_str, 2048, "Removing brick from a plain stripe is not allowed");
                 gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
                 ret = -1;
@@ -321,8 +322,8 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
 
 	/* Do not allow remove-brick if the bricks given is less than the replica count
 	   or stripe count */
-        if (((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) || (volinfo->type == GF_CLUSTER_TYPE_STRIPE))
-	    && !(volinfo->brick_count <= volinfo->sub_count)) {
+        if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+            !(volinfo->brick_count <= volinfo->sub_count)) {
                 if (volinfo->sub_count && (count % volinfo->sub_count != 0)) {
                         snprintf (err_str, 2048, "Remove brick incorrect"
                                   " brick count of %d for %s %d",
@@ -512,16 +513,20 @@ out:
 
 
 int
-glusterd_op_perform_remove_brick (glusterd_volinfo_t  *volinfo, char *brick)
+glusterd_op_perform_remove_brick (glusterd_volinfo_t  *volinfo, char *brick,
+                                  int force, int *need_migrate)
 {
-
         glusterd_brickinfo_t    *brickinfo = NULL;
         char                    *dup_brick = NULL;
-        int32_t                 ret = -1;
+        int32_t                  ret = -1;
+        glusterd_conf_t         *priv = NULL;
 
         GF_ASSERT (volinfo);
         GF_ASSERT (brick);
 
+        priv = THIS->private;
+        GF_ASSERT (priv);
+
         dup_brick = gf_strdup (brick);
         if (!dup_brick)
                 goto out;
@@ -534,15 +539,26 @@ glusterd_op_perform_remove_brick (glusterd_volinfo_t  *volinfo, char *brick)
         if (ret)
                 goto out;
 
-        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
-                ret = glusterd_brick_stop (volinfo, brickinfo);
-                if (ret) {
-                        gf_log ("", GF_LOG_ERROR, "Unable to stop "
-                                "glusterfs, ret: %d", ret);
-                        goto out;
+        if (!uuid_compare (brickinfo->uuid, priv->uuid)) {
+                /* Only if the brick is in this glusterd, do the rebalance */
+                if (need_migrate)
+                        *need_migrate = 1;
+        }
+
+        if (force) {
+                if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                        ret = glusterd_brick_stop (volinfo, brickinfo);
+                        if (ret) {
+                                gf_log (THIS->name, GF_LOG_ERROR, "Unable to stop "
+                                        "glusterfs, ret: %d", ret);
+                                goto out;
+                        }
                 }
+                glusterd_delete_brick (volinfo, brickinfo);
+                goto out;
         }
-        glusterd_delete_brick (volinfo, brickinfo);
+
+        brickinfo->decommissioned = 1;
 out:
         if (dup_brick)
                 GF_FREE (dup_brick);
@@ -700,17 +716,18 @@ out:
 }
 
 int
-glusterd_op_stage_remove_brick (dict_t *dict)
+glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
 {
-        int                                     ret = -1;
-        char                                    *volname = NULL;
-        glusterd_volinfo_t                      *volinfo = NULL;
-        dict_t                                  *ctx     = NULL;
-        char                                    *errstr  = NULL;
-        int32_t                                 brick_count = 0;
+        int                 ret         = -1;
+        char               *volname     = NULL;
+        glusterd_volinfo_t *volinfo     = NULL;
+        char               *errstr      = NULL;
+        int32_t             brick_count = 0;
+        char                msg[2048]   = {0,};
+        int32_t             flag        = 0;
+        gf1_op_commands     cmd         = GF_OP_CMD_NONE;
 
         ret = dict_get_str (dict, "volname", &volname);
-
         if (ret) {
                 gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
                 goto out;
@@ -723,25 +740,64 @@ glusterd_op_stage_remove_brick (dict_t *dict)
                 goto out;
         }
 
-        if (glusterd_is_defrag_on(volinfo)) {
-                ctx = glusterd_op_get_ctx ();
-                errstr = gf_strdup("Rebalance is in progress. Please retry"
-                                    " after completion");
-                if (!errstr) {
-                        ret = -1;
+        ret = dict_get_int32 (dict, "command", &flag);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Unable to get brick count");
+                goto out;
+        }
+        cmd = flag;
+
+        ret = -1;
+        switch (cmd) {
+        case GF_OP_CMD_NONE:
+                errstr = gf_strdup ("no remove-brick command issued");
+                goto out;
+
+        case GF_OP_CMD_STATUS:
+                ret = 0;
+                goto out;
+
+        case GF_OP_CMD_START:
+        {
+                if (GLUSTERD_STATUS_STARTED != volinfo->status) {
+                        snprintf (msg, sizeof (msg), "Volume %s needs to be started "
+                                  "before remove-brick (you can use 'force' or "
+                                  "'commit' to override this behavior)",
+                                  volinfo->volname);
+                        errstr = gf_strdup (msg);
+                        gf_log (THIS->name, GF_LOG_ERROR, "%s", errstr);
                         goto out;
                 }
-                gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr);
-                ret = dict_set_dynstr (ctx, "errstr", errstr);
-                if (ret) {
-                        GF_FREE (errstr);
-                        gf_log ("", GF_LOG_DEBUG,
-                                "failed to set errstr ctx");
+                if (glusterd_is_defrag_on(volinfo)) {
+                        errstr = gf_strdup("Rebalance is in progress. Please retry"
+                                           " after completion");
+                        gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr);
                         goto out;
                 }
+                break;
+        }
 
-                ret = -1;
-                goto out;
+        case GF_OP_CMD_PAUSE:
+        case GF_OP_CMD_ABORT:
+        {
+                if (!volinfo->decommission_in_progress) {
+                        errstr = gf_strdup("remove-brick is not in progress");
+                        gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr);
+                        goto out;
+                }
+                break;
+        }
+
+        case GF_OP_CMD_COMMIT:
+                if (volinfo->decommission_in_progress) {
+                        errstr = gf_strdup ("use 'force' option as migration "
+                                            "is in progress");
+                        goto out;
+                }
+                break;
+
+        case GF_OP_CMD_COMMIT_FORCE:
+                break;
         }
 
         ret = dict_get_int32 (dict, "count", &brick_count);
@@ -750,41 +806,96 @@ glusterd_op_stage_remove_brick (dict_t *dict)
                 goto out;
         }
 
+        ret = 0;
         if (volinfo->brick_count == brick_count) {
-                ctx = glusterd_op_get_ctx ();
-                if (!ctx) {
-                        gf_log ("", GF_LOG_ERROR,
-                                "Operation Context is not present");
-                        ret = -1;
-                        goto out;
-                }
                 errstr = gf_strdup ("Deleting all the bricks of the "
                                     "volume is not allowed");
-                if (!errstr) {
-                        gf_log ("", GF_LOG_ERROR, "Out of memory");
-                        ret = -1;
-                        goto out;
-                }
-
-                ret = dict_set_dynstr (ctx, "errstr", errstr);
-                if (ret) {
-                        GF_FREE (errstr);
-                        gf_log ("", GF_LOG_DEBUG,
-                                "failed to set pump status in ctx");
-                        goto out;
-                }
-
                 ret = -1;
                 goto out;
         }
 
 out:
         gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+        if (ret && errstr) {
+                if (op_errstr)
+                        *op_errstr = errstr;
+        }
 
         return ret;
 }
 
 int
+glusterd_remove_brick_migrate_cbk (glusterd_volinfo_t *volinfo,
+                                   gf_defrag_status_t status)
+{
+        int                   ret = 0;
+        glusterd_brickinfo_t *brickinfo = NULL;
+        glusterd_brickinfo_t *tmp = NULL;
+
+        switch (status) {
+        case GF_DEFRAG_STATUS_PAUSED:
+        case GF_DEFRAG_STATUS_FAILED:
+                /* No changes required in the volume file.
+                   everything should remain as is */
+                break;
+        case GF_DEFRAG_STATUS_STOPPED:
+                /* Fall back to the old volume file */
+                list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks, brick_list) {
+                        if (!brickinfo->decommissioned)
+                                continue;
+                        brickinfo->decommissioned = 0;
+                }
+                break;
+
+        case GF_DEFRAG_STATUS_COMPLETE:
+                /* Done with the task, you can remove the brick from the
+                   volume file */
+                list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks, brick_list) {
+                        if (!brickinfo->decommissioned)
+                                continue;
+                        gf_log (THIS->name, GF_LOG_INFO, "removing the brick %s",
+                                brickinfo->path);
+                        brickinfo->decommissioned = 0;
+                        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                                ret = glusterd_brick_stop (volinfo, brickinfo);
+                                if (ret) {
+                                        gf_log (THIS->name, GF_LOG_ERROR,
+                                                "Unable to stop glusterfs (%d)", ret);
+                                }
+                        }
+                        glusterd_delete_brick (volinfo, brickinfo);
+                }
+                break;
+
+        default:
+                GF_ASSERT (!"cbk function called with wrong status");
+                break;
+        }
+
+        ret = glusterd_create_volfiles_and_notify_services (volinfo);
+        if (ret)
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "Unable to write volume files (%d)", ret);
+
+        ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret)
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "Unable to store volume info (%d)", ret);
+
+
+        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                ret = glusterd_check_generate_start_nfs ();
+                if (ret)
+                        gf_log (THIS->name, GF_LOG_ERROR,
+                                "Unable to start nfs process (%d)", ret);
+        }
+
+        volinfo->decommission_in_progress = 0;
+        return 0;
+}
+
+
+int
 glusterd_op_add_brick (dict_t *dict, char **op_errstr)
 {
         int                                     ret = 0;
@@ -848,15 +959,20 @@ out:
 }
 
 int
-glusterd_op_remove_brick (dict_t *dict)
+glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
 {
-        int                                     ret = -1;
-        char                                    *volname = NULL;
-        glusterd_volinfo_t                      *volinfo = NULL;
-        char                                    *brick = NULL;
-        int32_t                                 count = 0;
-        int32_t                                 i = 1;
-        char                                    key[256] = {0,};
+        int                 ret            = -1;
+        char               *volname        = NULL;
+        glusterd_volinfo_t *volinfo        = NULL;
+        char               *brick          = NULL;
+        int32_t             count          = 0;
+        int32_t             i              = 1;
+        char                key[256]       = {0,};
+        int32_t             flag           = 0;
+        char                err_str[4096]  = {0,};
+        int                 need_rebalance = 0;
+        int                 force          = 0;
+        gf1_op_commands     cmd            = 0;
 
         ret = dict_get_str (dict, "volname", &volname);
 
@@ -866,12 +982,99 @@ glusterd_op_remove_brick (dict_t *dict)
         }
 
         ret = glusterd_volinfo_find (volname, &volinfo);
-
         if (ret) {
                 gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
                 goto out;
         }
 
+        ret = dict_get_int32 (dict, "command", &flag);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Unable to get brick count");
+                goto out;
+        }
+        cmd = flag;
+
+        ret = -1;
+        switch (cmd) {
+        case GF_OP_CMD_NONE:
+                goto out;
+
+        case GF_OP_CMD_STATUS:
+                ret = 0;
+                goto out;
+
+        case GF_OP_CMD_PAUSE:
+        {
+                if (volinfo->decommission_in_progress) {
+                        if (volinfo->defrag == (void *)1)
+                                volinfo->defrag = NULL;
+
+                        if (volinfo->defrag) {
+                                LOCK (&volinfo->defrag->lock);
+
+                                volinfo->defrag_status = GF_DEFRAG_STATUS_PAUSED;
+
+                                UNLOCK (&volinfo->defrag->lock);
+                        }
+                }
+
+                /* rebalance '_cbk()' will take care of volume file updates */
+                ret = 0;
+                goto out;
+        }
+
+        case GF_OP_CMD_ABORT:
+        {
+                if (volinfo->decommission_in_progress) {
+                        if (volinfo->defrag == (void *)1)
+                                volinfo->defrag = NULL;
+
+                        if (volinfo->defrag) {
+                                LOCK (&volinfo->defrag->lock);
+
+                                volinfo->defrag_status = GF_DEFRAG_STATUS_STOPPED;
+
+                                UNLOCK (&volinfo->defrag->lock);
+                        }
+                }
+
+                /* rebalance '_cbk()' will take care of volume file updates */
+                ret = 0;
+                goto out;
+        }
+
+        case GF_OP_CMD_START:
+                force = 0;
+                break;
+
+        case GF_OP_CMD_COMMIT:
+                force = 1;
+                break;
+
+        case GF_OP_CMD_COMMIT_FORCE:
+
+                if (volinfo->decommission_in_progress) {
+                        if (volinfo->defrag == (void *)1)
+                                volinfo->defrag = NULL;
+
+                        if (volinfo->defrag) {
+                                LOCK (&volinfo->defrag->lock);
+                                /* Fake 'rebalance-complete' so the graph change
+                                   happens right away */
+                                volinfo->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+
+                                UNLOCK (&volinfo->defrag->lock);
+                        }
+                        ret = 0;
+                        /* Graph change happens in rebalance _cbk function,
+                           no need to do anything here */
+                        goto out;
+                }
+
+                force = 1;
+                break;
+        }
+
         ret = dict_get_int32 (dict, "count", &count);
         if (ret) {
                 gf_log ("", GF_LOG_ERROR, "Unable to get count");
@@ -887,26 +1090,46 @@ glusterd_op_remove_brick (dict_t *dict)
                         goto out;
                 }
 
-                ret = glusterd_op_perform_remove_brick (volinfo, brick);
+                ret = glusterd_op_perform_remove_brick (volinfo, brick, force,
+                                                        (i == 1) ? &need_rebalance : NULL);
                 if (ret)
                         goto out;
                 i++;
         }
 
         ret = glusterd_create_volfiles_and_notify_services (volinfo);
-        if (ret)
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_WARNING, "failed to create volfiles");
                 goto out;
-
-        volinfo->defrag_status = 0;
+        }
 
         ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
-
-        if (ret)
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_WARNING, "failed to store volinfo");
                 goto out;
+        }
 
-        if (GLUSTERD_STATUS_STARTED == volinfo->status)
-                ret = glusterd_check_generate_start_nfs ();
+        volinfo->defrag_status = 0;
+        if (!force && need_rebalance) {
+                /* perform the rebalance operations */
+                ret = glusterd_handle_defrag_start (volinfo, err_str, 4096,
+                                                    GF_DEFRAG_CMD_START_FORCE,
+                                                    glusterd_remove_brick_migrate_cbk);
+                if (!ret)
+                        volinfo->decommission_in_progress = 1;
+
+                if (ret) {
+                        gf_log (THIS->name, GF_LOG_ERROR,
+                                "failed to start the rebalance");
+                }
+        } else {
+                if (GLUSTERD_STATUS_STARTED == volinfo->status)
+                        ret = glusterd_check_generate_start_nfs ();
+        }
 
 out:
+        if (ret && err_str[0] && op_errstr)
+                *op_errstr = gf_strdup (err_str);
+
         return ret;
 }
author	Amar Tumballi <amar@gluster.com>	2011-09-09 09:42:51 +0530
committer	Vijay Bellur <vijay@gluster.com>	2011-09-13 02:10:12 -0700
commit	25daa42911d2ff697880ee29c591cac5f2abebed (patch)
tree	9555284c052e1e205909e91f578a8b46b522ec56 /xlators/mgmt/glusterd/src/glusterd-brick-ops.c
parent	17e57f27c714c94dd5d9fa91650f83d069f2f4e4 (diff)