From 1392da3e237d8ea080573909015916e3544a6d2c Mon Sep 17 00:00:00 2001 From: Xavier Hernandez Date: Thu, 15 May 2014 10:35:14 +0200 Subject: cli/glusterd: Added support for dispersed volumes Two new options have been added to the 'create' command of the cli interface: disperse [] redundancy Both are optional. A dispersed volume is created by specifying, at least, one of them. If 'disperse' is missing or it's present but '' does not, the number of bricks enumerated in the command line is taken as the disperse count. If 'redundancy' is missing, the lowest optimal value is assumed. A configuration is considered optimal (for most workloads) when the disperse count - redundancy count is a power of 2. If the resulting redundancy is 1, the volume is created normally, but if it's greater than 1, a warning is shown to the user and he/she must answer yes/no to continue volume creation. If there isn't any optimal value for the given number of bricks, a warning is also shown and, if the user accepts, a redundancy of 1 is used. If 'redundancy' is specified and the resulting volume is not optimal, another warning is shown to the user. A distributed-disperse volume can be created using a number of bricks multiple of the disperse count. Change-Id: Iab93efbe78e905cdb91f54f3741599f7ea6645e4 BUG: 1118629 Signed-off-by: Xavier Hernandez Reviewed-on: http://review.gluster.org/7782 Tested-by: Gluster Build System Reviewed-by: Jeff Darcy Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 15 +++++ xlators/mgmt/glusterd/src/glusterd-handler.c | 10 ++++ xlators/mgmt/glusterd/src/glusterd-store.c | 23 +++++++ xlators/mgmt/glusterd/src/glusterd-store.h | 2 + xlators/mgmt/glusterd/src/glusterd-utils.c | 80 ++++++++++++++++++++++--- xlators/mgmt/glusterd/src/glusterd-volgen.c | 24 ++++++++ xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 21 +++++++ xlators/mgmt/glusterd/src/glusterd.h | 2 + 8 files changed, 168 insertions(+), 9 deletions(-) (limited to 'xlators/mgmt/glusterd/src') diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 452df759ad4..089c7d637c9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -169,6 +169,12 @@ gd_addbr_validate_stripe_count (glusterd_volinfo_t *volinfo, int stripe_count, } } break; + case GF_CLUSTER_TYPE_DISPERSE: + snprintf (err_str, err_len, "Volume %s cannot be converted " + "from dispersed to striped-" + "dispersed", volinfo->volname); + gf_log(THIS->name, GF_LOG_ERROR, "%s", err_str); + goto out; } out: @@ -259,6 +265,12 @@ gd_addbr_validate_replica_count (glusterd_volinfo_t *volinfo, int replica_count, } } break; + case GF_CLUSTER_TYPE_DISPERSE: + snprintf (err_str, err_len, "Volume %s cannot be converted " + "from dispersed to replicated-" + "dispersed", volinfo->volname); + gf_log(THIS->name, GF_LOG_ERROR, "%s", err_str); + goto out; } out: return ret; @@ -276,6 +288,7 @@ gd_rmbr_validate_replica_count (glusterd_volinfo_t *volinfo, switch (volinfo->type) { case GF_CLUSTER_TYPE_NONE: case GF_CLUSTER_TYPE_STRIPE: + case GF_CLUSTER_TYPE_DISPERSE: snprintf (err_str, err_len, "replica count (%d) option given for non replicate " "volume %s", replica_count, volinfo->volname); @@ -737,6 +750,8 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) strcpy (vol_type, "stripe"); } else if (volinfo->type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) { strcpy (vol_type, "stripe-replicate"); + } else if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { + strcpy (vol_type, "disperse"); } else { strcpy (vol_type, "distribute"); } diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index ed4bd60f88b..e10dc22b56b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -398,6 +398,16 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, 256, "volume%d.disperse_count", count); + ret = dict_set_int32 (volumes, key, volinfo->disperse_count); + if (ret) + goto out; + + snprintf (key, 256, "volume%d.redundancy_count", count); + ret = dict_set_int32 (volumes, key, volinfo->redundancy_count); + if (ret) + goto out; + snprintf (key, 256, "volume%d.transport", count); ret = dict_set_int32 (volumes, key, volinfo->transport_type); if (ret) diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index c31d8a8ad71..086a6550a72 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -844,6 +844,18 @@ glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) if (ret) goto out; + snprintf (buf, sizeof (buf), "%d", volinfo->disperse_count); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, + buf); + if (ret) + goto out; + + snprintf (buf, sizeof (buf), "%d", volinfo->redundancy_count); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, + buf); + if (ret) + goto out; + snprintf (buf, sizeof (buf), "%d", volinfo->version); ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, buf); if (ret) @@ -2618,6 +2630,12 @@ glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT, strlen (GLUSTERD_STORE_KEY_VOL_REPLICA_CNT))) { volinfo->replica_count = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, + strlen (GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT))) { + volinfo->disperse_count = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, + strlen (GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT))) { + volinfo->redundancy_count = atoi (value); } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_TRANSPORT, strlen (GLUSTERD_STORE_KEY_VOL_TRANSPORT))) { volinfo->transport_type = atoi (value); @@ -2754,6 +2772,11 @@ glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo->replica_count > 0); break; + case GF_CLUSTER_TYPE_DISPERSE: + GF_ASSERT (volinfo->disperse_count > 0); + GF_ASSERT (volinfo->redundancy_count > 0); + break; + default: GF_ASSERT (0); break; diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 89cf24de789..fb7de7b1b10 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -44,6 +44,8 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count" #define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count" #define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count" +#define GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT "disperse_count" +#define GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT "redundancy_count" #define GLUSTERD_STORE_KEY_VOL_BRICK "brick" #define GLUSTERD_STORE_KEY_VOL_VERSION "version" #define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index dc923b1eeb4..aff2356eb4f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -548,6 +548,8 @@ glusterd_volinfo_dup (glusterd_volinfo_t *volinfo, new_volinfo->type = volinfo->type; new_volinfo->replica_count = volinfo->replica_count; new_volinfo->stripe_count = volinfo->stripe_count; + new_volinfo->disperse_count = volinfo->disperse_count; + new_volinfo->redundancy_count = volinfo->redundancy_count; new_volinfo->dist_leaf_count = volinfo->dist_leaf_count; new_volinfo->sub_count = volinfo->sub_count; new_volinfo->transport_type = volinfo->transport_type; @@ -2524,6 +2526,18 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.disperse_count", prefix, count); + ret = dict_set_int32 (dict, key, volinfo->disperse_count); + if (ret) + goto out; + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.redundancy_count", prefix, count); + ret = dict_set_int32 (dict, key, volinfo->redundancy_count); + if (ret) + goto out; + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count); ret = dict_set_int32 (dict, key, volinfo->dist_leaf_count); @@ -4206,6 +4220,24 @@ glusterd_import_volinfo (dict_t *peer_data, int count, gf_log (THIS->name, GF_LOG_INFO, "peer is possibly old version"); + /* not having a 'disperse_count' key is not a error + (as peer may be of old version) */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.disperse_count", prefix, count); + ret = dict_get_int32 (peer_data, key, &new_volinfo->disperse_count); + if (ret) + gf_log (THIS->name, GF_LOG_INFO, + "peer is possibly old version"); + + /* not having a 'redundancy_count' key is not a error + (as peer may be of old version) */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.redundancy_count", prefix, count); + ret = dict_get_int32 (peer_data, key, &new_volinfo->redundancy_count); + if (ret) + gf_log (THIS->name, GF_LOG_INFO, + "peer is possibly old version"); + /* not having a 'dist_count' key is not a error (as peer may be of old version) */ memset (key, 0, sizeof (key)); @@ -6932,6 +6964,9 @@ glusterd_get_dist_leaf_count (glusterd_volinfo_t *volinfo) int rcount = volinfo->replica_count; int scount = volinfo->stripe_count; + if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) + return volinfo->disperse_count; + return (rcount ? rcount : 1) * (scount ? scount : 1); } @@ -11694,6 +11729,13 @@ gd_update_volume_op_versions (glusterd_volinfo_t *volinfo) } } + if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { + if (volinfo->op_version < GD_OP_VERSION_3_6_0) + volinfo->op_version = GD_OP_VERSION_3_6_0; + if (volinfo->client_op_version < GD_OP_VERSION_3_6_0) + volinfo->client_op_version = GD_OP_VERSION_3_6_0; + } + return; } @@ -12774,7 +12816,7 @@ glusterd_volume_quorum_calculate (glusterd_volinfo_t *volinfo, dict_t *dict, goto out; } - up_count = volinfo->replica_count - down_count; + up_count = volinfo->dist_leaf_count - down_count; if (quorum_type && !strcmp (quorum_type, "fixed")) { if (up_count >= quorum_count) { @@ -12782,7 +12824,8 @@ glusterd_volume_quorum_calculate (glusterd_volinfo_t *volinfo, dict_t *dict, goto out; } } else { - if (volinfo->replica_count % 2 == 0) { + if ((GF_CLUSTER_TYPE_DISPERSE != volinfo->type) && + (volinfo->dist_leaf_count % 2 == 0)) { if ((up_count > quorum_count) || ((up_count == quorum_count) && first_brick_on)) { quorum_met = _gf_true; @@ -12835,8 +12878,9 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, goto out; } - if (!glusterd_is_volume_replicate (volinfo) || - volinfo->replica_count < 3) { + if ((!glusterd_is_volume_replicate (volinfo) || + volinfo->replica_count < 3) && + (GF_CLUSTER_TYPE_DISPERSE != volinfo->type)) { for (i = 0; i < volinfo->brick_count ; i++) { /* for a pure distribute volume, and replica volume with replica count 2, quorum is not met if even @@ -12858,7 +12902,8 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, ret = 0; quorum_met = _gf_true; } else { - distribute_subvols = volinfo->brick_count / volinfo->replica_count; + distribute_subvols = volinfo->brick_count / + volinfo->dist_leaf_count; for (j = 0; j < distribute_subvols; j++) { // by default assume quorum is not met /* TODO: Handle distributed striped replicate volumes @@ -12867,11 +12912,11 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, */ ret = 1; quorum_met = _gf_false; - for (i = 0; i < volinfo->replica_count; i++) { + for (i = 0; i < volinfo->dist_leaf_count; i++) { snprintf (key, sizeof (key), "%s%"PRId64".brick%"PRId64".status", key_prefix, index, - (j * volinfo->replica_count) + i); + (j * volinfo->dist_leaf_count) + i); ret = dict_get_int32 (dict, key, &brick_online); if (ret || !brick_online) { if (i == 0) @@ -13043,6 +13088,9 @@ glusterd_snap_quorum_check_for_create (dict_t *dict, gf_boolean_t snap_volume, else quorum_count = volinfo->replica_count/2 + 1; + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { + quorum_count = volinfo->disperse_count - + volinfo->redundancy_count; } else { quorum_count = volinfo->brick_count; } @@ -13061,8 +13109,22 @@ glusterd_snap_quorum_check_for_create (dict_t *dict, gf_boolean_t snap_volume, if the quorum-type option is not set to auto, the behavior is set to the default behavior) */ - if (!ret) - quorum_count = tmp; + if (!ret) { + /* for dispersed volumes, only allow quorums + equal or larger than minimum functional + value. + */ + if ((GF_CLUSTER_TYPE_DISPERSE != + volinfo->type) || + (tmp >= quorum_count)) { + quorum_count = tmp; + } else { + gf_log(this->name, GF_LOG_INFO, + "Ignoring small quorum-count " + "(%d) on dispersed volume", tmp); + quorum_type = NULL; + } + } else quorum_type = NULL; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6ab899a16cf..9701c6b939c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -2684,10 +2684,14 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph, "%s-replicate-%d"}; char *stripe_args[] = {"cluster/stripe", "%s-stripe-%d"}; + char *disperse_args[] = {"cluster/disperse", + "%s-disperse-%d"}; + char option[32] = ""; int rclusters = 0; int clusters = 0; int dist_count = 0; int ret = -1; + xlator_t * ec = NULL; if (!volinfo->dist_leaf_count) goto out; @@ -2737,6 +2741,26 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph, if (clusters < 0) goto out; break; + case GF_CLUSTER_TYPE_DISPERSE: + clusters = volgen_graph_build_clusters (graph, volinfo, + disperse_args[0], + disperse_args[1], + volinfo->brick_count, + volinfo->disperse_count); + if (clusters < 0) + goto out; + + sprintf(option, "%d", volinfo->redundancy_count); + ec = first_of (graph); + while (clusters-- > 0) { + ret = xlator_set_option (ec, "redundancy", option); + if (ret) + goto out; + + ec = ec->next; + } + + break; default: gf_log ("", GF_LOG_ERROR, "volume inconsistency: " "unrecognized clustering type"); diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 53beebe0555..f23a9eb96b7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -1689,6 +1689,27 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) "replica count for volume %s", volname); goto out; } + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { + ret = dict_get_int32 (dict, "disperse-count", + &volinfo->disperse_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "disperse count for volume %s", volname); + goto out; + } + ret = dict_get_int32 (dict, "redundancy-count", + &volinfo->redundancy_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "redundancy count for volume %s", volname); + goto out; + } + if (priv->op_version < GD_OP_VERSION_3_6_0) { + gf_log (this->name, GF_LOG_ERROR, "Disperse volume " + "needs op-version 3.6.0 or higher"); + ret = -1; + goto out; + } } /* dist-leaf-count is the count of brick nodes for a given diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index a8ecb505a5b..ddbb2c81338 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -336,6 +336,8 @@ struct glusterd_volinfo_ { int sub_count; /* backward compatibility */ int stripe_count; int replica_count; + int disperse_count; + int redundancy_count; int subvol_count; /* Number of subvolumes in a distribute volume */ int dist_leaf_count; /* Number of bricks in one -- cgit