From 9b223b15ab69fce4076de036ee162f36a058bcd2 Mon Sep 17 00:00:00 2001 From: Vishal Pandey Date: Wed, 24 Apr 2019 13:37:16 +0530 Subject: glusterd/thin-arbiter: Thin-arbiter integration with GD1 gluster volume create replica 2 thin-arbiter 1 : : : [force] The changes have been made in a way that the last brick in the bricks list will be treated as the thin-arbiter. GD1 will be manipulated to consider replica count to be as 2 and continue creating the volume like any other replica 2 volume but since thin-arbiter volumes need ta-brick client xlator entries for each subvolume in fuse volfile, volfile generation is modified in a way to inject these entries seperately in the volfile for every subvolume. Few more additions - 1- Save the volinfo with new fields ta_bricks list and thin_arbiter_count. 2- Introduce a new option client.ta-brick-port to add remote-port to ta-brick xlator entry in fuse volfiles. The option can be set using the following CLI syntax - gluster volume set client.ta-brick-port 3- Volume Info will contain a Thin-Arbiter-path entry to distinguish from other replicate volumes. Change-Id: Ib434e2313b29716f32476c6c211d282c4ef39406 Updates #687 Signed-off-by: Vishal Pandey --- cli/src/cli-cmd-parser.c | 145 ++++++++++++- cli/src/cli-cmd-volume.c | 4 +- cli/src/cli-rpc-ops.c | 15 ++ doc/gluster.8 | 2 +- heal/src/glfs-heal.c | 3 +- tests/basic/glusterd/thin-arbiter-volume-probe.t | 25 +++ tests/basic/glusterd/thin-arbiter-volume.t | 45 ++++ xlators/mgmt/glusterd/src/glusterd-handler.c | 27 +++ xlators/mgmt/glusterd/src/glusterd-store.c | 236 ++++++++++++++++++++- xlators/mgmt/glusterd/src/glusterd-store.h | 2 + xlators/mgmt/glusterd/src/glusterd-utils.c | 253 ++++++++++++++++++++++- xlators/mgmt/glusterd/src/glusterd-utils.h | 7 + xlators/mgmt/glusterd/src/glusterd-volgen.c | 144 ++++++++++++- xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 74 ++++++- xlators/mgmt/glusterd/src/glusterd-volume-set.c | 8 + xlators/mgmt/glusterd/src/glusterd.h | 2 + 16 files changed, 961 insertions(+), 31 deletions(-) create mode 100644 tests/basic/glusterd/thin-arbiter-volume-probe.t create mode 100644 tests/basic/glusterd/thin-arbiter-volume.t diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index a6ce49035d9..decdd10cb50 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -79,6 +79,95 @@ str_getunamb(const char *tok, char **opwords) return (char *)cli_getunamb(tok, (void **)opwords, id_sel); } +int32_t +cli_cmd_ta_brick_parse(const char **words, int wordcount, char **ta_brick) +{ + char *host_name = NULL; + char *tmp_host = NULL; + char *delimiter = NULL; + cli_brick_t *brick = NULL; + int ret = 0; + + GF_ASSERT(words); + GF_ASSERT(wordcount); + + if (validate_brick_name((char *)words[wordcount - 1])) { + cli_err( + "Wrong brick type: %s, use :" + "", + words[wordcount - 1]); + ret = -1; + goto out; + } else { + delimiter = strrchr(words[wordcount - 1], ':'); + ret = gf_canonicalize_path(delimiter + 1); + if (ret) + goto out; + } + + tmp_host = gf_strdup((char *)words[wordcount - 1]); + if (!tmp_host) { + gf_log("cli", GF_LOG_ERROR, "Out of memory"); + ret = -1; + goto out; + } + get_host_name(tmp_host, &host_name); + if (!host_name) { + ret = -1; + gf_log("cli", GF_LOG_ERROR, + "Unable to retrieve " + "hostname"); + goto out; + } + + if (!(strcmp(host_name, "localhost") && strcmp(host_name, "127.0.0.1") && + strncmp(host_name, "0.", 2))) { + cli_err( + "Please provide a valid hostname/ip other " + "than localhost, 127.0.0.1 or loopback " + "address (0.0.0.0 to 0.255.255.255)."); + ret = -1; + goto out; + } + if (!valid_internet_address(host_name, _gf_false, _gf_false)) { + cli_err( + "internet address '%s' does not conform to " + "standards", + host_name); + } + + brick = GF_MALLOC(sizeof(cli_brick_t), gf_common_list_node); + if (brick == NULL) { + ret = -1; + gf_log("cli", GF_LOG_ERROR, "Out of memory"); + goto out; + } + + brick->name = words[wordcount - 1]; + brick->len = strlen(words[wordcount - 1]); + *ta_brick = GF_MALLOC(brick->len + 3, gf_common_mt_char); + if (*ta_brick == NULL) { + ret = -1; + gf_log("cli", GF_LOG_ERROR, "Out of memory"); + goto out; + } + + strcat(*ta_brick, " "); + strcat(*ta_brick, brick->name); + strcat(*ta_brick, " "); +out: + if (tmp_host) { + GF_FREE(tmp_host); + tmp_host = NULL; + } + if (brick) { + GF_FREE(brick); + brick = NULL; + } + + return ret; +} + int32_t cli_cmd_bricks_parse(const char **words, int wordcount, int brick_index, char **bricks, int *brick_count) @@ -476,14 +565,17 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, char *trans_type = NULL; int32_t index = 0; char *bricks = NULL; + char *ta_brick = NULL; int32_t brick_count = 0; - char *opwords[] = {"replica", "stripe", "transport", "disperse", - "redundancy", "disperse-data", "arbiter", NULL}; + char *opwords[] = {"replica", "stripe", "transport", + "disperse", "redundancy", "disperse-data", + "arbiter", "thin-arbiter", NULL}; char *w = NULL; int op_count = 0; int32_t replica_count = 1; int32_t arbiter_count = 0; + int32_t thin_arbiter_count = 0; int32_t stripe_count = 1; int32_t disperse_count = -1; int32_t redundancy_count = -1; @@ -581,6 +673,25 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, if (ret) goto out; index += 2; + } else if (!strcmp(words[index], "thin-arbiter")) { + ret = gf_string2int(words[index + 1], &thin_arbiter_count); + if ((ret == -1) || (thin_arbiter_count != 1)) { + cli_err( + "For thin-arbiter " + "configuration, " + "replica count must be" + " 2 and thin-arbiter count " + "must be 1. The 3rd " + "brick of the replica " + "will be the thin-arbiter brick"); + ret = -1; + goto out; + } + ret = dict_set_int32(dict, "thin-arbiter-count", + thin_arbiter_count); + if (ret) + goto out; + index += 2; } } @@ -589,7 +700,7 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, if ((arbiter_count == 1) && (replica_count == 2)) replica_count += arbiter_count; - if (replica_count == 2) { + if (replica_count == 2 && thin_arbiter_count == 0) { if (strcmp(words[wordcount - 1], "force")) { question = "Replica 2 volumes are prone" @@ -657,6 +768,12 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, "option."); ret = -1; goto out; + } else if ((strcmp(w, "thin-arbiter") == 0)) { + cli_err( + "thin-arbiter option must be preceded by replica " + "option."); + ret = -1; + goto out; } else { GF_ASSERT(!"opword mismatch"); ret = -1; @@ -680,7 +797,20 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, wc = wordcount - 1; } - ret = cli_cmd_bricks_parse(words, wc, brick_index, &bricks, &brick_count); + // Exclude the thin-arbiter-brick i.e. last brick in the bricks list + if (thin_arbiter_count == 1) { + ret = cli_cmd_bricks_parse(words, wc - 1, brick_index, &bricks, + &brick_count); + if (ret) + goto out; + + ret = cli_cmd_ta_brick_parse(words, wc, &ta_brick); + + } else { + ret = cli_cmd_bricks_parse(words, wc, brick_index, &bricks, + &brick_count); + } + if (ret) goto out; @@ -739,6 +869,12 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, if (ret) goto out; + if (thin_arbiter_count == 1) { + ret = dict_set_dynstr(dict, "ta-brick", ta_brick); + if (ret) + goto out; + } + ret = dict_set_int32(dict, "count", brick_count); if (ret) goto out; @@ -752,6 +888,7 @@ cli_cmd_volume_create_parse(struct cli_state *state, const char **words, out: if (ret) { GF_FREE(bricks); + GF_FREE(ta_brick); gf_log("cli", GF_LOG_ERROR, "Unable to parse create volume CLI"); if (dict) dict_unref(dict); diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index f2948a3cbdb..c6f08985b12 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -2999,9 +2999,9 @@ struct cli_cmd volume_cmds[] = { "list information of all volumes"}, {"volume create [stripe ] " - "[replica [arbiter ]] " + "[[replica [arbiter ]]|[replica 2 thin-arbiter 1]] " "[disperse []] [disperse-data ] [redundancy ] " - "[transport ] " + "[transport ] " "... [force]", cli_cmd_volume_create_cbk, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 16d5f73983f..35985ab44c6 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -723,10 +723,12 @@ gf_cli_get_volume_cbk(struct rpc_req *req, struct iovec *iov, int count, int32_t redundancy_count = 0; int32_t arbiter_count = 0; int32_t snap_count = 0; + int32_t thin_arbiter_count = 0; int32_t vol_type = 0; int32_t transport = 0; char *volume_id_str = NULL; char *volname = NULL; + char *ta_brick = NULL; dict_t *dict = NULL; cli_local_t *local = NULL; char key[1024] = {0}; @@ -903,6 +905,11 @@ xml_output: if (ret) goto out; + snprintf(key, 256, "volume%d.thin_arbiter_count", i); + ret = dict_get_int32(dict, key, &thin_arbiter_count); + if (ret) + goto out; + // Distributed (stripe/replicate/stripe-replica) setups vol_type = get_vol_type(type, dist_count, brick_count); @@ -929,6 +936,14 @@ xml_output: if (ret) goto out; + if (thin_arbiter_count) { + snprintf(key, 1024, "volume%d.thin_arbiter_brick", i); + ret = dict_get_str(dict, key, &ta_brick); + if (ret) + goto out; + cli_out("Thin-arbiter-path: %s", ta_brick); + } + snprintf(key, 256, "volume%d.opt_count", i); ret = dict_get_int32(dict, key, &opt_count); if (ret) diff --git a/doc/gluster.8 b/doc/gluster.8 index 4f36c13d45f..99a8d5e5048 100644 --- a/doc/gluster.8 +++ b/doc/gluster.8 @@ -41,7 +41,7 @@ List all volumes in cluster \fB\ volume status [all | [nfs|shd||quotad]] [detail|clients|mem|inode|fd|callpool|tasks|client-list] \fR Display status of all or specified volume(s)/brick .TP -\fB\ volume create [stripe ] [replica ] [disperse []] [redundancy ] [transport ] ... \fR +\fB\ volume create [stripe ] [[replica [arbiter ]]|[replica 2 thin-arbiter 1]] [disperse []] [redundancy ] [transport ] ... \fR Create a new volume of the specified type using the specified bricks and transport type (the default transport type is tcp). To create a volume with both transports (tcp and rdma), give 'transport tcp,rdma' as an option. .TP diff --git a/heal/src/glfs-heal.c b/heal/src/glfs-heal.c index ce6925a281b..3ebf79eee14 100644 --- a/heal/src/glfs-heal.c +++ b/heal/src/glfs-heal.c @@ -1144,7 +1144,8 @@ glfsh_gather_heal_info(glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, while (xl->next) xl = xl->next; while (xl) { - if (strcmp(xl->type, "protocol/client") == 0) { + if (strcmp(xl->type, "protocol/client") == 0 && + !strstr(xl->name, "-ta-")) { heal_xl = _get_ancestor(xl, heal_op); if (heal_xl) { old_THIS = THIS; diff --git a/tests/basic/glusterd/thin-arbiter-volume-probe.t b/tests/basic/glusterd/thin-arbiter-volume-probe.t new file mode 100644 index 00000000000..acc6943806d --- /dev/null +++ b/tests/basic/glusterd/thin-arbiter-volume-probe.t @@ -0,0 +1,25 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc + +#This tests if the thin-arbiter-count is transferred to the other peer. +function check_peers { + $CLI_1 peer status | grep 'Peer in Cluster (Connected)' | wc -l +} + +cleanup; + +TEST launch_cluster 2; +TEST $CLI_1 peer probe $H2; + +EXPECT_WITHIN $PROBE_TIMEOUT 1 check_peers + +kill_glusterd 2 +$CLI_1 volume create $V0 replica 2 thin-arbiter 1 $H0:$B0/b{1..3} +TEST $glusterd_2 +EXPECT_WITHIN $PROBE_TIMEOUT 1 check_peers +EXPECT "1 x 2 = 2" volinfo_field_1 $V0 "Number of Bricks" +EXPECT "1 x 2 = 2" volinfo_field_2 $V0 "Number of Bricks" + +cleanup; diff --git a/tests/basic/glusterd/thin-arbiter-volume.t b/tests/basic/glusterd/thin-arbiter-volume.t new file mode 100644 index 00000000000..4e813890a45 --- /dev/null +++ b/tests/basic/glusterd/thin-arbiter-volume.t @@ -0,0 +1,45 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../ volume.rc +. $(dirname $0)/../../thin-arbiter.rc + +#This command tests the volume create command validation for thin-arbiter volumes. + +cleanup; + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 replica 2 thin-arbiter 1 $H0:$B0/b1 $H0:$B0/b2 $H0:$B0/b3 +EXPECT "1 x 2 = 2" volinfo_field $V0 "Number of Bricks" +TEST $CLI volume start $V0 + +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; + +TEST touch $M0/a.txt +TEST ls $B0/b1/a.txt +TEST ls $B0/b2/a.txt +TEST ! ls $B0/b3/a.txt + +TEST umount $M0 +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +TEST $CLI volume create $V0 replica 2 thin-arbiter 1 $H0:$B0/b{4..8} +EXPECT "2 x 2 = 4" volinfo_field $V0 "Number of Bricks" + +TEST $CLI volume delete $V0 + +TEST rm -rf $B0/b{1..3} + +TEST $CLI volume create $V0 replica 2 thin-arbiter 1 $H0:$B0/b1 $H0:$B0/b2 $H0:$B0/b3 +EXPECT "1 x 2 = 2" volinfo_field $V0 "Number of Bricks" + +TEST killall -15 glusterd +TEST glusterd +TEST pidof glusterd +EXPECT "1 x 2 = 2" volinfo_field $V0 "Number of Bricks" + +cleanup + diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index 576cae7b57a..a0bf409d872 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -357,6 +357,7 @@ glusterd_add_volume_detail_to_dict(glusterd_volinfo_t *volinfo, dict_t *volumes, }; int keylen; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *ta_brickinfo = NULL; char *buf = NULL; int i = 1; dict_t *dict = NULL; @@ -368,6 +369,10 @@ glusterd_add_volume_detail_to_dict(glusterd_volinfo_t *volinfo, dict_t *volumes, xlator_t *this = NULL; int32_t len = 0; + char ta_brick[4096] = { + 0, + }; + GF_ASSERT(volinfo); GF_ASSERT(volumes); @@ -431,6 +436,11 @@ glusterd_add_volume_detail_to_dict(glusterd_volinfo_t *volinfo, dict_t *volumes, if (ret) goto out; + keylen = snprintf(key, sizeof(key), "volume%d.thin_arbiter_count", count); + ret = dict_set_int32n(volumes, key, keylen, volinfo->thin_arbiter_count); + if (ret) + goto out; + volume_id_str = gf_strdup(uuid_utoa(volinfo->volume_id)); if (!volume_id_str) goto out; @@ -481,6 +491,23 @@ glusterd_add_volume_detail_to_dict(glusterd_volinfo_t *volinfo, dict_t *volumes, i++; } + if (volinfo->thin_arbiter_count == 1) { + ta_brickinfo = list_first_entry(&volinfo->ta_bricks, + glusterd_brickinfo_t, brick_list); + len = snprintf(ta_brick, sizeof(ta_brick), "%s:%s", + ta_brickinfo->hostname, ta_brickinfo->path); + if ((len < 0) || (len >= sizeof(ta_brick))) { + ret = -1; + goto out; + } + buf = gf_strdup(ta_brick); + keylen = snprintf(key, sizeof(key), "volume%d.thin_arbiter_brick", + count); + ret = dict_set_dynstrn(volumes, key, keylen, buf); + if (ret) + goto out; + } + ret = glusterd_add_arbiter_info_to_bricks(volinfo, volumes, count); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index fc0df1155a9..311e7d3abf3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -266,7 +266,8 @@ out: int32_t glusterd_store_volinfo_brick_fname_write(int vol_fd, glusterd_brickinfo_t *brickinfo, - int32_t brick_count) + int32_t brick_count, + int is_thin_arbiter) { char key[64] = { 0, @@ -276,8 +277,13 @@ glusterd_store_volinfo_brick_fname_write(int vol_fd, }; int32_t ret = -1; - snprintf(key, sizeof(key), "%s-%d", GLUSTERD_STORE_KEY_VOL_BRICK, - brick_count); + if (!is_thin_arbiter) { + snprintf(key, sizeof(key), "%s-%d", GLUSTERD_STORE_KEY_VOL_BRICK, + brick_count); + } else { + snprintf(key, sizeof(key), "%s-%d", GLUSTERD_STORE_KEY_VOL_TA_BRICK, + brick_count); + } glusterd_store_brickinfofname_set(brickinfo, brickfname, sizeof(brickfname)); ret = gf_store_save_value(vol_fd, key, brickfname); @@ -498,14 +504,14 @@ glusterd_store_perform_brick_store(glusterd_brickinfo_t *brickinfo) ret = -1; goto out; } - ret = glusterd_store_brickinfo_write(fd, brickinfo); if (ret) goto out; out: - if (ret && (fd > 0)) + if (ret && (fd > 0)) { gf_store_unlink_tmppath(brickinfo->shandle); + } gf_msg_debug(THIS->name, 0, "Returning %d", ret); return ret; } @@ -553,15 +559,15 @@ out: static int32_t glusterd_store_brickinfo(glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo, int32_t brick_count, - int vol_fd) + int vol_fd, int is_thin_arbiter) { int32_t ret = -1; GF_ASSERT(volinfo); GF_ASSERT(brickinfo); - ret = glusterd_store_volinfo_brick_fname_write(vol_fd, brickinfo, - brick_count); + ret = glusterd_store_volinfo_brick_fname_write( + vol_fd, brickinfo, brick_count, is_thin_arbiter); if (ret) goto out; @@ -988,6 +994,18 @@ glusterd_volume_exclude_options_write(int fd, glusterd_volinfo_t *volinfo) total_len += ret; } + if ((conf->op_version >= GD_OP_VERSION_7_0) && + volinfo->thin_arbiter_count) { + ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%d\n", + GLUSTERD_STORE_KEY_VOL_THIN_ARBITER_CNT, + volinfo->thin_arbiter_count); + if (ret < 0 || ret >= sizeof(buf) - total_len) { + ret = -1; + goto out; + } + total_len += ret; + } + ret = gf_store_save_items(fd, buf); if (ret) goto out; @@ -1320,17 +1338,29 @@ glusterd_store_brickinfos(glusterd_volinfo_t *volinfo, int vol_fd) { int32_t ret = 0; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *ta_brickinfo = NULL; int32_t brick_count = 0; + int32_t ta_brick_count = 0; GF_ASSERT(volinfo); cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list) { - ret = glusterd_store_brickinfo(volinfo, brickinfo, brick_count, vol_fd); + ret = glusterd_store_brickinfo(volinfo, brickinfo, brick_count, vol_fd, + 0); if (ret) goto out; brick_count++; } + if (volinfo->thin_arbiter_count == 1) { + ta_brickinfo = list_first_entry(&volinfo->ta_bricks, + glusterd_brickinfo_t, brick_list); + ret = glusterd_store_brickinfo(volinfo, ta_brickinfo, ta_brick_count, + vol_fd, 1); + if (ret) + goto out; + } + out: gf_msg_debug(THIS->name, 0, "Returning %d", ret); return ret; @@ -1507,6 +1537,7 @@ glusterd_store_brickinfos_atomic_update(glusterd_volinfo_t *volinfo) { int ret = -1; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *ta_brickinfo = NULL; GF_ASSERT(volinfo); @@ -1516,6 +1547,15 @@ glusterd_store_brickinfos_atomic_update(glusterd_volinfo_t *volinfo) if (ret) goto out; } + + if (volinfo->thin_arbiter_count == 1) { + ta_brickinfo = list_first_entry(&volinfo->ta_bricks, + glusterd_brickinfo_t, brick_list); + ret = gf_store_rename_tmppath(ta_brickinfo->shandle); + if (ret) + goto out; + } + out: return ret; } @@ -1670,6 +1710,7 @@ glusterd_store_volinfo(glusterd_volinfo_t *volinfo, unlock: pthread_mutex_unlock(&volinfo->store_volinfo_lock); pthread_mutex_unlock(&ctx->cleanup_lock); + if (ret) glusterd_store_volume_cleanup_tmp(volinfo); @@ -2435,6 +2476,7 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) { int32_t ret = 0; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *ta_brickinfo = NULL; gf_store_iter_t *iter = NULL; char *key = NULL; char *value = NULL; @@ -2446,6 +2488,7 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) }; glusterd_conf_t *priv = NULL; int32_t brick_count = 0; + int32_t ta_brick_count = 0; char tmpkey[4096] = { 0, }; @@ -2455,6 +2498,10 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) struct pmap_registry *pmap = NULL; xlator_t *this = NULL; int brickid = 0; + /* ta_brick_id initialization with 2 since ta-brick id starts with + * volname-ta-2 + */ + int ta_brick_id = 2; gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; int32_t len = 0; @@ -2748,6 +2795,175 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) brick_count++; } + ret = gf_store_iter_new(volinfo->shandle, &tmpiter); + + if (ret) + goto out; + + if (volinfo->thin_arbiter_count == 1) { + while (ta_brick_count < volinfo->subvol_count) { + ret = glusterd_brickinfo_new(&ta_brickinfo); + if (ret) + goto out; + + snprintf(tmpkey, sizeof(tmpkey), "%s-%d", + GLUSTERD_STORE_KEY_VOL_TA_BRICK, 0); + + ret = gf_store_iter_get_matching(tmpiter, tmpkey, &tmpvalue); + + len = snprintf(path, sizeof(path), "%s/%s", brickdir, tmpvalue); + if ((len < 0) || (len >= sizeof(path))) { + ret = -1; + goto out; + } + + ret = gf_store_handle_retrieve(path, &ta_brickinfo->shandle); + + if (ret) + goto out; + + ret = gf_store_iter_new(ta_brickinfo->shandle, &iter); + + if (ret) + goto out; + + ret = gf_store_iter_get_next(iter, &key, &value, &op_errno); + if (ret) { + gf_msg("glusterd", GF_LOG_ERROR, op_errno, + GD_MSG_STORE_ITER_GET_FAIL, + "Unable to iterate " + "the store for brick: %s", + path); + goto out; + } + + while (!ret) { + if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_HOSTNAME, + SLEN(GLUSTERD_STORE_KEY_BRICK_HOSTNAME))) { + if (snprintf(ta_brickinfo->hostname, + sizeof(ta_brickinfo->hostname), "%s", + value) >= sizeof(ta_brickinfo->hostname)) { + gf_msg("glusterd", GF_LOG_ERROR, op_errno, + GD_MSG_PARSE_BRICKINFO_FAIL, + "brick hostname truncated: %s", + ta_brickinfo->hostname); + goto out; + } + } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_PATH, + SLEN(GLUSTERD_STORE_KEY_BRICK_PATH))) { + if (snprintf(ta_brickinfo->path, sizeof(ta_brickinfo->path), + "%s", value) >= sizeof(ta_brickinfo->path)) { + gf_msg("glusterd", GF_LOG_ERROR, op_errno, + GD_MSG_PARSE_BRICKINFO_FAIL, + "brick path truncated: %s", ta_brickinfo->path); + goto out; + } + } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_REAL_PATH, + SLEN(GLUSTERD_STORE_KEY_BRICK_REAL_PATH))) { + if (snprintf(ta_brickinfo->real_path, + sizeof(ta_brickinfo->real_path), "%s", + value) >= sizeof(ta_brickinfo->real_path)) { + gf_msg("glusterd", GF_LOG_ERROR, op_errno, + GD_MSG_PARSE_BRICKINFO_FAIL, + "real_path truncated: %s", + ta_brickinfo->real_path); + goto out; + } + } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_PORT, + SLEN(GLUSTERD_STORE_KEY_BRICK_PORT))) { + ret = gf_string2int(value, &ta_brickinfo->port); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + GD_MSG_INCOMPATIBLE_VALUE, + "Failed to convert " + "string to integer"); + } + + if (ta_brickinfo->port < priv->base_port) { + /* This is required to adhere to the + IANA standards */ + ta_brickinfo->port = 0; + } + } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT, + SLEN(GLUSTERD_STORE_KEY_BRICK_RDMA_PORT))) { + ret = gf_string2int(value, &ta_brickinfo->rdma_port); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + GD_MSG_INCOMPATIBLE_VALUE, + "Failed to convert " + "string to integer"); + } + + if (ta_brickinfo->rdma_port < priv->base_port) { + /* This is required to adhere to the + IANA standards */ + ta_brickinfo->rdma_port = 0; + } + } else if (!strncmp( + key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, + SLEN(GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) { + ret = gf_string2int(value, &ta_brickinfo->decommissioned); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + GD_MSG_INCOMPATIBLE_VALUE, + "Failed to convert " + "string to integer"); + } + + } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_ID)) { + if (snprintf(ta_brickinfo->brick_id, + sizeof(ta_brickinfo->brick_id), "%s", + value) >= sizeof(ta_brickinfo->brick_id)) { + gf_msg("glusterd", GF_LOG_ERROR, op_errno, + GD_MSG_PARSE_BRICKINFO_FAIL, + "brick_id truncated: %s", + ta_brickinfo->brick_id); + goto out; + } + } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_FSID, + SLEN(GLUSTERD_STORE_KEY_BRICK_FSID))) { + ret = gf_string2uint64(value, &ta_brickinfo->statfs_fsid); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + GD_MSG_INVALID_ENTRY, + "%s " + "is not a valid uint64_t value", + value); + } + } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_UUID)) { + gf_uuid_parse(value, brickinfo->uuid); + } else if (!strncmp( + key, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS, + SLEN(GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS))) { + ret = gf_string2int(value, &ta_brickinfo->snap_status); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + GD_MSG_INCOMPATIBLE_VALUE, + "Failed to convert " + "string to integer"); + } + + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_KEY, + "Unknown key: %s", key); + } + + GF_FREE(key); + GF_FREE(value); + key = NULL; + value = NULL; + ret = gf_store_iter_get_next(iter, &key, &value, &op_errno); + } + + GLUSTERD_ASSIGN_BRICKID_TO_TA_BRICKINFO(ta_brickinfo, volinfo, + ta_brick_id); + ta_brick_id += 3; + + cds_list_add_tail(&ta_brickinfo->brick_list, &volinfo->ta_bricks); + ta_brick_count++; + } + } + assign_brick_groups(volinfo); ret = 0; @@ -2994,6 +3210,8 @@ glusterd_store_update_volinfo(glusterd_volinfo_t *volinfo) volinfo->replica_count = atoi(value); } else if (!strcmp(key, GLUSTERD_STORE_KEY_VOL_ARBITER_CNT)) { volinfo->arbiter_count = atoi(value); + } else if (!strcmp(key, GLUSTERD_STORE_KEY_VOL_THIN_ARBITER_CNT)) { + volinfo->thin_arbiter_count = atoi(value); } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, SLEN(GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT))) { volinfo->disperse_count = atoi(value); diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 59aee880487..45aba64ff8d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -42,7 +42,9 @@ typedef enum glusterd_store_ver_ac_ { #define GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT "disperse_count" #define GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT "redundancy_count" #define GLUSTERD_STORE_KEY_VOL_ARBITER_CNT "arbiter_count" +#define GLUSTERD_STORE_KEY_VOL_THIN_ARBITER_CNT "thin_arbiter_count" #define GLUSTERD_STORE_KEY_VOL_BRICK "brick" +#define GLUSTERD_STORE_KEY_VOL_TA_BRICK "ta-brick" #define GLUSTERD_STORE_KEY_VOL_VERSION "version" #define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type" #define GLUSTERD_STORE_KEY_VOL_ID "volume-id" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 80fb82938e5..45cdf8ab956 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -639,6 +639,7 @@ glusterd_volinfo_new(glusterd_volinfo_t **volinfo) CDS_INIT_LIST_HEAD(&new_volinfo->vol_list); CDS_INIT_LIST_HEAD(&new_volinfo->snapvol_list); CDS_INIT_LIST_HEAD(&new_volinfo->bricks); + CDS_INIT_LIST_HEAD(&new_volinfo->ta_bricks); CDS_INIT_LIST_HEAD(&new_volinfo->snap_volumes); new_volinfo->dict = dict_new(); @@ -1525,6 +1526,37 @@ out: return ret; } +int32_t +glusterd_volume_ta_brickinfo_get(uuid_t uuid, char *hostname, char *path, + glusterd_volinfo_t *volinfo, + glusterd_brickinfo_t **ta_brickinfo) +{ + glusterd_brickinfo_t *ta_brickiter = NULL; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + + ret = -1; + + cds_list_for_each_entry(ta_brickiter, &volinfo->ta_bricks, brick_list) + { + if (strcmp(ta_brickiter->path, path) == 0 && + strcmp(ta_brickiter->hostname, hostname) == 0) { + gf_msg_debug(this->name, 0, LOGSTR_FOUND_BRICK, + ta_brickiter->hostname, ta_brickiter->path, + volinfo->volname); + ret = 0; + if (ta_brickinfo) + *ta_brickinfo = ta_brickiter; + break; + } + } + + gf_msg_debug(this->name, 0, "Returning %d", ret); + return ret; +} + int32_t glusterd_volume_brickinfo_get_by_brick(char *brick, glusterd_volinfo_t *volinfo, glusterd_brickinfo_t **brickinfo, @@ -2831,6 +2863,7 @@ glusterd_add_volume_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict, char key[64] = ""; int keylen; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *ta_brickinfo = NULL; int32_t i = 1; char *volume_id_str = NULL; char *str = NULL; @@ -2881,6 +2914,11 @@ glusterd_add_volume_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict, if (ret) goto out; + keylen = snprintf(key, sizeof(key), "%s.subvol_count", pfx); + ret = dict_set_int32n(dict, key, keylen, volinfo->subvol_count); + if (ret) + goto out; + keylen = snprintf(key, sizeof(key), "%s.stripe_count", pfx); ret = dict_set_int32n(dict, key, keylen, volinfo->stripe_count); if (ret) @@ -2896,6 +2934,11 @@ glusterd_add_volume_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict, if (ret) goto out; + keylen = snprintf(key, sizeof(key), "%s.thin_arbiter_count", pfx); + ret = dict_set_int32n(dict, key, keylen, volinfo->thin_arbiter_count); + if (ret) + goto out; + keylen = snprintf(key, sizeof(key), "%s.disperse_count", pfx); ret = dict_set_int32n(dict, key, keylen, volinfo->disperse_count); if (ret) @@ -3058,6 +3101,44 @@ glusterd_add_volume_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict, i++; } + i = 1; + if (volinfo->thin_arbiter_count == 1) { + cds_list_for_each_entry(ta_brickinfo, &volinfo->ta_bricks, brick_list) + { + keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.hostname", pfx, + i); + ret = dict_set_strn(dict, key, keylen, ta_brickinfo->hostname); + if (ret) + goto out; + + keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.path", pfx, i); + ret = dict_set_strn(dict, key, keylen, ta_brickinfo->path); + if (ret) + goto out; + + keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.decommissioned", + pfx, i); + ret = dict_set_int32n(dict, key, keylen, + ta_brickinfo->decommissioned); + if (ret) + goto out; + + keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.brick_id", pfx, + i); + ret = dict_set_strn(dict, key, keylen, ta_brickinfo->brick_id); + if (ret) + goto out; + + snprintf(key, sizeof(key), "%s.ta-brick%d.uuid", pfx, i); + ret = dict_set_dynstr_with_alloc(dict, key, + uuid_utoa(ta_brickinfo->uuid)); + if (ret) + goto out; + + i++; + } + } + /* Add volume op-versions to dict. This prevents volume inconsistencies * in the cluster */ @@ -3746,6 +3827,100 @@ out: return ret; } +static int32_t +glusterd_import_new_ta_brick(dict_t *peer_data, int32_t vol_count, + int32_t brick_count, + glusterd_brickinfo_t **ta_brickinfo, char *prefix) +{ + char key[128]; + char key_prefix[64]; + int keylen; + int ret = -1; + char *hostname = NULL; + char *path = NULL; + char *brick_id = NULL; + int decommissioned = 0; + glusterd_brickinfo_t *new_ta_brickinfo = NULL; + char msg[256] = ""; + char *brick_uuid_str = NULL; + + GF_ASSERT(peer_data); + GF_ASSERT(vol_count >= 0); + GF_ASSERT(ta_brickinfo); + GF_ASSERT(prefix); + + ret = snprintf(key_prefix, sizeof(key_prefix), "%s%d.ta-brick%d", prefix, + vol_count, brick_count); + + if (ret < 0 || ret >= sizeof(key_prefix)) { + ret = -1; + snprintf(msg, sizeof(msg), "key_prefix too long"); + goto out; + } + + keylen = snprintf(key, sizeof(key), "%s.hostname", key_prefix); + ret = dict_get_strn(peer_data, key, keylen, &hostname); + if (ret) { + snprintf(msg, sizeof(msg), "%s missing in payload", key); + goto out; + } + + keylen = snprintf(key, sizeof(key), "%s.path", key_prefix); + ret = dict_get_strn(peer_data, key, keylen, &path); + if (ret) { + snprintf(msg, sizeof(msg), "%s missing in payload", key); + goto out; + } + + keylen = snprintf(key, sizeof(key), "%s.brick_id", key_prefix); + ret = dict_get_strn(peer_data, key, keylen, &brick_id); + + keylen = snprintf(key, sizeof(key), "%s.decommissioned", key_prefix); + ret = dict_get_int32n(peer_data, key, keylen, &decommissioned); + if (ret) { + /* For backward compatibility */ + ret = 0; + } + + ret = glusterd_brickinfo_new(&new_ta_brickinfo); + if (ret) + goto out; + + ret = snprintf(new_ta_brickinfo->path, sizeof(new_ta_brickinfo->path), "%s", + path); + if (ret < 0 || ret >= sizeof(new_ta_brickinfo->path)) { + ret = -1; + goto out; + } + ret = snprintf(new_ta_brickinfo->hostname, + sizeof(new_ta_brickinfo->hostname), "%s", hostname); + if (ret < 0 || ret >= sizeof(new_ta_brickinfo->hostname)) { + ret = -1; + goto out; + } + new_ta_brickinfo->decommissioned = decommissioned; + if (brick_id) + (void)snprintf(new_ta_brickinfo->brick_id, + sizeof(new_ta_brickinfo->brick_id), "%s", brick_id); + keylen = snprintf(key, sizeof(key), "%s.uuid", key_prefix); + ret = dict_get_strn(peer_data, key, keylen, &brick_uuid_str); + if (ret) + goto out; + gf_uuid_parse(brick_uuid_str, new_ta_brickinfo->uuid); + + *ta_brickinfo = new_ta_brickinfo; + +out: + if (msg[0]) { + gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_IMPORT_FAIL, "%s", + msg); + gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;ta-brick=%s", + new_ta_brickinfo->hostname, new_ta_brickinfo->path); + } + gf_msg_debug("glusterd", 0, "Returning with %d", ret); + return ret; +} + /* The prefix represents the type of volume to be added. * It will be "volume" for normal volumes, and snap# like * snap1, snap2, for snapshot volumes @@ -3857,8 +4032,10 @@ glusterd_import_bricks(dict_t *peer_data, int32_t vol_count, { int ret = -1; int brick_count = 1; + int ta_brick_count = 1; int brickid = 0; glusterd_brickinfo_t *new_brickinfo = NULL; + glusterd_brickinfo_t *new_ta_brickinfo = NULL; GF_ASSERT(peer_data); GF_ASSERT(vol_count >= 0); @@ -3877,6 +4054,19 @@ glusterd_import_bricks(dict_t *peer_data, int32_t vol_count, cds_list_add_tail(&new_brickinfo->brick_list, &new_volinfo->bricks); brick_count++; } + + if (new_volinfo->thin_arbiter_count == 1) { + while (ta_brick_count <= new_volinfo->subvol_count) { + ret = glusterd_import_new_ta_brick(peer_data, vol_count, + ta_brick_count, + &new_ta_brickinfo, prefix); + if (ret) + goto out; + cds_list_add_tail(&new_ta_brickinfo->brick_list, + &new_volinfo->ta_bricks); + ta_brick_count++; + } + } ret = 0; out: gf_msg_debug("glusterd", 0, "Returning with %d", ret); @@ -4155,6 +4345,14 @@ glusterd_import_volinfo(dict_t *peer_data, int count, goto out; } + keylen = snprintf(key, sizeof(key), "%s.subvol_count", key_prefix); + ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->subvol_count); + if (ret) { + snprintf(msg, sizeof(msg), "%s missing in payload for %s", key, + volname); + goto out; + } + /* not having a 'stripe_count' key is not a error (as peer may be of old version) */ keylen = snprintf(key, sizeof(key), "%s.stripe_count", key_prefix); @@ -4179,6 +4377,15 @@ glusterd_import_volinfo(dict_t *peer_data, int count, gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED, "peer is possibly old version"); + /* not having a 'thin_arbiter_count' key is not a error + (as peer may be of old version) */ + keylen = snprintf(key, sizeof(key), "%s.thin_arbiter_count", key_prefix); + ret = dict_get_int32n(peer_data, key, keylen, + &new_volinfo->thin_arbiter_count); + if (ret) + gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED, + "peer is possibly old version"); + /* not having a 'disperse_count' key is not a error (as peer may be of old version) */ keylen = snprintf(key, sizeof(key), "%s.disperse_count", key_prefix); @@ -4392,6 +4599,8 @@ glusterd_volinfo_copy_brickinfo(glusterd_volinfo_t *old_volinfo, { glusterd_brickinfo_t *new_brickinfo = NULL; glusterd_brickinfo_t *old_brickinfo = NULL; + glusterd_brickinfo_t *new_ta_brickinfo = NULL; + glusterd_brickinfo_t *old_ta_brickinfo = NULL; glusterd_conf_t *priv = NULL; int ret = 0; xlator_t *this = NULL; @@ -4440,6 +4649,46 @@ glusterd_volinfo_copy_brickinfo(glusterd_volinfo_t *old_volinfo, } } } + if (new_volinfo->thin_arbiter_count == 1) { + cds_list_for_each_entry(new_ta_brickinfo, &new_volinfo->ta_bricks, + brick_list) + { + ret = glusterd_volume_ta_brickinfo_get( + new_ta_brickinfo->uuid, new_ta_brickinfo->hostname, + new_ta_brickinfo->path, old_volinfo, &old_ta_brickinfo); + if (ret == 0) { + new_ta_brickinfo->port = old_ta_brickinfo->port; + + if (old_ta_brickinfo->real_path[0] == '\0') { + if (!realpath(new_ta_brickinfo->path, abspath)) { + /* Here an ENOENT should also be a + * failure as the brick is expected to + * be in existence + */ + gf_msg(this->name, GF_LOG_CRITICAL, errno, + GD_MSG_BRICKINFO_CREATE_FAIL, + "realpath () failed for brick " + "%s. The underlying filesystem " + "may be in bad state", + new_brickinfo->path); + ret = -1; + goto out; + } + if (strlen(abspath) >= + sizeof(new_ta_brickinfo->real_path)) { + ret = -1; + goto out; + } + (void)strncpy(new_ta_brickinfo->real_path, abspath, + sizeof(new_ta_brickinfo->real_path)); + } else { + (void)strncpy(new_ta_brickinfo->real_path, + old_ta_brickinfo->real_path, + sizeof(new_ta_brickinfo->real_path)); + } + } + } + } ret = 0; out: @@ -4608,8 +4857,8 @@ gd_check_and_update_rebalance_info(glusterd_volinfo_t *old_volinfo, new->rebalance_time = old->rebalance_time; /* glusterd_rebalance_t.{op, id, defrag_cmd} are copied during volume - * import - * a new defrag object should come to life with rebalance being restarted + * import a new defrag object should come to life with rebalance being + * restarted */ out: return ret; diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index c506da32950..2312d426051 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -32,6 +32,13 @@ brickid); \ } while (0) +#define GLUSTERD_ASSIGN_BRICKID_TO_TA_BRICKINFO(ta_brickinfo, volinfo, \ + brickid) \ + do { \ + sprintf(ta_brickinfo->brick_id, "%s-ta-%d", volinfo->volname, \ + brickid); \ + } while (0) + #define ALL_VOLUME_OPTION_CHECK(volname, get_opt, key, ret, op_errstr, label) \ do { \ gf_boolean_t _all = !strcmp("all", volname); \ diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 479ae779f66..8243548f881 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -566,7 +566,13 @@ no_filter_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme, for (trav = first_of(graph); trav; trav = trav->next) { if (strcmp(trav->type, vme->voltype) != 0) continue; - + if (strcmp(vme->option, "ta-remote-port") == 0) { + if (strstr(trav->name, "-ta-") != NULL) { + ret = xlator_set_option(trav, "remote-port", + strlen(vme->option), vme->value); + } + continue; + } ret = xlator_set_option(trav, vme->option, strlen(vme->option), vme->value); if (ret) @@ -3185,7 +3191,10 @@ volgen_graph_build_clients(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, 0, }; glusterd_brickinfo_t *brick = NULL; + glusterd_brickinfo_t *ta_brick = NULL; xlator_t *xl = NULL; + int subvol_index = 0; + int thin_arbiter_index = 0; if (volinfo->brick_count == 0) { gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLUME_INCONSISTENCY, @@ -3212,6 +3221,30 @@ volgen_graph_build_clients(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, i = 0; cds_list_for_each_entry(brick, &volinfo->bricks, brick_list) { + /* insert ta client xlator entry. + * eg - If subvol count is > 1, then after every two client xlator + * entries there should be a ta client xlator entry in the volfile. ta + * client xlator indexes are - 2, 5, 8 etc depending on the index of + * subvol. + */ + if (volinfo->thin_arbiter_count && + (i + 1) % (volinfo->replica_count + 1) == 0) { + thin_arbiter_index = 0; + cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, brick_list) + { + if (thin_arbiter_index == subvol_index) { + xl = volgen_graph_build_client( + graph, volinfo, ta_brick->hostname, NULL, + ta_brick->path, ta_brick->brick_id, transt, set_dict); + if (!xl) { + ret = -1; + goto out; + } + } + thin_arbiter_index++; + } + subvol_index++; + } xl = volgen_graph_build_client(graph, volinfo, brick->hostname, NULL, brick->path, brick->brick_id, transt, set_dict); @@ -3223,6 +3256,28 @@ volgen_graph_build_clients(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, i++; } + /* Add ta client xlator entry for last subvol + * Above loop will miss out on making the ta client + * xlator entry for the last subvolume in the volfile + */ + if (volinfo->thin_arbiter_count) { + thin_arbiter_index = 0; + cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, brick_list) + { + if (thin_arbiter_index == subvol_index) { + xl = volgen_graph_build_client( + graph, volinfo, ta_brick->hostname, NULL, ta_brick->path, + ta_brick->brick_id, transt, set_dict); + if (!xl) { + ret = -1; + goto out; + } + } + + thin_arbiter_index++; + } + } + if (i != volinfo->brick_count) { gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLUME_INCONSISTENCY, "volume inconsistency: actual number of bricks (%d) " @@ -3599,12 +3654,15 @@ set_afr_pending_xattrs_option(volgen_graph_t *graph, xlator_t *this = NULL; glusterd_conf_t *conf = NULL; glusterd_brickinfo_t *brick = NULL; + glusterd_brickinfo_t *ta_brick = NULL; char *ptr = NULL; int i = 0; int index = -1; int ret = 0; char *afr_xattrs_list = NULL; int list_size = -1; + int ta_brick_index = 0; + int subvol_index = 0; this = THIS; GF_VALIDATE_OR_GOTO("glusterd", this, out); @@ -3643,6 +3701,26 @@ set_afr_pending_xattrs_option(volgen_graph_t *graph, break; strncat(ptr, brick->brick_id, strlen(brick->brick_id)); if (i == volinfo->replica_count) { + /* add ta client xlator in afr-pending-xattrs before making entries + * for client xlators in volfile. + * ta client xlator indexes are - 2, 5, 8 depending on the index of + * subvol. e.g- For first subvol ta client xlator id is volname-ta-2 + */ + ta_brick_index = 0; + if (volinfo->thin_arbiter_count == 1) { + ptr[strlen(brick->brick_id)] = ','; + cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, + brick_list) + { + if (ta_brick_index == subvol_index) { + break; + } + ta_brick_index++; + } + + strncat(ptr, ta_brick->brick_id, strlen(ta_brick->brick_id)); + } + ret = xlator_set_fixed_option(afr_xlators_list[index++], "afr-pending-xattr", afr_xattrs_list); if (ret) @@ -3650,6 +3728,7 @@ set_afr_pending_xattrs_option(volgen_graph_t *graph, memset(afr_xattrs_list, 0, list_size); ptr = afr_xattrs_list; i = 1; + subvol_index++; continue; } ptr[strlen(brick->brick_id)] = ','; @@ -3674,6 +3753,13 @@ volgen_graph_build_afr_clusters(volgen_graph_t *graph, char *replicate_name = "%s-replicate-%d"; xlator_t *afr = NULL; char option[32] = {0}; + glusterd_brickinfo_t *ta_brick = NULL; + int ta_brick_index = 0; + int ta_replica_offset = 0; + int ta_brick_offset = 0; + char ta_option[4096] = { + 0, + }; if (glusterd_volinfo_get_boolean(volinfo, "cluster.jbr") > 0) { replicate_type = "experimental/jbrc"; @@ -3681,9 +3767,20 @@ volgen_graph_build_afr_clusters(volgen_graph_t *graph, replicate_type = "cluster/replicate"; } + /* In thin-arbiter case brick count and replica count remain same + * but due to additional entries of ta client xlators in the volfile, + * GD1 is manipulated to include these client xlators while linking them to + * afr/cluster entry in the volfile. + */ + if (volinfo->thin_arbiter_count == 1) { + ta_replica_offset = 1; + ta_brick_offset = volinfo->subvol_count; + } + clusters = volgen_link_bricks_from_list_tail( - graph, volinfo, replicate_type, replicate_name, volinfo->brick_count, - volinfo->replica_count); + graph, volinfo, replicate_type, replicate_name, + volinfo->brick_count + ta_brick_offset, + volinfo->replica_count + ta_replica_offset); if (clusters < 0) goto out; @@ -3693,18 +3790,43 @@ volgen_graph_build_afr_clusters(volgen_graph_t *graph, clusters = -1; goto out; } - if (!volinfo->arbiter_count) + if (!volinfo->arbiter_count && !volinfo->thin_arbiter_count) goto out; afr = first_of(graph); - sprintf(option, "%d", volinfo->arbiter_count); - for (i = 0; i < clusters; i++) { - ret = xlator_set_fixed_option(afr, "arbiter-count", option); - if (ret) { - clusters = -1; - goto out; + + if (volinfo->arbiter_count) { + sprintf(option, "%d", volinfo->arbiter_count); + for (i = 0; i < clusters; i++) { + ret = xlator_set_fixed_option(afr, "arbiter-count", option); + if (ret) { + clusters = -1; + goto out; + } + + afr = afr->next; + } + } + + if (volinfo->thin_arbiter_count == 1) { + for (i = 0; i < clusters; i++) { + ta_brick_index = 0; + cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, brick_list) + { + if (ta_brick_index == i) { + break; + } + ta_brick_index++; + } + snprintf(ta_option, sizeof(ta_option), "%s:%s", ta_brick->hostname, + ta_brick->path); + ret = xlator_set_fixed_option(afr, "thin-arbiter", ta_option); + if (ret) { + clusters = -1; + goto out; + } + afr = afr->next; } - afr = afr->next; } out: return clusters; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 7eb74d7b4fd..4624fe1c8d0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -276,6 +276,7 @@ __glusterd_handle_create_volume(rpcsvc_request_t *req) char *bricks = NULL; char *volname = NULL; int brick_count = 0; + int thin_arbiter_count = 0; void *cli_rsp = NULL; char err_str[2048] = { 0, @@ -435,6 +436,21 @@ __glusterd_handle_create_volume(rpcsvc_request_t *req) goto out; } + ret = dict_get_int32n(dict, "thin-arbiter-count", + SLEN("thin-arbiter-count"), &thin_arbiter_count); + if (thin_arbiter_count && conf->op_version < GD_OP_VERSION_7_0) { + snprintf(err_str, sizeof(err_str), + "Cannot execute command. " + "The cluster is operating at version %d. " + "Thin-arbiter volume creation is unavailable in " + "this version", + conf->op_version); + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_OP_FAILED, "%s", + err_str); + ret = -1; + goto out; + } + if (!dict_getn(dict, "force", SLEN("force"))) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "Failed to get 'force' flag"); @@ -2028,14 +2044,20 @@ glusterd_op_create_volume(dict_t *dict, char **op_errstr) glusterd_volinfo_t *volinfo = NULL; gf_boolean_t vol_added = _gf_false; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *ta_brickinfo = NULL; xlator_t *this = NULL; char *brick = NULL; + char *ta_brick = NULL; int32_t count = 0; int32_t i = 1; char *bricks = NULL; + char *ta_bricks = NULL; char *brick_list = NULL; + char *ta_brick_list = NULL; char *free_ptr = NULL; + char *ta_free_ptr = NULL; char *saveptr = NULL; + char *ta_saveptr = NULL; char *trans_type = NULL; char *str = NULL; char *username = NULL; @@ -2153,6 +2175,20 @@ glusterd_op_create_volume(dict_t *dict, char **op_errstr) /* coverity[unused_value] arbiter count is optional */ ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"), &volinfo->arbiter_count); + ret = dict_get_int32n(dict, "thin-arbiter-count", + SLEN("thin-arbiter-count"), + &volinfo->thin_arbiter_count); + if (volinfo->thin_arbiter_count) { + ret = dict_get_strn(dict, "ta-brick", SLEN("ta-brick"), &ta_bricks); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Unable to get thin arbiter brick for " + "volume %s", + volname); + goto out; + } + } + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { ret = dict_get_int32n(dict, "disperse-count", SLEN("disperse-count"), &volinfo->disperse_count); @@ -2241,6 +2277,38 @@ glusterd_op_create_volume(dict_t *dict, char **op_errstr) volinfo->transport_type = GF_TRANSPORT_BOTH_TCP_RDMA; } + if (ta_bricks) { + ta_brick_list = gf_strdup(ta_bricks); + ta_free_ptr = ta_brick_list; + } + + if (volinfo->thin_arbiter_count) { + ta_brick = strtok_r(ta_brick_list + 1, " \n", &ta_saveptr); + + count = 1; + brickid = volinfo->replica_count; + /* assign brickid to ta_bricks + * Following loop runs for number of subvols times. Although + * there is only one ta-brick for a volume but the volume fuse volfile + * requires an entry of ta-brick for each subvolume. Also, the ta-brick + * id needs to be adjusted according to the subvol count. + * For eg- For first subvolume ta-brick id is volname-ta-2, for second + * subvol ta-brick id is volname-ta-5. + */ + while (count <= volinfo->subvol_count) { + ret = glusterd_brickinfo_new_from_brick(ta_brick, &ta_brickinfo, + _gf_false, op_errstr); + if (ret) + goto out; + + GLUSTERD_ASSIGN_BRICKID_TO_TA_BRICKINFO(ta_brickinfo, volinfo, + brickid); + cds_list_add_tail(&ta_brickinfo->brick_list, &volinfo->ta_bricks); + count++; + brickid += volinfo->replica_count + 1; + } + } + if (bricks) { brick_list = gf_strdup(bricks); free_ptr = brick_list; @@ -2259,7 +2327,10 @@ glusterd_op_create_volume(dict_t *dict, char **op_errstr) op_errstr); if (ret) goto out; - + if (volinfo->thin_arbiter_count == 1 && + (brickid + 1) % (volinfo->replica_count + 1) == 0) { + brickid = brickid + 1; + } GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid++); ret = glusterd_resolve_brick(brickinfo); @@ -2350,6 +2421,7 @@ glusterd_op_create_volume(dict_t *dict, char **op_errstr) out: GF_FREE(free_ptr); + GF_FREE(ta_free_ptr); if (!vol_added && volinfo) glusterd_volinfo_unref(volinfo); return ret; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index dba8fbee28c..b943f66c008 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1479,6 +1479,14 @@ struct volopt_map_entry glusterd_volopt_map[] = { .value = "9", .flags = VOLOPT_FLAG_CLIENT_OPT}, + /* Although the following option is named ta-remote-port but it will be + * added as remote-port in client volfile for ta-bricks only. + */ + {.key = "client.ta-brick-port", + .voltype = "protocol/client", + .option = "ta-remote-port", + .op_version = GD_OP_VERSION_7_0}, + /* Server xlator options */ {.key = "network.tcp-window-size", .voltype = "protocol/server", diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 575f8c5c1dd..f63f4c107ef 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -440,6 +440,7 @@ struct glusterd_volinfo_ { /* This is a current pointer for glusterd_volinfo_t->snap_volumes */ struct cds_list_head bricks; + struct cds_list_head ta_bricks; struct cds_list_head snap_volumes; /* TODO : Need to remove this, as this * is already part of snapshot object. @@ -449,6 +450,7 @@ struct glusterd_volinfo_ { int stripe_count; int replica_count; int arbiter_count; + int thin_arbiter_count; int disperse_count; int redundancy_count; int subvol_count; /* Number of subvolumes in a -- cgit