From ef89e1b5bb77706b1910a45640b11a4341c78d6a Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Mon, 2 Apr 2018 13:58:23 +0530 Subject: afr: initial changes for thin arbiter 1. Create thin arbiter index file during mount. 2. Set pending marker in thin arbiter id file in case of failure. Change-Id: I269eb8d069f0323f1fc616175e5e5eb7b91d5f82 updates: #352 Signed-off-by: Ravishankar N --- xlators/cluster/afr/src/afr-common.c | 93 ++++++++++++++++++++++++-- xlators/cluster/afr/src/afr-messages.h | 3 +- xlators/cluster/afr/src/afr-transaction.c | 107 ++++++++++++++++++++++++++++++ xlators/cluster/afr/src/afr-transaction.h | 5 ++ xlators/cluster/afr/src/afr.c | 25 ++++++- xlators/cluster/afr/src/afr.h | 4 ++ 6 files changed, 229 insertions(+), 8 deletions(-) (limited to 'xlators/cluster') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 01a5db54bdd..8752e98c8df 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -2901,10 +2901,8 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } - - static void -afr_discover_done (call_frame_t *frame, xlator_t *this) +afr_discover_unwind (call_frame_t *frame, xlator_t *this) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -2966,6 +2964,84 @@ unwind: &local->replies[read_subvol].postparent); } +static int +afr_ta_id_file_check (void *opaque) +{ + afr_private_t *priv = NULL; + xlator_t *this = NULL; + loc_t loc = {0, }; + struct iatt stbuf = {0,}; + dict_t *dict = NULL; + uuid_t gfid = {0,}; + fd_t *fd = NULL; + int ret = 0; + + this = opaque; + priv = this->private; + + ret = afr_fill_ta_loc (this, &loc); + if (ret) + goto out; + + ret = syncop_lookup (priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + &stbuf, 0, 0, 0); + if (ret == 0) { + goto out; + } else if (ret == -ENOENT) { + fd = fd_create (loc.inode, getpid()); + if (!fd) + goto out; + dict = dict_new (); + if (!dict) + goto out; + gf_uuid_generate (gfid); + ret = dict_set_gfuuid (dict, "gfid-req", gfid, true); + ret = syncop_create (priv->children[THIN_ARBITER_BRICK_INDEX], + &loc, O_RDWR, 0664, fd, &stbuf, dict, + NULL); + } + +out: + if (ret == 0) { + gf_uuid_copy (priv->ta_gfid, stbuf.ia_gfid); + } else { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to lookup/create thin-arbiter id file."); + } + if (dict) + dict_unref (dict); + if (fd) + fd_unref (fd); + loc_wipe (&loc); + + return 0; +} + +static int +afr_ta_id_file_check_cbk (int ret, call_frame_t *ta_frame, void *opaque) +{ + return 0; +} + +static void +afr_discover_done (call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + afr_private_t *priv = NULL; + + priv = this->private; + if (!priv->thin_arbiter_count) + goto unwind; + if (!gf_uuid_is_null(priv->ta_gfid)) + goto unwind; + + ret = synctask_new (this->ctx->env, afr_ta_id_file_check, + afr_ta_id_file_check_cbk, NULL, this); + if (ret) + goto unwind; +unwind: + afr_discover_unwind (frame, this); +} int afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -5514,15 +5590,22 @@ afr_set_low_priority (call_frame_t *frame) void afr_priv_destroy (afr_private_t *priv) { - int i = 0; + int i = 0; + int child_count = -1; if (!priv) goto out; GF_FREE (priv->last_event); + + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } if (priv->pending_key) { - for (i = 0; i < priv->child_count; i++) + for (i = 0; i < child_count; i++) GF_FREE (priv->pending_key[i]); } + GF_FREE (priv->pending_reads); GF_FREE (priv->local); GF_FREE (priv->pending_key); diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index 743bd945821..8aa94730158 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -66,7 +66,8 @@ GLFS_MSGID(AFR, AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, - AFR_MSG_INODE_CTX_GET_FAILED + AFR_MSG_INODE_CTX_GET_FAILED, + AFR_MSG_THIN_ARB ); #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 8bb096775c2..b4d3062fc2b 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -635,6 +635,14 @@ afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; + if (priv->thin_arbiter_count) { + /* We need to perform post-op even if 1 data brick was down + * before the txn started.*/ + if (AFR_COUNT (local->transaction.failed_subvols, + priv->child_count)) + return _gf_false; + } + for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i] && local->transaction.failed_subvols[i]) @@ -824,6 +832,97 @@ afr_handle_quorum (call_frame_t *frame) } } +int +afr_fill_ta_loc (xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = NULL; + + priv = this->private; + loc->parent = inode_ref (priv->root_inode); + gf_uuid_copy (loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + gf_uuid_copy (loc->gfid, priv->ta_gfid); + loc->inode = inode_new (loc->parent->table); + if (!loc->inode) + return -ENOMEM; + return 0; +} + +int +afr_changelog_thin_arbiter_post_op (xlator_t *this, afr_local_t *local) +{ + int ret = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int failed_count = 0; + struct gf_flock flock = {0, }; + loc_t loc = {0,}; + int i = 0; + + priv = this->private; + if (!priv->thin_arbiter_count) + return 0; + + + failed_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); + if (!failed_count) + return 0; + + GF_ASSERT (failed_count == 1); + ret = afr_fill_ta_loc (this, &loc); + if (ret) + goto out; + + xattr = dict_new (); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin (xattr, priv->pending_key[i], + local->pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int)); + if (ret) + goto out; + } + + flock.l_type = F_WRLCK; + flock.l_start = 0; + flock.l_len = 0; + + /*TODO: Convert to two domain locking. */ + ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], + THIN_ARBITER_DOM1, &loc, F_SETLKW, &flock, + NULL, NULL); + if (ret) + goto out; + + ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + + if (ret == -EINVAL) { + gf_msg (this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB, + "Thin-arbiter has denied post-op on %s for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa (local->inode->gfid)); + + } else if (ret) { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s failed for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa (local->inode->gfid)); + } + flock.l_type = F_UNLCK; + syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], + THIN_ARBITER_DOM1, &loc, F_SETLKW, &flock, NULL, NULL); +out: + if (xattr) + dict_unref (xattr); + + return ret; +} + int afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { @@ -885,6 +984,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } + ret = afr_changelog_thin_arbiter_post_op (this, local); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + afr_changelog_post_op_done (frame, this); + goto out; + } + if (need_undirty) local->dirty[idx] = hton32(-1); else diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index cb62c185938..629f6dd557c 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -54,8 +54,13 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv, inode_t *inode2, unsigned char *readable2); int afr_transaction_resume (call_frame_t *frame, xlator_t *this); + int afr_lock (call_frame_t *frame, xlator_t *this); + void afr_delayed_changelog_wake_up_cbk (void *data); + +int +afr_fill_ta_loc (xlator_t *this, loc_t *loc); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index cb4b1537984..27cee590b4b 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -303,12 +303,20 @@ afr_pending_xattrs_init (afr_private_t *priv, xlator_t *this) char *ptr1 = NULL; char *xattrs_list = NULL; xlator_list_t *trav = NULL; + int child_count = -1; trav = this->children; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the + * name of the thin arbiter file for persistance across add/ + * removal of DHT subvols.*/ + child_count++; + } GF_OPTION_INIT ("afr-pending-xattr", xattrs_list, str, out); priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), - priv->child_count, gf_afr_mt_char); + child_count, gf_afr_mt_char); if (!priv->pending_key) { ret = -ENOMEM; goto out; @@ -318,7 +326,7 @@ afr_pending_xattrs_init (afr_private_t *priv, xlator_t *this) "Unable to fetch afr-pending-xattr option from volfile." " Falling back to using client translator names. "); - while (i < priv->child_count) { + while (i < child_count) { ret = gf_asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, trav->xlator->name); @@ -368,6 +376,7 @@ init (xlator_t *this) int read_subvol_index = -1; char *qtype = NULL; char *fav_child_policy = NULL; + char *thin_arbiter = NULL; if (!this->children) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -397,6 +406,11 @@ init (xlator_t *this) priv->read_child = -1; GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out); + GF_OPTION_INIT ("thin-arbiter", thin_arbiter, str, out); + if (thin_arbiter && strlen(thin_arbiter) > 0) { + priv->thin_arbiter_count = 1; + priv->child_count--; + } INIT_LIST_HEAD (&priv->healing); INIT_LIST_HEAD (&priv->heal_waiting); @@ -1103,6 +1117,13 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .description = "subset of child_count. Has to be 0 or 1." }, + { .key = {"thin-arbiter"}, + .type = GF_OPTION_TYPE_STR, + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "contains host:path of thin abriter brick", + }, { .key = {"shd-max-threads"}, .type = GF_OPTION_TYPE_INT, .min = 1, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b96be62a910..fd75de45341 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -35,6 +35,8 @@ #define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/ #define ARBITER_BRICK_INDEX 2 +#define THIN_ARBITER_BRICK_INDEX 2 +#define THIN_ARBITER_DOM1 "afr.ta.domain-1" #define AFR_HALO_MAX_LATENCY 99999 typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); @@ -81,10 +83,12 @@ typedef struct _afr_private { unsigned int child_count; /* total number of children */ unsigned int arbiter_count; /*subset of child_count. Has to be 0 or 1.*/ + unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ xlator_t **children; inode_t *root_inode; + uuid_t ta_gfid; /*For thin arbiter.*/ unsigned char *child_up; int64_t *child_latency; -- cgit