From ea95631ff47c8048f039faedbc0faa918c4e165a Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Thu, 5 Sep 2019 16:12:39 +0530 Subject: cluster/ec: quorum-count implementation fixes: #721 Change-Id: I5333540e3c635ccf441cf1f4696e4c8986e38ea8 Signed-off-by: Pranith Kumar K --- xlators/cluster/ec/src/ec-common.c | 13 ++++++ xlators/cluster/ec/src/ec-common.h | 24 ++++++++++ xlators/cluster/ec/src/ec-dir-write.c | 57 ++++++++++++----------- xlators/cluster/ec/src/ec-inode-write.c | 61 ++++++++++++------------- xlators/cluster/ec/src/ec-types.h | 1 + xlators/cluster/ec/src/ec.c | 13 ++++++ xlators/mgmt/glusterd/src/glusterd-volume-set.c | 46 +++++++++++++++++++ 7 files changed, 156 insertions(+), 59 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index e243b8ba5d9..dea987ef319 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -707,6 +707,19 @@ ec_child_select(ec_fop_data_t *fop) return 0; } + if (!fop->parent && fop->lock_count && + (fop->locks[0].update[EC_DATA_TXN] || + fop->locks[0].update[EC_METADATA_TXN])) { + if (ec->quorum_count && (num < ec->quorum_count)) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, + "Insufficient available children " + "for this request (have %d, need " + "%d). %s", + num, ec->quorum_count, ec_msg_str(fop)); + return 0; + } + } + return 1; } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index ce35b4878c1..51493612ac6 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -25,6 +25,30 @@ typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t; #define EC_FLAG_LOCK_SHARED 0x0001 +#define QUORUM_CBK(fn, fop, frame, cookie, this, op_ret, op_errno, params...) \ + do { \ + ec_t *__ec = fop->xl->private; \ + int32_t __op_ret = 0; \ + int32_t __op_errno = 0; \ + int32_t __success_count = gf_bits_count(fop->good); \ + \ + __op_ret = op_ret; \ + __op_errno = op_errno; \ + if (!fop->parent && frame && \ + (GF_CLIENT_PID_SELF_HEALD != frame->root->pid) && \ + __ec->quorum_count && (__success_count < __ec->quorum_count) && \ + op_ret >= 0) { \ + __op_ret = -1; \ + __op_errno = EIO; \ + gf_msg(__ec->xl->name, GF_LOG_ERROR, 0, \ + EC_MSG_CHILDS_INSUFFICIENT, \ + "Insufficient available children for this request " \ + "(have %d, need %d). %s", \ + __success_count, __ec->quorum_count, ec_msg_str(fop)); \ + } \ + fn(frame, cookie, this, __op_ret, __op_errno, params); \ + } while (0) + enum _ec_xattrop_flags { EC_FLAG_XATTROP, EC_FLAG_DATA_DIRTY, diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index bd2544af862..53d27d895c3 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -215,10 +215,10 @@ ec_manager_create(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.create != NULL) { - fop->cbks.create(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->fd, fop->loc[0].inode, - &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], - cbk->xdata); + QUORUM_CBK(fop->cbks.create, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->fd, + fop->loc[0].inode, &cbk->iatt[0], &cbk->iatt[1], + &cbk->iatt[2], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -387,9 +387,10 @@ ec_manager_link(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.link != NULL) { - fop->cbks.link(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0], - &cbk->iatt[1], &cbk->iatt[2], cbk->xdata); + QUORUM_CBK(fop->cbks.link, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -566,9 +567,10 @@ ec_manager_mkdir(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.mkdir != NULL) { - fop->cbks.mkdir(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0], - &cbk->iatt[1], &cbk->iatt[2], cbk->xdata); + QUORUM_CBK(fop->cbks.mkdir, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -770,9 +772,10 @@ ec_manager_mknod(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.mknod != NULL) { - fop->cbks.mknod(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0], - &cbk->iatt[1], &cbk->iatt[2], cbk->xdata); + QUORUM_CBK(fop->cbks.mknod, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -928,10 +931,10 @@ ec_manager_rename(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.rename != NULL) { - fop->cbks.rename(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - &cbk->iatt[2], &cbk->iatt[3], &cbk->iatt[4], - cbk->xdata); + QUORUM_CBK(fop->cbks.rename, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], &cbk->iatt[2], &cbk->iatt[3], + &cbk->iatt[4], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1080,9 +1083,9 @@ ec_manager_rmdir(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.rmdir != NULL) { - fop->cbks.rmdir(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.rmdir, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1234,10 +1237,10 @@ ec_manager_symlink(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.symlink != NULL) { - fop->cbks.symlink(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, - &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], - cbk->xdata); + QUORUM_CBK(fop->cbks.symlink, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1389,9 +1392,9 @@ ec_manager_unlink(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.unlink != NULL) { - fop->cbks.unlink(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.unlink, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 95cef7e8092..9b5fe2a7fdc 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -181,26 +181,26 @@ ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, switch (fop->id) { case GF_FOP_SETXATTR: if (fop->cbks.setxattr) { - fop->cbks.setxattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret, + op_errno, xdata); } break; case GF_FOP_REMOVEXATTR: if (fop->cbks.removexattr) { - fop->cbks.removexattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); } break; case GF_FOP_FSETXATTR: if (fop->cbks.fsetxattr) { - fop->cbks.fsetxattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); } break; case GF_FOP_FREMOVEXATTR: if (fop->cbks.fremovexattr) { - fop->cbks.fremovexattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); } break; } @@ -490,16 +490,15 @@ ec_manager_setattr(ec_fop_data_t *fop, int32_t state) if (fop->id == GF_FOP_SETATTR) { if (fop->cbks.setattr != NULL) { - fop->cbks.setattr(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], - &cbk->iatt[1], cbk->xdata); + QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } else { if (fop->cbks.fsetattr != NULL) { - fop->cbks.fsetattr(fop->req_frame, fop, fop->xl, - cbk->op_ret, cbk->op_errno, - &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } @@ -990,9 +989,9 @@ ec_manager_fallocate(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.fallocate != NULL) { - fop->cbks.fallocate(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1243,9 +1242,9 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.discard != NULL) { - fop->cbks.discard(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1473,17 +1472,15 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state) if (fop->id == GF_FOP_TRUNCATE) { if (fop->cbks.truncate != NULL) { - fop->cbks.truncate(fop->req_frame, fop, fop->xl, - cbk->op_ret, cbk->op_errno, - &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } else { if (fop->cbks.ftruncate != NULL) { - fop->cbks.ftruncate(fop->req_frame, fop, fop->xl, - cbk->op_ret, cbk->op_errno, - &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } @@ -2241,9 +2238,9 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.writev != NULL) { - fop->cbks.writev(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index b93a07aba40..9c790380d4d 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -655,6 +655,7 @@ struct _ec { gf_boolean_t optimistic_changelog; gf_boolean_t parallel_writes; uint32_t stripe_cache; + uint32_t quorum_count; uint32_t background_heals; uint32_t heal_wait_qlen; uint32_t self_heal_window_size; /* max size of read/writes */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index b7acc666afc..f75ad395a4c 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -285,6 +285,7 @@ reconfigure(xlator_t *this, dict_t *options) GF_OPTION_RECONF("parallel-writes", ec->parallel_writes, options, bool, failed); GF_OPTION_RECONF("stripe-cache", ec->stripe_cache, options, uint32, failed); + GF_OPTION_RECONF("quorum-count", ec->quorum_count, options, uint32, failed); ret = 0; if (ec_assign_read_policy(ec, read_policy)) { ret = -1; @@ -783,6 +784,7 @@ init(xlator_t *this) failed); GF_OPTION_INIT("parallel-writes", ec->parallel_writes, bool, failed); GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed); + GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed); this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this); if (!this->itable) @@ -1466,6 +1468,7 @@ ec_dump_private(xlator_t *this) gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters); gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]); gf_proc_dump_write("parallel-writes", "%d", ec->parallel_writes); + gf_proc_dump_write("quorum-count", "%u", ec->quorum_count); snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache", this->type, this->name); @@ -1735,6 +1738,16 @@ struct volume_options options[] = { "specially for sequential writes. However, this will also" "lead to extra memory consumption, maximum " "(cache size * stripe size) Bytes per open file."}, + { + .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = + "This option can be used to define how many successes on" + "the bricks constitute a success to the application. This" + " count should be in the range" + "[disperse-data-count, disperse-count] (inclusive)", + }, { .key = {NULL}, }, diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 2ed8d48be59..e4427069263 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -646,6 +646,42 @@ out: return ret; } +static int +validate_disperse_quorum_count(glusterd_volinfo_t *volinfo, dict_t *dict, + char *key, char *value, char **op_errstr) +{ + int ret = -1; + int quorum_count = 0; + int data_count = 0; + + ret = gf_string2int(value, &quorum_count); + if (ret) { + gf_asprintf(op_errstr, + "%s is not an integer. %s expects a " + "valid integer value.", + value, key); + goto out; + } + + if (volinfo->type != GF_CLUSTER_TYPE_DISPERSE) { + gf_asprintf(op_errstr, "Cannot set %s for a non-disperse volume.", key); + ret = -1; + goto out; + } + + data_count = volinfo->disperse_count - volinfo->redundancy_count; + if (quorum_count < data_count || quorum_count > volinfo->disperse_count) { + gf_asprintf(op_errstr, "%d for %s is out of range [%d - %d]", + quorum_count, key, data_count, volinfo->disperse_count); + ret = -1; + goto out; + } + + ret = 0; +out: + return ret; +} + static int validate_parallel_readdir(glusterd_volinfo_t *volinfo, dict_t *dict, char *key, char *value, char **op_errstr) @@ -2917,6 +2953,16 @@ struct volopt_map_entry glusterd_volopt_map[] = { .type = NO_DOC, .op_version = GD_OP_VERSION_3_13_0, .flags = VOLOPT_FLAG_CLIENT_OPT}, + {.key = "disperse.quorum-count", + .voltype = "cluster/disperse", + .type = NO_DOC, + .op_version = GD_OP_VERSION_8_0, + .validate_fn = validate_disperse_quorum_count, + .description = "This option can be used to define how many successes on" + "the bricks constitute a success to the application. This" + " count should be in the range" + "[disperse-data-count, disperse-count] (inclusive)", + .flags = VOLOPT_FLAG_CLIENT_OPT}, { .key = "features.sdfs", .voltype = "features/sdfs", -- cgit