diff options
Diffstat (limited to 'xlators/cluster/ec/src')
22 files changed, 1433 insertions, 599 deletions
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index c5af2ab5e39..703a30e2485 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -179,13 +179,14 @@ ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src, "links: %u-%u, uid: %u-%u, gid: %u-%u, " "rdev: %" PRIu64 "-%" PRIu64 ", size: %" PRIu64 "-%" PRIu64 ", " - "mode: %o-%o)", + "mode: %o-%o), %s", dst[i].ia_ino, src[i].ia_ino, dst[i].ia_nlink, src[i].ia_nlink, dst[i].ia_uid, src[i].ia_uid, dst[i].ia_gid, src[i].ia_gid, dst[i].ia_rdev, src[i].ia_rdev, dst[i].ia_size, src[i].ia_size, st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type), - st_mode_from_ia(src[i].ia_prot, dst[i].ia_type)); + st_mode_from_ia(src[i].ia_prot, dst[i].ia_type), + ec_msg_str(fop)); return 0; } @@ -342,9 +343,8 @@ out: } static int32_t -ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which, - char *key, char *new_key, const char *def, - gf_boolean_t global, ...) +ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key, + const char *def, gf_boolean_t global, const char *fmt, ...) { ec_t *ec = cbk->fop->xl->private; data_t *data[ec->nodes]; @@ -356,7 +356,7 @@ ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which, ec_dict_list(data, cbk, which, key, global); - va_start(args, global); + va_start(args, fmt); err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args); va_end(args); @@ -485,22 +485,12 @@ ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key) tmp = NULL; - len = dict_serialized_length(lockinfo); - if (len < 0) { - err = len; - - goto out; - } - ptr = GF_MALLOC(len, gf_common_mt_char); - if (ptr == NULL) { - err = -ENOMEM; - - goto out; - } - err = dict_serialize(lockinfo, ptr); + err = dict_allocate_and_serialize(lockinfo, (char **)&ptr, + (unsigned int *)&len); if (err != 0) { goto out; } + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; err = dict_set_dynptr(dict, key, ptr, len); if (err != 0) { @@ -739,14 +729,14 @@ ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg) if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) || (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) { - return ec_dict_data_concat("(<EC:%s> { })", data->cbk, data->which, key, - NULL, NULL, _gf_false, + return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL, + _gf_false, _gf_false, "(<EC:%s> { })", data->cbk->fop->xl->name); } if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) { - return ec_dict_data_concat("{\n}", data->cbk, data->which, key, NULL, - NULL, _gf_false); + return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL, + _gf_false, "{\n}"); } if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) { @@ -776,9 +766,9 @@ ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg) if (XATTR_IS_NODE_UUID(key)) { if (data->cbk->fop->int32) { /* List of node uuid is requested */ - return ec_dict_data_concat("{ }", data->cbk, data->which, key, + return ec_dict_data_concat(data->cbk, data->which, key, GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR, - _gf_true); + _gf_true, "{ }"); } else { return ec_dict_data_uuid(data->cbk, data->which, key); } diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 8d656702e12..b955efd8c2d 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -44,16 +44,16 @@ ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status) UNLOCK(&fd->lock); } -static int -ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open) +static uintptr_t +ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask) { int i = 0; int count = 0; ec_t *ec = NULL; ec_fd_t *fd_ctx = NULL; + uintptr_t need_open = 0; ec = this->private; - *need_open = 0; fd_ctx = ec_fd_get(fd, this); if (!fd_ctx) @@ -63,9 +63,9 @@ ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open) { for (i = 0; i < ec->nodes; i++) { if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) && - (ec->xl_up & (1 << i))) { + ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) { fd_ctx->fd_status[i] = EC_FD_OPENING; - *need_open |= (1 << i); + need_open |= (1 << i); count++; } } @@ -76,10 +76,11 @@ ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open) * then ignore fixing the fd as it has been * requested from heal operation. */ - if (count >= ec->fragments) - count = 0; + if (count >= ec->fragments) { + need_open = 0; + } - return count; + return need_open; } static gf_boolean_t @@ -96,11 +97,11 @@ ec_is_fd_fixable(fd_t *fd) } static void -ec_fix_open(ec_fop_data_t *fop) +ec_fix_open(ec_fop_data_t *fop, uintptr_t mask) { - int call_count = 0; uintptr_t need_open = 0; int ret = 0; + int32_t flags = 0; loc_t loc = { 0, }; @@ -109,9 +110,10 @@ ec_fix_open(ec_fop_data_t *fop) goto out; /* Evaluate how many remote fd's to be opened */ - call_count = ec_fd_ctx_need_open(fop->fd, fop->xl, &need_open); - if (!call_count) + need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask); + if (need_open == 0) { goto out; + } loc.inode = inode_ref(fop->fd->inode); gf_uuid_copy(loc.gfid, fop->fd->inode->gfid); @@ -120,12 +122,15 @@ ec_fix_open(ec_fop_data_t *fop) goto out; } + flags = fop->fd->flags & (~(O_TRUNC | O_APPEND | O_CREAT | O_EXCL)); if (IA_IFDIR == fop->fd->inode->ia_type) { - ec_opendir(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, NULL, NULL, + ec_opendir(fop->frame, fop->xl, need_open, + EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &fop->loc[0], fop->fd, NULL); } else { - ec_open(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, NULL, NULL, - &loc, fop->fd->flags, fop->fd, NULL); + ec_open(fop->frame, fop->xl, need_open, + EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc, + flags, fop->fd, NULL); } out: @@ -225,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx) int32_t ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good, - uintptr_t bad, dict_t *xdata) + uintptr_t bad, uint32_t pending, dict_t *xdata) { if (op_ret < 0) { gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL, @@ -311,16 +316,19 @@ ec_check_status(ec_fop_data_t *fop) } } - gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, - "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " - "remaining=%s, good=%s, bad=%s)", - gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, - ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), - ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), - ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), - ec_bin(str4, sizeof(str4), fop->good, ec->nodes), - ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), - ec->nodes)); + gf_msg( + fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, + "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " + "remaining=%s, good=%s, bad=%s," + "(Least significant bit represents first client/brick of subvol), %s)", + gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), + ec_bin(str4, sizeof(str4), fop->good, ec->nodes), + ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), + ec->nodes), + ec_msg_str(fop)); if (fop->use_fd) { if (fop->fd != NULL) { ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, @@ -494,12 +502,16 @@ ec_resume(ec_fop_data_t *fop, int32_t error) } void -ec_resume_parent(ec_fop_data_t *fop, int32_t error) +ec_resume_parent(ec_fop_data_t *fop) { ec_fop_data_t *parent; + int32_t error = 0; parent = fop->parent; if (parent != NULL) { + if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) { + error = fop->error; + } ec_trace("RESUME_PARENT", fop, "error=%u", error); fop->parent = NULL; ec_resume(parent, error); @@ -592,6 +604,8 @@ ec_internal_op(ec_fop_data_t *fop) return _gf_true; if (fop->id == GF_FOP_FXATTROP) return _gf_true; + if (fop->id == GF_FOP_OPEN) + return _gf_true; return _gf_false; } @@ -602,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop) loc_t *loc2 = NULL; char gfid1[64] = {0}; char gfid2[64] = {0}; + ec_fop_data_t *parent = fop->parent; if (fop->errstr) return fop->errstr; - if (!fop->use_fd) { loc1 = &fop->loc[0]; loc2 = &fop->loc[1]; @@ -613,24 +627,46 @@ ec_msg_str(ec_fop_data_t *fop) if (fop->id == GF_FOP_RENAME) { gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' and '%s' with gfids " - "%s and %s respectively", + "%s and %s respectively. Parent FOP: %s", ec_fop_name(fop->id), loc1->path, loc2->path, uuid_utoa_r(loc1->gfid, gfid1), - uuid_utoa_r(loc2->gfid, gfid2)); + uuid_utoa_r(loc2->gfid, gfid2), + parent ? ec_fop_name(parent->id) : "No Parent"); } else { - gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s", - ec_fop_name(fop->id), loc1->path, - uuid_utoa_r(loc1->gfid, gfid1)); + gf_asprintf( + &fop->errstr, + "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s", + ec_fop_name(fop->id), loc1->path, + uuid_utoa_r(loc1->gfid, gfid1), + parent ? ec_fop_name(parent->id) : "No Parent"); } } else { - gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s", - ec_fop_name(fop->id), - uuid_utoa_r(fop->fd->inode->gfid, gfid1)); + gf_asprintf( + &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s", + ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1), + parent ? ec_fop_name(parent->id) : "No Parent"); } return fop->errstr; } -int32_t +static void +ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need, + int32_t loglevel) +{ + ec_t *ec = fop->xl->private; + char str1[32], str2[32], str3[32]; + + gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT, + "Insufficient available children for this request: " + "Have : %d, Need : %u : Child UP : %s " + "Mask: %s, Healing : %s : %s ", + have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->healing, ec->nodes), + ec_msg_str(fop)); +} + +static int32_t ec_child_select(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; @@ -644,6 +680,9 @@ ec_child_select(ec_fop_data_t *fop) * unlock should go on all subvols where lock is performed*/ if (fop->parent && !ec_internal_op(fop)) { fop->mask &= (fop->parent->mask & ~fop->parent->healing); + if (ec_is_data_fop(fop->id)) { + fop->healing |= fop->parent->healing; + } } if ((fop->mask & ~ec->xl_up) != 0) { @@ -684,15 +723,18 @@ ec_child_select(ec_fop_data_t *fop) ec_trace("SELECT", fop, ""); if ((num < fop->minimum) && (num < ec->fragments)) { - gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, - "Insufficient available children " - "for this request (have %d, need " - "%d). %s", - num, fop->minimum, ec_msg_str(fop)); + ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR); return 0; } - ec_sleep(fop); + if (!fop->parent && fop->lock_count && + (fop->locks[0].update[EC_DATA_TXN] || + fop->locks[0].update[EC_METADATA_TXN])) { + if (ec->quorum_count && (num < ec->quorum_count)) { + ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR); + return 0; + } + } return 1; } @@ -772,6 +814,8 @@ ec_dispatch_one(ec_fop_data_t *fop) ec_dispatch_start(fop); if (ec_child_select(fop)) { + ec_sleep(fop); + fop->expected = 1; fop->first = ec_select_first_by_read_policy(fop->xl->private, fop); @@ -806,6 +850,8 @@ ec_dispatch_inc(ec_fop_data_t *fop) ec_dispatch_start(fop); if (ec_child_select(fop)) { + ec_sleep(fop); + fop->expected = gf_bits_count(fop->remaining); fop->first = 0; @@ -819,6 +865,8 @@ ec_dispatch_all(ec_fop_data_t *fop) ec_dispatch_start(fop); if (ec_child_select(fop)) { + ec_sleep(fop); + fop->expected = gf_bits_count(fop->remaining); fop->first = 0; @@ -837,6 +885,8 @@ ec_dispatch_min(ec_fop_data_t *fop) ec_dispatch_start(fop); if (ec_child_select(fop)) { + ec_sleep(fop); + fop->expected = count = ec->fragments; fop->first = ec_select_first_by_read_policy(fop->xl->private, fop); idx = fop->first - 1; @@ -851,6 +901,23 @@ ec_dispatch_min(ec_fop_data_t *fop) } } +void +ec_succeed_all(ec_fop_data_t *fop) +{ + ec_dispatch_start(fop); + + if (ec_child_select(fop)) { + fop->expected = gf_bits_count(fop->remaining); + fop->first = 0; + + /* Simulate a successful execution on all bricks */ + ec_trace("SUCCEED", fop, ""); + + fop->good = fop->remaining; + fop->remaining = 0; + } +} + ec_lock_t * ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc) { @@ -1372,27 +1439,28 @@ ec_get_size_version(ec_lock_link_t *link) !ec_is_data_fop(fop->id)) link->optimistic_changelog = _gf_true; + memset(&loc, 0, sizeof(loc)); + + LOCK(&lock->loc.inode->lock); + set_dirty = ec_set_dirty_flag(link, ctx, dirty); /* If ec metadata has already been retrieved, do not try again. */ - if (ctx->have_info && (!set_dirty)) { + if (ctx->have_info) { if (ec_is_data_fop(fop->id)) { fop->healing |= lock->healing; } - return; + if (!set_dirty) + goto unlock; } /* Determine if there's something we need to retrieve for the current * operation. */ if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) && (lock->loc.inode->ia_type != IA_INVAL)) { - return; + goto unlock; } - memset(&loc, 0, sizeof(loc)); - - LOCK(&lock->loc.inode->lock); - changed_flags = ec_set_xattrop_flags_and_params(lock, link, dirty); if (link->waiting_flags) { /* This fop needs to wait until all its flags are cleared which @@ -1403,6 +1471,7 @@ ec_get_size_version(ec_lock_link_t *link) GF_ASSERT(!changed_flags); } +unlock: UNLOCK(&lock->loc.inode->lock); if (!changed_flags) @@ -1814,6 +1883,10 @@ ec_lock_acquired(ec_lock_link_t *link) LOCK(&lock->loc.inode->lock); lock->acquired = _gf_true; + if (lock->contention) { + lock->release = _gf_true; + lock->contention = _gf_false; + } ec_lock_update_fd(lock, fop); ec_lock_wake_shared(lock, &list); @@ -1824,7 +1897,8 @@ ec_lock_acquired(ec_lock_link_t *link) if (fop->use_fd && (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) { - ec_fix_open(fop); + /* Try to reopen closed fd's only if lock has succeeded. */ + ec_fix_open(fop, lock->mask); } ec_lock_resume_shared(&list); @@ -1838,15 +1912,20 @@ ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ec_lock_link_t *link = NULL; ec_lock_t *lock = NULL; + link = fop->data; + lock = link->lock; if (op_ret >= 0) { - link = fop->data; - lock = link->lock; lock->mask = lock->good_mask = fop->good; lock->healing = 0; ec_lock_acquired(link); ec_lock(fop->parent); } else { + LOCK(&lock->loc.inode->lock); + { + lock->contention = _gf_false; + } + UNLOCK(&lock->loc.inode->lock); gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED, "Failed to complete preop lock"); } @@ -2177,7 +2256,7 @@ ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, if (op_ret < 0) { gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED, - "entry/inode unlocking failed (%s)", ec_fop_name(link->fop->id)); + "entry/inode unlocking failed :(%s)", ec_msg_str(link->fop)); } else { ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock); } @@ -2214,6 +2293,23 @@ ec_unlock_lock(ec_lock_link_t *link) } } +void +ec_inode_bad_inc(inode_t *inode, xlator_t *xl) +{ + ec_inode_t *ctx = NULL; + + LOCK(&inode->lock); + { + ctx = __ec_inode_get(inode, xl); + if (ctx == NULL) { + goto unlock; + } + ctx->bad_version++; + } +unlock: + UNLOCK(&inode->lock); +} + int32_t ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xattr, @@ -2229,6 +2325,12 @@ ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this, ctx = lock->ctx; if (op_ret < 0) { + if (link->lock->fd == NULL) { + ec_inode_bad_inc(link->lock->loc.inode, this); + } else { + ec_inode_bad_inc(link->lock->fd->inode, this); + } + gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno, EC_MSG_SIZE_VERS_UPDATE_FAIL, "Failed to update version and size. %s", ec_msg_str(fop)); @@ -2371,37 +2473,47 @@ ec_update_info(ec_lock_link_t *link) uint64_t dirty[2] = {0, 0}; uint64_t size; ec_t *ec = NULL; + uintptr_t mask; lock = link->lock; ctx = lock->ctx; ec = link->fop->xl->private; /* pre_version[*] will be 0 if have_version is false */ - version[0] = ctx->post_version[0] - ctx->pre_version[0]; - version[1] = ctx->post_version[1] - ctx->pre_version[1]; + version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] - + ctx->pre_version[EC_DATA_TXN]; + version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] - + ctx->pre_version[EC_METADATA_TXN]; size = ctx->post_size - ctx->pre_size; /* If we set the dirty flag for update fop, we have to unset it. * If fop has failed on some bricks, leave the dirty as marked. */ + if (lock->unlock_now) { + if (version[EC_DATA_TXN]) { + /*A data fop will have difference in post and pre version + *and for data fop we send writes on healing bricks also */ + mask = lock->good_mask | lock->healing; + } else { + mask = lock->good_mask; + } /* Ensure that nodes are up while doing final * metadata update.*/ - if (!(ec->node_mask & ~lock->good_mask) && - !(ec->node_mask & ~ec->xl_up)) { - if (ctx->dirty[0] != 0) { - dirty[0] = -1; + if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) { + if (ctx->dirty[EC_DATA_TXN] != 0) { + dirty[EC_DATA_TXN] = -1; } - if (ctx->dirty[1] != 0) { - dirty[1] = -1; + if (ctx->dirty[EC_METADATA_TXN] != 0) { + dirty[EC_METADATA_TXN] = -1; } /*If everything is fine and we already *have version xattr set on entry, there *is no need to update version again*/ - if (ctx->pre_version[0]) { - version[0] = 0; + if (ctx->pre_version[EC_DATA_TXN]) { + version[EC_DATA_TXN] = 0; } - if (ctx->pre_version[1]) { - version[1] = 0; + if (ctx->pre_version[EC_METADATA_TXN]) { + version[EC_METADATA_TXN] = 0; } } else { link->optimistic_changelog = _gf_false; @@ -2410,8 +2522,8 @@ ec_update_info(ec_lock_link_t *link) memset(ctx->dirty, 0, sizeof(ctx->dirty)); } - if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) || - (dirty[1] != 0)) { + if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) || + (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) { ec_update_size_version(link, version, size, dirty); return _gf_true; } @@ -2453,13 +2565,20 @@ ec_lock_release(ec_t *ec, inode_t *inode) goto done; } lock = ctx->inode_lock; - if ((lock == NULL) || !lock->acquired || lock->release) { + if ((lock == NULL) || lock->release) { goto done; } gf_msg_debug(ec->xl->name, 0, "Releasing inode %p due to lock contention", inode); + if (!lock->acquired) { + /* This happens if some bricks already got the lock while inodelk is in + * progress. Set release to true after lock is acquired*/ + lock->contention = _gf_true; + goto done; + } + /* The lock is not marked to be released, so the frozen list should be * empty. */ GF_ASSERT(list_empty(&lock->frozen)); @@ -2911,3 +3030,13 @@ ec_manager(ec_fop_data_t *fop, int32_t error) __ec_manager(fop, error); } + +gf_boolean_t +__ec_is_last_fop(ec_t *ec) +{ + if ((list_empty(&ec->pending_fops)) && + (GF_ATOMIC_GET(ec->async_fop_count) == 0)) { + return _gf_true; + } + return _gf_false; +} diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 115e1475b50..51493612ac6 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -11,8 +11,7 @@ #ifndef __EC_COMMON_H__ #define __EC_COMMON_H__ -#include <glusterfs/xlator.h> - +#include "glusterfs/compat-errno.h" // for ENODATA on BSD #include "ec-data.h" typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t; @@ -26,6 +25,30 @@ typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t; #define EC_FLAG_LOCK_SHARED 0x0001 +#define QUORUM_CBK(fn, fop, frame, cookie, this, op_ret, op_errno, params...) \ + do { \ + ec_t *__ec = fop->xl->private; \ + int32_t __op_ret = 0; \ + int32_t __op_errno = 0; \ + int32_t __success_count = gf_bits_count(fop->good); \ + \ + __op_ret = op_ret; \ + __op_errno = op_errno; \ + if (!fop->parent && frame && \ + (GF_CLIENT_PID_SELF_HEALD != frame->root->pid) && \ + __ec->quorum_count && (__success_count < __ec->quorum_count) && \ + op_ret >= 0) { \ + __op_ret = -1; \ + __op_errno = EIO; \ + gf_msg(__ec->xl->name, GF_LOG_ERROR, 0, \ + EC_MSG_CHILDS_INSUFFICIENT, \ + "Insufficient available children for this request " \ + "(have %d, need %d). %s", \ + __success_count, __ec->quorum_count, ec_msg_str(fop)); \ + } \ + fn(frame, cookie, this, __op_ret, __op_errno, params); \ + } while (0) + enum _ec_xattrop_flags { EC_FLAG_XATTROP, EC_FLAG_DATA_DIRTY, @@ -54,9 +77,12 @@ enum _ec_xattrop_flags { #define EC_SELFHEAL_BIT 62 -#define EC_MINIMUM_ONE -1 -#define EC_MINIMUM_MIN -2 -#define EC_MINIMUM_ALL -3 +#define EC_MINIMUM_ONE (1 << 6) +#define EC_MINIMUM_MIN (2 << 6) +#define EC_MINIMUM_ALL (3 << 6) +#define EC_FOP_NO_PROPAGATE_ERROR (1 << 8) +#define EC_FOP_MINIMUM(_flags) ((_flags)&255) +#define EC_FOP_FLAGS(_flags) ((_flags) & ~255) #define EC_UPDATE_DATA 1 #define EC_UPDATE_META 2 @@ -163,11 +189,14 @@ void ec_dispatch_one(ec_fop_data_t *fop); void +ec_succeed_all(ec_fop_data_t *fop); + +void ec_sleep(ec_fop_data_t *fop); void ec_resume(ec_fop_data_t *fop, int32_t error); void -ec_resume_parent(ec_fop_data_t *fop, int32_t error); +ec_resume_parent(ec_fop_data_t *fop); void ec_manager(ec_fop_data_t *fop, int32_t error); @@ -190,4 +219,16 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, void ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index, int32_t ret_status); +gf_boolean_t +ec_is_entry_healing(ec_fop_data_t *fop); +void +ec_set_entry_healing(ec_fop_data_t *fop); +void +ec_reset_entry_healing(ec_fop_data_t *fop); +char * +ec_msg_str(ec_fop_data_t *fop); +gf_boolean_t +__ec_is_last_fop(ec_t *ec); +void +ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop); #endif /* __EC_COMMON_H__ */ diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c index fae8843a679..06388833546 100644 --- a/xlators/cluster/ec/src/ec-data.c +++ b/xlators/cluster/ec/src/ec-data.c @@ -8,7 +8,6 @@ cases as published by the Free Software Foundation. */ -#include "ec-mem-types.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-data.h" @@ -98,7 +97,7 @@ ec_cbk_data_destroy(ec_cbk_data_t *cbk) ec_fop_data_t * ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id, - uint32_t flags, uintptr_t target, int32_t minimum, + uint32_t flags, uintptr_t target, uint32_t fop_flags, ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks, void *data) { @@ -151,7 +150,8 @@ ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id, fop->refs = 1; fop->flags = flags; - fop->minimum = minimum; + fop->minimum = EC_FOP_MINIMUM(fop_flags); + fop->fop_flags = EC_FOP_FLAGS(fop_flags); fop->mask = target; fop->wind = wind; @@ -201,11 +201,13 @@ ec_handle_last_pending_fop_completion(ec_fop_data_t *fop, gf_boolean_t *notify) { ec_t *ec = fop->xl->private; + *notify = _gf_false; + if (!list_empty(&fop->pending_list)) { LOCK(&ec->lock); { list_del_init(&fop->pending_list); - *notify = list_empty(&ec->pending_fops); + *notify = __ec_is_last_fop(ec); } UNLOCK(&ec->lock); } @@ -271,7 +273,7 @@ ec_fop_data_release(ec_fop_data_t *fop) loc_wipe(&fop->loc[1]); GF_FREE(fop->errstr); - ec_resume_parent(fop, fop->error); + ec_resume_parent(fop); ec_fop_cleanup(fop); diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 112536d554c..c8a74ffe1ed 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -18,7 +18,7 @@ ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop, int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno); ec_fop_data_t * ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id, - uint32_t flags, uintptr_t target, int32_t minimum, + uint32_t flags, uintptr_t target, uint32_t fop_flags, ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks, void *data); void diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index c9db7010a0f..f71dcfac293 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -8,15 +8,11 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> -#include <glusterfs/defaults.h> - #include "ec.h" #include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" -#include "ec-method.h" #include "ec-fops.h" /**************************************************************** @@ -127,13 +123,15 @@ ec_manager_opendir(ec_fop_data_t *fop, int32_t state) return EC_STATE_REPORT; } - err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); - if (err != 0) { - UNLOCK(&fop->fd->lock); + if (!ctx->loc.inode) { + err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); + if (err != 0) { + UNLOCK(&fop->fd->lock); - fop->error = -err; + fop->error = -err; - return EC_STATE_REPORT; + return EC_STATE_REPORT; + } } UNLOCK(&fop->fd->lock); @@ -219,7 +217,7 @@ ec_manager_opendir(ec_fop_data_t *fop, int32_t state) void ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_opendir_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc, fd_t *fd, dict_t *xdata) { ec_cbk_t callback = {.opendir = func}; @@ -233,7 +231,7 @@ ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_OPENDIR, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_opendir, + target, fop_flags, ec_wind_opendir, ec_manager_opendir, callback, data); if (fop == NULL) { goto out; @@ -388,9 +386,16 @@ ec_manager_readdir(ec_fop_data_t *fop, int32_t state) /* Return error if opendir has not been successfully called on * any subvolume. */ ctx = ec_fd_get(fop->fd, fop->xl); - if ((ctx == NULL) || (ctx->open == 0)) { - fop->error = EINVAL; + if (ctx == NULL) { + fop->error = ENOMEM; + } else if (ctx->open == 0) { + fop->error = EBADFD; + } + if (fop->error) { + gf_msg(fop->xl->name, GF_LOG_ERROR, fop->error, + EC_MSG_INVALID_REQUEST, "EC is not winding readdir: %s", + ec_msg_str(fop)); return EC_STATE_REPORT; } @@ -515,7 +520,7 @@ ec_manager_readdir(ec_fop_data_t *fop, int32_t state) void ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_readdir_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { ec_cbk_t callback = {.readdir = func}; @@ -529,7 +534,7 @@ ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_READDIR, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_readdir, + target, fop_flags, ec_wind_readdir, ec_manager_readdir, callback, data); if (fop == NULL) { goto out; @@ -585,7 +590,7 @@ ec_wind_readdirp(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_readdirp_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { ec_cbk_t callback = {.readdirp = func}; @@ -599,7 +604,7 @@ ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate( - frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, minimum, + frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, fop_flags, ec_wind_readdirp, ec_manager_readdir, callback, data); if (fop == NULL) { goto out; diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index e24667feedc..53d27d895c3 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -8,9 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> -#include <glusterfs/defaults.h> - #include "ec.h" #include "ec-messages.h" #include "ec-helpers.h" @@ -218,10 +215,10 @@ ec_manager_create(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.create != NULL) { - fop->cbks.create(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->fd, fop->loc[0].inode, - &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], - cbk->xdata); + QUORUM_CBK(fop->cbks.create, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->fd, + fop->loc[0].inode, &cbk->iatt[0], &cbk->iatt[1], + &cbk->iatt[2], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -262,7 +259,7 @@ ec_manager_create(ec_fop_data_t *fop, int32_t state) void ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_create_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { ec_cbk_t callback = {.create = func}; @@ -275,7 +272,7 @@ ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, fop_flags, ec_wind_create, ec_manager_create, callback, data); if (fop == NULL) { @@ -390,9 +387,10 @@ ec_manager_link(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.link != NULL) { - fop->cbks.link(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0], - &cbk->iatt[1], &cbk->iatt[2], cbk->xdata); + QUORUM_CBK(fop->cbks.link, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -432,9 +430,9 @@ ec_manager_link(ec_fop_data_t *fop, int32_t state) } void -ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_link_cbk_t func, void *data, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) { ec_cbk_t callback = {.link = func}; ec_fop_data_t *fop = NULL; @@ -446,7 +444,7 @@ ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, fop_flags, ec_wind_link, ec_manager_link, callback, data); if (fop == NULL) { goto out; @@ -569,9 +567,10 @@ ec_manager_mkdir(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.mkdir != NULL) { - fop->cbks.mkdir(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0], - &cbk->iatt[1], &cbk->iatt[2], cbk->xdata); + QUORUM_CBK(fop->cbks.mkdir, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -613,9 +612,9 @@ ec_manager_mkdir(ec_fop_data_t *fop, int32_t state) } void -ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_mkdir_cbk_t func, void *data, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc, + mode_t mode, mode_t umask, dict_t *xdata) { ec_cbk_t callback = {.mkdir = func}; ec_fop_data_t *fop = NULL; @@ -627,7 +626,7 @@ ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, fop_flags, ec_wind_mkdir, ec_manager_mkdir, callback, data); if (fop == NULL) { goto out; @@ -773,9 +772,10 @@ ec_manager_mknod(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.mknod != NULL) { - fop->cbks.mknod(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0], - &cbk->iatt[1], &cbk->iatt[2], cbk->xdata); + QUORUM_CBK(fop->cbks.mknod, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -815,9 +815,9 @@ ec_manager_mknod(ec_fop_data_t *fop, int32_t state) } void -ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_mknod_cbk_t func, void *data, loc_t *loc, mode_t mode, dev_t rdev, - mode_t umask, dict_t *xdata) +ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) { ec_cbk_t callback = {.mknod = func}; ec_fop_data_t *fop = NULL; @@ -829,7 +829,7 @@ ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, fop_flags, ec_wind_mknod, ec_manager_mknod, callback, data); if (fop == NULL) { goto out; @@ -931,10 +931,10 @@ ec_manager_rename(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.rename != NULL) { - fop->cbks.rename(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - &cbk->iatt[2], &cbk->iatt[3], &cbk->iatt[4], - cbk->xdata); + QUORUM_CBK(fop->cbks.rename, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], &cbk->iatt[2], &cbk->iatt[3], + &cbk->iatt[4], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -975,7 +975,7 @@ ec_manager_rename(ec_fop_data_t *fop, int32_t state) void ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_rename_cbk_t func, void *data, loc_t *oldloc, + uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { ec_cbk_t callback = {.rename = func}; @@ -988,7 +988,7 @@ ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, fop_flags, ec_wind_rename, ec_manager_rename, callback, data); if (fop == NULL) { @@ -1083,9 +1083,9 @@ ec_manager_rmdir(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.rmdir != NULL) { - fop->cbks.rmdir(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.rmdir, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1125,9 +1125,9 @@ ec_manager_rmdir(ec_fop_data_t *fop, int32_t state) } void -ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_rmdir_cbk_t func, void *data, loc_t *loc, int xflags, - dict_t *xdata) +ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc, + int xflags, dict_t *xdata) { ec_cbk_t callback = {.rmdir = func}; ec_fop_data_t *fop = NULL; @@ -1139,7 +1139,7 @@ ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, fop_flags, ec_wind_rmdir, ec_manager_rmdir, callback, data); if (fop == NULL) { goto out; @@ -1237,10 +1237,10 @@ ec_manager_symlink(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.symlink != NULL) { - fop->cbks.symlink(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, fop->loc[0].inode, - &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], - cbk->xdata); + QUORUM_CBK(fop->cbks.symlink, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1281,7 +1281,7 @@ ec_manager_symlink(ec_fop_data_t *fop, int32_t state) void ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_symlink_cbk_t func, void *data, + uint32_t fop_flags, fop_symlink_cbk_t func, void *data, const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) { ec_cbk_t callback = {.symlink = func}; @@ -1294,9 +1294,9 @@ ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target, minimum, - ec_wind_symlink, ec_manager_symlink, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target, + fop_flags, ec_wind_symlink, ec_manager_symlink, + callback, data); if (fop == NULL) { goto out; } @@ -1392,9 +1392,9 @@ ec_manager_unlink(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.unlink != NULL) { - fop->cbks.unlink(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.unlink, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1435,7 +1435,7 @@ ec_manager_unlink(ec_fop_data_t *fop, int32_t state) void ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_unlink_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc, int xflags, dict_t *xdata) { ec_cbk_t callback = {.unlink = func}; @@ -1448,7 +1448,7 @@ ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, fop_flags, ec_wind_unlink, ec_manager_unlink, callback, data); if (fop == NULL) { diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h index 2abef0d17b3..07edf8a7fec 100644 --- a/xlators/cluster/ec/src/ec-fops.h +++ b/xlators/cluster/ec/src/ec-fops.h @@ -18,233 +18,237 @@ void ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_access_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc, int32_t mask, dict_t *xdata); void ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_create_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); void ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_entrylk_cbk_t func, void *data, + uint32_t fop_flags, fop_entrylk_cbk_t func, void *data, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata); void ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fentrylk_cbk_t func, void *data, + uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data, const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata); void -ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_flush_cbk_t func, void *data, fd_t *fd, dict_t *xdata); +ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd, + dict_t *xdata); void -ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_fsync_cbk_t func, void *data, fd_t *fd, int32_t datasync, - dict_t *xdata); +ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd, + int32_t datasync, dict_t *xdata); void ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fsyncdir_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd, int32_t datasync, dict_t *xdata); void ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_getxattr_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc, const char *name, dict_t *xdata); void ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fgetxattr_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd, const char *name, dict_t *xdata); void -ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_heal_cbk_t func, void *data, loc_t *loc, int32_t partial, - dict_t *xdata); +ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc, + int32_t partial, dict_t *xdata); void -ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_fheal_cbk_t func, void *data, fd_t *fd, int32_t partial, - dict_t *xdata); +ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd, + int32_t partial, dict_t *xdata); void ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, - uintptr_t target, int32_t minimum, fop_inodelk_cbk_t func, + uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func, void *data, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata); void ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, - uintptr_t target, int32_t minimum, fop_finodelk_cbk_t func, + uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func, void *data, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata); void -ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_link_cbk_t func, void *data, loc_t *oldloc, loc_t *newloc, - dict_t *xdata); +ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc, + loc_t *newloc, dict_t *xdata); void -ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, +ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata); void ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_lookup_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc, dict_t *xdata); void -ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_mkdir_cbk_t func, void *data, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata); +ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc, + mode_t mode, mode_t umask, dict_t *xdata); void -ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_mknod_cbk_t func, void *data, loc_t *loc, mode_t mode, dev_t rdev, - mode_t umask, dict_t *xdata); +ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); void -ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_open_cbk_t func, void *data, loc_t *loc, int32_t flags, fd_t *fd, - dict_t *xdata); +ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata); void ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_opendir_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc, fd_t *fd, dict_t *xdata); void ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_readdir_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset, dict_t *xdata); void ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_readdirp_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset, dict_t *xdata); void ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_readlink_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc, size_t size, dict_t *xdata); void -ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_readv_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset, - uint32_t flags, dict_t *xdata); +ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata); void ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_removexattr_cbk_t func, void *data, + uint32_t fop_flags, fop_removexattr_cbk_t func, void *data, loc_t *loc, const char *name, dict_t *xdata); void ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fremovexattr_cbk_t func, void *data, + uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data, fd_t *fd, const char *name, dict_t *xdata); void ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_rename_cbk_t func, void *data, loc_t *oldloc, + uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc, loc_t *newloc, dict_t *xdata); void -ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_rmdir_cbk_t func, void *data, loc_t *loc, int xflags, - dict_t *xdata); +ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc, + int xflags, dict_t *xdata); void ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_setattr_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata); void ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fsetattr_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata); void ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_setxattr_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); void ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); void -ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_stat_cbk_t func, void *data, loc_t *loc, dict_t *xdata); +ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc, + dict_t *xdata); void -ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_fstat_cbk_t func, void *data, fd_t *fd, dict_t *xdata); +ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd, + dict_t *xdata); void ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_statfs_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc, dict_t *xdata); void ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_symlink_cbk_t func, void *data, + uint32_t fop_flags, fop_symlink_cbk_t func, void *data, const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata); void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd, int32_t mode, off_t offset, size_t len, dict_t *xdata); void ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_discard_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd, off_t offset, size_t len, dict_t *xdata); void ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_truncate_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc, off_t offset, dict_t *xdata); void ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_ftruncate_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd, off_t offset, dict_t *xdata); void ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_unlink_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc, int xflags, dict_t *xdata); void ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_writev_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata); void ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_xattrop_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); void ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fxattrop_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); void -ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_seek_cbk_t func, void *data, fd_t *fd, off_t offset, - gf_seek_what_t what, dict_t *xdata); +ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd, + off_t offset, gf_seek_what_t what, dict_t *xdata); void -ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_ipc_cbk_t func, void *data, int32_t op, dict_t *xdata); +ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op, + dict_t *xdata); #endif /* __EC_FOPS_H__ */ diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c index ee7662f52ce..6e4990c71f5 100644 --- a/xlators/cluster/ec/src/ec-galois.c +++ b/xlators/cluster/ec/src/ec-galois.c @@ -10,9 +10,6 @@ #include <string.h> -#include <glusterfs/mem-pool.h> -#include <glusterfs/list.h> - #include "ec-mem-types.h" #include "ec-gf8.h" #include "ec-helpers.h" diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 175e88ac94b..884deb93669 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -8,8 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> -#include <glusterfs/defaults.h> #include <glusterfs/byte-order.h> #include "ec.h" @@ -17,7 +15,6 @@ #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" -#include "ec-method.h" #include "ec-fops.h" /* FOP: flush */ @@ -150,9 +147,41 @@ ec_manager_flush(ec_fop_data_t *fop, int32_t state) } } +static int32_t +ec_validate_fd(fd_t *fd, xlator_t *xl) +{ + uint64_t iversion = 0; + uint64_t fversion = 0; + ec_inode_t *inode_ctx = NULL; + ec_fd_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __ec_fd_get(fd, xl); + if (fd_ctx) { + fversion = fd_ctx->bad_version; + } + } + UNLOCK(&fd->lock); + + LOCK(&fd->inode->lock); + { + inode_ctx = __ec_inode_get(fd->inode, xl); + if (inode_ctx) { + iversion = inode_ctx->bad_version; + } + } + UNLOCK(&fd->inode->lock); + if (fversion < iversion) { + return EBADF; + } + return 0; +} + void -ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_flush_cbk_t func, void *data, fd_t *fd, dict_t *xdata) +ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd, + dict_t *xdata) { ec_cbk_t callback = {.flush = func}; ec_fop_data_t *fop = NULL; @@ -164,7 +193,17 @@ ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, minimum, + if (fd) { + error = ec_validate_fd(fd, this); + if (error) { + gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD, + "Failing %s on %s", gf_fop_list[GF_FOP_FLUSH], + fd->inode ? uuid_utoa(fd->inode->gfid) : ""); + goto out; + } + } + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, fop_flags, ec_wind_flush, ec_manager_flush, callback, data); if (fop == NULL) { goto out; @@ -366,9 +405,9 @@ ec_manager_fsync(ec_fop_data_t *fop, int32_t state) } void -ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_fsync_cbk_t func, void *data, fd_t *fd, int32_t datasync, - dict_t *xdata) +ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd, + int32_t datasync, dict_t *xdata) { ec_cbk_t callback = {.fsync = func}; ec_fop_data_t *fop = NULL; @@ -380,7 +419,17 @@ ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, minimum, + if (fd) { + error = ec_validate_fd(fd, this); + if (error) { + gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD, + "Failing %s on %s", gf_fop_list[GF_FOP_FSYNC], + fd->inode ? uuid_utoa(fd->inode->gfid) : ""); + goto out; + } + } + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, fop_flags, ec_wind_fsync, ec_manager_fsync, callback, data); if (fop == NULL) { goto out; @@ -553,7 +602,7 @@ ec_manager_fsyncdir(ec_fop_data_t *fop, int32_t state) void ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fsyncdir_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd, int32_t datasync, dict_t *xdata) { ec_cbk_t callback = {.fsyncdir = func}; @@ -566,9 +615,9 @@ ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target, minimum, - ec_wind_fsyncdir, ec_manager_fsyncdir, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target, + fop_flags, ec_wind_fsyncdir, ec_manager_fsyncdir, + callback, data); if (fop == NULL) { goto out; } @@ -848,7 +897,7 @@ ec_manager_lookup(ec_fop_data_t *fop, int32_t state) void ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_lookup_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc, dict_t *xdata) { ec_cbk_t callback = {.lookup = func}; @@ -862,7 +911,7 @@ ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_LOOKUP, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_lookup, + target, fop_flags, ec_wind_lookup, ec_manager_lookup, callback, data); if (fop == NULL) { goto out; @@ -1033,7 +1082,7 @@ ec_manager_statfs(ec_fop_data_t *fop, int32_t state) void ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_statfs_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc, dict_t *xdata) { ec_cbk_t callback = {.statfs = func}; @@ -1047,7 +1096,7 @@ ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_STATFS, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_statfs, + target, fop_flags, ec_wind_statfs, ec_manager_statfs, callback, data); if (fop == NULL) { goto out; @@ -1270,7 +1319,7 @@ ec_manager_xattrop(ec_fop_data_t *fop, int32_t state) void ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_xattrop_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { ec_cbk_t callback = {.xattrop = func}; @@ -1283,9 +1332,9 @@ ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target, minimum, - ec_wind_xattrop, ec_manager_xattrop, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target, + fop_flags, ec_wind_xattrop, ec_manager_xattrop, + callback, data); if (fop == NULL) { goto out; } @@ -1343,7 +1392,7 @@ ec_wind_fxattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fxattrop_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { ec_cbk_t callback = {.fxattrop = func}; @@ -1356,9 +1405,9 @@ ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target, minimum, - ec_wind_fxattrop, ec_manager_xattrop, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target, + fop_flags, ec_wind_fxattrop, ec_manager_xattrop, + callback, data); if (fop == NULL) { goto out; } @@ -1507,8 +1556,9 @@ ec_manager_ipc(ec_fop_data_t *fop, int32_t state) } void -ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_ipc_cbk_t func, void *data, int32_t op, dict_t *xdata) +ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op, + dict_t *xdata) { ec_cbk_t callback = {.ipc = func}; ec_fop_data_t *fop = NULL; @@ -1520,7 +1570,7 @@ ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, fop_flags, ec_wind_ipc, ec_manager_ipc, callback, data); if (fop == NULL) { goto out; diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index eaf80e023e3..7d991f04aac 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -8,7 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> #include <glusterfs/defaults.h> #include <glusterfs/compat-errno.h> #include <glusterfs/byte-order.h> @@ -17,7 +16,6 @@ #include <glusterfs/cluster-syncop.h> #include "ec.h" -#include "ec-mem-types.h" #include "ec-types.h" #include "ec-messages.h" #include "ec-helpers.h" @@ -72,6 +70,7 @@ struct ec_name_data { char *name; inode_t *parent; default_args_cbk_t *replies; + uint32_t heal_pending; }; static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL}; @@ -103,6 +102,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata) } /* FOP: heal */ +void +ec_set_entry_healing(ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + loc_t *loc = NULL; + + if (!fop) + return; + + loc = &fop->loc[0]; + LOCK(&loc->inode->lock); + { + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx) { + ctx->heal_count += 1; + } + } + UNLOCK(&loc->inode->lock); +} + +void +ec_reset_entry_healing(ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + loc_t *loc = NULL; + int32_t heal_count = 0; + if (!fop) + return; + + loc = &fop->loc[0]; + LOCK(&loc->inode->lock); + { + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx) { + ctx->heal_count += -1; + heal_count = ctx->heal_count; + } + } + UNLOCK(&loc->inode->lock); + GF_ASSERT(heal_count >= 0); +} + uintptr_t ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood) { @@ -325,16 +366,16 @@ ec_heal_data_block(ec_heal_t *heal) /* FOP: fheal */ void -ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_fheal_cbk_t func, void *data, fd_t *fd, int32_t partial, - dict_t *xdata) +ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd, + int32_t partial, dict_t *xdata) { ec_fd_t *ctx = ec_fd_get(fd, this); if (ctx != NULL) { gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%" PRIXPTR, ctx->flags, ctx->open); - ec_heal(frame, this, target, minimum, func, data, &ctx->loc, partial, + ec_heal(frame, this, target, fop_flags, func, data, &ctx->loc, partial, xdata); } } @@ -954,6 +995,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia, ret = -ENOTCONN; goto out; } + out: if (xattr) dict_unref(xattr); @@ -977,6 +1019,7 @@ ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data) int estale_count = 0; int i = 0; call_frame_t *frame = name_data->frame; + uuid_t gfid; ec = name_data->frame->this->private; EC_REPLIES_ALLOC(replies, ec->nodes); @@ -985,12 +1028,16 @@ ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data) goto out; } + loc.parent = inode_ref(name_data->parent); loc.inode = inode_new(name_data->parent->table); if (!loc.inode) { ret = -ENOMEM; goto out; } - gf_uuid_parse(key, loc.gfid); + + gf_uuid_parse(key, gfid); + gf_uuid_copy(loc.pargfid, name_data->parent->gfid); + loc.name = name_data->name; output = alloca0(ec->nodes); ret = cluster_lookup(ec->xl_list, name_data->participants, ec->nodes, replies, output, name_data->frame, ec->xl, &loc, NULL); @@ -1003,6 +1050,11 @@ ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data) estale_count++; else name_data->participants[i] = 0; + } else if (gf_uuid_compare(gfid, replies[i].stat.ia_gfid)) { + estale_count++; + gf_msg_debug(ec->xl->name, 0, "%s/%s: different gfid as %s", + uuid_utoa(name_data->parent->gfid), name_data->name, + key); } } @@ -1122,6 +1174,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, dict_t *xdata = NULL; char *linkname = NULL; ec_config_t config; + /* There should be just one gfid key */ EC_REPLIES_ALLOC(replies, ec->nodes); if (gfid_db->count != 1) { @@ -1366,6 +1419,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent, participants); + if (ret >= 0) { + /* If ec_create_name() succeeded we return 1 to indicate that a new + * file has been created and it will need to be healed. */ + ret = 1; + } out: cluster_replies_wipe(replies, ec->nodes); loc_wipe(&loc); @@ -1443,18 +1501,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name, name_on); - if (ret < 0) + if (ret < 0) { memset(name_on, 0, ec->nodes); + } else { + name_data->heal_pending += ret; + } for (i = 0; i < ec->nodes; i++) if (name_data->participants[i] && !name_on[i]) name_data->failed_on[i] = 1; + return 0; } int ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode, - unsigned char *participants) + unsigned char *participants, uint32_t *pending) { int i = 0; int j = 0; @@ -1467,7 +1529,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode, name_data.frame = frame; name_data.participants = participants; name_data.failed_on = alloca0(ec->nodes); - ; + name_data.heal_pending = 0; for (i = 0; i < ec->nodes; i++) { if (!participants[i]) @@ -1486,6 +1548,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode, break; } } + *pending += name_data.heal_pending; + loc_wipe(&loc); return ret; } @@ -1493,7 +1557,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode, int __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode, unsigned char *heal_on, unsigned char *sources, - unsigned char *healed_sinks) + unsigned char *healed_sinks, uint32_t *pending) { unsigned char *locked_on = NULL; unsigned char *output = NULL; @@ -1538,7 +1602,7 @@ unlock: if (sources[i] || healed_sinks[i]) participants[i] = 1; } - ret = ec_heal_names(frame, ec, inode, participants); + ret = ec_heal_names(frame, ec, inode, participants, pending); if (EC_COUNT(participants, ec->nodes) <= ec->fragments) goto out; @@ -1559,7 +1623,8 @@ out: int ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode, - unsigned char *sources, unsigned char *healed_sinks) + unsigned char *sources, unsigned char *healed_sinks, + uint32_t *pending) { unsigned char *locked_on = NULL; unsigned char *up_subvols = NULL; @@ -1590,7 +1655,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode, goto unlock; } ret = __ec_heal_entry(frame, ec, inode, locked_on, sources, - healed_sinks); + healed_sinks, pending); } unlock: cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, @@ -1909,16 +1974,16 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state) case EC_STATE_REPORT: if (fop->cbks.heal) { - fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0, + fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0, (heal->good | heal->bad), heal->good, heal->bad, - NULL); + 0, NULL); } return EC_STATE_END; case -EC_STATE_REPORT: if (fop->cbks.heal) { - fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0, - 0, 0, NULL); + fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1, + fop->error, 0, 0, 0, 0, NULL); } return EC_STATE_END; @@ -1933,7 +1998,7 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state) /*Takes lock */ void ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_heal_cbk_t func, ec_heal_t *heal) + uint32_t fop_flags, fop_heal_cbk_t func, ec_heal_t *heal) { ec_cbk_t callback = {.heal = func}; ec_fop_data_t *fop = NULL; @@ -1944,7 +2009,7 @@ ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target, VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags, NULL, ec_manager_heal_block, callback, heal); if (fop == NULL) goto out; @@ -1955,19 +2020,21 @@ out: if (fop != NULL) { ec_manager(fop, error); } else { - func(frame, NULL, this, -1, error, 0, 0, 0, NULL); + func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL); } } int32_t ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, uintptr_t mask, - uintptr_t good, uintptr_t bad, dict_t *xdata) + uintptr_t good, uintptr_t bad, uint32_t pending, + dict_t *xdata) { - ec_fop_data_t *fop = cookie; - ec_heal_t *heal = fop->data; + ec_heal_t *heal = cookie; - fop->heal = NULL; + if (heal->fop) { + heal->fop->heal = NULL; + } heal->fop = NULL; heal->error = op_ret < 0 ? op_errno : 0; syncbarrier_wake(heal->data); @@ -2259,9 +2326,10 @@ ec_restore_time_and_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd, loc.inode = inode_ref(fd->inode); gf_uuid_copy(loc.gfid, fd->inode->gfid); - ret = cluster_setattr(ec->xl_list, healed_sinks, ec->nodes, replies, - output, frame, ec->xl, &loc, &source_buf, - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME, NULL); + ret = cluster_setattr( + ec->xl_list, healed_sinks, ec->nodes, replies, output, frame, + ec->xl, &loc, &source_buf, + GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME, NULL); EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes); if (EC_COUNT(healed_sinks, ec->nodes) == 0) { ret = -ENOTCONN; @@ -2429,6 +2497,58 @@ out: return ret; } +int +ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode) +{ + int i = 0; + int ret = 0; + dict_t **xattr = NULL; + loc_t loc = {0}; + uint64_t dirty_xattr[EC_VERSION_SIZE] = {0}; + unsigned char *on = NULL; + default_args_cbk_t *replies = NULL; + dict_t *dict = NULL; + + /* Allocate the required memory */ + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + on = alloca0(ec->nodes); + EC_REPLIES_ALLOC(replies, ec->nodes); + xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < ec->nodes; i++) { + xattr[i] = dict; + on[i] = 1; + } + ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr, + (sizeof(*dirty_xattr) * EC_VERSION_SIZE)); + if (ret < 0) { + ret = -ENOMEM; + goto out; + } + PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame, + ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64, + xattr, NULL); +out: + if (dict) { + dict_unref(dict); + } + if (xattr) { + GF_FREE(xattr); + } + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + return ret; +} + void ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) { @@ -2446,6 +2566,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) intptr_t mbad = 0; intptr_t good = 0; intptr_t bad = 0; + uint32_t pending = 0; ec_fop_data_t *fop = data; gf_boolean_t blocking = _gf_false; ec_heal_need_t need_heal = EC_HEAL_NONEED; @@ -2481,7 +2602,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) if (loc->name && strlen(loc->name)) { ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name, participants); - if (ret == 0) { + if (ret >= 0) { gf_msg_debug(this->name, 0, "%s: name heal " "successful on %" PRIXPTR, @@ -2499,32 +2620,34 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) /* Mount triggers heal only when it detects that it must need heal, shd * triggers heals periodically which need not be thorough*/ - ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, - !ec->shd.iamshd, &need_heal); - - if (need_heal == EC_HEAL_NONEED) { - gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL, - "Heal is not required for : %s ", uuid_utoa(loc->gfid)); - goto out; + if (ec->shd.iamshd && (ret <= 0)) { + ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false, + &need_heal); + + if (need_heal == EC_HEAL_PURGE_INDEX) { + gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL, + "Index entry needs to be purged for: %s ", + uuid_utoa(loc->gfid)); + /* We need to send zero-xattrop so that stale index entry could be + * removed. We need not take lock on this entry to do so as + * xattrop on a brick is atomic. */ + ec_heal_purge_stale_index(frame, ec, loc->inode); + goto out; + } else if (need_heal == EC_HEAL_NONEED) { + gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL, + "Heal is not required for : %s ", uuid_utoa(loc->gfid)); + goto out; + } } - msources = alloca0(ec->nodes); - mhealed_sinks = alloca0(ec->nodes); - ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); - if (ret == 0) { - mgood = ec_char_array_to_mask(msources, ec->nodes); - mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); - } else { - op_ret = -1; - op_errno = -ret; - } sources = alloca0(ec->nodes); healed_sinks = alloca0(ec->nodes); if (IA_ISREG(loc->inode->ia_type)) { ret = ec_heal_data(frame, ec, blocking, loc->inode, sources, healed_sinks); } else if (IA_ISDIR(loc->inode->ia_type) && !partial) { - ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks); + ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks, + &pending); } else { ret = 0; memcpy(sources, participants, ec->nodes); @@ -2538,15 +2661,27 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) op_ret = -1; op_errno = -ret; } + msources = alloca0(ec->nodes); + mhealed_sinks = alloca0(ec->nodes); + ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); + if (ret == 0) { + mgood = ec_char_array_to_mask(msources, ec->nodes); + mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); + } else { + op_ret = -1; + op_errno = -ret; + } out: + ec_reset_entry_healing(fop); if (fop->cbks.heal) { - fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno, + fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno, ec_char_array_to_mask(participants, ec->nodes), - mgood & good, mbad & bad, NULL); + mgood & good, mbad & bad, pending, NULL); } if (frame) STACK_DESTROY(frame->root); + return; } @@ -2593,8 +2728,8 @@ void ec_heal_fail(ec_t *ec, ec_fop_data_t *fop) { if (fop->cbks.heal) { - fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0, - NULL); + fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0, + 0, 0, NULL); } ec_fop_data_release(fop); } @@ -2603,13 +2738,31 @@ void ec_launch_heal(ec_t *ec, ec_fop_data_t *fop) { int ret = 0; + call_frame_t *frame = NULL; + + frame = create_frame(ec->xl, ec->xl->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } + + ec_owner_set(frame, frame->root); + /*Do heal as root*/ + frame->root->uid = 0; + frame->root->gid = 0; + /*Mark the fops as internal*/ + frame->root->pid = GF_CLIENT_PID_SELF_HEALD; ret = synctask_new(ec->xl->ctx->env, ec_synctask_heal_wrap, ec_heal_done, - NULL, fop); + frame, fop); +out: if (ret < 0) { ec_fop_set_error(fop, ENOMEM); ec_heal_fail(ec, fop); } + + if (frame) + STACK_DESTROY(frame->root); } void @@ -2650,11 +2803,33 @@ ec_handle_healers_done(ec_fop_data_t *fop) ec_launch_heal(ec, heal_fop); } +gf_boolean_t +ec_is_entry_healing(ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + int32_t heal_count = 0; + loc_t *loc = NULL; + + loc = &fop->loc[0]; + + LOCK(&loc->inode->lock); + { + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx) { + heal_count = ctx->heal_count; + } + } + UNLOCK(&loc->inode->lock); + GF_ASSERT(heal_count >= 0); + return heal_count; +} + void ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) { gf_boolean_t can_heal = _gf_true; ec_t *ec = this->private; + ec_fop_data_t *fop_rel = NULL; if (fop->req_frame == NULL) { LOCK(&ec->lock); @@ -2662,8 +2837,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) if ((ec->background_heals > 0) && (ec->heal_wait_qlen + ec->background_heals) > (ec->heal_waiters + ec->healers)) { - list_add_tail(&fop->healer, &ec->heal_waiting); - ec->heal_waiters++; + if (!ec_is_entry_healing(fop)) { + list_add_tail(&fop->healer, &ec->heal_waiting); + ec->heal_waiters++; + ec_set_entry_healing(fop); + } else { + fop_rel = fop; + } fop = __ec_dequeue_heals(ec); } else { can_heal = _gf_false; @@ -2673,8 +2853,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) } if (can_heal) { - if (fop) + if (fop) { + if (fop->req_frame != NULL) { + ec_set_entry_healing(fop); + } ec_launch_heal(ec, fop); + } } else { gf_msg_debug(this->name, 0, "Max number of heals are " @@ -2682,12 +2866,15 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) ec_fop_set_error(fop, EBUSY); ec_heal_fail(ec, fop); } + if (fop_rel) { + ec_heal_done(0, NULL, fop_rel); + } } void -ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_heal_cbk_t func, void *data, loc_t *loc, int32_t partial, - dict_t *xdata) +ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc, + int32_t partial, dict_t *xdata) { ec_cbk_t callback = {.heal = func}; ec_fop_data_t *fop = NULL; @@ -2703,7 +2890,7 @@ ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, if (frame && frame->local) goto fail; - fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags, NULL, NULL, callback, data); err = ENOMEM; @@ -2729,15 +2916,27 @@ fail: if (fop) ec_fop_data_release(fop); if (func) - func(frame, NULL, this, -1, err, 0, 0, 0, NULL); + func(frame, data, this, -1, err, 0, 0, 0, 0, NULL); } int ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque) { ec_t *ec = opaque; + gf_boolean_t last_fop = _gf_false; + if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) { + LOCK(&ec->lock); + { + last_fop = __ec_is_last_fop(ec); + } + UNLOCK(&ec->lock); + } gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret); + + if (last_fop) + ec_pending_fops_completed(ec); + return 0; } @@ -2777,6 +2976,10 @@ ec_replace_brick_heal_wrap(void *opaque) itable = ec->xl->itable; else goto out; + + if (xlator_is_cleanup_starting(ec->xl)) + goto out; + ret = ec_replace_heal(ec, itable->root); out: return ret; @@ -2787,14 +2990,15 @@ ec_launch_replace_heal(ec_t *ec) { int ret = -1; - if (!ec) - return ret; ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap, ec_replace_heal_done, NULL, ec); + if (ret < 0) { gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d", ret); + ec_replace_heal_done(-1, NULL, ec); } + return ret; } @@ -2826,7 +3030,7 @@ out: static int32_t _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources, gf_boolean_t self_locked, int32_t lock_count, - ec_heal_need_t *need_heal) + ec_heal_need_t *need_heal, uint64_t *versions) { int i = 0; int source_count = 0; @@ -2836,11 +3040,18 @@ _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources, *need_heal = EC_HEAL_NONEED; if (self_locked || lock_count == 0) { for (i = 0; i < ec->nodes; i++) { - if (dirty[i]) { + if (dirty[i] || (versions[i] != versions[0])) { *need_heal = EC_HEAL_MUST; goto out; } } + /* If lock count is 0, all dirty flags are 0 and all the + * versions are macthing then why are we here. It looks + * like something went wrong while removing the index entries + * after completing a successful heal or fop. In this case + * we need to remove this index entry to avoid triggering heal + * in a loop and causing lookups again and again*/ + *need_heal = EC_HEAL_PURGE_INDEX; } else { for (i = 0; i < ec->nodes; i++) { /* Since each lock can only increment the dirty @@ -2852,6 +3063,9 @@ _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources, *need_heal = EC_HEAL_MUST; goto out; } + if (dirty[i] != dirty[0] || (versions[i] != versions[0])) { + *need_heal = EC_HEAL_MAYBE; + } } } } else { @@ -2872,7 +3086,6 @@ ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, unsigned char *healed_sinks = NULL; uint64_t *meta_versions = NULL; int ret = 0; - int i = 0; sources = alloca0(ec->nodes); healed_sinks = alloca0(ec->nodes); @@ -2885,15 +3098,7 @@ ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, } ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count, - need_heal); - if (ret == ec->nodes && *need_heal == EC_HEAL_NONEED) { - for (i = 1; i < ec->nodes; i++) { - if (meta_versions[i] != meta_versions[0]) { - *need_heal = EC_HEAL_MUST; - goto out; - } - } - } + need_heal, meta_versions); out: return ret; } @@ -2929,7 +3134,7 @@ ec_need_data_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, } ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count, - need_heal); + need_heal, data_versions); out: return ret; } @@ -2957,7 +3162,7 @@ ec_need_entry_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, } ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count, - need_heal); + need_heal, data_versions); out: return ret; } @@ -3055,10 +3260,6 @@ ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode, need_heal: ret = ec_need_heal(ec, inode, replies, lock_count, self_locked, thorough, need_heal); - - if (!self_locked && *need_heal == EC_HEAL_MUST) { - *need_heal = EC_HEAL_MAYBE; - } out: cluster_replies_wipe(replies, ec->nodes); loc_wipe(&loc); @@ -3144,7 +3345,7 @@ ec_get_heal_info(xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp) ret = ec_heal_inspect(frame, ec, loc.inode, up_subvols, _gf_false, _gf_false, &need_heal); - if (ret == ec->nodes && need_heal == EC_HEAL_NONEED) { + if (ret == ec->nodes && need_heal != EC_HEAL_MAYBE) { goto set_heal; } need_heal = EC_HEAL_NONEED; diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c index cba111a3e8f..5c1586bc9c5 100644 --- a/xlators/cluster/ec/src/ec-heald.c +++ b/xlators/cluster/ec/src/ec-heald.c @@ -8,7 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> #include <glusterfs/defaults.h> #include <glusterfs/compat-errno.h> #include "ec.h" @@ -63,7 +62,7 @@ __ec_shd_healer_wait(struct subvol_healer *healer) ec = healer->this->private; disabled_loop: - wait_till.tv_sec = time(NULL) + 60; + wait_till.tv_sec = gf_time() + ec->shd.timeout; while (!healer->rerun) { ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); @@ -71,6 +70,11 @@ disabled_loop: break; } + if (ec->shutdown) { + healer->running = _gf_false; + return -1; + } + ret = healer->rerun; healer->rerun = 0; @@ -152,19 +156,78 @@ ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name) return ret; } +static gf_boolean_t +ec_is_heal_completed(char *status) +{ + char *bad_pos = NULL; + char *zero_pos = NULL; + + if (!status) { + return _gf_false; + } + + /*Logic: + * Status will be of the form Good: <binary>, Bad: <binary> + * If heal completes, if we do strchr for '0' it should be present after + * 'Bad:' i.e. strRchr for ':' + * */ + + zero_pos = strchr(status, '0'); + bad_pos = strrchr(status, ':'); + if (!zero_pos || !bad_pos) { + /*malformed status*/ + return _gf_false; + } + + if (zero_pos > bad_pos) { + return _gf_true; + } + + return _gf_false; +} + int ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, gf_boolean_t full) { + dict_t *xdata = NULL; + dict_t *dict = NULL; + uint32_t count; int32_t ret; + char *heal_status = NULL; + ec_t *ec = healer->this->private; + + GF_ATOMIC_INC(ec->stats.shd.attempted); + ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL, + &xdata); + if (ret == 0) { + if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) { + if (ec_is_heal_completed(heal_status)) { + GF_ATOMIC_INC(ec->stats.shd.completed); + } + } + } - ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL); - if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) { + if (!full && (loc->inode->ia_type == IA_IFDIR)) { /* If we have just healed a directory, it's possible that - * other index entries have appeared to be healed. We put a - * mark so that we can check it later and restart a scan - * without delay. */ - healer->rerun = _gf_true; + * other index entries have appeared to be healed. */ + if ((xdata != NULL) && + (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) && + (count > 0)) { + /* Force a rerun of the index healer. */ + gf_msg_debug(healer->this->name, 0, "%d more entries to heal", + count); + + healer->rerun = _gf_true; + } + } + + if (xdata != NULL) { + dict_unref(xdata); + } + + if (dict) { + dict_unref(dict); } return ret; @@ -241,9 +304,11 @@ ec_shd_index_sweep(struct subvol_healer *healer) goto out; } + _mask_cancellation(); ret = syncop_mt_dir_scan(NULL, subvol, &loc, GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_index_heal, xdata, ec->shd.max_threads, ec->shd.wait_qlength); + _unmask_cancellation(); out: if (xdata) dict_unref(xdata); @@ -263,6 +328,11 @@ ec_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, int ret = 0; ec = this->private; + + if (this->cleanup_starting) { + return -ENOTCONN; + } + if (ec->xl_up_count <= ec->fragments) { return -ENOTCONN; } @@ -305,11 +375,15 @@ ec_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) { ec_t *ec = NULL; loc_t loc = {0}; + int ret = -1; ec = healer->this->private; loc.inode = inode; - return syncop_ftw(ec->xl_list[healer->subvol], &loc, - GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal); + _mask_cancellation(); + ret = syncop_ftw(ec->xl_list[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal); + _unmask_cancellation(); + return ret; } void * @@ -317,13 +391,16 @@ ec_shd_index_healer(void *data) { struct subvol_healer *healer = NULL; xlator_t *this = NULL; + int run = 0; healer = data; THIS = this = healer->this; ec_t *ec = this->private; for (;;) { - ec_shd_healer_wait(healer); + run = ec_shd_healer_wait(healer); + if (run == -1) + break; if (ec->xl_up_count > ec->fragments) { gf_msg_debug(this->name, 0, "starting index sweep on subvol %s", @@ -352,16 +429,12 @@ ec_shd_full_healer(void *data) rootloc.inode = this->itable->root; for (;;) { - pthread_mutex_lock(&healer->mutex); - { - run = __ec_shd_healer_wait(healer); - if (!run) - healer->running = _gf_false; - } - pthread_mutex_unlock(&healer->mutex); - - if (!run) + run = ec_shd_healer_wait(healer); + if (run < 0) { break; + } else if (run == 0) { + continue; + } if (ec->xl_up_count > ec->fragments) { gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_START, @@ -429,6 +502,9 @@ unlock: int ec_shd_full_healer_spawn(xlator_t *this, int subvol) { + if (xlator_is_cleanup_starting(this)) + return -1; + return ec_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol), ec_shd_full_healer); } @@ -436,6 +512,9 @@ ec_shd_full_healer_spawn(xlator_t *this, int subvol) int ec_shd_index_healer_spawn(xlator_t *this, int subvol) { + if (xlator_is_cleanup_starting(this)) + return -1; + return ec_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol), ec_shd_index_healer); } @@ -562,3 +641,41 @@ out: dict_del(output, this->name); return ret; } + +void +ec_destroy_healer_object(xlator_t *this, struct subvol_healer *healer) +{ + if (!healer) + return; + + pthread_cond_destroy(&healer->cond); + pthread_mutex_destroy(&healer->mutex); +} + +void +ec_selfheal_daemon_fini(xlator_t *this) +{ + struct subvol_healer *healer = NULL; + ec_self_heald_t *shd = NULL; + ec_t *priv = NULL; + int i = 0; + + priv = this->private; + if (!priv) + return; + + shd = &priv->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < priv->nodes; i++) { + healer = &shd->index_healers[i]; + ec_destroy_healer_object(this, healer); + + healer = &shd->full_healers[i]; + ec_destroy_healer_object(this, healer); + } + + GF_FREE(shd->index_healers); + GF_FREE(shd->full_healers); +} diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h index 2eda2a74f54..6c7da4edc10 100644 --- a/xlators/cluster/ec/src/ec-heald.h +++ b/xlators/cluster/ec/src/ec-heald.h @@ -11,9 +11,9 @@ #ifndef __EC_HEALD_H__ #define __EC_HEALD_H__ -#include <glusterfs/xlator.h> - -#include "ec-types.h" +#include "ec-types.h" // for ec_t +#include "glusterfs/dict.h" // for dict_t +#include "glusterfs/globals.h" // for xlator_t int ec_xl_op(xlator_t *this, dict_t *input, dict_t *output); @@ -24,4 +24,7 @@ ec_selfheal_daemon_init(xlator_t *this); void ec_shd_index_healer_wake(ec_t *ec); +void +ec_selfheal_daemon_fini(xlator_t *this); + #endif /* __EC_HEALD_H__ */ diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index e6b0359bd6f..48f54475e01 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -476,7 +476,7 @@ out: int32_t ec_loc_setup_path(xlator_t *xl, loc_t *loc) { - uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; char *name; int32_t ret = -EINVAL; @@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl) memset(ctx, 0, sizeof(*ctx)); INIT_LIST_HEAD(&ctx->heal); INIT_LIST_HEAD(&ctx->stripe_cache.lru); + ctx->heal_count = 0; value = (uint64_t)(uintptr_t)ctx; if (__inode_ctx_set(inode, xl, &value) != 0) { GF_FREE(ctx); @@ -752,6 +753,7 @@ __ec_fd_get(fd_t *fd, xlator_t *xl) { int i = 0; ec_fd_t *ctx = NULL; + ec_inode_t *ictx = NULL; uint64_t value = 0; ec_t *ec = xl->private; @@ -774,6 +776,12 @@ __ec_fd_get(fd_t *fd, xlator_t *xl) GF_FREE(ctx); return NULL; } + /* Only refering bad-version so no need for lock + * */ + ictx = __ec_inode_get(fd->inode, xl); + if (ictx) { + ctx->bad_version = ictx->bad_version; + } } } else { ctx = (ec_fd_t *)(uintptr_t)value; diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 55e59345ab0..dad5f4d7018 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -8,9 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> -#include <glusterfs/defaults.h> - #include "ec.h" #include "ec-messages.h" #include "ec-helpers.h" @@ -135,7 +132,7 @@ ec_manager_access(ec_fop_data_t *fop, int32_t state) void ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_access_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc, int32_t mask, dict_t *xdata) { ec_cbk_t callback = {.access = func}; @@ -149,7 +146,7 @@ ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_ACCESS, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_access, + target, fop_flags, ec_wind_access, ec_manager_access, callback, data); if (fop == NULL) { goto out; @@ -393,15 +390,34 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state) int32_t ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret, int32_t op_errno, uintptr_t mask, - uintptr_t good, uintptr_t bad, dict_t *xdata) + uintptr_t good, uintptr_t bad, uint32_t pending, + dict_t *xdata) { - ec_fop_data_t *fop = cookie; - fop_getxattr_cbk_t func = fop->data; + fop_getxattr_cbk_t func = cookie; ec_t *ec = xl->private; dict_t *dict = NULL; char *str; char bin1[65], bin2[65]; + /* We try to return the 'pending' information in xdata, but if this cannot + * be set, we will ignore it silently. We prefer to report the success or + * failure of the heal itself. */ + if (xdata == NULL) { + xdata = dict_new(); + } else { + dict_ref(xdata); + } + if (xdata != NULL) { + if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) { + /* dict_set_uint32() is marked as 'warn_unused_result' and gcc + * enforces to check the result in this case. However we don't + * really care if it succeeded or not. We'll just do the same. + * + * This empty 'if' avoids the warning, and it will be removed by + * the optimizer. */ + } + } + if (op_ret >= 0) { dict = dict_new(); if (dict == NULL) { @@ -435,18 +451,21 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, } out: - func(frame, NULL, xl, op_ret, op_errno, dict, NULL); + func(frame, NULL, xl, op_ret, op_errno, dict, xdata); if (dict != NULL) { dict_unref(dict); } + if (xdata != NULL) { + dict_unref(xdata); + } return 0; } void ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_getxattr_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc, const char *name, dict_t *xdata) { ec_cbk_t callback = {.getxattr = func}; @@ -468,7 +487,7 @@ ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, } fop = ec_fop_data_allocate( - frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, minimum, + frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags, ec_wind_getxattr, ec_manager_getxattr, callback, data); if (fop == NULL) { goto out; @@ -588,7 +607,7 @@ ec_wind_fgetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fgetxattr_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd, const char *name, dict_t *xdata) { ec_cbk_t callback = {.fgetxattr = func}; @@ -602,7 +621,7 @@ ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate( - frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, minimum, + frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags, ec_wind_fgetxattr, ec_manager_getxattr, callback, data); if (fop == NULL) { goto out; @@ -774,13 +793,15 @@ ec_manager_open(ec_fop_data_t *fop, int32_t state) return EC_STATE_REPORT; } - err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); - if (err != 0) { - UNLOCK(&fop->fd->lock); + if (!ctx->loc.inode) { + err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); + if (err != 0) { + UNLOCK(&fop->fd->lock); - fop->error = -err; + fop->error = -err; - return EC_STATE_REPORT; + return EC_STATE_REPORT; + } } ctx->flags = fop->int32; @@ -869,9 +890,9 @@ ec_manager_open(ec_fop_data_t *fop, int32_t state) } void -ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_open_cbk_t func, void *data, loc_t *loc, int32_t flags, fd_t *fd, - dict_t *xdata) +ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) { ec_cbk_t callback = {.open = func}; ec_fop_data_t *fop = NULL; @@ -884,7 +905,7 @@ ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_OPEN, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_open, ec_manager_open, + target, fop_flags, ec_wind_open, ec_manager_open, callback, data); if (fop == NULL) { goto out; @@ -1071,7 +1092,7 @@ ec_manager_readlink(ec_fop_data_t *fop, int32_t state) void ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_readlink_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc, size_t size, dict_t *xdata) { ec_cbk_t callback = {.readlink = func}; @@ -1085,7 +1106,7 @@ ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate( - frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, minimum, + frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, fop_flags, ec_wind_readlink, ec_manager_readlink, callback, data); if (fop == NULL) { goto out; @@ -1331,6 +1352,7 @@ int32_t ec_manager_readv(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk; + ec_t *ec = fop->xl->private; switch (state) { case EC_STATE_INIT: @@ -1350,6 +1372,9 @@ ec_manager_readv(ec_fop_data_t *fop, int32_t state) return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: + if (ec->read_mask) { + fop->mask &= ec->read_mask; + } ec_dispatch_min(fop); return EC_STATE_PREPARE_ANSWER; @@ -1417,9 +1442,9 @@ ec_manager_readv(ec_fop_data_t *fop, int32_t state) } void -ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_readv_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset, - uint32_t flags, dict_t *xdata) +ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) { ec_cbk_t callback = {.readv = func}; ec_fop_data_t *fop = NULL; @@ -1432,8 +1457,8 @@ ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_READ, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_readv, ec_manager_readv, - callback, data); + target, fop_flags, ec_wind_readv, + ec_manager_readv, callback, data); if (fop == NULL) { goto out; } @@ -1637,9 +1662,9 @@ ec_manager_seek(ec_fop_data_t *fop, int32_t state) } void -ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_seek_cbk_t func, void *data, fd_t *fd, off_t offset, - gf_seek_what_t what, dict_t *xdata) +ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd, + off_t offset, gf_seek_what_t what, dict_t *xdata) { ec_cbk_t callback = {.seek = func}; ec_fop_data_t *fop = NULL; @@ -1652,7 +1677,7 @@ ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_SEEK, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_seek, ec_manager_seek, + target, fop_flags, ec_wind_seek, ec_manager_seek, callback, data); if (fop == NULL) { goto out; @@ -1855,8 +1880,9 @@ ec_manager_stat(ec_fop_data_t *fop, int32_t state) } void -ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_stat_cbk_t func, void *data, loc_t *loc, dict_t *xdata) +ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc, + dict_t *xdata) { ec_cbk_t callback = {.stat = func}; ec_fop_data_t *fop = NULL; @@ -1869,7 +1895,7 @@ ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_STAT, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_stat, ec_manager_stat, + target, fop_flags, ec_wind_stat, ec_manager_stat, callback, data); if (fop == NULL) { goto out; @@ -1965,8 +1991,9 @@ ec_wind_fstat(ec_t *ec, ec_fop_data_t *fop, int32_t idx) } void -ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, - fop_fstat_cbk_t func, void *data, fd_t *fd, dict_t *xdata) +ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd, + dict_t *xdata) { ec_cbk_t callback = {.fstat = func}; ec_fop_data_t *fop = NULL; @@ -1979,8 +2006,8 @@ ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FSTAT, EC_FLAG_LOCK_SHARED, - target, minimum, ec_wind_fstat, ec_manager_stat, - callback, data); + target, fop_flags, ec_wind_fstat, + ec_manager_stat, callback, data); if (fop == NULL) { goto out; } diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index e7b34e67e10..9b5fe2a7fdc 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -8,10 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> -#include <glusterfs/defaults.h> - -#include "ec.h" #include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" @@ -89,6 +85,8 @@ ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, uint64_t size) goto out; } + if (fop->locks[0].lock) + ec_lock_update_good(fop->locks[0].lock, fop); vector.iov_base = iobuf->ptr; vector.iov_len = size; memset(vector.iov_base, 0, vector.iov_len); @@ -183,26 +181,26 @@ ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, switch (fop->id) { case GF_FOP_SETXATTR: if (fop->cbks.setxattr) { - fop->cbks.setxattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret, + op_errno, xdata); } break; case GF_FOP_REMOVEXATTR: if (fop->cbks.removexattr) { - fop->cbks.removexattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); } break; case GF_FOP_FSETXATTR: if (fop->cbks.fsetxattr) { - fop->cbks.fsetxattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); } break; case GF_FOP_FREMOVEXATTR: if (fop->cbks.fremovexattr) { - fop->cbks.fremovexattr(frame, cookie, this, op_ret, op_errno, - xdata); + QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); } break; } @@ -281,7 +279,7 @@ ec_manager_xattr(ec_fop_data_t *fop, int32_t state) void ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_removexattr_cbk_t func, void *data, + uint32_t fop_flags, fop_removexattr_cbk_t func, void *data, loc_t *loc, const char *name, dict_t *xdata) { ec_cbk_t callback = {.removexattr = func}; @@ -295,7 +293,7 @@ ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_REMOVEXATTR, 0, target, - minimum, ec_wind_removexattr, ec_manager_xattr, + fop_flags, ec_wind_removexattr, ec_manager_xattr, callback, data); if (fop == NULL) { goto out; @@ -361,7 +359,7 @@ ec_wind_fremovexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fremovexattr_cbk_t func, void *data, + uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data, fd_t *fd, const char *name, dict_t *xdata) { ec_cbk_t callback = {.fremovexattr = func}; @@ -375,8 +373,8 @@ ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FREMOVEXATTR, 0, target, - minimum, ec_wind_fremovexattr, ec_manager_xattr, - callback, data); + fop_flags, ec_wind_fremovexattr, + ec_manager_xattr, callback, data); if (fop == NULL) { goto out; } @@ -492,16 +490,15 @@ ec_manager_setattr(ec_fop_data_t *fop, int32_t state) if (fop->id == GF_FOP_SETATTR) { if (fop->cbks.setattr != NULL) { - fop->cbks.setattr(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], - &cbk->iatt[1], cbk->xdata); + QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } else { if (fop->cbks.fsetattr != NULL) { - fop->cbks.fsetattr(fop->req_frame, fop, fop->xl, - cbk->op_ret, cbk->op_errno, - &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } @@ -550,7 +547,7 @@ ec_manager_setattr(ec_fop_data_t *fop, int32_t state) void ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_setattr_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { ec_cbk_t callback = {.setattr = func}; @@ -563,9 +560,9 @@ ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target, minimum, - ec_wind_setattr, ec_manager_setattr, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target, + fop_flags, ec_wind_setattr, ec_manager_setattr, + callback, data); if (fop == NULL) { goto out; } @@ -627,7 +624,7 @@ ec_wind_fsetattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fsetattr_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) { ec_cbk_t callback = {.fsetattr = func}; @@ -640,9 +637,9 @@ ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target, minimum, - ec_wind_fsetattr, ec_manager_setattr, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target, + fop_flags, ec_wind_fsetattr, ec_manager_setattr, + callback, data); if (fop == NULL) { goto out; } @@ -707,7 +704,7 @@ ec_wind_setxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_setxattr_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { ec_cbk_t callback = {.setxattr = func}; @@ -720,9 +717,9 @@ ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target, minimum, - ec_wind_setxattr, ec_manager_xattr, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target, + fop_flags, ec_wind_setxattr, ec_manager_xattr, + callback, data); if (fop == NULL) { goto out; } @@ -825,7 +822,7 @@ ec_wind_fsetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { ec_cbk_t callback = {.fsetxattr = func}; @@ -839,7 +836,7 @@ ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETXATTR, 0, target, - minimum, ec_wind_fsetxattr, ec_manager_xattr, + fop_flags, ec_wind_fsetxattr, ec_manager_xattr, callback, data); if (fop == NULL) { goto out; @@ -992,9 +989,9 @@ ec_manager_fallocate(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.fallocate != NULL) { - fop->cbks.fallocate(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1035,7 +1032,7 @@ ec_manager_fallocate(ec_fop_data_t *fop, int32_t state) void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd, int32_t mode, off_t offset, size_t len, dict_t *xdata) { ec_cbk_t callback = {.fallocate = func}; @@ -1049,8 +1046,8 @@ ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target, - minimum, ec_wind_fallocate, ec_manager_fallocate, - callback, data); + fop_flags, ec_wind_fallocate, + ec_manager_fallocate, callback, data); if (fop == NULL) { goto out; } @@ -1209,8 +1206,8 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state) ec_dispatch_all(fop); return EC_STATE_DELAYED_START; } else { - /*Assume discard to have succeeded on mask*/ - fop->good = fop->mask; + /* Assume discard to have succeeded on all bricks */ + ec_succeed_all(fop); } /* Fall through */ @@ -1245,9 +1242,9 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.discard != NULL) { - fop->cbks.discard(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -1289,7 +1286,7 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state) void ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_discard_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd, off_t offset, size_t len, dict_t *xdata) { ec_cbk_t callback = {.discard = func}; @@ -1302,9 +1299,9 @@ ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target, minimum, - ec_wind_discard, ec_manager_discard, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target, + fop_flags, ec_wind_discard, ec_manager_discard, + callback, data); if (fop == NULL) { goto out; } @@ -1405,6 +1402,7 @@ int32_t ec_manager_truncate(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk; + off_t offset_down; switch (state) { case EC_STATE_INIT: @@ -1416,16 +1414,19 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: + offset_down = fop->user_size; + ec_adjust_offset_down(fop->xl->private, &offset_down, _gf_true); + if (fop->id == GF_FOP_TRUNCATE) { ec_lock_prepare_inode( fop, &fop->loc[0], EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, - fop->offset, EC_RANGE_FULL); + offset_down, EC_RANGE_FULL); } else { ec_lock_prepare_fd( fop, fop->fd, EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, - fop->offset, EC_RANGE_FULL); + offset_down, EC_RANGE_FULL); } ec_lock(fop); @@ -1471,17 +1472,15 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state) if (fop->id == GF_FOP_TRUNCATE) { if (fop->cbks.truncate != NULL) { - fop->cbks.truncate(fop->req_frame, fop, fop->xl, - cbk->op_ret, cbk->op_errno, - &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } else { if (fop->cbks.ftruncate != NULL) { - fop->cbks.ftruncate(fop->req_frame, fop, fop->xl, - cbk->op_ret, cbk->op_errno, - &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } @@ -1530,7 +1529,7 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state) void ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_truncate_cbk_t func, void *data, loc_t *loc, + uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc, off_t offset, dict_t *xdata) { ec_cbk_t callback = {.truncate = func}; @@ -1543,9 +1542,9 @@ ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target, minimum, - ec_wind_truncate, ec_manager_truncate, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target, + fop_flags, ec_wind_truncate, ec_manager_truncate, + callback, data); if (fop == NULL) { goto out; } @@ -1604,7 +1603,7 @@ ec_wind_ftruncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_ftruncate_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd, off_t offset, dict_t *xdata) { ec_cbk_t callback = {.ftruncate = func}; @@ -1618,8 +1617,8 @@ ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FTRUNCATE, 0, target, - minimum, ec_wind_ftruncate, ec_manager_truncate, - callback, data); + fop_flags, ec_wind_ftruncate, + ec_manager_truncate, callback, data); if (fop == NULL) { goto out; } @@ -1973,6 +1972,23 @@ ec_get_and_merge_stripe(ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which) return found; } +static uintptr_t +ec_get_lock_good_mask(inode_t *inode, xlator_t *xl) +{ + ec_lock_t *lock = NULL; + ec_inode_t *ictx = NULL; + LOCK(&inode->lock); + { + ictx = __ec_inode_get(inode, xl); + if (ictx) + lock = ictx->inode_lock; + } + UNLOCK(&inode->lock); + if (lock) + return lock->good_mask; + return 0; +} + void ec_writev_start(ec_fop_data_t *fop) { @@ -2009,20 +2025,29 @@ ec_writev_start(ec_fop_data_t *fop) if (err != 0) { goto failed_fd; } + tail = fop->size - fop->user_size - fop->head; if (fop->head > 0) { - found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD); - if (!found_stripe) { - if (ec_make_internal_fop_xdata(&xdata)) { - err = -ENOMEM; - goto failed_xdata; + if (current > fop->offset) { + found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD); + if (!found_stripe) { + if (ec_make_internal_fop_xdata(&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, + ec_get_lock_good_mask(fop->fd->inode, fop->xl), + EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd, + ec->stripe_size, fop->offset, 0, xdata); + } + } else { + memset(fop->vector[0].iov_base, 0, fop->head); + memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); + if (ec->stripe_cache && (fop->size <= ec->stripe_size)) { + ec_add_stripe_in_cache(ec, fop); } - ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, - ec_writev_merge_head, NULL, fd, ec->stripe_size, - fop->offset, 0, xdata); } } - tail = fop->size - fop->user_size - fop->head; if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { /* Current locking scheme will make sure the 'current' below will * never decrease while the fop is in progress, so the checks will @@ -2035,8 +2060,10 @@ ec_writev_start(ec_fop_data_t *fop) err = -ENOMEM; goto failed_xdata; } - ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, - ec_writev_merge_tail, NULL, fd, ec->stripe_size, + ec_readv(fop->frame, fop->xl, + ec_get_lock_good_mask(fop->fd->inode, fop->xl), + EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd, + ec->stripe_size, fop->offset + fop->size - ec->stripe_size, 0, xdata); } } else { @@ -2211,9 +2238,9 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state) GF_ASSERT(cbk != NULL); if (fop->cbks.writev != NULL) { - fop->cbks.writev(fop->req_frame, fop, fop->xl, cbk->op_ret, - cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], - cbk->xdata); + QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; @@ -2262,7 +2289,7 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state) void ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_writev_cbk_t func, void *data, fd_t *fd, + uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { @@ -2276,7 +2303,7 @@ ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, fop_flags, ec_wind_writev, ec_manager_writev, callback, data); if (fop == NULL) { diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c index f978af0ac67..601960d6154 100644 --- a/xlators/cluster/ec/src/ec-locks.c +++ b/xlators/cluster/ec/src/ec-locks.c @@ -8,13 +8,9 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/xlator.h> -#include <glusterfs/defaults.h> - #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" -#include "ec-method.h" #include "ec-fops.h" #include "ec-messages.h" @@ -28,9 +24,36 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) ec_t *ec = fop->xl->private; ec_cbk_data_t *ans = NULL; ec_cbk_data_t *cbk = NULL; - uintptr_t locked = 0, notlocked = 0; + uintptr_t locked = 0; + int32_t good = 0; + int32_t eagain = 0; + int32_t estale = 0; int32_t error = -1; + /* There are some errors that we'll handle in an special way while trying + * to acquire a lock. + * + * EAGAIN: If it's found during a parallel non-blocking lock request, we + * consider that there's contention on the inode, so we consider + * the acquisition a failure and try again with a sequential + * blocking lock request. This will ensure that we get a lock on + * as many bricks as possible (ignoring EAGAIN here would cause + * unnecessary triggers of self-healing). + * + * If it's found during a sequential blocking lock request, it's + * considered an error. Lock will only succeed if there are + * enough other bricks locked. + * + * ESTALE: This can appear during parallel or sequential lock request if + * the inode has just been unlinked. We consider this error is + * not recoverable, but we also don't consider it as fatal. So, + * if it happens during parallel lock, we won't attempt a + * sequential one unless there are EAGAIN errors on other + * bricks (and are enough to form a quorum), but if we reach + * quorum counting the ESTALE bricks, we consider the whole + * result of the operation is ESTALE instead of EIO. + */ + list_for_each_entry(ans, &fop->cbk_list, list) { if (ans->op_ret >= 0) { @@ -38,24 +61,23 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) error = EIO; } locked |= ans->mask; + good = ans->count; cbk = ans; - } else { - if (ans->op_errno == EAGAIN) { - switch (fop->uint32) { - case EC_LOCK_MODE_NONE: - case EC_LOCK_MODE_ALL: - /* Goal is to treat non-blocking lock as failure - * even if there is a single EAGAIN*/ - notlocked |= ans->mask; - break; - } - } + } else if (ans->op_errno == ESTALE) { + estale += ans->count; + } else if ((ans->op_errno == EAGAIN) && + (fop->uint32 != EC_LOCK_MODE_INC)) { + eagain += ans->count; } } if (error == -1) { - if (gf_bits_count(locked | notlocked) >= ec->fragments) { - if (notlocked == 0) { + /* If we have enough quorum with succeeded and EAGAIN answers, we + * ignore for now any ESTALE answer. If there are EAGAIN answers, + * we retry with a sequential blocking lock request if needed. + * Otherwise we succeed. */ + if ((good + eagain) >= ec->fragments) { + if (eagain == 0) { if (fop->answer == NULL) { fop->answer = cbk; } @@ -68,21 +90,28 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) case EC_LOCK_MODE_NONE: error = EAGAIN; break; - case EC_LOCK_MODE_ALL: fop->uint32 = EC_LOCK_MODE_INC; break; - default: + /* This shouldn't happen because eagain cannot be > 0 + * when fop->uint32 is EC_LOCK_MODE_INC. */ error = EIO; break; } } } else { - if (fop->answer && fop->answer->op_ret < 0) + /* We have been unable to find enough candidates that will be able + * to take the lock. If we have quorum on some answer, we return + * it. Otherwise we check if ESTALE answers allow us to reach + * quorum. If so, we return ESTALE. */ + if (fop->answer && fop->answer->op_ret < 0) { error = fop->answer->op_errno; - else + } else if ((good + eagain + estale) >= ec->fragments) { + error = ESTALE; + } else { error = EIO; + } } } @@ -275,7 +304,7 @@ ec_manager_entrylk(ec_fop_data_t *fop, int32_t state) void ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_entrylk_cbk_t func, void *data, + uint32_t fop_flags, fop_entrylk_cbk_t func, void *data, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { @@ -288,9 +317,9 @@ ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target, minimum, - ec_wind_entrylk, ec_manager_entrylk, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target, + fop_flags, ec_wind_entrylk, ec_manager_entrylk, + callback, data); if (fop == NULL) { goto out; } @@ -403,7 +432,7 @@ ec_wind_fentrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, - int32_t minimum, fop_fentrylk_cbk_t func, void *data, + uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data, const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { @@ -416,9 +445,9 @@ ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target, minimum, - ec_wind_fentrylk, ec_manager_entrylk, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target, + fop_flags, ec_wind_fentrylk, ec_manager_entrylk, + callback, data); if (fop == NULL) { goto out; } @@ -650,7 +679,7 @@ ec_manager_inodelk(ec_fop_data_t *fop, int32_t state) void ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, - uintptr_t target, int32_t minimum, fop_inodelk_cbk_t func, + uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func, void *data, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { @@ -664,9 +693,9 @@ ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target, minimum, - ec_wind_inodelk, ec_manager_inodelk, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target, + fop_flags, ec_wind_inodelk, ec_manager_inodelk, + callback, data); if (fop == NULL) { goto out; } @@ -782,7 +811,7 @@ ec_wind_finodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) void ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, - uintptr_t target, int32_t minimum, fop_finodelk_cbk_t func, + uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func, void *data, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { @@ -796,9 +825,9 @@ ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target, minimum, - ec_wind_finodelk, ec_manager_inodelk, callback, - data); + fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target, + fop_flags, ec_wind_finodelk, ec_manager_inodelk, + callback, data); if (fop == NULL) { goto out; } @@ -1032,7 +1061,7 @@ ec_manager_lk(ec_fop_data_t *fop, int32_t state) } void -ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, +ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { @@ -1045,7 +1074,7 @@ ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, minimum, + fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, fop_flags, ec_wind_lk, ec_manager_lk, callback, data); if (fop == NULL) { goto out; diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h index 7c2880851a8..72e98f11286 100644 --- a/xlators/cluster/ec/src/ec-messages.h +++ b/xlators/cluster/ec/src/ec-messages.h @@ -55,6 +55,7 @@ GLFS_MSGID(EC, EC_MSG_INVALID_CONFIG, EC_MSG_HEAL_FAIL, EC_MSG_CONFIG_XATTR_INVALID, EC_MSG_EXTENSION, EC_MSG_EXTENSION_NONE, EC_MSG_EXTENSION_UNKNOWN, EC_MSG_EXTENSION_UNSUPPORTED, EC_MSG_EXTENSION_FAILED, EC_MSG_NO_GF, EC_MSG_MATRIX_FAILED, - EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED); + EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED, + EC_MSG_THREAD_CLEANUP_FAILED, EC_MSG_FD_BAD); #endif /* !_EC_MESSAGES_H_ */ diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h index 2489fc84226..f91233b2f88 100644 --- a/xlators/cluster/ec/src/ec-method.h +++ b/xlators/cluster/ec/src/ec-method.h @@ -11,8 +11,6 @@ #ifndef __EC_METHOD_H__ #define __EC_METHOD_H__ -#include <glusterfs/xlator.h> - #include "ec-types.h" #include "ec-galois.h" diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index f3d63ca09ce..de9b89bb2c9 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -130,7 +130,12 @@ typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t); enum _ec_read_policy { EC_ROUND_ROBIN, EC_GFID_HASH, EC_READ_POLICY_MAX }; -enum _ec_heal_need { EC_HEAL_NONEED, EC_HEAL_MAYBE, EC_HEAL_MUST }; +enum _ec_heal_need { + EC_HEAL_NONEED, + EC_HEAL_MAYBE, + EC_HEAL_MUST, + EC_HEAL_PURGE_INDEX +}; enum _ec_stripe_part { EC_STRIPE_HEAD, EC_STRIPE_TAIL }; @@ -150,6 +155,7 @@ struct _ec_fd { loc_t loc; uintptr_t open; int32_t flags; + uint64_t bad_version; ec_fd_status_t fd_status[0]; }; @@ -171,6 +177,7 @@ struct _ec_inode { gf_boolean_t have_config; gf_boolean_t have_version; gf_boolean_t have_size; + int32_t heal_count; ec_config_t config; uint64_t pre_version[2]; uint64_t post_version[2]; @@ -179,14 +186,15 @@ struct _ec_inode { uint64_t dirty[2]; struct list_head heal; ec_stripe_list_t stripe_cache; + uint64_t bad_version; }; typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, int32_t, uintptr_t, uintptr_t, uintptr_t, - dict_t *); + uint32_t, dict_t *); typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, int32_t, uintptr_t, uintptr_t, uintptr_t, - dict_t *); + uint32_t, dict_t *); union _ec_cbk { fop_access_cbk_t access; @@ -264,6 +272,7 @@ struct _ec_lock { uint32_t refs_pending; /* Refs assigned to fops being prepared */ uint32_t waiting_flags; /*Track xattrop/dirty marking*/ gf_boolean_t acquired; + gf_boolean_t contention; gf_boolean_t unlock_now; gf_boolean_t release; gf_boolean_t query; @@ -307,9 +316,9 @@ struct _ec_fop_data { int32_t id; /* ID of the file operation */ int32_t refs; int32_t state; - int32_t minimum; /* Minimum number of successful - operation required to conclude a - fop as successful */ + uint32_t minimum; /* Minimum number of successful + operation required to conclude a + fop as successful */ int32_t expected; int32_t winds; int32_t jobs; @@ -324,11 +333,12 @@ struct _ec_fop_data { ec_cbk_data_t *answer; /* accepted answer */ int32_t lock_count; int32_t locked; + gf_lock_t lock; ec_lock_link_t locks[2]; int32_t first_lock; - gf_lock_t lock; - uint32_t flags; + uint32_t fop_flags; /* Flags passed by the caller. */ + uint32_t flags; /* Internal flags. */ uint32_t first; uintptr_t mask; uintptr_t healing; /*Dispatch is done but call is successful only @@ -616,6 +626,11 @@ struct _ec_statistics { requests. (Basically memory allocation errors). */ } stripe_cache; + struct { + gf_atomic_t attempted; /*Number of heals attempted on + files/directories*/ + gf_atomic_t completed; /*Number of heals complted on files/directories*/ + } shd; }; struct _ec { @@ -641,6 +656,8 @@ struct _ec { uintptr_t xl_notify; /* Bit flag representing notification for bricks. */ uintptr_t node_mask; + uintptr_t read_mask; /*Stores user defined read-mask*/ + gf_atomic_t async_fop_count; /* Number of on going asynchronous fops. */ xlator_t **xl_list; gf_lock_t lock; gf_timer_t *timer; @@ -650,6 +667,7 @@ struct _ec { gf_boolean_t optimistic_changelog; gf_boolean_t parallel_writes; uint32_t stripe_cache; + uint32_t quorum_count; uint32_t background_heals; uint32_t heal_wait_qlen; uint32_t self_heal_window_size; /* max size of read/writes */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 13ffeb96012..7344be4968d 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -285,6 +285,7 @@ reconfigure(xlator_t *this, dict_t *options) GF_OPTION_RECONF("parallel-writes", ec->parallel_writes, options, bool, failed); GF_OPTION_RECONF("stripe-cache", ec->stripe_cache, options, uint32, failed); + GF_OPTION_RECONF("quorum-count", ec->quorum_count, options, uint32, failed); ret = 0; if (ec_assign_read_policy(ec, read_policy)) { ret = -1; @@ -324,13 +325,18 @@ ec_get_event_from_state(ec_t *ec) void ec_up(xlator_t *this, ec_t *ec) { + char str1[32], str2[32]; + if (ec->timer != NULL) { gf_timer_call_cancel(this->ctx, ec->timer); ec->timer = NULL; } ec->up = 1; - gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP"); + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, + "Going UP : Child UP = %s Child Notify = %s", + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name); } @@ -338,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec) void ec_down(xlator_t *this, ec_t *ec) { + char str1[32], str2[32]; + if (ec->timer != NULL) { gf_timer_call_cancel(this->ctx, ec->timer); ec->timer = NULL; } ec->up = 0; - gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN"); + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, + "Going DOWN : Child UP = %s Child Notify = %s", + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name); } @@ -355,6 +366,7 @@ ec_notify_cbk(void *data) ec_t *ec = data; glusterfs_event_t event = GF_EVENT_MAXVAL; gf_boolean_t propagate = _gf_false; + gf_boolean_t launch_heal = _gf_false; LOCK(&ec->lock); { @@ -384,6 +396,11 @@ ec_notify_cbk(void *data) * still bricks DOWN, they will be healed when they * come up. */ ec_up(ec->xl, ec); + + if (ec->shd.iamshd && !ec->shutdown) { + launch_heal = _gf_true; + GF_ATOMIC_INC(ec->async_fop_count); + } } propagate = _gf_true; @@ -391,13 +408,12 @@ ec_notify_cbk(void *data) unlock: UNLOCK(&ec->lock); + if (launch_heal) { + /* We have just brought the volume UP, so we trigger + * a self-heal check on the root directory. */ + ec_launch_replace_heal(ec); + } if (propagate) { - if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) { - /* We have just brought the volume UP, so we trigger - * a self-heal check on the root directory. */ - ec_launch_replace_heal(ec); - } - default_notify(ec->xl, event, NULL); } } @@ -425,10 +441,55 @@ ec_disable_delays(ec_t *ec) { ec->shutdown = _gf_true; - return list_empty(&ec->pending_fops); + return __ec_is_last_fop(ec); } void +ec_cleanup_healer_object(ec_t *ec) +{ + struct subvol_healer *healer = NULL; + ec_self_heald_t *shd = NULL; + void *res = NULL; + int i = 0; + gf_boolean_t is_join = _gf_false; + + shd = &ec->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < ec->nodes; i++) { + healer = &shd->index_healers[i]; + pthread_mutex_lock(&healer->mutex); + { + healer->rerun = 1; + if (healer->running) { + pthread_cond_signal(&healer->cond); + is_join = _gf_true; + } + } + pthread_mutex_unlock(&healer->mutex); + if (is_join) { + pthread_join(healer->thread, &res); + is_join = _gf_false; + } + + healer = &shd->full_healers[i]; + pthread_mutex_lock(&healer->mutex); + { + healer->rerun = 1; + if (healer->running) { + pthread_cond_signal(&healer->cond); + is_join = _gf_true; + } + } + pthread_mutex_unlock(&healer->mutex); + if (is_join) { + pthread_join(healer->thread, &res); + is_join = _gf_false; + } + } +} +void ec_pending_fops_completed(ec_t *ec) { if (ec->shutdown) { @@ -441,6 +502,9 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state) { uintptr_t current_state = 0; + if (xlator_is_cleanup_starting(ec->xl)) + return _gf_false; + if ((ec->xl_notify & index_mask) == 0) { ec->xl_notify |= index_mask; ec->xl_notify_count++; @@ -462,6 +526,7 @@ ec_upcall(ec_t *ec, struct gf_upcall *upcall) struct gf_upcall_cache_invalidation *ci = NULL; struct gf_upcall_inodelk_contention *lc = NULL; inode_t *inode; + inode_table_t *table; switch (upcall->event_type) { case GF_UPCALL_CACHE_INVALIDATION: @@ -475,8 +540,18 @@ ec_upcall(ec_t *ec, struct gf_upcall *upcall) /* The lock is not owned by EC, ignore it. */ return _gf_true; } - inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable, - upcall->gfid); + table = ((xlator_t *)ec->xl->graph->top)->itable; + if (table == NULL) { + /* Self-heal daemon doesn't have an inode table on the top + * xlator because it doesn't need it. In this case we should + * use the inode table managed by EC itself where all inodes + * being healed should be present. However self-heal doesn't + * use eager-locking and inodelk's are already released as + * soon as possible. In this case we can safely ignore these + * notifications. */ + return _gf_false; + } + inode = inode_find(table, upcall->gfid); /* If inode is not found, it means that it's already released, * so we can ignore it. Probably it has been released and * destroyed while the contention notification was being sent. @@ -544,6 +619,7 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2) /* If there aren't pending fops running after we have waken up * them, we immediately propagate the notification. */ propagate = ec_disable_delays(ec); + ec_cleanup_healer_object(ec); goto unlock; } @@ -554,7 +630,10 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2) if (event == GF_EVENT_CHILD_UP) { /* We need to trigger a selfheal if a brick changes * to UP state. */ - needs_shd_check = ec_set_up_state(ec, mask, mask); + if (ec_set_up_state(ec, mask, mask) && ec->shd.iamshd && + !ec->shutdown) { + needs_shd_check = _gf_true; + } } else if (event == GF_EVENT_CHILD_DOWN) { ec_set_up_state(ec, mask, 0); } @@ -584,17 +663,21 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2) } } else { propagate = _gf_false; + needs_shd_check = _gf_false; + } + + if (needs_shd_check) { + GF_ATOMIC_INC(ec->async_fop_count); } } unlock: UNLOCK(&ec->lock); done: + if (needs_shd_check) { + ec_launch_replace_heal(ec); + } if (propagate) { - if (needs_shd_check && ec->shd.iamshd) { - ec_launch_replace_heal(ec); - } - error = default_notify(this, event, data); } @@ -627,6 +710,69 @@ ec_statistics_init(ec_t *ec) GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0); GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0); GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0); + GF_ATOMIC_INIT(ec->stats.shd.attempted, 0); + GF_ATOMIC_INIT(ec->stats.shd.completed, 0); +} + +static int +ec_assign_read_mask(ec_t *ec, char *read_mask_str) +{ + char *mask = NULL; + char *maskptr = NULL; + char *saveptr = NULL; + char *id_str = NULL; + int id = 0; + int ret = 0; + uintptr_t read_mask = 0; + + if (!read_mask_str) { + ec->read_mask = 0; + ret = 0; + goto out; + } + + mask = gf_strdup(read_mask_str); + if (!mask) { + ret = -1; + goto out; + } + maskptr = mask; + + for (;;) { + id_str = strtok_r(maskptr, ":", &saveptr); + if (id_str == NULL) + break; + if (gf_string2int(id_str, &id)) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "In read-mask \"%s\" id %s is not a valid integer", + read_mask_str, id_str); + ret = -1; + goto out; + } + + if ((id < 0) || (id >= ec->nodes)) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "In read-mask \"%s\" id %d is not in range [0 - %d]", + read_mask_str, id, ec->nodes - 1); + ret = -1; + goto out; + } + read_mask |= (1UL << id); + maskptr = NULL; + } + + if (gf_bits_count(read_mask) < ec->fragments) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "read-mask \"%s\" should contain at least %d ids", read_mask_str, + ec->fragments); + ret = -1; + goto out; + } + ec->read_mask = read_mask; + ret = 0; +out: + GF_FREE(mask); + return ret; } int32_t @@ -636,6 +782,7 @@ init(xlator_t *this) char *read_policy = NULL; char *extensions = NULL; int32_t err; + char *read_mask_str = NULL; if (this->parents == NULL) { gf_msg(this->name, GF_LOG_WARNING, 0, EC_MSG_NO_PARENTS, @@ -656,6 +803,7 @@ init(xlator_t *this) ec->xl = this; LOCK_INIT(&ec->lock); + GF_ATOMIC_INIT(ec->async_fop_count, 0); INIT_LIST_HEAD(&ec->pending_fops); INIT_LIST_HEAD(&ec->heal_waiting); INIT_LIST_HEAD(&ec->healing); @@ -714,12 +862,18 @@ init(xlator_t *this) if (ec_assign_read_policy(ec, read_policy)) goto failed; + GF_OPTION_INIT("heal-timeout", ec->shd.timeout, int32, failed); GF_OPTION_INIT("shd-max-threads", ec->shd.max_threads, uint32, failed); GF_OPTION_INIT("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed); GF_OPTION_INIT("optimistic-change-log", ec->optimistic_changelog, bool, failed); GF_OPTION_INIT("parallel-writes", ec->parallel_writes, bool, failed); GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed); + GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed); + GF_OPTION_INIT("ec-read-mask", read_mask_str, str, failed); + + if (ec_assign_read_mask(ec, read_mask_str)) + goto failed; this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this); if (!this->itable) @@ -759,6 +913,7 @@ failed: void fini(xlator_t *this) { + ec_selfheal_daemon_fini(this); __ec_destroy_private(this); } @@ -797,11 +952,12 @@ ec_gf_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - int32_t minimum = EC_MINIMUM_ALL; + uint32_t fop_flags = EC_MINIMUM_ALL; + if (cmd == ENTRYLK_UNLOCK) - minimum = EC_MINIMUM_ONE; - ec_entrylk(frame, this, -1, minimum, default_entrylk_cbk, NULL, volume, loc, - basename, cmd, type, xdata); + fop_flags = EC_MINIMUM_ONE; + ec_entrylk(frame, this, -1, fop_flags, default_entrylk_cbk, NULL, volume, + loc, basename, cmd, type, xdata); return 0; } @@ -811,10 +967,11 @@ ec_gf_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - int32_t minimum = EC_MINIMUM_ALL; + uint32_t fop_flags = EC_MINIMUM_ALL; + if (cmd == ENTRYLK_UNLOCK) - minimum = EC_MINIMUM_ONE; - ec_fentrylk(frame, this, -1, minimum, default_fentrylk_cbk, NULL, volume, + fop_flags = EC_MINIMUM_ONE; + ec_fentrylk(frame, this, -1, fop_flags, default_fentrylk_cbk, NULL, volume, fd, basename, cmd, type, xdata); return 0; @@ -905,7 +1062,7 @@ ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, { int error = 0; ec_t *ec = this->private; - int32_t minimum = EC_MINIMUM_ONE; + int32_t fop_flags = EC_MINIMUM_ONE; if (name && strcmp(name, EC_XATTR_HEAL) != 0) { EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out); @@ -920,11 +1077,11 @@ ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, if (name && ((fnmatch(GF_XATTR_STIME_PATTERN, name, 0) == 0) || XATTR_IS_NODE_UUID(name) || XATTR_IS_NODE_UUID_LIST(name))) { - minimum = EC_MINIMUM_ALL; + fop_flags = EC_MINIMUM_ALL; } - ec_getxattr(frame, this, -1, minimum, default_getxattr_cbk, NULL, loc, name, - xdata); + ec_getxattr(frame, this, -1, fop_flags, default_getxattr_cbk, NULL, loc, + name, xdata); return 0; out: @@ -954,11 +1111,12 @@ int32_t ec_gf_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - int32_t minimum = EC_MINIMUM_ALL; + int32_t fop_flags = EC_MINIMUM_ALL; + if (flock->l_type == F_UNLCK) - minimum = EC_MINIMUM_ONE; + fop_flags = EC_MINIMUM_ONE; - ec_inodelk(frame, this, &frame->root->lk_owner, -1, minimum, + ec_inodelk(frame, this, &frame->root->lk_owner, -1, fop_flags, default_inodelk_cbk, NULL, volume, loc, cmd, flock, xdata); return 0; @@ -968,10 +1126,11 @@ int32_t ec_gf_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - int32_t minimum = EC_MINIMUM_ALL; + int32_t fop_flags = EC_MINIMUM_ALL; + if (flock->l_type == F_UNLCK) - minimum = EC_MINIMUM_ONE; - ec_finodelk(frame, this, &frame->root->lk_owner, -1, minimum, + fop_flags = EC_MINIMUM_ONE; + ec_finodelk(frame, this, &frame->root->lk_owner, -1, fop_flags, default_finodelk_cbk, NULL, volume, fd, cmd, flock, xdata); return 0; @@ -991,10 +1150,11 @@ int32_t ec_gf_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - int32_t minimum = EC_MINIMUM_ALL; + int32_t fop_flags = EC_MINIMUM_ALL; + if (flock->l_type == F_UNLCK) - minimum = EC_MINIMUM_ONE; - ec_lk(frame, this, -1, minimum, default_lk_cbk, NULL, fd, cmd, flock, + fop_flags = EC_MINIMUM_ONE; + ec_lk(frame, this, -1, fop_flags, default_lk_cbk, NULL, fd, cmd, flock, xdata); return 0; @@ -1389,6 +1549,10 @@ ec_dump_private(xlator_t *this) gf_proc_dump_write("childs_up", "%u", ec->xl_up_count); gf_proc_dump_write("childs_up_mask", "%s", ec_bin(tmp, sizeof(tmp), ec->xl_up, ec->nodes)); + if (ec->read_mask) { + gf_proc_dump_write("read-mask", "%s", + ec_bin(tmp, sizeof(tmp), ec->read_mask, ec->nodes)); + } gf_proc_dump_write("background-heals", "%d", ec->background_heals); gf_proc_dump_write("heal-wait-qlength", "%d", ec->heal_wait_qlen); gf_proc_dump_write("self-heal-window-size", "%" PRIu32, @@ -1397,6 +1561,7 @@ ec_dump_private(xlator_t *this) gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters); gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]); gf_proc_dump_write("parallel-writes", "%d", ec->parallel_writes); + gf_proc_dump_write("quorum-count", "%u", ec->quorum_count); snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache", this->type, this->name); @@ -1416,6 +1581,10 @@ ec_dump_private(xlator_t *this) GF_ATOMIC_GET(ec->stats.stripe_cache.allocs)); gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC, GF_ATOMIC_GET(ec->stats.stripe_cache.errors)); + gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.shd.attempted)); + gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.shd.completed)); return 0; } @@ -1667,6 +1836,23 @@ struct volume_options options[] = { "lead to extra memory consumption, maximum " "(cache size * stripe size) Bytes per open file."}, { + .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = + "This option can be used to define how many successes on" + "the bricks constitute a success to the application. This" + " count should be in the range" + "[disperse-data-count, disperse-count] (inclusive)", + }, + { + .key = {"ec-read-mask"}, + .type = GF_OPTION_TYPE_STR, + .default_value = NULL, + .description = "This option can be used to choose which bricks can be" + " used for reading data/metadata of a file/directory", + }, + { .key = {NULL}, }, }; diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 1b210d9adc1..6f6de6d5981 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -18,6 +18,7 @@ #define EC_XATTR_SIZE EC_XATTR_PREFIX "size" #define EC_XATTR_VERSION EC_XATTR_PREFIX "version" #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal" +#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new" #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty" #define EC_STRIPE_CACHE_MAX_SIZE 10 #define EC_VERSION_SIZE 2 |
