diff options
Diffstat (limited to 'xlators/cluster/afr/src')
22 files changed, 3054 insertions, 1204 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 45b96e33970..032ab5c8001 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -18,9 +18,7 @@ #include <glusterfs/glusterfs.h> #include "afr.h" #include <glusterfs/dict.h> -#include <glusterfs/xlator.h> #include <glusterfs/hashfn.h> -#include <glusterfs/logging.h> #include <glusterfs/list.h> #include <glusterfs/call-stub.h> #include <glusterfs/defaults.h> @@ -47,6 +45,56 @@ afr_quorum_errno(afr_private_t *priv) return ENOTCONN; } +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid) +{ + if (!__is_root_gfid(pargfid)) { + return _gf_false; + } + + if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { + /*For backward compatibility /.landfill is private*/ + return _gf_true; + } + + if (pid == GF_CLIENT_PID_GSYNCD) { + /*geo-rep needs to create/sync private directory on slave because + * it appears in changelog*/ + return _gf_false; + } + + if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { + if (strcmp(name, priv->anon_inode_name) == 0) { + /* anonymous-inode dir is private*/ + return _gf_true; + } + } else { + if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == + 0) { + /* anonymous-inode dir prefix is private for geo-rep to work*/ + return _gf_true; + } + } + + return _gf_false; +} + +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + replies[i] = 1; + } else { + replies[i] = 0; + } + } +} + int afr_fav_child_reset_sink_xattrs(void *opaque); @@ -56,6 +104,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque); static void afr_discover_done(call_frame_t *frame, xlator_t *this); +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + local->cont.lk.dom_lock_op_ret[i] = op_ret; + local->cont.lk.dom_lock_op_errno[i] = op_errno; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to acquire %s on %s", + uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, + priv->children[i]->name); + } else { + local->cont.lk.dom_locked_nodes[i] = 1; + } + + syncbarrier_wake(&local->barrier); + + return 0; +} + +int +afr_dom_lock_acquire(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + int i = 0; + + priv = frame->this->private; + local = frame->local; + local->cont.lk.dom_locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + if (!local->cont.lk.dom_locked_nodes) { + return -ENOMEM; + } + local->cont.lk.dom_lock_op_ret = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_ret) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + local->cont.lk.dom_lock_op_errno = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_errno) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + flock.l_type = F_WRLCK; + + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLK, &flock, NULL); + + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) + goto blocking_lock; + + /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ + if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != + priv->child_count) { + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.dom_lock_op_ret[i] == -1 && + local->cont.lk.dom_lock_op_errno[i] == EAGAIN) + goto blocking_lock; + } + } + + return 0; + +blocking_lock: + afr_dom_lock_release(frame); + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLKW, &flock, NULL); + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { + afr_dom_lock_release(frame); + return -afr_quorum_errno(priv); + } + + return 0; +} + +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to release %s on %s", local->loc.path, + AFR_LK_HEAL_DOM, priv->children[i]->name); + } + local->cont.lk.dom_locked_nodes[i] = 0; + + syncbarrier_wake(&local->barrier); + + return 0; +} + +void +afr_dom_lock_release(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + priv = frame->this->private; + locked_on = local->cont.lk.dom_locked_nodes; + if (AFR_COUNT(locked_on, priv->child_count) == 0) + return; + flock.l_type = F_UNLCK; + + AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, + AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); + + return; +} + +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) +{ + if (!info) + return; + if (info->xdata_req) + dict_unref(info->xdata_req); + if (info->fd) + fd_unref(info->fd); + GF_FREE(info->locked_nodes); + GF_FREE(info->child_up_event_gen); + GF_FREE(info->child_down_event_gen); + GF_FREE(info); +} + +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -ENOMEM; + + info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); + if (!info) { + goto cleanup; + } + INIT_LIST_HEAD(&info->pos); + info->fd = fd_ref(local->fd); + info->cmd = local->cont.lk.cmd; + info->pid = frame->root->pid; + info->flock = local->cont.lk.user_flock; + info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); + if (!info->xdata_req) { + goto cleanup; + } + info->lk_owner = frame->root->lk_owner; + info->locked_nodes = GF_MALLOC( + sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); + if (!info->locked_nodes) { + goto cleanup; + } + memcpy(info->locked_nodes, local->cont.lk.locked_nodes, + sizeof(*info->locked_nodes) * priv->child_count); + info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), + priv->child_count, gf_afr_mt_int32_t); + if (!info->child_up_event_gen) { + goto cleanup; + } + info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), + priv->child_count, + gf_afr_mt_int32_t); + if (!info->child_down_event_gen) { + goto cleanup; + } + + LOCK(&local->fd->lock); + { + fd_ctx = __afr_fd_ctx_get(local->fd, this); + if (fd_ctx) + fd_ctx->lk_heal_info = info; + } + UNLOCK(&local->fd->lock); + if (!fd_ctx) { + goto cleanup; + } + + LOCK(&priv->lock); + { + list_add_tail(&info->pos, &priv->saved_locks); + } + UNLOCK(&priv->lock); + + return 0; +cleanup: + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to add lock to healq", + uuid_utoa(local->fd->inode->gfid)); + if (info) { + afr_lk_heal_info_cleanup(info); + if (fd_ctx) { + LOCK(&local->fd->lock); + { + fd_ctx->lk_heal_info = NULL; + } + UNLOCK(&local->fd->lock); + } + } + return ret; +} + +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = local->cont.lk.user_flock; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -EINVAL; + + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx || !fd_ctx->lk_heal_info) { + goto out; + } + + info = fd_ctx->lk_heal_info; + if ((info->flock.l_start != flock.l_start) || + (info->flock.l_whence != flock.l_whence) || + (info->flock.l_len != flock.l_len)) { + /*TODO: Compare lkowners too.*/ + goto out; + } + + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + UNLOCK(&priv->lock); + + afr_lk_heal_info_cleanup(info); + fd_ctx->lk_heal_info = NULL; + ret = 0; +out: + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to remove lock from healq", + uuid_utoa(local->fd->inode->gfid)); + return ret; +} + +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed to heal lock on child %d for %s", i, + uuid_utoa(local->fd->inode->gfid)); + } + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); + } else { + local->cont.lk.getlk_rsp[i] = *lock; + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + afr_local_t *local = frame->local; + struct gf_flock flock = { + 0, + }; + gf_boolean_t ret = _gf_true; + char *wind_on = alloca0(priv->child_count); + unsigned char *success_replies = alloca0(priv->child_count); + local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), + priv->child_count, gf_afr_mt_gf_lock); + + flock = info->flock; + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + wind_on[i] = 1; + } + + AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, + info->xdata_req); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) == 0) { + ret = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret != 0) + continue; + if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) + continue; + /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ + if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, + &info->lk_owner)) { + ret = _gf_false; + break; + } + } +out: + afr_local_replies_wipe(local, priv); + GF_FREE(local->cont.lk.getlk_rsp); + local->cont.lk.getlk_rsp = NULL; + return ret; +} + +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) + return; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + if (fd_ctx) { + fd_ctx->is_fd_bad = _gf_true; + fd_ctx->lk_heal_info = NULL; + } + } + UNLOCK(&fd->lock); +} + +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) +{ + LOCK(&priv->lock); + { + list_del(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + UNLOCK(&priv->lock); +} + +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + int op_errno = 0; + int32_t *current_event_gen = NULL; + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + char *wind_on = alloca0(priv->child_count); + gf_boolean_t retry = _gf_true; + + frame->root->pid = info->pid; + lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + + op_errno = -afr_dom_lock_acquire(frame); + if ((op_errno != 0)) { + goto release; + } + + if (!afr_does_lk_owner_match(frame, priv, info)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, + "Ignoring lock heal for %s since lk-onwers mismatch. " + "Lock possibly pre-empted by another client.", + uuid_utoa(info->fd->inode->gfid)); + goto release; + } + + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + continue; + wind_on[i] = 1; + } + + current_event_gen = alloca(priv->child_count); + memcpy(current_event_gen, info->child_up_event_gen, + priv->child_count * sizeof *current_event_gen); + AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, + &info->flock, info->xdata_req); + + LOCK(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { + continue; + } + + if ((current_event_gen[i] == info->child_up_event_gen[i]) && + (current_event_gen[i] > info->child_down_event_gen[i])) { + info->locked_nodes[i] = 1; + retry = _gf_false; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->saved_locks); + } else { + /*We received subsequent child up/down events while heal was in + * progress; don't mark child as healed. Attempt again on the + * new child up*/ + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, + "Event gen mismatch: skipped healing lock on child %d " + "for %s.", + i, uuid_utoa(info->fd->inode->gfid)); + } + } + } + UNLOCK(&priv->lock); + +release: + afr_dom_lock_release(frame); + if (retry) + afr_add_lock_to_lkhealq(priv, info); + return; +} + +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + STACK_DESTROY(frame->root); + return 0; +} + +static int +afr_lock_heal(void *opaque) +{ + call_frame_t *frame = (call_frame_t *)opaque; + call_frame_t *iter_frame = NULL; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + struct list_head healq = { + 0, + }; + int ret = 0; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + return ENOMEM; + } + + INIT_LIST_HEAD(&healq); + LOCK(&priv->lock); + { + list_splice_init(&priv->lk_healq, &healq); + } + UNLOCK(&priv->lock); + + list_for_each_entry_safe(info, tmp, &healq, pos) + { + GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < + priv->child_count)); + ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); + afr_lock_heal_do(iter_frame, priv, info); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = ENOTCONN; + gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, + AFR_MSG_LK_HEAL_DOM, + "Aborting processing of lk_healq." + "Healing will be reattempted on next child up for locks " + "that are still in quorum."); + LOCK(&priv->lock); + { + list_add_tail(&healq, &priv->lk_healq); + } + UNLOCK(&priv->lock); + break; + } + } + + AFR_STACK_DESTROY(iter_frame); + return ret; +} + +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_up_event_gen[child] = priv->event_generation; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) + return -1; + + ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, + frame); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, + "Failed to launch lock heal synctask"); + + return ret; +} + +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) +{ + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_down_event_gen[child] = priv->event_generation; + if (info->locked_nodes[child] == 1) + info->locked_nodes[child] = 0; + if (!afr_has_quorum(info->locked_nodes, this, NULL)) { + /* Since the lock was lost on quorum no. of nodes, we should + * not attempt to heal it anymore. Some other client could have + * acquired the lock, modified data and released it and this + * client wouldn't know about it if we heal it.*/ + afr_mark_fd_bad(info->fd, this); + list_del(&info->pos); + afr_lk_heal_info_cleanup(info); + /* We're not winding an unlock on the node where the lock is still + * present because when fencing logic switches over to the new + * client (since we marked the fd bad), it should preempt any + * existing lock. */ + } + } + return 0; +} + gf_boolean_t afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) @@ -70,6 +693,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, return _gf_true; } +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) +{ + int ret = 0; + uint32_t lk_mode = GF_LK_ADVISORY; + + ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); + if (!ret && lk_mode == GF_LK_MANDATORY) + return _gf_true; + + return _gf_false; +} + call_frame_t * afr_copy_frame(call_frame_t *base) { @@ -284,7 +920,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, metadatamap |= (1 << index); } if (metadatamap_old != metadatamap) { - event = 0; + __afr_inode_need_refresh_set(inode, this); } break; @@ -297,7 +933,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, datamap |= (1 << index); } if (datamap_old != datamap) - event = 0; + __afr_inode_need_refresh_set(inode, this); break; default: @@ -461,34 +1097,6 @@ out: } int -__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this) -{ - int ret = -1; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint32_t event = 0; - uint64_t val = 0; - afr_inode_ctx_t *ctx = NULL; - - ret = __afr_inode_ctx_get(this, inode, &ctx); - if (ret) - return ret; - - val = ctx->read_subvol; - - metadatamap = (val & 0x000000000000ffff) >> 0; - datamap = (val & 0x00000000ffff0000) >> 16; - event = 0; - - val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | - (((uint64_t)event) << 32); - - ctx->read_subvol = val; - - return ret; -} - -int __afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int *event_p) { @@ -559,22 +1167,6 @@ out: } int -__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - if (priv->child_count <= 16) - ret = __afr_inode_event_gen_reset_small(inode, this); - else - ret = -1; - - return ret; -} - -int afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int *event_p) { @@ -640,12 +1232,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, return 0; } -int +static int afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, int *spb_choice) { int ret = -1; - GF_VALIDATE_OR_GOTO(this->name, inode, out); LOCK(&inode->lock); @@ -657,6 +1248,40 @@ out: return ret; } +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) +{ + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); + + priv = this->private; + + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; + } + } + +out: + return ret; +} int afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int event) @@ -723,30 +1348,22 @@ out: return need_refresh; } -static int -afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) +int +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { int ret = -1; afr_inode_ctx_t *ctx = NULL; - GF_VALIDATE_OR_GOTO(this->name, inode, out); - - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get(this, inode, &ctx); - if (ret) - goto unlock; - + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret == 0) { ctx->need_refresh = _gf_true; } -unlock: - UNLOCK(&inode->lock); -out: + return ret; } int -afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { int ret = -1; @@ -754,7 +1371,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) LOCK(&inode->lock); { - ret = __afr_inode_event_gen_reset(inode, this); + ret = __afr_inode_need_refresh_set(inode, this); } UNLOCK(&inode->lock); out: @@ -820,7 +1437,6 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque) gf_boolean_t timer_set = _gf_false; gf_boolean_t timer_cancelled = _gf_false; gf_boolean_t timer_reset = _gf_false; - gf_boolean_t need_invalidate = _gf_true; int old_spb_choice = -1; frame = data->frame; @@ -932,7 +1548,6 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque) timer_set = _gf_true; if (timer_reset && !ctx->timer) timer_cancelled = _gf_true; - need_invalidate = _gf_false; } unlock: UNLOCK(&inode->lock); @@ -946,8 +1561,7 @@ post_unlock: * reads from an older cached value despite a change in spb_choice to * a new value. */ - if (need_invalidate) - inode_invalidate(inode); + inode_invalidate(inode); out: GF_FREE(data); AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); @@ -1054,6 +1668,8 @@ afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode, ia_type = inode->ia_type; } + if (!xdata) + continue; /* mkdir_cbk sends NULL xdata_rsp. */ afr_accused_fill(this, xdata, data_accused, (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); @@ -1175,7 +1791,6 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) inode_t *inode = NULL; int event_generation = 0; int read_subvol = -1; - int op_errno = ENOMEM; int ret = 0; local = frame->local; @@ -1191,7 +1806,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) ret = afr_inode_get_readable(frame, inode, this, local->readable, &event_generation, local->transaction.type); - if (ret == -EIO || (local->is_read_txn && !event_generation)) { + if (ret == -EIO) { /* No readable subvolume even after refresh ==> splitbrain.*/ if (!priv->fav_child_policy) { err = EIO; @@ -1204,18 +1819,12 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) goto refresh_done; } - heal_frame = copy_frame(frame); + heal_frame = afr_frame_create(this, NULL); if (!heal_frame) { err = EIO; goto refresh_done; } - heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD; - heal_local = AFR_FRAME_INIT(heal_frame, op_errno); - if (!heal_local) { - err = EIO; - AFR_STACK_DESTROY(heal_frame); - goto refresh_done; - } + heal_local = heal_frame->local; heal_local->xdata_req = dict_new(); if (!heal_local->xdata_req) { err = EIO; @@ -1236,18 +1845,6 @@ refresh_done: return 0; } -static void -afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, - unsigned char *replies) -{ - int i = 0; - - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].valid && local->replies[i].op_ret == 0) - replies[i] = 1; - } -} - int afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) { @@ -1257,7 +1854,6 @@ afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) gf_boolean_t start_heal = _gf_false; afr_local_t *heal_local = NULL; unsigned char *success_replies = NULL; - int op_errno = ENOMEM; int ret = 0; if (error != 0) { @@ -1269,32 +1865,32 @@ afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) success_replies = alloca0(priv->child_count); afr_fill_success_replies(local, priv, success_replies); - if (!afr_has_quorum(success_replies, this, frame)) { - error = afr_final_errno(frame->local, this->private); - if (!error) - error = afr_quorum_errno(priv); - goto refresh_done; - } - if (priv->thin_arbiter_count && local->is_read_txn && AFR_COUNT(success_replies, priv->child_count) != priv->child_count) { /* We need to query the good bricks and/or thin-arbiter.*/ + if (success_replies[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (success_replies[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } error = EINVAL; goto refresh_done; } + if (!afr_has_quorum(success_replies, this, frame)) { + error = afr_final_errno(frame->local, this->private); + if (!error) + error = afr_quorum_errno(priv); + goto refresh_done; + } + ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal); if (ret && afr_selfheal_enabled(this) && start_heal) { - heal_frame = copy_frame(frame); + heal_frame = afr_frame_create(this, NULL); if (!heal_frame) goto refresh_done; - heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD; - heal_local = AFR_FRAME_INIT(heal_frame, op_errno); - if (!heal_local) { - AFR_STACK_DESTROY(heal_frame); - goto refresh_done; - } + heal_local = heal_frame->local; heal_local->refreshinode = inode_ref(local->refreshinode); heal_local->heal_frame = heal_frame; if (!afr_throttled_selfheal(heal_frame, this)) { @@ -1331,17 +1927,22 @@ afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (xdata) local->replies[call_child].xdata = dict_ref(xdata); } + if (xdata) { ret = dict_get_int8(xdata, "link-count", &need_heal); - local->replies[call_child].need_heal = need_heal; - } else { - local->replies[call_child].need_heal = need_heal; + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to get link count"); + } } + local->replies[call_child].need_heal = need_heal; call_count = afr_frame_return(frame); if (call_count == 0) { afr_set_need_heal(this, local); ret = afr_inode_refresh_err(frame, this); + if (ret) { + gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed"); + } afr_inode_refresh_done(frame, this, ret); } } @@ -1610,19 +2211,18 @@ out: } int -afr_least_pending_reads_child(afr_private_t *priv) +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable) { int i = 0; - int child = 0; + int child = -1; int64_t read_iter = -1; int64_t pending_read = -1; - pending_read = GF_ATOMIC_GET(priv->pending_reads[0]); - for (i = 1; i < priv->child_count; i++) { - if (AFR_IS_ARBITER_BRICK(priv, i)) + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i]) continue; read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); - if (read_iter < pending_read) { + if (child == -1 || read_iter < pending_read) { pending_read = read_iter; child = i; } @@ -1631,8 +2231,54 @@ afr_least_pending_reads_child(afr_private_t *priv) return child; } +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + if (child == -1 || + priv->child_latency[i] < priv->child_latency[child]) { + child = i; + } + } + return child; +} + +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, + unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + int64_t pending_read = 0; + int64_t latency = -1; + int64_t least_latency = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); + latency = (pending_read + 1) * priv->child_latency[i]; + + if (child == -1 || latency < least_latency) { + least_latency = latency; + child = i; + } + } + return child; +} + int -afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, + unsigned char *readable) { uuid_t gfid_copy = { 0, @@ -1641,14 +2287,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) int child = -1; switch (priv->hash_mode) { - case 0: + case AFR_READ_POLICY_FIRST_UP: break; - case 1: + case AFR_READ_POLICY_GFID_HASH: gf_uuid_copy(gfid_copy, args->gfid); child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; break; - case 2: + case AFR_READ_POLICY_GFID_PID_HASH: if (args->ia_type != IA_IFDIR) { /* * Why getpid? Because it's one of the cheapest calls @@ -1660,14 +2306,21 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) * need is a low probability that multiple clients * won't converge on the same subvolume. */ + gf_uuid_copy(gfid_copy, args->gfid); pid = getpid(); - memcpy(gfid_copy, &pid, sizeof(pid)); + *(pid_t *)gfid_copy ^= pid; } child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; break; - case 3: - child = afr_least_pending_reads_child(priv); + case AFR_READ_POLICY_LESS_LOAD: + child = afr_least_pending_reads_child(priv, readable); + break; + case AFR_READ_POLICY_LEAST_LATENCY: + child = afr_least_latency_child(priv, readable); + break; + case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: + child = afr_least_latency_times_pending_reads_child(priv, readable); break; } @@ -1700,7 +2353,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, } /* second preference - use hashed mode */ - read_subvol = afr_hash_child(&local_args, priv); + read_subvol = afr_hash_child(&local_args, priv, readable); if (read_subvol >= 0 && readable[read_subvol]) return read_subvol; @@ -2006,6 +2659,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this) { /* lk */ GF_FREE(local->cont.lk.locked_nodes); + GF_FREE(local->cont.lk.dom_locked_nodes); + GF_FREE(local->cont.lk.dom_lock_op_ret); + GF_FREE(local->cont.lk.dom_lock_op_errno); } { /* create */ @@ -2236,7 +2892,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = -1; + int spb_subvol = -1; int child_count = -1; if (*read_subvol != -1) @@ -2246,11 +2902,12 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, local = frame->local; child_count = priv->child_count; - afr_inode_split_brain_choice_get(local->inode, this, &spb_choice); - if ((spb_choice >= 0) && + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && (AFR_COUNT(success_replies, child_count) == child_count)) { - *read_subvol = spb_choice; - } else if (!priv->quorum_count) { + *read_subvol = spb_subvol; + } else if (!priv->quorum_count || + frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { *read_subvol = afr_first_up_child(frame, this); } else if (priv->quorum_count && afr_has_quorum(data_readable, this, NULL)) { @@ -2289,6 +2946,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) 0, }; gf_boolean_t locked_entry = _gf_false; + gf_boolean_t in_flight_create = _gf_false; gf_boolean_t can_interpret = _gf_true; inode_t *parent = NULL; ia_type_t ia_type = IA_INVAL; @@ -2332,17 +2990,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) if (!replies[i].valid) continue; - if (locked_entry && replies[i].op_ret == -1 && - replies[i].op_errno == ENOENT) { - /* Second, check entry is still - "underway" in creation */ - local->op_ret = -1; - local->op_errno = ENOENT; - goto error; - } - - if (replies[i].op_ret == -1) + if (replies[i].op_ret == -1) { + if (locked_entry && replies[i].op_errno == ENOENT) { + in_flight_create = _gf_true; + } continue; + } if (read_subvol == -1 || !readable[read_subvol]) { read_subvol = i; @@ -2352,6 +3005,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) } } + if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto error; + } + if (read_subvol == -1) goto error; /* We now have a read_subvol, which is readable[] (if there @@ -2410,7 +3069,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) if (read_subvol == -1) goto cant_interpret; if (ret) { - afr_inode_event_gen_reset(local->inode, this); + afr_inode_need_refresh_set(local->inode, this); dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); } } else { @@ -2463,7 +3122,7 @@ error: * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ENODATA > ENOENT > ESTALE > others + * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others */ int @@ -2475,6 +3134,8 @@ afr_higher_errno(int32_t old_errno, int32_t new_errno) return ENOENT; if (old_errno == ESTALE || new_errno == ESTALE) return ESTALE; + if (old_errno == ENOSPC || new_errno == ENOSPC) + return ENOSPC; return new_errno; } @@ -2604,6 +3265,10 @@ afr_lookup_sh_metadata_wrap(void *opaque) dict = dict_new(); if (!dict) goto out; + if (local->xattr_req) { + dict_copy(local->xattr_req, dict); + } + ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT); if (ret) { gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); @@ -2612,7 +3277,7 @@ afr_lookup_sh_metadata_wrap(void *opaque) if (loc_is_nameless(&local->loc)) { ret = afr_selfheal_unlocked_discover_on(frame, local->inode, local->loc.gfid, local->replies, - local->child_up); + local->child_up, dict); } else { inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, local->loc.name, local->replies, @@ -2786,7 +3451,7 @@ afr_lookup_selfheal_wrap(void *opaque) inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, local->loc.name, local->replies, - local->child_up, NULL); + local->child_up, local->xattr_req); if (inode) inode_unref(inode); @@ -2962,6 +3627,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; afr_local_t *local = NULL; int read_subvol = -1; + int ret = 0; unsigned char *data_readable = NULL; unsigned char *success_replies = NULL; @@ -2983,7 +3649,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) if (!afr_has_quorum(success_replies, this, frame)) goto unwind; - afr_replies_interpret(frame, this, local->inode, NULL); + ret = afr_replies_interpret(frame, this, local->inode, NULL); + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + } read_subvol = afr_read_subvol_decide(local->inode, this, NULL, data_readable); @@ -3035,7 +3704,7 @@ afr_ta_id_file_check(void *opaque) this = opaque; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_false); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc.name); @@ -3226,10 +3895,15 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) local->inode = inode_ref(loc->inode); - if (xattr_req) + if (xattr_req) { /* If xattr_req was null, afr_lookup_xattr_req_prepare() will allocate one for us */ - local->xattr_req = dict_ref(xattr_req); + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + } if (gf_uuid_is_null(loc->inode->gfid)) { afr_discover_do(frame, this, 0); @@ -3239,11 +3913,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd(loc->inode, this, event, - local->event_generation)) - afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do); - else - afr_discover_do(frame, this, 0); + afr_discover_do(frame, this, 0); return 0; out: @@ -3344,11 +4014,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) return 0; } - if (__is_root_gfid(loc->parent->gfid)) { - if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) { - op_errno = EPERM; - goto out; - } + if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, + frame->root->pid)) { + op_errno = EPERM; + goto out; } local = AFR_FRAME_INIT(frame, op_errno); @@ -3384,11 +4053,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd(loc->inode, this, event, - local->event_generation)) - afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do); - else - afr_lookup_do(frame, this, 0); + afr_lookup_do(frame, this, 0); return 0; out: @@ -3398,8 +4063,18 @@ out: } void -_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx) +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx) { + afr_private_t *priv = this->private; + + if (fd_ctx->lk_heal_info) { + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); + fd_ctx->lk_heal_info = NULL; + } GF_FREE(fd_ctx->opened_on); GF_FREE(fd_ctx); return; @@ -3419,7 +4094,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long)ctx; if (fd_ctx) { - _afr_cleanup_fd_ctx(fd_ctx); + _afr_cleanup_fd_ctx(this, fd_ctx); } out: @@ -3512,13 +4187,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd) } fd_ctx->readdir_subvol = -1; + fd_ctx->lk_heal_info = NULL; ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx); if (ret) gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd); out: if (ret && fd_ctx) - _afr_cleanup_fd_ctx(fd_ctx); + _afr_cleanup_fd_ctx(this, fd_ctx); return ret; } @@ -3542,11 +4218,10 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, } else { local->op_errno = op_errno; } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno, local->xdata_rsp); @@ -3642,6 +4317,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) call_stub_t *stub = NULL; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -3682,11 +4358,10 @@ afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } else { local->op_errno = op_errno; } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, local->xdata_rsp); @@ -4179,9 +4854,9 @@ out: } static int32_t -afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, - loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_local_t *local = NULL; int32_t op_errno = ENOMEM; @@ -4193,8 +4868,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, local->op = fop; if (loc) loc_copy(&local->loc, loc); - if (fd) + if (fd && (flock->l_type != F_UNLCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local->fd = fd_ref(fd); + } local->cont.inodelk.volume = gf_strdup(volume); if (!local->cont.inodelk.volume) { @@ -4223,8 +4900,8 @@ int32_t afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock, - xdata); + afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, + flock, xdata); return 0; } @@ -4232,15 +4909,16 @@ int32_t afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock, - xdata); + afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, + flock, xdata); return 0; } static int -afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, - loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_local_t *local = NULL; int32_t op_errno = ENOMEM; @@ -4252,8 +4930,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, local->op = fop; if (loc) loc_copy(&local->loc, loc); - if (fd) + if (fd && (cmd != ENTRYLK_UNLOCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local->fd = fd_ref(fd); + } local->cont.entrylk.cmd = cmd; local->cont.entrylk.in_cmd = cmd; local->cont.entrylk.type = type; @@ -4280,8 +4960,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd, - type, xdata); + afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, + cmd, type, xdata); return 0; } @@ -4290,8 +4970,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd, - type, xdata); + afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, + cmd, type, xdata); return 0; } @@ -4303,10 +4983,10 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int call_count = 0; struct statvfs *buf = NULL; + local = frame->local; + LOCK(&frame->lock); { - local = frame->local; - if (op_ret != 0) { local->op_errno = op_errno; goto unlock; @@ -4332,10 +5012,9 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, } } unlock: + call_count = --local->call_count; UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, &local->cont.statfs.buf, local->xdata_rsp); @@ -4410,9 +5089,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } call_count = afr_frame_return(frame); - if (call_count == 0) + if (call_count == 0) { AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, local->xdata_rsp); + } return 0; } @@ -4511,11 +5191,133 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, } int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} + +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = -1; + + local = frame->local; + child_index = (long)cookie; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int child_index = (long)cookie; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + return 0; +} +int +afr_lk_transaction(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *wind_on = NULL; + int op_errno = 0; + int i = 0; + int ret = 0; + + frame = (call_frame_t *)opaque; + local = frame->local; + this = frame->this; + priv = this->private; + wind_on = alloca0(priv->child_count); + + if (priv->arbiter_count || priv->child_count != 3) { + op_errno = ENOTSUP; + gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Lock healing supported only for replica 3 volumes.", + uuid_utoa(local->fd->inode->gfid)); + goto err; + } + + op_errno = -afr_dom_lock_acquire(frame); // Released during + // AFR_STACK_UNWIND + if (op_errno != 0) { + goto err; + } + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { + op_errno = afr_final_errno(local, priv); + goto err; + } + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) + wind_on[i] = 1; + } + AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + goto unlock; + } else { + if (local->cont.lk.user_flock.l_type == F_UNLCK) + ret = afr_remove_lock_from_saved_locks(local, this); + else + ret = afr_add_lock_to_saved_locks(frame, this); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } + + return 0; + +unlock: + local->cont.lk.user_flock.l_type = F_UNLCK; + AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return -1; +} + +int afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + int ret = 0; int i = 0; int32_t op_errno = ENOMEM; @@ -4526,9 +5328,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, goto out; local->op = GF_FOP_LK; - if (!afr_lk_is_unlock(cmd, flock) && - !afr_is_consistent_io_possible(local, priv, &op_errno)) - goto out; + if (!afr_lk_is_unlock(cmd, flock)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + } local->cont.lk.locked_nodes = GF_CALLOC( priv->child_count, sizeof(*local->cont.lk.locked_nodes), @@ -4546,6 +5350,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, if (xdata) local->xdata_req = dict_ref(xdata); + if (afr_is_lock_mode_mandatory(xdata)) { + ret = synctask_new(this->ctx->env, afr_lk_transaction, + afr_lk_transaction_cbk, frame, frame); + if (ret) { + op_errno = ENOMEM; + goto out; + } + return 0; + } + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i], priv->children[i]->fops->lk, fd, cmd, flock, local->xdata_req); @@ -4867,6 +5681,8 @@ afr_priv_dump(xlator_t *this) GF_ATOMIC_GET(priv->pending_reads[i])); sprintf(key, "child_latency[%d]", i); gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]); + sprintf(key, "halo_child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->halo_child_up[i]); } gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal); gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); @@ -4879,6 +5695,7 @@ afr_priv_dump(xlator_t *this) priv->background_self_heal_count); gf_proc_dump_write("healers", "%d", priv->healers); gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); + gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); if (priv->quorum_count == AFR_QUORUM_AUTO) { gf_proc_dump_write("quorum-type", "auto"); } else if (priv->quorum_count == 0) { @@ -4939,13 +5756,31 @@ __afr_get_up_children_count(afr_private_t *priv) return up_children; } +static int +__get_heard_from_all_status(xlator_t *this) +{ + afr_private_t *priv = this->private; + int i; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + return 0; + } + } + if (priv->thin_arbiter_count && !priv->ta_child_up) { + return 0; + } + return 1; +} + glusterfs_event_t -__afr_transform_event_from_state(afr_private_t *priv) +__afr_transform_event_from_state(xlator_t *this) { int i = 0; int up_children = 0; + afr_private_t *priv = this->private; - if (AFR_COUNT(priv->last_event, priv->child_count) == priv->child_count) + if (__get_heard_from_all_status(this)) /* have_heard_from_all. Let afr_notify() do the propagation. */ return GF_EVENT_MAXVAL; @@ -4987,7 +5822,7 @@ afr_notify_cbk(void *data) goto unlock; } priv->timer = NULL; - event = __afr_transform_event_from_state(priv); + event = __afr_transform_event_from_state(this); if (event != GF_EVENT_MAXVAL) propagate = _gf_true; } @@ -5014,22 +5849,6 @@ __afr_launch_notify_timer(xlator_t *this, afr_private_t *priv) } } -int -__get_heard_from_all_status(xlator_t *this) -{ - afr_private_t *priv = this->private; - int heard_from_all = 1; - int i = 0; - - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) { - heard_from_all = 0; - break; - } - } - return heard_from_all; -} - static int find_best_down_child(xlator_t *this) { @@ -5041,7 +5860,7 @@ find_best_down_child(xlator_t *this) priv = this->private; for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i] && priv->child_latency[i] >= 0 && + if (!priv->child_up[i] && priv->child_latency[i] >= 0 && priv->child_latency[i] < best_latency) { best_child = i; best_latency = priv->child_latency[i]; @@ -5113,7 +5932,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, "), " "marking child down.", child_latency_msec, halo_max_latency_msec); - *event = GF_EVENT_CHILD_DOWN; + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_DOWN; + } } } else if (child_latency_msec < halo_max_latency_msec && priv->child_up[idx] == 0) { @@ -5125,7 +5946,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, "), " "marking child up.", child_latency_msec, halo_max_latency_msec); - *event = GF_EVENT_CHILD_UP; + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_UP; + } } else { gf_log(child_xlator->name, GF_LOG_INFO, "Not marking child %d up, " @@ -5187,9 +6010,15 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, * want to set the child_latency to MAX to indicate * the child needs ping data to be available before doing child-up */ - if (child_latency_msec < 0 && priv->halo_enabled) { + if (!priv->halo_enabled) + goto out; + + if (child_latency_msec < 0) { /*set to INT64_MAX-1 so that it is found for best_down_child*/ - priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + priv->halo_child_up[idx] = 1; + if (priv->child_latency[idx] < 0) { + priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + } } /* @@ -5227,13 +6056,14 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, "up_children (%d) > halo_max_replicas (%d)", worst_up_child, up_children, priv->halo_max_replicas); } - +out: if (up_children == 1) { gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, "Subvolume '%s' came back up; " "going online.", child_xlator->name); - gf_event(EVENT_AFR_SUBVOL_UP, "subvol=%s", this->name); + gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); } else { *event = GF_EVENT_SOME_DESCENDENT_UP; } @@ -5277,6 +6107,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, */ if (child_latency_msec < 0) { priv->child_latency[idx] = child_latency_msec; + priv->halo_child_up[idx] = 0; } priv->child_up[idx] = 0; @@ -5289,7 +6120,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, * as we want it to be up to date if we are going to * begin using it synchronously. */ - if (up_children < priv->halo_min_replicas) { + if (priv->halo_enabled && up_children < priv->halo_min_replicas) { best_down_child = find_best_down_child(this); if (best_down_child >= 0) { gf_msg_debug(this->name, 0, @@ -5301,7 +6132,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, *up_child = best_down_child; } } - for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 0) down_children++; @@ -5310,7 +6140,8 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, "All subvolumes are down. Going " "offline until at least one of them " "comes back up."); - gf_event(EVENT_AFR_SUBVOLS_DOWN, "subvol=%s", this->name); + gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); } else { *event = GF_EVENT_SOME_DESCENDENT_DOWN; } @@ -5359,6 +6190,11 @@ afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) } LOCK(&priv->lock); { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Ignore multiple release requests from shds.*/ + UNLOCK(&priv->lock); + return; + } priv->release_ta_notify_dom_lock = _gf_true; inmem_count = priv->ta_in_mem_txn_count; onwire_count = priv->ta_on_wire_txn_count; @@ -5366,7 +6202,7 @@ afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) UNLOCK(&priv->lock); if (inmem_count || onwire_count) /* lock release will happen in txn code path after - * inflight or on-wire txns are over.*/ + * in-memory or on-wire txns are over.*/ return; afr_ta_lock_release_synctask(this); @@ -5467,13 +6303,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) had_quorum = priv->quorum_count && afr_has_quorum(priv->child_up, this, NULL); - if (priv->halo_enabled) { - halo_max_latency_msec = afr_get_halo_latency(this); + if (event == GF_EVENT_CHILD_PING) { + child_latency_msec = (int64_t)(uintptr_t)data2; + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency(this); - if (event == GF_EVENT_CHILD_PING) { /* Calculates the child latency and sets event */ - child_latency_msec = (int64_t)(uintptr_t)data2; LOCK(&priv->lock); { __afr_handle_ping_event(this, child_xlator, idx, @@ -5481,6 +6317,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) child_latency_msec); } UNLOCK(&priv->lock); + } else { + LOCK(&priv->lock); + { + priv->child_latency[idx] = child_latency_msec; + } + UNLOCK(&priv->lock); } } @@ -5524,22 +6366,27 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) if (priv->thin_arbiter_count && (idx == AFR_CHILD_THIN_ARBITER)) { priv->ta_child_up = 1; + priv->ta_event_gen++; break; } __afr_handle_child_up_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); + __afr_lock_heal_synctask(this, priv, idx); break; case GF_EVENT_CHILD_DOWN: if (priv->thin_arbiter_count && (idx == AFR_CHILD_THIN_ARBITER)) { priv->ta_child_up = 0; + priv->ta_event_gen++; + afr_ta_locked_priv_invalidate(priv); break; } __afr_handle_child_down_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); + __afr_mark_pending_lk_heal(this, priv, idx); break; case GF_EVENT_CHILD_CONNECTING: @@ -5585,12 +6432,14 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) if (!had_quorum && has_quorum) { gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET, "Client-quorum is met"); - gf_event(EVENT_AFR_QUORUM_MET, "subvol=%s", this->name); + gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); } if (had_quorum && !has_quorum) { gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, "Client-quorum is not met"); - gf_event(EVENT_AFR_QUORUM_FAIL, "subvol=%s", this->name); + gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); } } @@ -5609,10 +6458,8 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) * b) Already heard from everyone, but we now got a child-up * event. */ - if (have_heard_from_all && priv->shd.iamshd) { - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i]) - afr_selfheal_childup(this, i); + if (have_heard_from_all) { + afr_selfheal_childup(this, priv); } } out: @@ -5633,7 +6480,7 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } - local->child_up = GF_CALLOC(priv->child_count, sizeof(*local->child_up), + local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up), gf_afr_mt_char); if (!local->child_up) { if (op_errno) @@ -5692,6 +6539,9 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) if (priv->thin_arbiter_count) { local->ta_child_up = priv->ta_child_up; local->ta_failed_subvol = AFR_CHILD_UNKNOWN; + local->read_txn_query_child = AFR_CHILD_UNKNOWN; + local->ta_event_gen = priv->ta_event_gen; + local->fop_state = TA_SUCCESS; } local->is_new_entry = _gf_false; @@ -5763,6 +6613,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) afr_private_t *priv = NULL; priv = this->private; + INIT_LIST_HEAD(&local->transaction.wait_list); + INIT_LIST_HEAD(&local->transaction.owner_list); + INIT_LIST_HEAD(&local->ta_waitq); + INIT_LIST_HEAD(&local->ta_onwireq); ret = afr_internal_lock_init(&local->internal_lock, priv->child_count); if (ret < 0) goto out; @@ -5800,10 +6654,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) goto out; ret = 0; - INIT_LIST_HEAD(&local->transaction.wait_list); - INIT_LIST_HEAD(&local->transaction.owner_list); - INIT_LIST_HEAD(&local->ta_waitq); - INIT_LIST_HEAD(&local->ta_onwireq); out: return ret; } @@ -5822,6 +6672,8 @@ afr_priv_destroy(afr_private_t *priv) if (!priv) goto out; + + GF_FREE(priv->sh_domain); GF_FREE(priv->last_event); child_count = priv->child_count; @@ -5837,7 +6689,9 @@ afr_priv_destroy(afr_private_t *priv) GF_FREE(priv->local); GF_FREE(priv->pending_key); GF_FREE(priv->children); + GF_FREE(priv->anon_inode); GF_FREE(priv->child_up); + GF_FREE(priv->halo_child_up); GF_FREE(priv->child_latency); LOCK_DESTROY(&priv->lock); @@ -5888,274 +6742,218 @@ out: return changelog; } -gf_boolean_t -afr_decide_heal_info(afr_private_t *priv, unsigned char *sources, int source) +static dict_t * +afr_set_heal_info(char *status) { - int sources_count = 0; + dict_t *dict = NULL; + int ret = -1; - if (source < 0) + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; goto out; + } - sources_count = AFR_COUNT(sources, priv->child_count); - if (sources_count == priv->child_count) - return _gf_false; + ret = dict_set_dynstr_sizen(dict, "heal-info", status); + if (ret) + gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set heal-info key to " + "%s", + status); out: - return _gf_true; + /* Any error other than EINVAL, dict_set_dynstr frees status */ + if (ret == -ENOMEM || ret == -EINVAL) { + GF_FREE(status); + } + + if (ret && dict) { + dict_unref(dict); + dict = NULL; + } + return dict; } -int -afr_selfheal_locked_metadata_inspect(call_frame_t *frame, xlator_t *this, - inode_t *inode, gf_boolean_t *msh, - unsigned char *pending) +static gf_boolean_t +afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type) { - int ret = -1; - unsigned char *locked_on = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - unsigned char *undid_pending = NULL; - struct afr_reply *locked_replies = NULL; - afr_private_t *priv = this->private; + int *dirty = alloca0(priv->child_count * sizeof(int)); + int i = 0; - locked_on = alloca0(priv->child_count); - sources = alloca0(priv->child_count); - sinks = alloca0(priv->child_count); - healed_sinks = alloca0(priv->child_count); - undid_pending = alloca0(priv->child_count); - - locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); - - ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, - locked_on); - { - if (ret == 0) { - /* Not a single lock */ - ret = -afr_final_errno(frame->local, priv); - if (ret == 0) - ret = -ENOTCONN; /* all invalid responses */ - goto out; - } - ret = __afr_selfheal_metadata_prepare( - frame, this, inode, locked_on, sources, sinks, healed_sinks, - undid_pending, locked_replies, pending); - *msh = afr_decide_heal_info(priv, sources, ret); + afr_selfheal_extract_xattr(this, replies, type, dirty, NULL); + for (i = 0; i < priv->child_count; i++) { + if (dirty[i] > 1) + return _gf_true; } - afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, - locked_on); -out: - if (locked_replies) - afr_replies_wipe(locked_replies, priv->child_count); - return ret; + + return _gf_false; } -int -afr_selfheal_locked_data_inspect(call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_boolean_t *dsh, unsigned char *pflag) +static gf_boolean_t +afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type) { - int ret = -1; - unsigned char *data_lock = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - unsigned char *undid_pending = NULL; - afr_private_t *priv = NULL; - struct afr_reply *locked_replies = NULL; - inode_t *inode = fd->inode; + gf_boolean_t data_chk = _gf_false; + gf_boolean_t mdata_chk = _gf_false; + gf_boolean_t entry_chk = _gf_false; - priv = this->private; - data_lock = alloca0(priv->child_count); - sources = alloca0(priv->child_count); - sinks = alloca0(priv->child_count); - healed_sinks = alloca0(priv->child_count); - undid_pending = alloca0(priv->child_count); - - locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + switch (ia_type) { + case IA_IFDIR: + mdata_chk = _gf_true; + entry_chk = _gf_true; + break; + case IA_IFREG: + mdata_chk = _gf_true; + data_chk = _gf_true; + break; + default: + /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/ + mdata_chk = _gf_true; + break; + } - ret = afr_selfheal_inodelk(frame, this, inode, this->name, 0, 0, data_lock); - { - if (ret == 0) { - ret = -afr_final_errno(frame->local, priv); - if (ret == 0) - ret = -ENOTCONN; /* all invalid responses */ - goto out; - } - ret = __afr_selfheal_data_prepare(frame, this, inode, data_lock, - sources, sinks, healed_sinks, - undid_pending, locked_replies, pflag); - *dsh = afr_decide_heal_info(priv, sources, ret); + if (data_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_DATA_TRANSACTION)) { + return _gf_true; + } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_METADATA_TRANSACTION)) { + return _gf_true; + } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_ENTRY_TRANSACTION)) { + return _gf_true; } - afr_selfheal_uninodelk(frame, this, inode, this->name, 0, 0, data_lock); -out: - if (locked_replies) - afr_replies_wipe(locked_replies, priv->child_count); - return ret; + + return _gf_false; } -int -afr_selfheal_locked_entry_inspect(call_frame_t *frame, xlator_t *this, - inode_t *inode, gf_boolean_t *esh, - unsigned char *pflag) +static int +afr_update_heal_status(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh, + gf_boolean_t *msh, unsigned char pending) { int ret = -1; - int source = -1; + GF_UNUSED int ret1 = 0; + int i = 0; + int io_domain_lk_count = 0; + int shd_domain_lk_count = 0; afr_private_t *priv = NULL; - unsigned char *locked_on = NULL; - unsigned char *data_lock = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - struct afr_reply *locked_replies = NULL; - gf_boolean_t granular_locks = _gf_false; + char *key1 = NULL; + char *key2 = NULL; priv = this->private; - granular_locks = priv->granular_locks; /*Assign to local variable so that - reconfigure doesn't change this - value between locking and unlocking - below*/ - locked_on = alloca0(priv->child_count); - data_lock = alloca0(priv->child_count); - sources = alloca0(priv->child_count); - sinks = alloca0(priv->child_count); - healed_sinks = alloca0(priv->child_count); + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); - locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); - - if (!granular_locks) { - ret = afr_selfheal_tryentrylk(frame, this, inode, priv->sh_domain, NULL, - locked_on); - } - { - if (!granular_locks && ret == 0) { - ret = -afr_final_errno(frame->local, priv); - if (ret == 0) - ret = -ENOTCONN; /* all invalid responses */ - goto out; + for (i = 0; i < priv->child_count; i++) { + if ((replies[i].valid != 1) || (replies[i].op_ret != 0)) + continue; + if (!io_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count); + } + if (!shd_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count); } + } - ret = afr_selfheal_entrylk(frame, this, inode, this->name, NULL, - data_lock); - { - if (ret == 0) { - ret = -afr_final_errno(frame->local, priv); - if (ret == 0) - ret = -ENOTCONN; - /* all invalid responses */ - goto unlock; - } - ret = __afr_selfheal_entry_prepare(frame, this, inode, data_lock, - sources, sinks, healed_sinks, - locked_replies, &source, pflag); - if ((ret == 0) && (*pflag & PFLAG_SBRAIN)) - ret = -EIO; - *esh = afr_decide_heal_info(priv, sources, ret); + if (!pending) { + if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) || + (!io_domain_lk_count)) { + /* Needs heal. */ + ret = 0; + } else { + /* No heal needed. */ + *dsh = *esh = *msh = 0; + } + } else { + if (shd_domain_lk_count) { + ret = -EAGAIN; /*For 'possibly-healing'. */ + } else { + ret = 0; /*needs heal. Just set a non -ve value so that it is + assumed as the source index.*/ } - afr_selfheal_unentrylk(frame, this, inode, this->name, NULL, data_lock, - NULL); } -unlock: - if (!granular_locks) - afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, - locked_on, NULL); -out: - if (locked_replies) - afr_replies_wipe(locked_replies, priv->child_count); return ret; } +/*return EIO, EAGAIN or pending*/ int -afr_selfheal_locked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, - inode_t **inode, gf_boolean_t *entry_selfheal, - gf_boolean_t *data_selfheal, - gf_boolean_t *metadata_selfheal, - unsigned char *pending) - +afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **inode, gf_boolean_t *entry_selfheal, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, unsigned char *pending) { int ret = -1; - fd_t *fd = NULL; + int i = 0; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; gf_boolean_t dsh = _gf_false; gf_boolean_t msh = _gf_false; gf_boolean_t esh = _gf_false; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *valid_on = NULL; + uint64_t *witness = NULL; + + priv = this->private; + replies = alloca0(sizeof(*replies) * priv->child_count); + sources = alloca0(sizeof(*sources) * priv->child_count); + sinks = alloca0(sizeof(*sinks) * priv->child_count); + witness = alloca0(sizeof(*witness) * priv->child_count); + valid_on = alloca0(sizeof(*valid_on) * priv->child_count); ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh, - &esh); + &esh, replies); if (ret) goto out; - - /* For every heal type hold locks and check if it indeed needs heal */ - - /* Heal-info does an open() on the file being examined so that the - * current eager-lock holding client, if present, at some point sees - * open-fd count being > 1 and releases the eager-lock so that heal-info - * doesn't remain blocked forever until IO completes. - */ - if ((*inode)->ia_type == IA_IFREG) { - ret = afr_selfheal_data_open(this, *inode, &fd); - if (ret < 0) { - gf_msg_debug(this->name, -ret, "%s: Failed to open", - uuid_utoa((*inode)->gfid)); - goto out; + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + valid_on[i] = 1; } } - if (msh) { - ret = afr_selfheal_locked_metadata_inspect(frame, this, *inode, &msh, - pending); - if (ret == -EIO) + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) goto out; } - if (dsh) { - ret = afr_selfheal_locked_data_inspect(frame, this, fd, &dsh, pending); - if (ret == -EIO || (ret == -EAGAIN)) + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) goto out; } - if (esh) { - ret = afr_selfheal_locked_entry_inspect(frame, this, *inode, &esh, - pending); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; } + ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh, + &msh, *pending); out: *data_selfheal = dsh; *entry_selfheal = esh; *metadata_selfheal = msh; - if (fd) - fd_unref(fd); + if (replies) + afr_replies_wipe(replies, priv->child_count); return ret; } -static dict_t * -afr_set_heal_info(char *status) -{ - dict_t *dict = NULL; - int ret = -1; - - dict = dict_new(); - if (!dict) { - ret = -ENOMEM; - goto out; - } - - ret = dict_set_dynstr_sizen(dict, "heal-info", status); - if (ret) - gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, - "Failed to set heal-info key to " - "%s", - status); -out: - /* Any error other than EINVAL, dict_set_dynstr frees status */ - if (ret == -ENOMEM || ret == -EINVAL) { - GF_FREE(status); - } - - if (ret && dict) { - dict_unref(dict); - dict = NULL; - } - return dict; -} - int afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) { @@ -6169,10 +6967,21 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) inode_t *inode = NULL; char *substr = NULL; char *status = NULL; + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + + /*Use frame with lk-owner set*/ + heal_frame = afr_frame_create(frame->this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; - ret = afr_selfheal_locked_inspect(frame, this, loc->gfid, &inode, - &entry_selfheal, &data_selfheal, - &metadata_selfheal, &pending); + ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, + &entry_selfheal, &data_selfheal, + &metadata_selfheal, &pending); if (ret == -ENOMEM) { ret = -1; @@ -6257,6 +7066,10 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) op_errno = 0; out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); if (dict) dict_unref(dict); @@ -6453,6 +7266,8 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) int op_errno = 0; dict_t *dict = NULL; afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; local = frame->local; dict = dict_new(); @@ -6462,7 +7277,16 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) goto out; } - ret = afr_selfheal_do(frame, this, loc->gfid); + heal_frame = afr_frame_create(this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk + * work correctly*/ + ret = afr_selfheal_do(heal_frame, this, loc->gfid); if (ret == 1 || ret == 2) { ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg", @@ -6484,6 +7308,10 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) } out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } if (local->op == GF_FOP_GETXATTR) AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); else if (local->op == GF_FOP_SETXATTR) @@ -6626,7 +7454,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque) ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) + if (ret < priv->child_count) goto data_unlock; ret = __afr_selfheal_data_prepare( heal_frame, this, inode, locked_on, sources, sinks, @@ -6643,7 +7471,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque) ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, LLONG_MAX - 1, 0, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) + if (ret < priv->child_count) goto mdata_unlock; ret = __afr_selfheal_metadata_prepare( heal_frame, this, inode, locked_on, sources, sinks, @@ -6975,16 +7803,16 @@ afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) return _gf_false; } -gf_boolean_t +static gf_boolean_t afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) { afr_local_t *local = NULL; - local = frame->local; - if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) return _gf_false; + local = frame->local; + if (local->op != GF_FOP_LOOKUP) /* TODO:If the replica count is being increased on a plain distribute * volume that was never mounted, we need to allow setxattr on '/' with @@ -7001,14 +7829,49 @@ afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) } gf_boolean_t -afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this, - unsigned char *subvols) +afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count) { + if (frame && (up_children_count > 0) && + afr_is_add_replica_mount_lookup_on_root(frame)) + return _gf_true; + + return _gf_false; +} + +void +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; afr_private_t *priv = this->private; + unsigned char *success_replies = NULL; - if (frame && afr_is_add_replica_mount_lookup_on_root(frame)) { - if (AFR_COUNT(subvols, priv->child_count) > 0) - return _gf_true; + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + + if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) { + local->op_errno = afr_final_errno(local, priv); + if (!local->op_errno) + local->op_errno = afr_quorum_errno(priv); + local->op_ret = -1; + } +} + +gf_boolean_t +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child) +{ + int *pending = NULL; + int ret = 0; + int i = 0; + + ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending); + if (ret == 0) { + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + /* Not doing a ntoh32(pending) as we just want to check + * if it is non-zero or not. */ + if (pending[i]) { + return _gf_true; + } + } } return _gf_false; diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 6307b637f8d..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -10,7 +10,6 @@ #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> @@ -18,16 +17,10 @@ #include <glusterfs/glusterfs.h> #include <glusterfs/dict.h> -#include <glusterfs/xlator.h> -#include <glusterfs/hashfn.h> -#include <glusterfs/logging.h> #include <glusterfs/list.h> -#include <glusterfs/call-stub.h> -#include <glusterfs/defaults.h> #include <glusterfs/common-utils.h> #include <glusterfs/compat-errno.h> #include <glusterfs/compat.h> -#include <glusterfs/checksum.h> #include "afr.h" #include "afr-transaction.h" @@ -45,6 +38,10 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, fd_ctx = local->fd_ctx; child_index = (long)cookie; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + LOCK(&frame->lock); { if (op_ret == -1) { @@ -56,19 +53,22 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (!local->xdata_rsp && xdata) local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - - if (call_count == 0) + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno, local->fd, NULL); + } + return 0; } int -afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -84,6 +84,12 @@ afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) goto out; local->op = GF_FOP_OPENDIR; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) goto out; @@ -158,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) } static void -afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, - gf_dirent_t *entries, fd_t *fd) +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) { int ret = -1; gf_dirent_t *entry = NULL; @@ -177,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) { - if (__is_root_gfid(fd->inode->gfid) && - !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { continue; } @@ -222,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret >= 0) - afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries, - local->fd); + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 84e2a344624..b7cceb79158 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -10,7 +10,6 @@ #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> @@ -18,11 +17,8 @@ #include <glusterfs/glusterfs.h> #include "afr.h" #include <glusterfs/dict.h> -#include <glusterfs/xlator.h> -#include <glusterfs/hashfn.h> #include <glusterfs/logging.h> #include <glusterfs/list.h> -#include <glusterfs/call-stub.h> #include <glusterfs/defaults.h> #include <glusterfs/common-utils.h> #include <glusterfs/compat-errno.h> @@ -123,11 +119,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) continue; if (local->replies[i].op_ret < 0) { if (local->inode) - afr_inode_event_gen_reset(local->inode, this); + afr_inode_need_refresh_set(local->inode, this); if (local->parent) - afr_inode_event_gen_reset(local->parent, this); + afr_inode_need_refresh_set(local->parent, this); if (local->parent2) - afr_inode_event_gen_reset(local->parent2, this); + afr_inode_need_refresh_set(local->parent2, this); continue; } @@ -233,9 +229,9 @@ __afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf, preparent, postparent, preparent2, postparent2, xdata); + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); if (call_count == 0) { __afr_dir_write_finalize(frame, this); @@ -349,6 +345,7 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int pre_op_count = 0; int failed_count = 0; + unsigned char *success_replies = NULL; local = frame->local; priv = this->private; @@ -364,9 +361,16 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) failed_count = AFR_COUNT(local->transaction.failed_subvols, priv->child_count); + /* FOP succeeded on all bricks. */ if (pre_op_count == priv->child_count && !failed_count) return; + /* FOP did not suceed on quorum no. of bricks. */ + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + if (!afr_has_quorum(success_replies, this, NULL)) + return; + if (priv->thin_arbiter_count) { /*Mark new entry using ta file*/ local->is_new_entry = _gf_true; diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 523a5b48880..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -18,11 +18,8 @@ #include <glusterfs/glusterfs.h> #include "afr.h" #include <glusterfs/dict.h> -#include <glusterfs/xlator.h> -#include <glusterfs/hashfn.h> #include <glusterfs/logging.h> #include <glusterfs/list.h> -#include <glusterfs/call-stub.h> #include <glusterfs/byte-order.h> #include <glusterfs/defaults.h> #include <glusterfs/common-utils.h> @@ -305,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) afr_local_t *local = NULL; int op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -948,24 +946,13 @@ unlock: goto unwind; } - len = dict_serialized_length(local->dict); - if (len <= 0) { - goto unwind; - } - - lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char); - if (!lockinfo_buf) { + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { local->op_ret = -1; - local->op_errno = ENOMEM; goto unwind; } - op_ret = dict_serialize(local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } - op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, (void *)lockinfo_buf, len); if (op_ret < 0) { @@ -1064,24 +1051,13 @@ unlock: goto unwind; } - len = dict_serialized_length(local->dict); - if (len <= 0) { - goto unwind; - } - - lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char); - if (!lockinfo_buf) { + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { local->op_ret = -1; - local->op_errno = ENOMEM; goto unwind; } - op_ret = dict_serialize(local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } - op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, (void *)lockinfo_buf, len); if (op_ret < 0) { @@ -1723,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, int32_t op_errno = 0; fop_fgetxattr_cbk_t cbk = NULL; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1816,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, afr_local_t *local = NULL; int32_t op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1891,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, afr_local_t *local = NULL; int32_t op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 1627ee2c426..8c982bc7e6f 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -38,5 +38,8 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata); int +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata); +int afr_handle_quota_size(call_frame_t *frame, xlator_t *this); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 7fcc9d48ada..1d6e4f3570a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -8,9 +8,7 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> @@ -18,11 +16,7 @@ #include <glusterfs/glusterfs.h> #include "afr.h" #include <glusterfs/dict.h> -#include <glusterfs/xlator.h> -#include <glusterfs/hashfn.h> #include <glusterfs/logging.h> -#include <glusterfs/list.h> -#include <glusterfs/call-stub.h> #include <glusterfs/defaults.h> #include <glusterfs/common-utils.h> #include <glusterfs/compat-errno.h> @@ -180,11 +174,10 @@ __afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, { __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf, postbuf, xattr, xdata); + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) { __afr_inode_write_finalize(frame, this); @@ -498,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int op_errno = ENOMEM; int ret = -1; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -737,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -947,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1063,11 +1059,10 @@ afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie, if (ret) goto out; - gf_msg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, - op_ret ? op_errno : 0, afr_get_msg_id(op_type), - "Set of pending xattr %s on" - " %s.", - op_ret ? "failed" : "succeeded", priv->children[i]->name); + gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, + op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s", + priv->children[i]->name, "op_ret=%s", + op_ret ? "failed" : "succeeded", NULL); out: syncbarrier_wake(&local->barrier); @@ -1161,9 +1156,8 @@ _afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc, } if (!count) { - gf_msg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, - "Couldn't acquire lock on" - " any child."); + gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, + NULL); ret = -EAGAIN; goto unlock; } @@ -1214,6 +1208,7 @@ _afr_handle_empty_brick(void *opaque) char *op_type = NULL; int op_type_len = 0; afr_empty_brick_args_t *data = NULL; + call_frame_t *op_frame = NULL; data = opaque; frame = data->frame; @@ -1221,21 +1216,29 @@ _afr_handle_empty_brick(void *opaque) if (!data->op_type) goto out; + op_frame = copy_frame(frame); + if (!op_frame) { + ret = -1; + op_errno = ENOMEM; + goto out; + } + op_type = data->op_type; op_type_len = strlen(op_type); - this = frame->this; + this = op_frame->this; priv = this->private; - local = AFR_FRAME_INIT(frame, op_errno); + afr_set_lk_owner(op_frame, this, op_frame->root); + local = AFR_FRAME_INIT(op_frame, op_errno); if (!local) goto out; loc_copy(&local->loc, &data->loc); - gf_msg(this->name, GF_LOG_INFO, 0, 0, "New brick is : %s", - priv->children[empty_index]->name); + gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s", + priv->children[empty_index]->name, NULL); - ret = _afr_handle_empty_brick_type(this, frame, &local->loc, empty_index, + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, AFR_METADATA_TRANSACTION, op_type, op_type_len); if (ret) { @@ -1251,7 +1254,7 @@ _afr_handle_empty_brick(void *opaque) local->xattr_req = NULL; local->xdata_req = NULL; - ret = _afr_handle_empty_brick_type(this, frame, &local->loc, empty_index, + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, AFR_ENTRY_TRANSACTION, op_type, op_type_len); if (ret) { @@ -1261,6 +1264,9 @@ _afr_handle_empty_brick(void *opaque) } ret = 0; out: + if (op_frame) { + AFR_STACK_DESTROY(op_frame); + } AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); return 0; } @@ -1305,9 +1311,8 @@ afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc, */ ret = afr_inode_split_brain_choice_set(loc->inode, this, -1); if (ret) - gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to set" - "split-brain choice to -1"); + gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + NULL); afr_heal_splitbrain_file(frame, this, loc); ret = 0; out: @@ -1330,8 +1335,8 @@ afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len) spb_child_index = afr_get_child_index_from_name(this, spb_child_str); if (spb_child_index < 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, - "Invalid subvol: %s", spb_child_str); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "subvol=%s", spb_child_str, NULL); } return spb_child_index; } @@ -1353,11 +1358,9 @@ afr_can_set_split_brain_choice(void *opaque) &data->m_spb); if (ret) - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to determine if %s" - " is in split-brain. " - "Aborting split-brain-choice set.", - uuid_utoa(loc->gfid)); + gf_smsg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s", + uuid_utoa(loc->gfid), NULL); return ret; } @@ -1365,7 +1368,8 @@ int afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, dict_t *dict) { - void *value = NULL; + void *choice_value = NULL; + void *resolve_value = NULL; afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_spbc_timeout_t *data = NULL; @@ -1376,6 +1380,14 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, priv = this->private; + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len); + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value, + &len); + if (!choice_value && !resolve_value) { + ret = -1; + goto out; + } + local = AFR_FRAME_INIT(frame, op_errno); if (!local) { ret = 1; @@ -1384,9 +1396,9 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, local->op = GF_FOP_SETXATTR; - ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &value, &len); - if (value) { - spb_child_index = afr_get_split_brain_child_index(this, value, len); + if (choice_value) { + spb_child_index = afr_get_split_brain_child_index(this, choice_value, + len); if (spb_child_index < 0) { /* Case where value was "none" */ if (spb_child_index == -2) @@ -1410,12 +1422,8 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice, afr_set_split_brain_choice, NULL, data); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to create" - " synctask. Aborting split-brain choice set" - " for %s", - loc->name); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "name=%s", loc->name, NULL); ret = 1; op_errno = ENOMEM; goto out; @@ -1424,9 +1432,9 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, goto out; } - ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &value, &len); - if (value) { - spb_child_index = afr_get_split_brain_child_index(this, value, len); + if (resolve_value) { + spb_child_index = afr_get_split_brain_child_index(this, resolve_value, + len); if (spb_child_index < 0) { ret = 1; goto out; @@ -1490,8 +1498,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, goto out; if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) { - gf_msg(this->name, GF_LOG_ERROR, EPERM, afr_get_msg_id(op_type), - "'%s' is an internal extended attribute.", op_type); + gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR, + "op_type=%s", op_type, NULL); ret = 1; goto out; } @@ -1517,8 +1525,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, ret = synctask_new(this->ctx->env, _afr_handle_empty_brick, _afr_handle_empty_brick_cbk, NULL, data); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, afr_get_msg_id(op_type), - "Failed to create synctask."); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + NULL); ret = 1; op_errno = ENOMEM; afr_brick_args_cleanup(data); @@ -1676,6 +1684,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1884,6 +1893,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1984,6 +1994,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2093,6 +2104,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2199,6 +2211,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2398,6 +2411,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2492,7 +2506,9 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, call_frame_t *transaction_frame = NULL; int ret = -1; int32_t op_errno = ENOMEM; + int8_t last_fsync = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2501,10 +2517,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, if (!local) goto out; - if (xdata) + if (xdata) { local->xdata_req = dict_copy_with_ref(xdata, NULL); - else + if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) { + if (last_fsync) { + local->transaction.disable_delayed_post_op = _gf_true; + } + } + } else { local->xdata_req = dict_new(); + } if (!local->xdata_req) goto out; diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 4091671fa3f..bc8eabe0f43 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -397,7 +397,6 @@ afr_unlock_now(call_frame_t *frame, xlator_t *this) int_lock->lk_call_count = call_count; if (!call_count) { - GF_ASSERT(!local->transaction.do_eager_unlock); gf_msg_trace(this->name, 0, "No internal locks unlocked"); int_lock->lock_cbk(frame, this); goto out; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index b0fb00641a0..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -31,6 +31,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_empty_brick_t, gf_afr_mt_child_latency_t, gf_afr_mt_atomic_t, + gf_afr_mt_lk_heal_info_t, + gf_afr_mt_gf_lock, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index c9c99270e98..e73fd997765 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -23,25 +23,145 @@ * glfs-message-id.h. */ -GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, - AFR_MSG_QUORUM_OVERRIDE, AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, - AFR_MSG_SUBVOLS_DOWN, AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, - AFR_MSG_OPEN_FAIL, AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, - AFR_MSG_GFID_NULL, AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, - AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, - AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, - AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, - AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, - AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, - AFR_MSG_SELF_HEAL_INFO, AFR_MSG_READ_SUBVOL_ERROR, - AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, - AFR_MSG_INVALID_DATA, AFR_MSG_INVALID_ARG, - AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, - AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, - AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, - AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL, - AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED, - AFR_MSG_THIN_ARB); +GLFS_MSGID( + AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE, + AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN, + AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL, + AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL, + AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, + AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, + AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, + AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, + AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO, + AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA, + AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, + AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, + AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG, + AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB, + AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED, + AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET, + AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START, + AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT, + AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED, + AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG, + AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED, + AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH, + AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED, + AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA, + SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE, + SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED, + AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED, + AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED, + AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED, + AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN, + AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN, + AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP, + AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR, + AFR_MSG_INTERNAL_ATTR); +#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed" +#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed" +#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed" +#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed" +#define AFR_MSG_INVALID_ARG_STR "Invalid argument" +#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on " +#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file" +#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict." +#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed." +#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \ + "Failed to populate loc for thin-arbiter" +#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending" +#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping." +#define AFR_MSG_UNKNOWN_SET_STR "Unknown set" +#define AFR_MSG_NO_XL_ID_STR "xl does not have id" +#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on" +#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on" +#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter." +#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output" +#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time" +#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict" +#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \ + "No majority to resolve gfid split brain" +#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected" +#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal" +#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch" +#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \ + "Size policy is not applicable to directories." +#define AFR_MSG_NO_CHILD_SELECTED_STR \ + "No child selected by favorite-child policy" +#define AFR_MSG_INVALID_CHILD_STR "Invalid child" +#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \ + "selected as authentic to resolve conflicting data" +#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick" +#define SNO_DIFF_IN_MTIME_STR "No difference in mtime" +#define SNO_BIGGER_FILE_STR "No bigger file" +#define SALL_BRICKS_UP_TO_RESOLVE_STR \ + "All the bricks should be up to resolve the gfid split brain" +#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock" +#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed" +#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame" +#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop" +#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed" +#define AFR_MSG_FSYNC_FAILED_STR "fsync failed" +#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met" +#define AFR_MSG_FOP_FAILED_STR "Failing Fop" +#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume" +#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling" +#define AFR_MSG_CHILD_MISCONFIGURED_STR \ + "replicate translator needs more than one subvolume defined" +#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads" +#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count" +#define AFR_MSG_UNABLE_TO_FETCH_STR \ + "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \ + "using client translator names" +#define AFR_MSG_NULL_DEREF_STR "possible NULL deref" +#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key" +#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask" +#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up" +#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \ + "Failed to cancel split-brain choice" +#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \ + "Cannot set replica. File is not in data/metadata split-brain" +#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx" +#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols" +#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child" +#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file" +#define AFR_MSG_TIMER_CREATE_FAIL_STR \ + "Cannot create timer for delayed initialization" +#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online" +#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \ + "All subvolumes are down. Going offline until atleast one of them is up" +#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock" +#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume" +#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met" +#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir" +#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid" +#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file" +#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain" +#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down" +#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask" +#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \ + "read subvolume in this generation is not up" +#define AFR_MSG_INTERNAL_LKS_FAILED_STR \ + "Unable to work with lk-owner while attempting fop" +#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \ + "subvolume does not support locking. please load features/locks xlator " \ + "on server." +#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx" +#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting." +#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child." +#define AFR_MSG_NEW_BRICK_STR "New brick" +#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \ + "Failed to set split-brain choice to -1" +#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \ + "Failed to determine split-brain. Aborting split-brain-choice set" +#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume" +#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr" +#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute" #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index ff72c73a9f4..64856042b65 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -8,9 +8,7 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> @@ -18,11 +16,7 @@ #include <glusterfs/glusterfs.h> #include "afr.h" #include <glusterfs/dict.h> -#include <glusterfs/xlator.h> -#include <glusterfs/hashfn.h> #include <glusterfs/logging.h> -#include <glusterfs/list.h> -#include <glusterfs/call-stub.h> #include <glusterfs/defaults.h> #include <glusterfs/common-utils.h> #include <glusterfs/compat-errno.h> @@ -30,10 +24,6 @@ #include <glusterfs/byte-order.h> #include <glusterfs/statedump.h> -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" #include "afr-transaction.h" gf_boolean_t @@ -73,6 +63,10 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, local = frame->local; fd_ctx = local->fd_ctx; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + LOCK(&frame->lock); { if (op_ret == -1) { @@ -84,13 +78,16 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, if (!local->xdata_rsp && xdata) local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) { - if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { + afr_handle_replies_quorum(frame, this); + if (local->op_ret == -1) { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL, + NULL); + } else if (fd_ctx->flags & O_TRUNC) { STACK_WIND(frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, fd, 0, NULL); } else { @@ -140,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = 0; + int spb_subvol = 0; int event_generation = 0; int ret = 0; int32_t op_errno = 0; @@ -161,6 +158,11 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, goto out; } + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) goto out; @@ -177,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ret = afr_inode_get_readable(frame, local->inode, this, NULL, &event_generation, AFR_DATA_TRANSACTION); if ((ret < 0) && - (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) == - 0) && - spb_choice < 0) { + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { afr_inode_refresh(frame, this, local->inode, local->inode->gfid, afr_open_continue); } else { @@ -213,11 +215,9 @@ afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, "successfully on subvolume %s", local->loc.path, priv->children[child_index]->name); } else { - gf_msg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, - AFR_MSG_OPEN_FAIL, - "Failed to open %s on " - "subvolume %s", - local->loc.path, priv->children[child_index]->name); + gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, + AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s", + priv->children[child_index]->name, NULL); } fd_ctx = local->fd_ctx; diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 9a91f2e56fc..6fc2c75145c 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -30,27 +30,6 @@ afr_pending_read_decrement(afr_private_t *priv, int child_index) GF_ATOMIC_DEC(priv->pending_reads[child_index]); } -static gf_boolean_t -afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child) -{ - int *pending = NULL; - int ret = 0; - int i = 0; - - ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending); - if (ret == 0) { - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { - /* Not doing a ntoh32(pending) as we just want to check - * if it is non-zero or not. */ - if (pending[i]) { - return _gf_true; - } - } - } - - return _gf_false; -} - void afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol) { @@ -114,7 +93,7 @@ afr_ta_read_txn(void *opaque) call_frame_t *frame = NULL; xlator_t *this = NULL; int read_subvol = -1; - int up_child = AFR_CHILD_UNKNOWN; + int query_child = AFR_CHILD_UNKNOWN; int possible_bad_child = AFR_CHILD_UNKNOWN; int ret = 0; int op_errno = ENOMEM; @@ -134,18 +113,18 @@ afr_ta_read_txn(void *opaque) this = frame->this; local = frame->local; priv = this->private; + query_child = local->read_txn_query_child; - if (local->child_up[AFR_CHILD_ZERO]) { - up_child = AFR_CHILD_ZERO; + if (query_child == AFR_CHILD_ZERO) { possible_bad_child = AFR_CHILD_ONE; - } else if (local->child_up[AFR_CHILD_ONE]) { - up_child = AFR_CHILD_ONE; + } else if (query_child == AFR_CHILD_ONE) { possible_bad_child = AFR_CHILD_ZERO; + } else { + /*read_txn_query_child is AFR_CHILD_UNKNOWN*/ + goto out; } - GF_ASSERT(up_child != AFR_CHILD_UNKNOWN); - - /* Query the up_child to see if it blames the down one. */ + /* Ask the query_child to see if it blames the possibly bad one. */ xdata_req = dict_new(); if (!xdata_req) goto out; @@ -159,30 +138,33 @@ afr_ta_read_txn(void *opaque) goto out; if (local->fd) { - ret = syncop_fxattrop(priv->children[up_child], local->fd, + ret = syncop_fxattrop(priv->children[query_child], local->fd, GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, NULL); } else { - ret = syncop_xattrop(priv->children[up_child], &local->loc, + ret = syncop_xattrop(priv->children[query_child], &local->loc, GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, NULL); } if (ret || !xdata_rsp) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed xattrop for gfid %s on %s", - uuid_utoa(local->inode->gfid), priv->children[up_child]->name); + uuid_utoa(local->inode->gfid), + priv->children[query_child]->name); op_errno = -ret; goto out; } if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, possible_bad_child)) { - read_subvol = up_child; + read_subvol = query_child; goto out; } dict_unref(xdata_rsp); - /* Query thin-arbiter to see if it blames any data brick. */ - ret = afr_fill_ta_loc(this, &loc); + xdata_rsp = NULL; + + /* It doesn't. So query thin-arbiter to see if it blames any data brick. */ + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc.name); @@ -211,8 +193,8 @@ afr_ta_read_txn(void *opaque) goto unlock; } - if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) { - read_subvol = up_child; + if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) { + read_subvol = query_child; } else { gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, "Failing read for gfid %s since good brick %s is down", @@ -290,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) int read_subvol = -1; inode_t *inode = NULL; int ret = -1; - int spb_choice = -1; + int spb_subvol = -1; local = frame->local; inode = local->inode; @@ -321,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) local->read_attempted[read_subvol] = 1; readfn: if (read_subvol == -1) { - ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice); - if ((ret == 0) && spb_choice >= 0) - read_subvol = spb_choice; + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; } if (read_subvol == -1) { @@ -450,6 +432,11 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, if (priv->thin_arbiter_count && AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + if (local->child_up[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (local->child_up[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } afr_ta_read_txn_synctask(frame, this); return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 226876154d7..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -55,7 +55,8 @@ afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, for (i = 0; i < priv->child_count; i++) { if (source == -1) { /* case (a) above. */ - if (replies[i].valid && replies[i].op_ret == 0) { + if (replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { ia_type = replies[i].poststat.ia_type; break; } @@ -63,7 +64,8 @@ afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, /* case (b) above. */ if (i == source) continue; - if (sources[i] && replies[i].valid && replies[i].op_ret == 0) { + if (sources[i] && replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { ia_type = replies[i].poststat.ia_type; break; } @@ -77,6 +79,12 @@ heal: for (i = 0; i < priv->child_count; i++) { if (!replies[i].valid || replies[i].op_ret != 0) continue; + + if (gf_uuid_is_null(gfid) && + !gf_uuid_is_null(replies[i].poststat.ia_gfid) && + replies[i].poststat.ia_type == ia_type) + gfid = replies[i].poststat.ia_gfid; + if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) || replies[i].poststat.ia_type != ia_type) continue; @@ -132,7 +140,7 @@ heal: } } out: - if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) { + if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { ret = -afr_final_errno(local, priv); } loc_wipe(&loc); @@ -383,11 +391,12 @@ out: uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2), priv->children[src_idx]->name); gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" "subvol=%s;type=gfid;file=" "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;" "child-%d=%s;gfid-%d=%s", - this->name, uuid_utoa(pargfid), bname, child_idx, - priv->children[child_idx]->name, child_idx, + this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid), + bname, child_idx, priv->children[child_idx]->name, child_idx, uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx, priv->children[src_idx]->name, src_idx, uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2)); @@ -504,7 +513,8 @@ afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc, &replies[source].poststat, - (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME), NULL); + (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME), + NULL); loc_wipe(&loc); @@ -1565,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, char *accused = NULL; /* Accused others without any self-accusal */ char *pending = NULL; /* Have pending operations on others */ char *self_accused = NULL; /* Accused itself */ - int min_participants = -1; priv = this->private; @@ -1589,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, } } - if (type == AFR_DATA_TRANSACTION) { - min_participants = priv->child_count; - } else { - min_participants = AFR_SH_MIN_PARTICIPANTS; - } - if (afr_success_count(replies, priv->child_count) < min_participants) { + if (afr_success_count(replies, priv->child_count) < priv->child_count) { /* Treat this just like locks not being acquired */ return -ENOTCONN; } @@ -1655,7 +1659,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, } } - if (type == AFR_DATA_TRANSACTION) + if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) afr_selfheal_post_op_failure_accounting(priv, accused, sources, locked_on); @@ -1763,11 +1767,9 @@ afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (xdata) { local->replies[i].xdata = dict_ref(xdata); ret = dict_get_int8(xdata, "link-count", &need_heal); - local->replies[i].need_heal = need_heal; - } else { - local->replies[i].need_heal = need_heal; } + local->replies[i].need_heal = need_heal; syncbarrier_wake(&local->barrier); return 0; @@ -1823,10 +1825,41 @@ afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, return inode; } +static int +afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + + ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1); + if (ret) + return ret; + + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + ret = dict_set_uint32(dict, key1, 1); + if (ret) + return ret; + + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + ret = dict_set_uint32(dict, key2, 1); + if (ret) + return ret; + + return 0; +} + int afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, uuid_t gfid, struct afr_reply *replies, - unsigned char *discover_on) + unsigned char *discover_on, dict_t *dict) { loc_t loc = { 0, @@ -1841,12 +1874,19 @@ afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, xattr_req = dict_new(); if (!xattr_req) return -ENOMEM; + if (dict) + dict_copy(dict, xattr_req); if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { dict_unref(xattr_req); return -ENOMEM; } + if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) { + dict_unref(xattr_req); + return -1; + } + loc.inode = inode_ref(inode); gf_uuid_copy(loc.gfid, gfid); @@ -1865,12 +1905,16 @@ int afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, struct afr_reply *replies) { - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + dict_t *dict = NULL; - priv = frame->this->private; + local = frame->local; + + if (local->xattr_req) + dict = local->xattr_req; return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, - priv->child_up); + local->child_up, dict); } unsigned int @@ -2237,7 +2281,8 @@ int afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, inode_t **link_inode, gf_boolean_t *data_selfheal, gf_boolean_t *metadata_selfheal, - gf_boolean_t *entry_selfheal) + gf_boolean_t *entry_selfheal, + struct afr_reply *replies_dst) { afr_private_t *priv = NULL; inode_t *inode = NULL; @@ -2296,11 +2341,13 @@ afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, priv->children[i]->name, uuid_utoa(replies[i].poststat.ia_gfid)); gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" "subvol=%s;" "type=file;gfid=%s;" "ia_type-%d=%s;ia_type-%d=%s", - this->name, uuid_utoa(replies[i].poststat.ia_gfid), - first_idx, gf_inode_type_to_str(first.ia_type), i, + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(replies[i].poststat.ia_gfid), first_idx, + gf_inode_type_to_str(first.ia_type), i, gf_inode_type_to_str(replies[i].poststat.ia_type)); ret = -EIO; goto out; @@ -2371,6 +2418,8 @@ afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, ret = 0; out: + if (replies && replies_dst) + afr_replies_copy(replies_dst, replies, priv->child_count); if (inode) inode_unref(inode); if (replies) @@ -2410,8 +2459,11 @@ afr_frame_create(xlator_t *this, int32_t *op_errno) pid_t pid = GF_CLIENT_PID_SELF_HEALD; frame = create_frame(this, this->ctx->pool); - if (!frame) + if (!frame) { + if (op_errno) + *op_errno = ENOMEM; return NULL; + } local = AFR_FRAME_INIT(frame, (*op_errno)); if (!local) { @@ -2487,7 +2539,7 @@ afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid) ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode, &data_selfheal, &metadata_selfheal, - &entry_selfheal); + &entry_selfheal, NULL); if (ret) goto out; @@ -2698,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, out: return source; } + +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; + } + } + + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } + + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } + + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); + } else { + op_errno = ENOMEM; + } + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 6dd38ef633a..37bcc2b3f9e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -225,24 +225,40 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd, return ret; } +static gf_boolean_t +afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source, + unsigned char *healed_sinks) +{ + afr_private_t *priv = this->private; + int i = 0; + + if (!locked_on[source]) + return _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && locked_on[i]) + return _gf_true; + } + + return _gf_false; +} + static int afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *healed_sinks, off_t offset, size_t size, int type, struct afr_reply *replies) { int ret = -1; - int sink_count = 0; afr_private_t *priv = NULL; unsigned char *data_lock = NULL; priv = this->private; - sink_count = AFR_COUNT(healed_sinks, priv->child_count); data_lock = alloca0(priv->child_count); ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, data_lock); { - if (ret < sink_count) { + if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) { ret = -ENOTCONN; goto unlock; } @@ -324,6 +340,9 @@ afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, call_frame_t *iter_frame = NULL; unsigned char arbiter_sink_status = 0; + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing data selfheal on %s", uuid_utoa(fd->inode->gfid)); + priv = this->private; if (priv->arbiter_count) { arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; @@ -375,17 +394,18 @@ __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_local_t *local = NULL; afr_private_t *priv = NULL; - unsigned char arbiter_sink_status = 0; int i = 0; local = frame->local; priv = this->private; - if (priv->arbiter_count) { - arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; - healed_sinks[ARBITER_BRICK_INDEX] = 0; - } - + /* This will send truncate on the arbiter brick as well if it is marked as + * sink. If changelog is enabled on the volume it captures truncate as a + * data transactions on the arbiter brick. This will help geo-rep to + * properly sync the data from master to slave if arbiter is the ACTIVE + * brick during syncing and which had got some entries healed for data as + * part of self heal. + */ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size, NULL); @@ -396,8 +416,6 @@ __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd, */ healed_sinks[i] = 0; - if (arbiter_sink_status) - healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; return 0; } @@ -538,9 +556,11 @@ __afr_selfheal_data_finalize_source( replies, AFR_DATA_TRANSACTION); if (source < 0) { gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" "subvol=%s;type=data;" "file=%s", - this->name, uuid_utoa(inode->gfid)); + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); return -EIO; } @@ -693,19 +713,18 @@ __afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd, goto unlock; } - if (priv->arbiter_count && - AFR_COUNT(healed_sinks, priv->child_count) == 1 && - healed_sinks[ARBITER_BRICK_INDEX]) { - is_arbiter_the_only_sink = _gf_true; - goto restore_time; - } - ret = __afr_selfheal_truncate_sinks( frame, this, fd, healed_sinks, locked_replies[source].poststat.ia_size); if (ret < 0) goto unlock; + if (priv->arbiter_count && + AFR_COUNT(healed_sinks, priv->child_count) == 1 && + healed_sinks[ARBITER_BRICK_INDEX]) { + is_arbiter_the_only_sink = _gf_true; + goto restore_time; + } ret = 0; } unlock: diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index b23ed6a1afa..64893f441e3 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -16,54 +16,170 @@ #include <glusterfs/syncop-utils.h> #include <glusterfs/events.h> -static int -afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, - inode_t *inode, int child, struct afr_reply *replies) +int +afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, + struct afr_reply *replies, + gf_boolean_t *anon_inode) { afr_private_t *priv = NULL; + afr_local_t *local = NULL; xlator_t *subvol = NULL; int ret = 0; + int i = 0; + char g[64] = {0}; + unsigned char *lookup_success = NULL; + call_frame_t *frame = NULL; + loc_t loc2 = { + 0, + }; loc_t loc = { 0, }; - char g[64]; priv = this->private; - subvol = priv->children[child]; + lookup_success = alloca0(priv->child_count); + uuid_utoa_r(replies[child].poststat.ia_gfid, g); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + if (replies[child].poststat.ia_type == IA_IFDIR) { + /* This directory may have sub-directory hierarchy which may need to + * be preserved for subsequent heals. So unconditionally move the + * directory to anonymous-inode directory*/ + *anon_inode = _gf_true; + goto anon_inode; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid); + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + lookup_success[i] = 1; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + ret = -local->replies[i].op_errno; + } + } + + if (priv->quorum_count) { + if (afr_has_quorum(lookup_success, this, NULL)) { + *anon_inode = _gf_true; + } + } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) { + *anon_inode = _gf_true; + } else if (ret) { + goto out; + } + +anon_inode: + if (!*anon_inode) { + ret = 0; + goto out; + } loc.parent = inode_ref(dir); gf_uuid_copy(loc.pargfid, dir->gfid); loc.name = name; - loc.inode = inode_ref(inode); - if (replies[child].valid && replies[child].op_ret == 0) { - switch (replies[child].poststat.ia_type) { - case IA_IFDIR: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), - name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), - subvol->name); - ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); - break; - default: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), - name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), - subvol->name); - ret = syncop_unlink(subvol, &loc, NULL, NULL); - break; - } + ret = afr_anon_inode_create(this, child, &loc2.parent); + if (ret < 0) + goto out; + + loc2.name = g; + ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s failed", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s successful", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); } +out: loc_wipe(&loc); + loc_wipe(&loc2); + if (frame) { + AFR_STACK_DESTROY(frame); + } return ret; } int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies) +{ + char g[64] = {0}; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + gf_boolean_t anon_inode = _gf_false; + + priv = this->private; + subvol = priv->children[child]; + + if ((!replies[child].valid) || (replies[child].op_ret < 0)) { + /*Nothing to do*/ + ret = 0; + goto out; + } + + if (priv->use_anon_inode) { + ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child, + replies, &anon_inode); + if (ret < 0 || anon_inode) + goto out; + } + + loc.parent = inode_ref(dir); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, + uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), + name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink(subvol, &loc, NULL, NULL); + break; + } + +out: + loc_wipe(&loc); + return ret; +} + +int afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, unsigned char *sources, inode_t *dir, const char *name, inode_t *inode, @@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, loc_t srcloc = { 0, }; + loc_t anonloc = { + 0, + }; xlator_t *this = frame->this; afr_private_t *priv = NULL; dict_t *xdata = NULL; @@ -86,15 +205,18 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, 0, }; unsigned char *newentry = NULL; + char iatt_uuid_str[64] = {0}; + char dir_uuid_str[64] = {0}; priv = this->private; iatt = &replies[source].poststat; + uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str); if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, "Invalid ia_type (%d) or gfid(%s). source brick=%d, " "pargfid=%s, name=%s", - iatt->ia_type, uuid_utoa(iatt->ia_gfid), source, - uuid_utoa(dir->gfid), name); + iatt->ia_type, iatt_uuid_str, source, + uuid_utoa_r(dir->gfid, dir_uuid_str), name); ret = -EINVAL; goto out; } @@ -119,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, srcloc.inode = inode_ref(inode); gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); - if (iatt->ia_type != IA_IFDIR) - ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); - if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) { + ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == -ENOENT || ret == -ESTALE) { newentry[dst] = 1; ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, sources, newentry); if (ret) goto out; + } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) { + // Try rename from hidden directory + ret = afr_anon_inode_create(this, dst, &anonloc.parent); + if (ret < 0) + goto out; + anonloc.inode = inode_ref(inode); + anonloc.name = iatt_uuid_str; + ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL); + if (ret == -ENOENT || ret == -ESTALE) + ret = -1; /*This sets 'mismatch' to true*/ + goto out; } mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); @@ -165,6 +297,7 @@ out: GF_FREE(linkname); loc_wipe(&loc); loc_wipe(&srcloc); + loc_wipe(&anonloc); return ret; } @@ -246,6 +379,19 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this, if (replies[i].op_ret != 0) continue; + if (gf_uuid_is_null(replies[i].poststat.ia_gfid)) + continue; + + if (replies[i].poststat.ia_type == IA_INVAL) + continue; + + if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) { + src_idx = i; + ia_type = replies[src_idx].poststat.ia_type; + gfid = &replies[src_idx].poststat.ia_gfid; + continue; + } + if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) && (ia_type == replies[i].poststat.ia_type)) { ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, @@ -269,11 +415,12 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this, gf_inode_type_to_str(replies[src_idx].poststat.ia_type), priv->children[src_idx]->name); gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" "subvol=%s;type=file;" "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-" "%d=%s;child-%d=%s;type-%d=%s", - this->name, uuid_utoa(pargfid), bname, i, - priv->children[i]->name, i, + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(pargfid), bname, i, priv->children[i]->name, i, gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx, priv->children[src_idx]->name, src_idx, gf_inode_type_to_str(replies[src_idx].poststat.ia_type)); @@ -465,6 +612,7 @@ __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources, afr_private_t *priv = NULL; int source = -1; int sources_count = 0; + int i = 0; priv = this->private; @@ -478,6 +626,20 @@ __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources, } source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION); + + /*If the selected source does not blame any other brick, then mark + * everything as sink to trigger conservative merge. + */ + if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } + } + return -1; + } + return source; } @@ -548,6 +710,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (afr_is_private_directory(priv, fd->inode->gfid, name, + GF_CLIENT_PID_SELF_HEALD)) { + return 0; + } + xattr = dict_new(); if (!xattr) return -ENOMEM; @@ -568,7 +735,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes " @@ -596,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, replies); if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { - ret = afr_shd_index_purge(subvol, parent_idx_inode, name, + ret = afr_shd_entry_purge(subvol, parent_idx_inode, name, inode->ia_type); /* Why is ret force-set to 0? We do not care about * index purge failing for full heal as it is quite @@ -726,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; - if (__is_root_gfid(fd->inode->gfid) && - !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) - continue; - ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, loc.inode, subvol, local->need_full_crawl); @@ -792,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, /* The name indices under the pgfid index dir are guaranteed * to be regular files. Hence the hardcoding. */ - afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG); + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); ret = 0; goto out; } @@ -831,6 +994,8 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd, subvol = priv->children[subvol_idx]; args.frame = afr_copy_frame(frame); + if (!args.frame) + goto out; args.xl = this; /* args.heal_fd represents the fd associated with the original directory * on which entry heal is being attempted. @@ -849,9 +1014,10 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd, * do not treat heal as failure. */ if (is_src) - return -errno; + ret = -errno; else - return 0; + ret = 0; + goto out; } ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args, @@ -861,7 +1027,9 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd, if (args.mismatch == _gf_true) ret = -1; - +out: + if (args.frame) + AFR_STACK_DESTROY(args.frame); return ret; } @@ -957,7 +1125,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, data_lock); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes could " @@ -1013,6 +1181,8 @@ unlock: goto postop_unlock; } + afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks, + locked_replies); ret = afr_selfheal_undo_pending( frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, AFR_ENTRY_TRANSACTION, locked_replies, postop_lock); @@ -1079,7 +1249,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain, NULL, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes could " diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index a661fcb5acc..03f43bad16e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -190,6 +190,59 @@ out: return ret; } +static int +__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, + struct afr_reply *replies, + unsigned char *sources) +{ + int ret = 0; + int i = 0; + int m_idx = 0; + afr_private_t *priv = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + dict_t *xattr = NULL; + + priv = this->private; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + raw[m_idx] = 1; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, + "Failed to set pending metadata xattr on child %d for %s", i, + uuid_utoa(inode->gfid)); + goto out; + } + } + + afr_replies_wipe(replies, priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + +out: + if (xattr) + dict_unref(xattr); + return ret; +} + /* * Look for mismatching uid/gid or mode or user xattrs even if * AFR xattrs don't say so, and pick one arbitrarily as winner. */ @@ -210,6 +263,7 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, }; int source = -1; int sources_count = 0; + int ret = 0; priv = this->private; @@ -242,9 +296,11 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, if (!priv->metadata_splitbrain_forced_heal) { gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" "subvol=%s;" "type=metadata;file=%s", - this->name, uuid_utoa(inode->gfid)); + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); return -EIO; } @@ -298,7 +354,13 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, healed_sinks[i] = 1; } } - + if ((sources_count == priv->child_count) && (source > -1) && + (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { + ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, + replies, sources); + if (ret < 0) + return ret; + } out: afr_mark_active_sinks(this, sources, locked_on, healed_sinks); return source; @@ -396,7 +458,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, data_lock); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { ret = -ENOTCONN; goto unlock; } @@ -419,12 +481,8 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) if (ret) goto unlock; - /* Restore atime/mtime for files that don't need data heal as - * restoring timestamps happens only as a part of data-heal. - */ - if (!IA_ISREG(locked_replies[source].poststat.ia_type)) - afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, - locked_replies); + afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, + locked_replies); ret = afr_selfheal_undo_pending( frame, this, inode, sources, sinks, healed_sinks, undid_pending, diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index c4df5d4609d..834aac86d48 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, const char *bname, inode_t *inode, struct afr_reply *replies) { - loc_t loc = { - 0, - }; int i = 0; afr_private_t *priv = NULL; - char g[64]; int ret = 0; priv = this->private; - loc.parent = inode_ref(parent); - gf_uuid_copy(loc.pargfid, pargfid); - loc.name = bname; - loc.inode = inode_ref(inode); - for (i = 0; i < priv->child_count; i++) { if (!replies[i].valid) continue; @@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, if (replies[i].op_ret) continue; - switch (replies[i].poststat.ia_type) { - case IA_IFDIR: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid), - bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), - priv->children[i]->name); - - ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL); - break; - default: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid), - bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), - priv->children[i]->name); - - ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL); - break; - } + ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i, + replies); } - loc_wipe(&loc); - return ret; } @@ -222,13 +193,14 @@ afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies, gf_inode_type_to_str(inode_type), priv->children[type_idx]->name); gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" "subvol=%s;type=file;" "file=<gfid:%s>/%s;count=2;" "child-%d=%s;type-%d=%s;child-%d=%s;" "type-%d=%s", - this->name, uuid_utoa(pargfid), bname, i, - priv->children[i]->name, i, - gf_inode_type_to_str(inode_type1), type_idx, + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(pargfid), bname, i, priv->children[i]->name, + i, gf_inode_type_to_str(inode_type1), type_idx, priv->children[type_idx]->name, type_idx, gf_inode_type_to_str(inode_type)); return -EIO; @@ -380,7 +352,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode, replies, gfid, locked_on, source, sources, is_gfid_absent, &gfid_idx); - if (ret) + if (ret || (gfid_idx < 0)) return ret; ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname, @@ -513,7 +485,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { ret = -ENOTCONN; goto unlock; } @@ -559,13 +531,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this, struct afr_reply *replies = NULL; inode_t *inode = NULL; int first_idx = -1; + afr_local_t *local = NULL; priv = this->private; + local = frame->local; replies = alloca0(sizeof(*replies) * priv->child_count); inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies, - priv->child_up, NULL); + local->child_up, NULL); if (!inode) return -ENOMEM; diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 6555ec55771..48e6dbcfb18 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -11,8 +11,6 @@ #ifndef _AFR_SELFHEAL_H #define _AFR_SELFHEAL_H -#define AFR_SH_MIN_PARTICIPANTS 2 - /* Perform fop on all UP subvolumes and wait for all callbacks to return */ #define AFR_ONALL(frame, rfn, fop, args...) \ @@ -20,9 +18,8 @@ afr_local_t *__local = frame->local; \ afr_private_t *__priv = frame->this->private; \ int __i = 0, __count = 0; \ - unsigned char *__child_up = NULL; \ + unsigned char *__child_up = alloca(__priv->child_count); \ \ - __child_up = alloca0(__priv->child_count); \ memcpy(__child_up, __priv->child_up, \ sizeof(*__child_up) * __priv->child_count); \ __count = AFR_COUNT(__child_up, __priv->child_count); \ @@ -48,13 +45,16 @@ afr_local_t *__local = frame->local; \ afr_private_t *__priv = frame->this->private; \ int __i = 0; \ - int __count = AFR_COUNT(list, __priv->child_count); \ + int __count = 0; \ + unsigned char *__list = alloca(__priv->child_count); \ \ + memcpy(__list, list, sizeof(*__list) * __priv->child_count); \ + __count = AFR_COUNT(__list, __priv->child_count); \ __local->barrier.waitfor = __count; \ afr_local_replies_wipe(__local, __priv); \ \ for (__i = 0; __i < __priv->child_count; __i++) { \ - if (!list[__i]) \ + if (!__list[__i]) \ continue; \ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ __priv->children[__i], \ @@ -83,9 +83,9 @@ #define ALLOC_MATRIX(n, type) \ ({ \ - type **__ptr = NULL; \ int __i; \ - __ptr = alloca0(n * sizeof(type *)); \ + type **__ptr = alloca(n * sizeof(type *)); \ + \ for (__i = 0; __i < n; __i++) \ __ptr[__i] = alloca0(n * sizeof(type)); \ __ptr; \ @@ -187,7 +187,7 @@ afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, int afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, uuid_t gfid, struct afr_reply *replies, - unsigned char *discover_on); + unsigned char *discover_on, dict_t *dict); inode_t * afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, const char *name, struct afr_reply *replies, @@ -326,7 +326,8 @@ int afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, inode_t **link_inode, gf_boolean_t *data_selfheal, gf_boolean_t *metadata_selfheal, - gf_boolean_t *entry_selfheal); + gf_boolean_t *entry_selfheal, + struct afr_reply *replies); int afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid); @@ -368,4 +369,9 @@ gf_boolean_t afr_is_file_empty_on_all_children(afr_private_t *priv, struct afr_reply *replies); +int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies); +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode); #endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 7eb12070a39..109fd4b7421 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -94,7 +94,7 @@ __afr_shd_healer_wait(struct subvol_healer *healer) priv = healer->this->private; disabled_loop: - wait_till.tv_sec = time(NULL) + priv->shd.timeout; + wait_till.tv_sec = gf_time() + priv->shd.timeout; while (!healer->rerun) { ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); @@ -222,7 +222,7 @@ out: } int -afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, ia_type_t type) { int ret = 0; @@ -371,8 +371,9 @@ afr_shd_sweep_prepare(struct subvol_healer *healer) event->split_brain_count = 0; event->heal_failed_count = 0; - time(&event->start_time); + event->start_time = gf_time(); event->end_time = 0; + _mask_cancellation(); } void @@ -385,8 +386,8 @@ afr_shd_sweep_done(struct subvol_healer *healer) event = &healer->crawl_event; shd = &(((afr_private_t *)healer->this->private)->shd); - time(&event->end_time); - history = memdup(event, sizeof(*event)); + event->end_time = gf_time(); + history = gf_memdup(event, sizeof(*event)); event->start_time = 0; if (!history) @@ -394,6 +395,7 @@ afr_shd_sweep_done(struct subvol_healer *healer) if (eh_save_history(shd->statistics[healer->subvol], history) < 0) GF_FREE(history); + _unmask_cancellation(); } int @@ -422,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, ret = afr_shd_selfheal(healer, healer->subvol, gfid); if (ret == -ENOENT || ret == -ESTALE) - afr_shd_index_purge(subvol, parent->inode, entry->d_name, val); + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val); if (ret == 2) /* If bricks crashed in pre-op after creating indices/xattrop @@ -522,6 +524,11 @@ afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, afr_private_t *priv = NULL; priv = this->private; + + if (this->cleanup_starting) { + return -ENOTCONN; + } + if (!priv->shd.enabled) return -EBUSY; @@ -590,7 +597,9 @@ _afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) { afr_private_t *priv = NULL; dict_t *xattr = NULL; - int *raw = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = { + 0, + }; int ret = -1; int i = 0; @@ -602,18 +611,11 @@ _afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) "Failed to create dict."); goto out; } - for (i = 0; i < priv->child_count; i++) { - raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); - if (!raw) - goto out; - - ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) { - GF_FREE(raw); + ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) goto out; - } } ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, @@ -640,6 +642,7 @@ afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer, if (afr_shd_fill_ta_loc(this, loc)) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc->name); + ret = -1; goto out; } @@ -798,6 +801,218 @@ afr_bricks_available_for_heal(afr_private_t *priv) return _gf_true; } +static gf_boolean_t +afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer) +{ + dict_t *xdata = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + int ret = -1; + int i = 0; + gf_boolean_t need_heal = _gf_false; + + priv = this->private; + + ret = afr_shd_fill_ta_loc(this, &loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + healer->rerun = 1; + goto out; + } + + if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) { + healer->rerun = 1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) { + need_heal = _gf_true; + break; + } + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + + return need_heal; +} + +static int +afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + afr_private_t *priv = healer->this->private; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + loc_t loc = {0}; + int count = 0; + int i = 0; + int op_errno = 0; + struct iatt *iatt = NULL; + gf_boolean_t multiple_links = _gf_false; + unsigned char *gfid_present = alloca0(priv->child_count); + unsigned char *entry_present = alloca0(priv->child_count); + char *type = "file"; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + gf_msg_debug(healer->this->name, 0, + "Not all bricks are up. Skipping " + "cleanup of %s on %s", + entry->d_name, subvol->name); + ret = 0; + goto out; + } + + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + ret = gf_uuid_parse(entry->d_name, loc.gfid); + if (ret) { + ret = 0; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + gfid_present[i] = 1; + iatt = &local->replies[i].poststat; + if (iatt->ia_type == IA_IFDIR) { + type = "dir"; + } + + if (i == healer->subvol) { + if (local->replies[i].poststat.ia_nlink > 1) { + multiple_links = _gf_true; + } + } + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + /*Inode is deleted from subvol*/ + if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); + ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name, + iatt->ia_type); + if (ret == -ENOENT || ret == -ESTALE) + ret = 0; + } else if (count > 1) { + loc_wipe(&loc); + loc.parent = inode_ref(parent->inode); + loc.name = entry->d_name; + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, + &loc, NULL); + count = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + entry_present[i] = 1; + iatt = &local->replies[i].poststat; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + for (i = 0; i < priv->child_count; i++) { + if (gfid_present[i] && !entry_present[i]) { + /*Entry is not anonymous on at least one subvol*/ + gf_msg_debug(healer->this->name, 0, + "Valid entry present on %s " + "Skipping cleanup of %s on %s", + priv->children[i]->name, entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging %s %s/%s on all subvols", type, priv->anon_inode_name, + entry->d_name); + ret = 0; + for (i = 0; i < priv->child_count; i++) { + op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent, + entry->d_name, iatt->ia_type); + if (op_errno != ENOENT && op_errno != ESTALE) { + ret |= -op_errno; + } + } + } + +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return ret; +} + +static void +afr_cleanup_anon_inode_dir(struct subvol_healer *healer) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_private_t *priv = healer->this->private; + loc_t loc = {0}; + + ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode); + if (ret) + goto out; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, + afr_shd_anon_inode_cleaner, NULL, + priv->shd.max_threads, priv->shd.wait_qlength); +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return; +} + void * afr_shd_index_healer(void *data) { @@ -824,7 +1039,8 @@ afr_shd_index_healer(void *data) priv->local[healer->subvol] = healer->local; if (priv->thin_arbiter_count) { - afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); + if (afr_shd_ta_needs_heal(this, healer)) + afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); } do { @@ -854,9 +1070,17 @@ afr_shd_index_healer(void *data) sleep(1); } while (ret > 0); - if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { + if (ret == 0) { + afr_cleanup_anon_inode_dir(healer); + } + + if (ret == 0 && pre_crawl_xdata && + !healer->crawl_event.heal_failed_count) { afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, pre_crawl_xdata); + } + + if (pre_crawl_xdata) { dict_unref(pre_crawl_xdata); pre_crawl_xdata = NULL; } @@ -974,7 +1198,7 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output, { int ret = 0; uint64_t count = 0; - char key[256] = {0}; + char key[128] = {0}; int keylen = 0; char suffix[64] = {0}; int xl_id = 0; @@ -1099,9 +1323,9 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path, { int ret = -1; uint64_t count = 0; - char key[256] = {0}; + char key[64] = {0}; int keylen = 0; - char xl_id_child_str[64] = {0}; + char xl_id_child_str[32] = {0}; int xl_id = 0; ret = dict_get_int32(output, this->name, &xl_id); @@ -1258,12 +1482,18 @@ out: return ret; } -int -afr_selfheal_childup(xlator_t *this, int subvol) +void +afr_selfheal_childup(xlator_t *this, afr_private_t *priv) { - afr_shd_index_healer_spawn(this, subvol); + int subvol = 0; - return 0; + if (!priv->shd.iamshd) + return; + for (subvol = 0; subvol < priv->child_count; subvol++) + if (priv->child_up[subvol]) + afr_shd_index_healer_spawn(this, subvol); + + return; } int @@ -1318,19 +1548,40 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) int op_ret = 0; uint64_t cnt = 0; +#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \ + dict_str_len) \ + { \ + int ret; \ + \ + ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \ + if (ret) { \ + gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \ + "key=%s", key, "value=%s", dict_str, NULL); \ + } \ + } + priv = this->private; shd = &priv->shd; ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op); - if (ret) + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=xl-op", NULL); goto out; + } this_name_len = strlen(this->name); ret = dict_get_int32n(input, this->name, this_name_len, &xl_id); - if (ret) + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=%s", this->name, NULL); goto out; + } ret = dict_set_int32n(output, this->name, this_name_len, xl_id); - if (ret) + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "key=%s", this->name, NULL); goto out; + } switch (op) { case GF_SHD_OP_HEAL_INDEX: op_ret = 0; @@ -1340,23 +1591,30 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); if (!priv->child_up[i]) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SBRICK_NOT_CONNECTED, SLEN(SBRICK_NOT_CONNECTED)); op_ret = -1; } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SLESS_THAN2_BRICKS_in_REP, SLEN(SLESS_THAN2_BRICKS_in_REP)); op_ret = -1; } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { - ret = dict_set_nstrn(output, key, keylen, SBRICK_IS_REMOTE, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, SLEN(SBRICK_IS_REMOTE)); } else { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SSTARTED_SELF_HEAL, SLEN(SSTARTED_SELF_HEAL)); - afr_shd_index_healer_spawn(this, i); + + ret = afr_shd_index_healer_spawn(this, i); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } } } break; @@ -1368,21 +1626,28 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); if (!priv->child_up[i]) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SBRICK_NOT_CONNECTED, SLEN(SBRICK_NOT_CONNECTED)); } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SLESS_THAN2_BRICKS_in_REP, SLEN(SLESS_THAN2_BRICKS_in_REP)); } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { - ret = dict_set_nstrn(output, key, keylen, SBRICK_IS_REMOTE, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, SLEN(SBRICK_IS_REMOTE)); } else { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SSTARTED_SELF_HEAL, SLEN(SSTARTED_SELF_HEAL)); - afr_shd_full_healer_spawn(this, i); + + ret = afr_shd_full_healer_spawn(this, i); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } op_ret = 0; } } @@ -1390,24 +1655,25 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) case GF_SHD_OP_INDEX_SUMMARY: /* this case has been handled in glfs-heal.c */ break; - case GF_SHD_OP_HEALED_FILES: - case GF_SHD_OP_HEAL_FAILED_FILES: - for (i = 0; i < priv->child_count; i++) { - keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); - ret = dict_set_nstrn(output, key, keylen, SOP_NOT_SUPPORTED, - SLEN(SOP_NOT_SUPPORTED)); - } - break; case GF_SHD_OP_SPLIT_BRAIN_FILES: eh_dump(shd->split_brain, output, afr_add_shd_event); break; case GF_SHD_OP_STATISTICS: for (i = 0; i < priv->child_count; i++) { eh_dump(shd->statistics[i], output, afr_add_crawl_event); - afr_shd_dict_add_crawl_event( + ret = afr_shd_dict_add_crawl_event( this, output, &shd->index_healers[i].crawl_event); - afr_shd_dict_add_crawl_event(this, output, - &shd->full_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } + + ret = afr_shd_dict_add_crawl_event( + this, output, &shd->full_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } } break; case GF_SHD_OP_STATISTICS_HEAL_COUNT: @@ -1418,7 +1684,7 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) if (!priv->child_up[i]) { keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SBRICK_NOT_CONNECTED, SLEN(SBRICK_NOT_CONNECTED)); } else { @@ -1427,6 +1693,10 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) if (ret == 0) { ret = dict_set_uint64(output, key, cnt); } + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_DICT_SET_FAILED, NULL); + } op_ret = 0; } } @@ -1434,11 +1704,13 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) break; default: - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, - "Unknown set op %d", op); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d", + op, NULL); break; } out: dict_deln(output, this->name, this_name_len); return op_ret; + +#undef AFR_SET_DICT_AND_LOG } diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 7de7c431460..18db728ea7b 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -14,12 +14,11 @@ #include <pthread.h> typedef struct { - int child; char *path; + int child; } shd_event_t; typedef struct { - int child; uint64_t healed_count; uint64_t split_brain_count; uint64_t heal_failed_count; @@ -31,38 +30,36 @@ typedef struct { cralwer is in progress */ time_t end_time; char *crawl_type; + int child; } crawl_event_t; struct subvol_healer { xlator_t *this; - int subvol; - gf_boolean_t local; - gf_boolean_t running; - gf_boolean_t rerun; crawl_event_t crawl_event; pthread_mutex_t mutex; pthread_cond_t cond; pthread_t thread; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; }; typedef struct { - gf_boolean_t iamshd; - gf_boolean_t enabled; - int timeout; struct subvol_healer *index_healers; struct subvol_healer *full_healers; eh_t *split_brain; eh_t **statistics; + int timeout; uint32_t max_threads; uint32_t wait_qlength; uint32_t halo_max_latency_msec; + gf_boolean_t iamshd; + gf_boolean_t enabled; } afr_self_heald_t; int -afr_selfheal_childup(xlator_t *this, int subvol); - -int afr_selfheal_daemon_init(xlator_t *this); int @@ -73,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); int -afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, ia_type_t type); #endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 229820ba26a..a51f79b1f43 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -32,7 +32,7 @@ static void afr_post_op_handle_success(call_frame_t *frame, xlator_t *this); static void -afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this); +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno); void __afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared); @@ -74,6 +74,14 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this); static void afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); +void +afr_ta_locked_priv_invalidate(afr_private_t *priv) +{ + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->release_ta_notify_dom_lock = _gf_false; + priv->ta_notify_dom_lock_offset = 0; +} + static void afr_ta_process_waitq(xlator_t *this) { @@ -116,9 +124,9 @@ afr_release_notify_lock_for_ta(void *opaque) this = (xlator_t *)opaque; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate loc for thin-arbiter."); goto out; } @@ -127,17 +135,16 @@ afr_release_notify_lock_for_ta(void *opaque) flock.l_len = 1; ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); - if (!ret) { - LOCK(&priv->lock); - priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; - priv->release_ta_notify_dom_lock = _gf_false; - priv->ta_notify_dom_lock_offset = 0; - UNLOCK(&priv->lock); - } else { + if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to unlock AFR_TA_DOM_NOTIFY lock."); } + LOCK(&priv->lock); + { + afr_ta_locked_priv_invalidate(priv); + } + UNLOCK(&priv->lock); out: loc_wipe(&loc); return ret; @@ -372,6 +379,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this) } local->transaction.unwind(frame, this); + GF_ASSERT(list_empty(&local->transaction.owner_list)); + GF_ASSERT(list_empty(&local->transaction.wait_list)); AFR_STACK_DESTROY(frame); return 0; @@ -393,7 +402,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list) } static void -afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked) +afr_handle_lock_acquire_failure(afr_local_t *local) { struct list_head shared; afr_lock_t *lock = NULL; @@ -414,13 +423,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked) afr_lock_fail_shared(local, &shared); local->transaction.do_eager_unlock = _gf_true; out: - if (locked) { - local->internal_lock.lock_cbk = afr_transaction_done; - afr_unlock(local->transaction.frame, local->transaction.frame->this); - } else { - afr_transaction_done(local->transaction.frame, - local->transaction.frame->this); - } + local->internal_lock.lock_cbk = afr_transaction_done; + afr_unlock(local->transaction.frame, local->transaction.frame->this); } call_frame_t * @@ -517,42 +521,6 @@ afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this) local->transaction.pre_op_sources[j] = 0; } -gf_boolean_t -afr_has_arbiter_fop_cbk_quorum(call_frame_t *frame) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - xlator_t *this = NULL; - gf_boolean_t fop_failed = _gf_false; - unsigned char *pre_op_sources = NULL; - int i = 0; - - local = frame->local; - this = frame->this; - priv = this->private; - pre_op_sources = local->transaction.pre_op_sources; - - /* If the fop failed on the brick, it is not a source. */ - for (i = 0; i < priv->child_count; i++) - if (local->transaction.failed_subvols[i]) - pre_op_sources[i] = 0; - - switch (AFR_COUNT(pre_op_sources, priv->child_count)) { - case 1: - if (pre_op_sources[ARBITER_BRICK_INDEX]) - fop_failed = _gf_true; - break; - case 0: - fop_failed = _gf_true; - break; - } - - if (fop_failed) - return _gf_false; - - return _gf_true; -} - void afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this) { @@ -619,7 +587,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this) failure_count = AFR_COUNT(local->transaction.failed_subvols, priv->child_count); if (failure_count == priv->child_count) { - afr_handle_lock_acquire_failure(local, _gf_true); + afr_handle_lock_acquire_failure(local); return 0; } else { lock = &local->inode_ctx->lock[local->transaction.type]; @@ -668,7 +636,7 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending) } static void -afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this) +afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this) { afr_private_t *priv = this->private; unsigned int inmem_count = 0; @@ -678,17 +646,25 @@ afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this) LOCK(&priv->lock); { /*Once we get notify lock release upcall notification, - if two fop states are non empty/non zero, we will - not release lock. - 1 - If anything in memory txn - 2 - If anything in onwire or onwireq + if any of the fop state counters are non-zero, we will + not release the lock. */ - if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) { - inmem_count = --priv->ta_in_mem_txn_count; - } else { - inmem_count = priv->ta_in_mem_txn_count; - } onwire_count = priv->ta_on_wire_txn_count; + inmem_count = priv->ta_in_mem_txn_count; + switch (fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + onwire_count = --priv->ta_on_wire_txn_count; + break; + case TA_INFO_IN_MEMORY_SUCCESS: + case TA_INFO_IN_MEMORY_FAILED: + inmem_count = --priv->ta_in_mem_txn_count; + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + GF_ASSERT(0); + break; + case TA_SUCCESS: + break; + } release = priv->release_ta_notify_dom_lock; } UNLOCK(&priv->lock); @@ -700,7 +676,7 @@ afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this) } static void -afr_ta_process_onwireq(afr_local_t *local, xlator_t *this) +afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this) { afr_private_t *priv = this->private; afr_local_t *entry = NULL; @@ -713,15 +689,6 @@ afr_ta_process_onwireq(afr_local_t *local, xlator_t *this) LOCK(&priv->lock); { - if (--priv->ta_on_wire_txn_count == 0) { - UNLOCK(&priv->lock); - /*Only one write fop came and after taking notify - *lock and before doing xattrop, it has received - *lock contention upcall, so this is the only place - *to find this out and release the lock*/ - afr_ta_dom_lock_check_and_release(local, this); - return; - } bad_child = priv->ta_bad_child_index; if (bad_child == AFR_CHILD_UNKNOWN) { /*The previous on-wire ta_post_op was a failure. Just dequeue @@ -742,13 +709,10 @@ afr_ta_process_onwireq(afr_local_t *local, xlator_t *this) while (!list_empty(&onwireq)) { entry = list_entry(onwireq.next, afr_local_t, ta_onwireq); list_del_init(&entry->ta_onwireq); - LOCK(&priv->lock); - --priv->ta_on_wire_txn_count; - UNLOCK(&priv->lock); if (entry->ta_failed_subvol == bad_child) { afr_post_op_handle_success(entry->transaction.frame, this); } else { - afr_post_op_handle_failure(entry->transaction.frame, this); + afr_post_op_handle_failure(entry->transaction.frame, this, EIO); } } } @@ -767,7 +731,7 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this) if (priv->thin_arbiter_count) { /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ - afr_ta_dom_lock_check_and_release(frame->local, this); + afr_ta_dom_lock_check_and_release(local->fop_state, this); } /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ @@ -873,7 +837,7 @@ afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame) priv = this->private; up_children_count = AFR_COUNT(subvols, priv->child_count); - if (afr_lookup_has_quorum(frame, this, subvols)) + if (afr_lookup_has_quorum(frame, up_children_count)) return _gf_true; if (priv->quorum_count == AFR_QUORUM_AUTO) { @@ -971,12 +935,8 @@ afr_need_dirty_marking(call_frame_t *frame, xlator_t *this) priv->child_count) return _gf_false; - if (priv->arbiter_count) { - if (!afr_has_arbiter_fop_cbk_quorum(frame)) - need_dirty = _gf_true; - } else if (!afr_has_fop_cbk_quorum(frame)) { + if (!afr_has_fop_cbk_quorum(frame)) need_dirty = _gf_true; - } return need_dirty; } @@ -1026,12 +986,8 @@ afr_handle_quorum(call_frame_t *frame, xlator_t *this) * no split-brain with the fix. The problem is eliminated completely. */ - if (priv->arbiter_count) { - if (afr_has_arbiter_fop_cbk_quorum(frame)) - return; - } else if (afr_has_fop_cbk_quorum(frame)) { + if (afr_has_fop_cbk_quorum(frame)) return; - } if (afr_need_dirty_marking(frame, this)) goto set_response; @@ -1073,7 +1029,7 @@ set_response: } int -afr_fill_ta_loc(xlator_t *this, loc_t *loc) +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop) { afr_private_t *priv = NULL; @@ -1081,6 +1037,11 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc) loc->parent = inode_ref(priv->root_inode); gf_uuid_copy(loc->pargfid, loc->parent->gfid); loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) { + /* Except afr_ta_id_file_check() which is path based, all other gluster + * FOPS need gfid.*/ + return -EINVAL; + } gf_uuid_copy(loc->gfid, priv->ta_gfid); loc->inode = inode_new(loc->parent->table); if (!loc->inode) { @@ -1090,97 +1051,28 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc) return 0; } -int -afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local) -{ - int ret = 0; - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - int failed_count = 0; - struct gf_flock flock = { - 0, - }; - loc_t loc = { - 0, - }; - int i = 0; - - priv = this->private; - if (!priv->thin_arbiter_count) - return 0; - - failed_count = AFR_COUNT(local->transaction.failed_subvols, - priv->child_count); - if (!failed_count) - return 0; - - GF_ASSERT(failed_count == 1); - ret = afr_fill_ta_loc(this, &loc); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed to populate thin-arbiter loc for: %s.", loc.name); - goto out; - } - - xattr = dict_new(); - if (!xattr) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin(xattr, priv->pending_key[i], - local->pending[i], - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) - goto out; - } - - flock.l_type = F_WRLCK; - flock.l_start = 0; - flock.l_len = 0; - - /*TODO: Convert to two domain locking. */ - ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], - AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL); - if (ret) - goto out; - - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); - - if (ret == -EINVAL) { - gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB, - "Thin-arbiter has denied post-op on %s for gfid %s.", - priv->pending_key[THIN_ARBITER_BRICK_INDEX], - uuid_utoa(local->inode->gfid)); - - } else if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Post-op on thin-arbiter id file %s failed for gfid %s.", - priv->pending_key[THIN_ARBITER_BRICK_INDEX], - uuid_utoa(local->inode->gfid)); - } - flock.l_type = F_UNLCK; - syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, - &loc, F_SETLK, &flock, NULL, NULL); -out: - if (xattr) - dict_unref(xattr); - - return ret; -} - static int afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) { xlator_t *this = NULL; afr_local_t *local = NULL; + call_frame_t *txn_frame = NULL; + afr_ta_fop_state_t fop_state; local = (afr_local_t *)opaque; + fop_state = local->fop_state; + txn_frame = local->transaction.frame; this = frame->this; + if (ret == 0) { + /*Mark pending xattrs on the up data brick.*/ + afr_post_op_handle_success(txn_frame, this); + } else { + afr_post_op_handle_failure(txn_frame, this, -ret); + } + STACK_DESTROY(frame->root); - afr_ta_process_onwireq(local, this); + afr_ta_process_onwireq(fop_state, this); return 0; } @@ -1191,6 +1083,7 @@ afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending, { int **changelog = NULL; int idx = 0; + int ret = 0; int i; if (local->is_new_entry == _gf_true) { @@ -1206,20 +1099,36 @@ afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending, if (local->transaction.failed_subvols[i]) changelog[i][idx] = hton32(1); } - afr_set_pending_dict(priv, xattr, changelog); + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } } out: return changelog; } +static void +afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local, + gf_boolean_t *valid) +{ + if (priv->ta_event_gen > local->ta_event_gen) { + /* We can't trust the ta's response anymore.*/ + afr_ta_locked_priv_invalidate(priv); + *valid = _gf_false; + return; + } + return; +} + static int afr_ta_post_op_do(void *opaque) { afr_local_t *local = NULL; afr_private_t *priv = NULL; xlator_t *this = NULL; - call_frame_t *txn_frame = NULL; dict_t *xattr = NULL; unsigned char *pending = NULL; int **changelog = NULL; @@ -1230,15 +1139,15 @@ afr_ta_post_op_do(void *opaque) }; int i = 0; int ret = 0; + gf_boolean_t valid = _gf_true; local = (afr_local_t *)opaque; - txn_frame = local->transaction.frame; - this = txn_frame->this; + this = local->transaction.frame->this; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate loc for thin-arbiter."); goto out; } @@ -1262,8 +1171,10 @@ afr_ta_post_op_do(void *opaque) changelog = afr_set_changelog_xattr(priv, pending, xattr, local); - if (!changelog) + if (!changelog) { + ret = -ENOMEM; goto out; + } ret = afr_ta_post_op_lock(this, &loc); if (ret) @@ -1271,22 +1182,31 @@ afr_ta_post_op_do(void *opaque) ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s failed for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + } LOCK(&priv->lock); { if (ret == 0) { priv->ta_bad_child_index = failed_subvol; } else if (ret == -EINVAL) { priv->ta_bad_child_index = success_subvol; + ret = -EIO; /* TA failed the fop. Return EIO to application. */ } + + afr_ta_locked_xattrop_validate(priv, local, &valid); } UNLOCK(&priv->lock); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Post-op on thin-arbiter id file %s failed for gfid %s.", + if (valid == _gf_false) { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s for gfid %s invalidated due " + "to event-gen mismatch.", priv->pending_key[THIN_ARBITER_BRICK_INDEX], uuid_utoa(local->inode->gfid)); - if (ret == -EINVAL) - ret = -EIO; /* TA failed the fop. Return EIO to application. */ + ret = -EIO; } afr_ta_post_op_unlock(this, &loc); @@ -1299,12 +1219,6 @@ out: loc_wipe(&loc); - if (ret == 0) { - /*Mark pending xattrs on the up data brick.*/ - afr_post_op_handle_success(local->transaction.frame, this); - } else { - afr_post_op_handle_failure(local->transaction.frame, this); - } return ret; } @@ -1363,6 +1277,7 @@ afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local, /* Post-op on TA not needed as the fop succeeded only on the * in-memory bad data brick and not the good one. Fail the fop.*/ local->fop_state = TA_INFO_IN_MEMORY_FAILED; + priv->ta_in_mem_txn_count++; } } UNLOCK(&priv->lock); @@ -1396,9 +1311,9 @@ afr_post_op_handle_success(call_frame_t *frame, xlator_t *this) } static void -afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this) +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno) { - afr_changelog_post_op_fail(frame, this, EIO); + afr_changelog_post_op_fail(frame, this, op_errno); return; } @@ -1428,7 +1343,7 @@ afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this) afr_post_op_handle_success(frame, this); break; case TA_INFO_IN_MEMORY_FAILED: - afr_post_op_handle_failure(frame, this); + afr_post_op_handle_failure(frame, this, EIO); break; default: break; @@ -2092,7 +2007,7 @@ err: local->op_ret = -1; local->op_errno = op_errno; - afr_handle_lock_acquire_failure(local, _gf_true); + afr_handle_lock_acquire_failure(local); if (xdata_req) dict_unref(xdata_req); @@ -2361,7 +2276,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this) } else { lock = &local->inode_ctx->lock[local->transaction.type]; if (local->internal_lock.lock_op_ret < 0) { - afr_handle_lock_acquire_failure(local, _gf_false); + afr_handle_lock_acquire_failure(local); } else { lock->event_generation = local->event_generation; afr_changelog_pre_op(frame, this); @@ -2432,8 +2347,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this, goto out; } - if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) { - /*Only allow writes but shard does [f]xattrops on writes, so + if (local->transaction.disable_delayed_post_op) { + goto out; + } + + if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) && + (local->op != GF_FOP_FSYNC)) { + /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so * they are fine too*/ goto out; } @@ -2560,8 +2480,10 @@ afr_changelog_fsync(call_frame_t *frame, xlator_t *this) local->call_count = call_count; xdata = dict_new(); - if (xdata) + if (xdata) { ret = dict_set_int32_sizen(xdata, "batch-fsync", 1); + ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + } for (i = 0; i < priv->child_count; i++) { if (!local->transaction.pre_op[i]) diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 33258a048a4..df7366f0a65 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -135,12 +135,34 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo) } } +void +afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options) +{ + char *volfile_id_str = NULL; + uuid_t anon_inode_gfid = {0}; + + /*If volume id is not present don't enable anything*/ + if (dict_get_str(options, "volume-id", &volfile_id_str)) + return; + GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX); + /*anon_inode_name is not supposed to change once assigned*/ + if (!priv->anon_inode_name[0]) { + snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s", + AFR_ANON_DIR_PREFIX, volfile_id_str); + gf_uuid_parse(volfile_id_str, anon_inode_gfid); + /*Flip a bit to make sure volfile-id and anon-gfid are not same*/ + anon_inode_gfid[0] ^= 1; + uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str); + } +} + int reconfigure(xlator_t *this, dict_t *options) { afr_private_t *priv = NULL; xlator_t *read_subvol = NULL; int read_subvol_index = -1; + int timeout_old = 0; int ret = -1; int index = -1; char *qtype = NULL; @@ -150,6 +172,7 @@ reconfigure(xlator_t *this, dict_t *options) char *locking_scheme = NULL; gf_boolean_t consistent_io = _gf_false; gf_boolean_t choose_local_old = _gf_false; + gf_boolean_t enabled_old = _gf_false; priv = this->private; @@ -166,7 +189,8 @@ reconfigure(xlator_t *this, dict_t *options) bool, out); GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out); - gf_string2boolean(data_self_heal, &priv->data_self_heal); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool, out); @@ -239,6 +263,8 @@ reconfigure(xlator_t *this, dict_t *options) out); GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log, + options, bool, out); GF_OPTION_RECONF("quorum-type", qtype, options, str, out); GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out); fix_quorum_options(this, priv, qtype, options); @@ -255,11 +281,13 @@ reconfigure(xlator_t *this, dict_t *options) GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options, bool, out); + enabled_old = priv->shd.enabled; GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out); GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool, out); + timeout_old = priv->shd.timeout; GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out); GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options, @@ -283,6 +311,16 @@ reconfigure(xlator_t *this, dict_t *options) consistent_io = _gf_false; priv->consistent_io = consistent_io; + afr_handle_anon_inode_options(priv, options); + + GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool, + out); + if (priv->shd.enabled) { + if ((priv->shd.enabled != enabled_old) || + (timeout_old != priv->shd.timeout)) + afr_selfheal_childup(this, priv); + } + ret = 0; out: return ret; @@ -368,7 +406,7 @@ afr_ta_init(afr_private_t *priv) priv->release_ta_notify_dom_lock = _gf_false; INIT_LIST_HEAD(&priv->ta_waitq); INIT_LIST_HEAD(&priv->ta_onwireq); - *priv->ta_gfid = 0; + gf_uuid_clear(priv->ta_gfid); } int32_t @@ -407,6 +445,8 @@ init(xlator_t *this) goto out; priv = this->private; + INIT_LIST_HEAD(&priv->saved_locks); + INIT_LIST_HEAD(&priv->lk_healq); LOCK_INIT(&priv->lock); child_count = xlator_subvolume_count(this); @@ -471,7 +511,8 @@ init(xlator_t *this) GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out); GF_OPTION_INIT("data-self-heal", data_self_heal, str, out); - gf_string2boolean(data_self_heal, &priv->data_self_heal); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str, out); @@ -525,7 +566,9 @@ init(xlator_t *this) GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); + afr_handle_anon_inode_options(priv, this->options); + GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out); if (priv->quorum_count != 0) priv->consistent_io = _gf_false; @@ -537,13 +580,19 @@ init(xlator_t *this) goto out; } + priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); + priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count, gf_afr_mt_child_latency_t); + priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); - if (!priv->child_up || !priv->child_latency) { + if (!priv->child_up || !priv->child_latency || !priv->halo_child_up || + !priv->anon_inode) { ret = -ENOMEM; goto out; } @@ -611,19 +660,83 @@ init(xlator_t *this) out: return ret; } +void +afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer) +{ + int ret = -1; + + if (!healer) + return; + + if (healer->running) { + /* + * If there are any resources to cleanup, We need + * to do that gracefully using pthread_cleanup_push + */ + ret = gf_thread_cleanup_xint(healer->thread); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED, + "Failed to clean up healer threads."); + healer->thread = 0; + } + pthread_cond_destroy(&healer->cond); + pthread_mutex_destroy(&healer->mutex); +} + +void +afr_selfheal_daemon_fini(xlator_t *this) +{ + struct subvol_healer *healer = NULL; + afr_self_heald_t *shd = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + if (!priv) + return; + shd = &priv->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + afr_destroy_healer_object(this, healer); + + healer = &shd->full_healers[i]; + afr_destroy_healer_object(this, healer); + + if (shd->statistics[i]) + eh_destroy(shd->statistics[i]); + } + GF_FREE(shd->index_healers); + GF_FREE(shd->full_healers); + GF_FREE(shd->statistics); + if (shd->split_brain) + eh_destroy(shd->split_brain); +} void fini(xlator_t *this) { afr_private_t *priv = NULL; priv = this->private; + + afr_selfheal_daemon_fini(this); + GF_ASSERT(list_empty(&priv->saved_locks)); + LOCK(&priv->lock); if (priv->timer != NULL) { gf_timer_call_cancel(this->ctx, priv->timer); priv->timer = NULL; } UNLOCK(&priv->lock); + + if (this->local_pool != NULL) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + this->private = NULL; afr_priv_destroy(priv); if (this->itable) { @@ -655,6 +768,7 @@ struct xlator_fops fops = { .getxattr = afr_getxattr, .fgetxattr = afr_fgetxattr, .readv = afr_readv, + .seek = afr_seek, /* inode write */ .writev = afr_writev, @@ -727,7 +841,7 @@ struct volume_options options[] = { {.key = {"read-hash-mode"}, .type = GF_OPTION_TYPE_INT, .min = 0, - .max = 3, + .max = 5, .default_value = "1", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, @@ -740,7 +854,10 @@ struct volume_options options[] = { "1 = hash by GFID of file (all clients use " "same subvolume).\n" "2 = hash by GFID of file and client PID.\n" - "3 = brick having the least outstanding read requests."}, + "3 = brick having the least outstanding read requests.\n" + "4 = brick having the least network ping latency.\n" + "5 = Hybrid mode between 3 and 4, ie least value among " + "network-latency multiplied by outstanding-read-requests."}, { .key = {"choose-local"}, .type = GF_OPTION_TYPE_BOOL, @@ -1200,6 +1317,14 @@ struct volume_options options[] = { .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, + {.key = {"use-anonymous-inode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "Setting this option heals directory renames efficiently"}, + {.key = {NULL}}, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 2cc3797675e..d62f9a9caf2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -39,7 +39,10 @@ #define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify" #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" +#define AFR_LK_HEAL_DOM "afr.lock-heal.domain" + #define AFR_HALO_MAX_LATENCY 99999 +#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode" #define PFLAG_PENDING (1 << 0) #define PFLAG_SBRAIN (1 << 1) @@ -95,6 +98,25 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this); gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \ } while (0) +#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \ + do { \ + afr_fd_ctx_t *__fd_ctx = NULL; \ + __fd_ctx = afr_fd_ctx_get(__fd, __this); \ + if (__fd_ctx && __fd_ctx->is_fd_bad) { \ + __error = EBADF; \ + goto __label; \ + } \ + } while (0) + +typedef enum { + AFR_READ_POLICY_FIRST_UP, + AFR_READ_POLICY_GFID_HASH, + AFR_READ_POLICY_GFID_PID_HASH, + AFR_READ_POLICY_LESS_LOAD, + AFR_READ_POLICY_LEAST_LATENCY, + AFR_READ_POLICY_LOAD_LATENCY_HYBRID, +} afr_read_hash_mode_t; + typedef enum { AFR_FAV_CHILD_NONE, AFR_FAV_CHILD_BY_SIZE, @@ -126,13 +148,27 @@ typedef enum { *on BAD brick - Success*/ TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed *on GOOD brick - Failed*/ + TA_SUCCESS, /*FOP succeeded on both data bricks.*/ } afr_ta_fop_state_t; struct afr_nfsd { - gf_boolean_t iamnfsd; uint32_t halo_max_latency_msec; + gf_boolean_t iamnfsd; }; +typedef struct _afr_lk_heal_info { + fd_t *fd; + int32_t cmd; + struct gf_flock flock; + dict_t *xdata_req; + unsigned char *locked_nodes; + struct list_head pos; + gf_lkowner_t lk_owner; + pid_t pid; + int32_t *child_up_event_gen; + int32_t *child_down_event_gen; +} afr_lk_heal_info_t; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -143,19 +179,21 @@ typedef struct _afr_private { inode_t *root_inode; + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ /* For thin-arbiter. */ - unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ uuid_t ta_gfid; - unsigned char ta_child_up; + unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ int ta_bad_child_index; - off_t ta_notify_dom_lock_offset; - gf_boolean_t release_ta_notify_dom_lock; + int ta_event_gen; unsigned int ta_in_mem_txn_count; unsigned int ta_on_wire_txn_count; struct list_head ta_waitq; struct list_head ta_onwireq; + unsigned char *anon_inode; unsigned char *child_up; + unsigned char *halo_child_up; int64_t *child_latency; unsigned char *local; @@ -176,30 +214,31 @@ typedef struct _afr_private { int32_t healers; /* No. of elements currently undergoing background heal*/ + gf_boolean_t release_ta_notify_dom_lock; + gf_boolean_t metadata_self_heal; /* on/off */ gf_boolean_t entry_self_heal; /* on/off */ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ - unsigned int hash_mode; /* for when read_child is not set */ gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ - int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ - afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic - resolution of split-brains.*/ + gf_timer_t *timer; /* launched when parent up is received */ unsigned int wait_count; /* # of servers to wait for success */ - gf_timer_t *timer; /* launched when parent up is received */ - + unsigned char ta_child_up; gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; gf_boolean_t pre_op_compat; /* on/off */ uint32_t post_op_delay_secs; unsigned int quorum_count; - char vol_uuid[UUID_SIZE + 1]; + off_t ta_notify_dom_lock_offset; + afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic + resolution of split-brains.*/ + afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ + int32_t *last_event; /* @event_generation: Keeps count of number of events received which can @@ -212,34 +251,41 @@ typedef struct _afr_private { important as we might have had a network split brain. */ uint32_t event_generation; + char vol_uuid[UUID_SIZE + 1]; gf_boolean_t choose_local; gf_boolean_t did_discovery; - uint64_t sh_readdir_size; gf_boolean_t ensure_durability; + gf_boolean_t halo_enabled; + gf_boolean_t consistent_metadata; + gf_boolean_t need_heal; + gf_boolean_t granular_locks; + uint64_t sh_readdir_size; char *sh_domain; char *afr_dirty; - gf_boolean_t halo_enabled; - uint32_t halo_max_latency_msec; - uint32_t halo_max_replicas; - uint32_t halo_min_replicas; + uint64_t spb_choice_timeout; afr_self_heald_t shd; struct afr_nfsd nfsd; - gf_boolean_t consistent_metadata; - uint64_t spb_choice_timeout; - gf_boolean_t need_heal; + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; - /* pump dependencies */ - void *pump_private; - gf_boolean_t use_afr_in_pump; - gf_boolean_t granular_locks; gf_boolean_t full_lock; gf_boolean_t esh_granular; gf_boolean_t consistent_io; gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t use_anon_inode; + + /*For lock healing.*/ + struct list_head saved_locks; + struct list_head lk_healq; + + /*For anon-inode handling */ + char anon_inode_name[NAME_MAX + 1]; + char anon_gfid_str[UUID_SIZE + 1]; } afr_private_t; typedef enum { @@ -303,18 +349,17 @@ afr_entry_lockee_cmp(const void *l1, const void *l2); typedef struct { loc_t *lk_loc; - int lockee_count; afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; - char lower_locked; - char higher_locked; unsigned char *lower_locked_nodes; - int32_t lock_count; + afr_lock_cbk_t lock_cbk; + + int lockee_count; int32_t lk_call_count; int32_t lk_expected_count; @@ -322,14 +367,15 @@ typedef struct { int32_t lock_op_ret; int32_t lock_op_errno; - afr_lock_cbk_t lock_cbk; char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ + int32_t lock_count; + char lower_locked; + char higher_locked; } afr_internal_lock_t; struct afr_reply { int valid; int32_t op_ret; - int32_t op_errno; dict_t *xattr; /*For xattrop*/ dict_t *xdata; struct iatt poststat; @@ -338,6 +384,7 @@ struct afr_reply { struct iatt preparent; struct iatt preparent2; struct iatt postparent2; + int32_t op_errno; /* For rchecksum */ uint8_t checksum[SHA256_DIGEST_LENGTH]; gf_boolean_t buf_has_zeroes; @@ -361,6 +408,10 @@ typedef struct { arrives, we continue to read off this subvol. */ int readdir_subvol; + /* lock-healing related members. */ + gf_boolean_t is_fd_bad; + afr_lk_heal_info_t *lk_heal_info; + } afr_fd_ctx_t; typedef enum { @@ -377,8 +428,6 @@ typedef struct _afr_inode_lock_t { */ int32_t num_inodelks; unsigned int event_generation; - gf_boolean_t release; - gf_boolean_t acquired; gf_timer_t *delay_timer; struct list_head owners; /*Transactions that are performing fop*/ struct list_head post_op; /*Transactions that are done with the fop @@ -387,6 +436,8 @@ typedef struct _afr_inode_lock_t { *conflicting transactions to complete*/ struct list_head frozen; /*Transactions that need to go as part of * next batch of eager-lock*/ + gf_boolean_t release; + gf_boolean_t acquired; } afr_lock_t; typedef struct _afr_inode_ctx { @@ -395,15 +446,11 @@ typedef struct _afr_inode_ctx { int lock_count; int spb_choice; gf_timer_t *timer; - gf_boolean_t need_refresh; unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; int inherited[AFR_NUM_CHANGE_LOGS]; int on_disk[AFR_NUM_CHANGE_LOGS]; - - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; + /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ + afr_lock_t lock[2]; /* @open_fd_count: Number of open FDs queried from the server, as queried through @@ -411,8 +458,12 @@ typedef struct _afr_inode_ctx { temporarily disabled. */ uint32_t open_fd_count; - /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ - afr_lock_t lock[2]; + gf_boolean_t need_refresh; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; } afr_inode_ctx_t; typedef struct _afr_local { @@ -426,19 +477,15 @@ typedef struct _afr_local { unsigned int event_generation; uint32_t open_fd_count; - gf_boolean_t update_open_fd_count; int32_t num_inodelks; - gf_boolean_t update_num_inodelks; - - gf_lkowner_t saved_lk_owner; int32_t op_ret; int32_t op_errno; - int32_t **pending; - int dirty[AFR_NUM_CHANGE_LOGS]; + int32_t **pending; + loc_t loc; loc_t newloc; @@ -469,14 +516,6 @@ typedef struct _afr_local { afr_read_txn_wind_t readfn; - /* @refreshed: - - the inode was "refreshed" (i.e, pending xattrs from all subvols - freshly inspected and inode ctx updated accordingly) as part of - this transaction already. - */ - gf_boolean_t refreshed; - /* @inode: the inode on which the read txn is performed on. ref'ed and copied @@ -501,8 +540,6 @@ typedef struct _afr_local { unsigned char *readable; unsigned char *readable2; /*For rename transaction*/ - int read_subvol; /* Current read subvolume */ - afr_inode_refresh_cbk_t refreshfn; /* @refreshinode: @@ -511,9 +548,30 @@ typedef struct _afr_local { */ inode_t *refreshinode; + dict_t *xattr_req; + + dict_t *dict; + + int read_subvol; /* Current read subvolume */ + + int optimistic_change_log; + + afr_internal_lock_t internal_lock; + /*To handle setattr/setxattr on yet to be linked inode from dht*/ uuid_t refreshgfid; + /* @refreshed: + + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; + + gf_boolean_t update_num_inodelks; + gf_boolean_t update_open_fd_count; + /* @pre_op_compat: @@ -523,14 +581,6 @@ typedef struct _afr_local { gf_boolean_t pre_op_compat; - dict_t *xattr_req; - - afr_internal_lock_t internal_lock; - - dict_t *dict; - - int optimistic_change_log; - /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? @@ -549,25 +599,25 @@ typedef struct _afr_local { struct { struct { - gf_boolean_t needs_fresh_lookup; - uuid_t gfid_req; - } lookup; - - struct { - unsigned char buf_set; struct statvfs buf; + unsigned char buf_set; } statfs; struct { - int32_t flags; fd_t *fd; + int32_t flags; } open; struct { - int32_t cmd; struct gf_flock user_flock; struct gf_flock ret_flock; unsigned char *locked_nodes; + int32_t cmd; + /*For lock healing only.*/ + unsigned char *dom_locked_nodes; + int32_t *dom_lock_op_ret; + int32_t *dom_lock_op_errno; + struct gf_flock *getlk_rsp; } lk; /* inode read */ @@ -592,8 +642,8 @@ typedef struct _afr_local { struct { char *name; - int last_index; long xattr_len; + int last_index; } getxattr; struct { @@ -606,11 +656,10 @@ typedef struct _afr_local { /* dir read */ struct { + uint32_t *checksum; int success_count; int32_t op_ret; int32_t op_errno; - - uint32_t *checksum; } opendir; struct { @@ -619,8 +668,8 @@ typedef struct _afr_local { size_t size; off_t offset; dict_t *dict; - gf_boolean_t failed; int last_index; + gf_boolean_t failed; } readdir; /* inode write */ @@ -630,12 +679,11 @@ typedef struct _afr_local { } inode_wfop; // common structure for all inode-write-fops struct { - int32_t op_ret; - struct iovec *vector; struct iobref *iobref; - int32_t count; off_t offset; + int32_t op_ret; + int32_t count; uint32_t flags; } writev; @@ -695,29 +743,25 @@ typedef struct _afr_local { } create; struct { + dict_t *params; dev_t dev; mode_t mode; - dict_t *params; } mknod; struct { - int32_t mode; dict_t *params; + int32_t mode; } mkdir; struct { - int flags; - } rmdir; - - struct { dict_t *params; char *linkpath; } symlink; struct { - int32_t mode; off_t offset; size_t len; + int32_t mode; } fallocate; struct { @@ -744,10 +788,10 @@ typedef struct _afr_local { struct { char *volume; char *basename; + void *xdata; entrylk_cmd in_cmd; entrylk_cmd cmd; entrylk_type type; - void *xdata; } entrylk; struct { @@ -756,31 +800,33 @@ typedef struct _afr_local { } seek; struct { - int32_t datasync; - } fsync; - - struct { struct gf_lease user_lease; struct gf_lease ret_lease; unsigned char *locked_nodes; } lease; - } cont; + struct { + int flags; + } rmdir; - struct { - off_t start, len; + struct { + int32_t datasync; + } fsync; - gf_boolean_t eager_lock_on; - gf_boolean_t do_eager_unlock; + struct { + uuid_t gfid_req; + gf_boolean_t needs_fresh_lookup; + } lookup; + + } cont; + struct { char *basename; char *new_basename; loc_t parent_loc; loc_t new_parent_loc; - afr_transaction_type type; - /* stub to resume on destruction of the transaction frame */ call_stub_t *resume_stub; @@ -798,6 +844,30 @@ typedef struct _afr_local { FOP failed. */ unsigned char *failed_subvols; + call_frame_t *main_frame; /*Fop frame*/ + call_frame_t *frame; /*Transaction frame*/ + + int (*wind)(call_frame_t *frame, xlator_t *this, int subvol); + + int (*unwind)(call_frame_t *frame, xlator_t *this); + + off_t start, len; + + afr_transaction_type type; + + int32_t in_flight_sb_errno; /* This is where the cause of the + failure on the last good copy of + the file is stored. + */ + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + afr_changelog_resume_t changelog_resume; + + gf_boolean_t eager_lock_on; + gf_boolean_t do_eager_unlock; + /* @dirtied: flag which indicates whether we set dirty flag in the OP. Typically true when we are performing operation on more than one subvol and optimistic changelog is disabled @@ -822,6 +892,10 @@ typedef struct _afr_local { */ gf_boolean_t no_uninherit; + gf_boolean_t in_flight_sb; /* Indicator for occurrence of + split-brain while in the middle of + a txn. */ + /* @uninherit_done: @uninherit_value: @@ -834,27 +908,7 @@ typedef struct _afr_local { gf_boolean_t uninherit_done; gf_boolean_t uninherit_value; - gf_boolean_t in_flight_sb; /* Indicator for occurrence of - split-brain while in the middle of - a txn. */ - int32_t in_flight_sb_errno; /* This is where the cause of the - failure on the last good copy of - the file is stored. - */ - - /* @changelog_resume: function to be called after changlogging - (either pre-op or post-op) is done - */ - afr_changelog_resume_t changelog_resume; - - call_frame_t *main_frame; /*Fop frame*/ - call_frame_t *frame; /*Transaction frame*/ - - int (*wind)(call_frame_t *frame, xlator_t *this, int subvol); - - int (*unwind)(call_frame_t *frame, xlator_t *this); - - /* post-op hook */ + gf_boolean_t disable_delayed_post_op; } transaction; syncbarrier_t barrier; @@ -867,34 +921,36 @@ typedef struct _afr_local { mode_t umask; int xflag; - gf_boolean_t do_discovery; struct afr_reply *replies; /* For client side background heals. */ struct list_head healer; call_frame_t *heal_frame; - gf_boolean_t need_full_crawl; - afr_fop_lock_state_t fop_lock_state; - - gf_boolean_t is_read_txn; afr_inode_ctx_t *inode_ctx; /*For thin-arbiter transactions.*/ - unsigned char ta_child_up; + int ta_failed_subvol; + int ta_event_gen; struct list_head ta_waitq; struct list_head ta_onwireq; afr_ta_fop_state_t fop_state; - int ta_failed_subvol; + afr_fop_lock_state_t fop_lock_state; + gf_lkowner_t saved_lk_owner; + unsigned char read_txn_query_child; + unsigned char ta_child_up; + gf_boolean_t do_discovery; + gf_boolean_t need_full_crawl; + gf_boolean_t is_read_txn; gf_boolean_t is_new_entry; } afr_local_t; typedef struct afr_spbc_timeout { call_frame_t *frame; - gf_boolean_t d_spb; - gf_boolean_t m_spb; loc_t *loc; int spb_child_index; + gf_boolean_t d_spb; + gf_boolean_t m_spb; } afr_spbc_timeout_t; typedef struct afr_spb_status { @@ -904,9 +960,9 @@ typedef struct afr_spb_status { typedef struct afr_empty_brick_args { call_frame_t *frame; + char *op_type; loc_t loc; int empty_index; - char *op_type; } afr_empty_brick_args_t; typedef struct afr_read_subvol_args { @@ -948,7 +1004,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, int event_generation); int -afr_inode_event_gen_reset(inode_t *inode, xlator_t *this); +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); + +int +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); int afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, @@ -1064,6 +1123,9 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); if (__local && __local->is_read_txn) \ afr_pending_read_decrement(__this->private, \ __local->read_subvol); \ + if (__local && __local->xdata_req && \ + afr_is_lock_mode_mandatory(__local->xdata_req)) \ + afr_dom_lock_release(frame); \ frame->local = NULL; \ } \ \ @@ -1091,8 +1153,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); #define AFR_FRAME_INIT(frame, op_errno) \ ({ \ frame->local = mem_get0(THIS->local_pool); \ - if (afr_local_init(frame->local, THIS->private, &op_errno)) { \ - afr_local_cleanup(frame->local, THIS); \ + if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \ + afr_local_cleanup(frame->local, frame->this); \ mem_put(frame->local); \ frame->local = NULL; \ }; \ @@ -1216,8 +1278,8 @@ int afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice); int -afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, - int *spb_choice); +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol); int afr_get_child_index_from_name(xlator_t *this, char *name); @@ -1302,7 +1364,7 @@ int afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode); int -afr_fill_ta_loc(xlator_t *this, loc_t *loc); +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop); int afr_ta_post_op_lock(xlator_t *this, loc_t *loc); @@ -1325,11 +1387,37 @@ afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local); void afr_ta_lock_release_synctask(xlator_t *this); +void +afr_ta_locked_priv_invalidate(afr_private_t *priv); + gf_boolean_t -afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this, - unsigned char *subvols); +afr_lookup_has_quorum(call_frame_t *frame, + const unsigned int up_children_count); void afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this); +void +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, + int child); + +void +afr_selfheal_childup(xlator_t *this, afr_private_t *priv); + +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata); + +void +afr_dom_lock_release(call_frame_t *frame); + +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies); + +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid); #endif /* __AFR_H__ */ |
