summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src')
-rw-r--r--xlators/cluster/afr/src/afr-common.c1781
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c40
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c20
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c41
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h3
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c124
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c1
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h2
-rw-r--r--xlators/cluster/afr/src/afr-messages.h160
-rw-r--r--xlators/cluster/afr/src/afr-open.c44
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c69
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c286
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c57
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c258
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c76
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c48
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h26
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c378
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h23
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c320
-rw-r--r--xlators/cluster/afr/src/afr.c137
-rw-r--r--xlators/cluster/afr/src/afr.h364
22 files changed, 3054 insertions, 1204 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 45b96e33970..032ab5c8001 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -18,9 +18,7 @@
#include <glusterfs/glusterfs.h>
#include "afr.h"
#include <glusterfs/dict.h>
-#include <glusterfs/xlator.h>
#include <glusterfs/hashfn.h>
-#include <glusterfs/logging.h>
#include <glusterfs/list.h>
#include <glusterfs/call-stub.h>
#include <glusterfs/defaults.h>
@@ -47,6 +45,56 @@ afr_quorum_errno(afr_private_t *priv)
return ENOTCONN;
}
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
+{
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
+
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
+
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
+
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
+ }
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ replies[i] = 1;
+ } else {
+ replies[i] = 0;
+ }
+ }
+}
+
int
afr_fav_child_reset_sink_xattrs(void *opaque);
@@ -56,6 +104,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
static void
afr_discover_done(call_frame_t *frame, xlator_t *this);
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ local->cont.lk.dom_lock_op_ret[i] = op_ret;
+ local->cont.lk.dom_lock_op_errno[i] = op_errno;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to acquire %s on %s",
+ uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+ priv->children[i]->name);
+ } else {
+ local->cont.lk.dom_locked_nodes[i] = 1;
+ }
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ local->cont.lk.dom_locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+ if (!local->cont.lk.dom_locked_nodes) {
+ return -ENOMEM;
+ }
+ local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_ret) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_errno) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ flock.l_type = F_WRLCK;
+
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLK, &flock, NULL);
+
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+ goto blocking_lock;
+
+ /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+ if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+ priv->child_count) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+ local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+ goto blocking_lock;
+ }
+ }
+
+ return 0;
+
+blocking_lock:
+ afr_dom_lock_release(frame);
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLKW, &flock, NULL);
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+ afr_dom_lock_release(frame);
+ return -afr_quorum_errno(priv);
+ }
+
+ return 0;
+}
+
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to release %s on %s", local->loc.path,
+ AFR_LK_HEAL_DOM, priv->children[i]->name);
+ }
+ local->cont.lk.dom_locked_nodes[i] = 0;
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+void
+afr_dom_lock_release(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = frame->this->private;
+ locked_on = local->cont.lk.dom_locked_nodes;
+ if (AFR_COUNT(locked_on, priv->child_count) == 0)
+ return;
+ flock.l_type = F_UNLCK;
+
+ AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+ AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
+
+ return;
+}
+
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+ if (!info)
+ return;
+ if (info->xdata_req)
+ dict_unref(info->xdata_req);
+ if (info->fd)
+ fd_unref(info->fd);
+ GF_FREE(info->locked_nodes);
+ GF_FREE(info->child_up_event_gen);
+ GF_FREE(info->child_down_event_gen);
+ GF_FREE(info);
+}
+
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -ENOMEM;
+
+ info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+ if (!info) {
+ goto cleanup;
+ }
+ INIT_LIST_HEAD(&info->pos);
+ info->fd = fd_ref(local->fd);
+ info->cmd = local->cont.lk.cmd;
+ info->pid = frame->root->pid;
+ info->flock = local->cont.lk.user_flock;
+ info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+ if (!info->xdata_req) {
+ goto cleanup;
+ }
+ info->lk_owner = frame->root->lk_owner;
+ info->locked_nodes = GF_MALLOC(
+ sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+ if (!info->locked_nodes) {
+ goto cleanup;
+ }
+ memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+ sizeof(*info->locked_nodes) * priv->child_count);
+ info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!info->child_up_event_gen) {
+ goto cleanup;
+ }
+ info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!info->child_down_event_gen) {
+ goto cleanup;
+ }
+
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(local->fd, this);
+ if (fd_ctx)
+ fd_ctx->lk_heal_info = info;
+ }
+ UNLOCK(&local->fd->lock);
+ if (!fd_ctx) {
+ goto cleanup;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&info->pos, &priv->saved_locks);
+ }
+ UNLOCK(&priv->lock);
+
+ return 0;
+cleanup:
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to add lock to healq",
+ uuid_utoa(local->fd->inode->gfid));
+ if (info) {
+ afr_lk_heal_info_cleanup(info);
+ if (fd_ctx) {
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx->lk_heal_info = NULL;
+ }
+ UNLOCK(&local->fd->lock);
+ }
+ }
+ return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = local->cont.lk.user_flock;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -EINVAL;
+
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx || !fd_ctx->lk_heal_info) {
+ goto out;
+ }
+
+ info = fd_ctx->lk_heal_info;
+ if ((info->flock.l_start != flock.l_start) ||
+ (info->flock.l_whence != flock.l_whence) ||
+ (info->flock.l_len != flock.l_len)) {
+ /*TODO: Compare lkowners too.*/
+ goto out;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ UNLOCK(&priv->lock);
+
+ afr_lk_heal_info_cleanup(info);
+ fd_ctx->lk_heal_info = NULL;
+ ret = 0;
+out:
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to remove lock from healq",
+ uuid_utoa(local->fd->inode->gfid));
+ return ret;
+}
+
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed to heal lock on child %d for %s", i,
+ uuid_utoa(local->fd->inode->gfid));
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+ } else {
+ local->cont.lk.getlk_rsp[i] = *lock;
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ afr_local_t *local = frame->local;
+ struct gf_flock flock = {
+ 0,
+ };
+ gf_boolean_t ret = _gf_true;
+ char *wind_on = alloca0(priv->child_count);
+ unsigned char *success_replies = alloca0(priv->child_count);
+ local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+ priv->child_count, gf_afr_mt_gf_lock);
+
+ flock = info->flock;
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ wind_on[i] = 1;
+ }
+
+ AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+ info->xdata_req);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+ continue;
+ if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+ continue;
+ /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+ if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+ &info->lk_owner)) {
+ ret = _gf_false;
+ break;
+ }
+ }
+out:
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->cont.lk.getlk_rsp);
+ local->cont.lk.getlk_rsp = NULL;
+ return ret;
+}
+
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd)
+ return;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ if (fd_ctx) {
+ fd_ctx->is_fd_bad = _gf_true;
+ fd_ctx->lk_heal_info = NULL;
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+ LOCK(&priv->lock);
+ {
+ list_del(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+}
+
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ int op_errno = 0;
+ int32_t *current_event_gen = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ char *wind_on = alloca0(priv->child_count);
+ gf_boolean_t retry = _gf_true;
+
+ frame->root->pid = info->pid;
+ lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+ op_errno = -afr_dom_lock_acquire(frame);
+ if ((op_errno != 0)) {
+ goto release;
+ }
+
+ if (!afr_does_lk_owner_match(frame, priv, info)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+ "Ignoring lock heal for %s since lk-onwers mismatch. "
+ "Lock possibly pre-empted by another client.",
+ uuid_utoa(info->fd->inode->gfid));
+ goto release;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ continue;
+ wind_on[i] = 1;
+ }
+
+ current_event_gen = alloca(priv->child_count);
+ memcpy(current_event_gen, info->child_up_event_gen,
+ priv->child_count * sizeof *current_event_gen);
+ AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+ &info->flock, info->xdata_req);
+
+ LOCK(&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+ continue;
+ }
+
+ if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+ (current_event_gen[i] > info->child_down_event_gen[i])) {
+ info->locked_nodes[i] = 1;
+ retry = _gf_false;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->saved_locks);
+ } else {
+ /*We received subsequent child up/down events while heal was in
+ * progress; don't mark child as healed. Attempt again on the
+ * new child up*/
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+ "Event gen mismatch: skipped healing lock on child %d "
+ "for %s.",
+ i, uuid_utoa(info->fd->inode->gfid));
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+release:
+ afr_dom_lock_release(frame);
+ if (retry)
+ afr_add_lock_to_lkhealq(priv, info);
+ return;
+}
+
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ STACK_DESTROY(frame->root);
+ return 0;
+}
+
+static int
+afr_lock_heal(void *opaque)
+{
+ call_frame_t *frame = (call_frame_t *)opaque;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+ struct list_head healq = {
+ 0,
+ };
+ int ret = 0;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&healq);
+ LOCK(&priv->lock);
+ {
+ list_splice_init(&priv->lk_healq, &healq);
+ }
+ UNLOCK(&priv->lock);
+
+ list_for_each_entry_safe(info, tmp, &healq, pos)
+ {
+ GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+ priv->child_count));
+ ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+ afr_lock_heal_do(iter_frame, priv, info);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = ENOTCONN;
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+ AFR_MSG_LK_HEAL_DOM,
+ "Aborting processing of lk_healq."
+ "Healing will be reattempted on next child up for locks "
+ "that are still in quorum.");
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&healq, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+ break;
+ }
+ }
+
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_up_event_gen[child] = priv->event_generation;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return -1;
+
+ ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+ frame);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+ "Failed to launch lock heal synctask");
+
+ return ret;
+}
+
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
+{
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_down_event_gen[child] = priv->event_generation;
+ if (info->locked_nodes[child] == 1)
+ info->locked_nodes[child] = 0;
+ if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+ /* Since the lock was lost on quorum no. of nodes, we should
+ * not attempt to heal it anymore. Some other client could have
+ * acquired the lock, modified data and released it and this
+ * client wouldn't know about it if we heal it.*/
+ afr_mark_fd_bad(info->fd, this);
+ list_del(&info->pos);
+ afr_lk_heal_info_cleanup(info);
+ /* We're not winding an unlock on the node where the lock is still
+ * present because when fencing logic switches over to the new
+ * client (since we marked the fd bad), it should preempt any
+ * existing lock. */
+ }
+ }
+ return 0;
+}
+
gf_boolean_t
afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
int32_t *op_errno)
@@ -70,6 +693,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
return _gf_true;
}
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+ int ret = 0;
+ uint32_t lk_mode = GF_LK_ADVISORY;
+
+ ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+ if (!ret && lk_mode == GF_LK_MANDATORY)
+ return _gf_true;
+
+ return _gf_false;
+}
+
call_frame_t *
afr_copy_frame(call_frame_t *base)
{
@@ -284,7 +920,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
metadatamap |= (1 << index);
}
if (metadatamap_old != metadatamap) {
- event = 0;
+ __afr_inode_need_refresh_set(inode, this);
}
break;
@@ -297,7 +933,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
datamap |= (1 << index);
}
if (datamap_old != datamap)
- event = 0;
+ __afr_inode_need_refresh_set(inode, this);
break;
default:
@@ -461,34 +1097,6 @@ out:
}
int
-__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this)
-{
- int ret = -1;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint32_t event = 0;
- uint64_t val = 0;
- afr_inode_ctx_t *ctx = NULL;
-
- ret = __afr_inode_ctx_get(this, inode, &ctx);
- if (ret)
- return ret;
-
- val = ctx->read_subvol;
-
- metadatamap = (val & 0x000000000000ffff) >> 0;
- datamap = (val & 0x00000000ffff0000) >> 16;
- event = 0;
-
- val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
- (((uint64_t)event) << 32);
-
- ctx->read_subvol = val;
-
- return ret;
-}
-
-int
__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int *event_p)
{
@@ -559,22 +1167,6 @@ out:
}
int
-__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int ret = -1;
-
- priv = this->private;
-
- if (priv->child_count <= 16)
- ret = __afr_inode_event_gen_reset_small(inode, this);
- else
- ret = -1;
-
- return ret;
-}
-
-int
afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int *event_p)
{
@@ -640,12 +1232,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
return 0;
}
-int
+static int
afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
int *spb_choice)
{
int ret = -1;
-
GF_VALIDATE_OR_GOTO(this->name, inode, out);
LOCK(&inode->lock);
@@ -657,6 +1248,40 @@ out:
return ret;
}
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol)
+{
+ int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO("afr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
+
+ priv = this->private;
+
+ ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+ if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+ local = frame->local;
+ *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (*spb_subvol >= 0) {
+ ret = 0;
+ }
+ }
+
+out:
+ return ret;
+}
int
afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int event)
@@ -723,30 +1348,22 @@ out:
return need_refresh;
}
-static int
-afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
int ret = -1;
afr_inode_ctx_t *ctx = NULL;
- GF_VALIDATE_OR_GOTO(this->name, inode, out);
-
- LOCK(&inode->lock);
- {
- ret = __afr_inode_ctx_get(this, inode, &ctx);
- if (ret)
- goto unlock;
-
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret == 0) {
ctx->need_refresh = _gf_true;
}
-unlock:
- UNLOCK(&inode->lock);
-out:
+
return ret;
}
int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
int ret = -1;
@@ -754,7 +1371,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
LOCK(&inode->lock);
{
- ret = __afr_inode_event_gen_reset(inode, this);
+ ret = __afr_inode_need_refresh_set(inode, this);
}
UNLOCK(&inode->lock);
out:
@@ -820,7 +1437,6 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
gf_boolean_t timer_set = _gf_false;
gf_boolean_t timer_cancelled = _gf_false;
gf_boolean_t timer_reset = _gf_false;
- gf_boolean_t need_invalidate = _gf_true;
int old_spb_choice = -1;
frame = data->frame;
@@ -932,7 +1548,6 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
timer_set = _gf_true;
if (timer_reset && !ctx->timer)
timer_cancelled = _gf_true;
- need_invalidate = _gf_false;
}
unlock:
UNLOCK(&inode->lock);
@@ -946,8 +1561,7 @@ post_unlock:
* reads from an older cached value despite a change in spb_choice to
* a new value.
*/
- if (need_invalidate)
- inode_invalidate(inode);
+ inode_invalidate(inode);
out:
GF_FREE(data);
AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
@@ -1054,6 +1668,8 @@ afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
ia_type = inode->ia_type;
}
+ if (!xdata)
+ continue; /* mkdir_cbk sends NULL xdata_rsp. */
afr_accused_fill(this, xdata, data_accused,
(ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION
: AFR_DATA_TRANSACTION);
@@ -1175,7 +1791,6 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
inode_t *inode = NULL;
int event_generation = 0;
int read_subvol = -1;
- int op_errno = ENOMEM;
int ret = 0;
local = frame->local;
@@ -1191,7 +1806,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
ret = afr_inode_get_readable(frame, inode, this, local->readable,
&event_generation, local->transaction.type);
- if (ret == -EIO || (local->is_read_txn && !event_generation)) {
+ if (ret == -EIO) {
/* No readable subvolume even after refresh ==> splitbrain.*/
if (!priv->fav_child_policy) {
err = EIO;
@@ -1204,18 +1819,12 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
goto refresh_done;
}
- heal_frame = copy_frame(frame);
+ heal_frame = afr_frame_create(this, NULL);
if (!heal_frame) {
err = EIO;
goto refresh_done;
}
- heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
- heal_local = AFR_FRAME_INIT(heal_frame, op_errno);
- if (!heal_local) {
- err = EIO;
- AFR_STACK_DESTROY(heal_frame);
- goto refresh_done;
- }
+ heal_local = heal_frame->local;
heal_local->xdata_req = dict_new();
if (!heal_local->xdata_req) {
err = EIO;
@@ -1236,18 +1845,6 @@ refresh_done:
return 0;
}
-static void
-afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
- unsigned char *replies)
-{
- int i = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].valid && local->replies[i].op_ret == 0)
- replies[i] = 1;
- }
-}
-
int
afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
{
@@ -1257,7 +1854,6 @@ afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
gf_boolean_t start_heal = _gf_false;
afr_local_t *heal_local = NULL;
unsigned char *success_replies = NULL;
- int op_errno = ENOMEM;
int ret = 0;
if (error != 0) {
@@ -1269,32 +1865,32 @@ afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
success_replies = alloca0(priv->child_count);
afr_fill_success_replies(local, priv, success_replies);
- if (!afr_has_quorum(success_replies, this, frame)) {
- error = afr_final_errno(frame->local, this->private);
- if (!error)
- error = afr_quorum_errno(priv);
- goto refresh_done;
- }
-
if (priv->thin_arbiter_count && local->is_read_txn &&
AFR_COUNT(success_replies, priv->child_count) != priv->child_count) {
/* We need to query the good bricks and/or thin-arbiter.*/
+ if (success_replies[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (success_replies[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
error = EINVAL;
goto refresh_done;
}
+ if (!afr_has_quorum(success_replies, this, frame)) {
+ error = afr_final_errno(frame->local, this->private);
+ if (!error)
+ error = afr_quorum_errno(priv);
+ goto refresh_done;
+ }
+
ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal);
if (ret && afr_selfheal_enabled(this) && start_heal) {
- heal_frame = copy_frame(frame);
+ heal_frame = afr_frame_create(this, NULL);
if (!heal_frame)
goto refresh_done;
- heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
- heal_local = AFR_FRAME_INIT(heal_frame, op_errno);
- if (!heal_local) {
- AFR_STACK_DESTROY(heal_frame);
- goto refresh_done;
- }
+ heal_local = heal_frame->local;
heal_local->refreshinode = inode_ref(local->refreshinode);
heal_local->heal_frame = heal_frame;
if (!afr_throttled_selfheal(heal_frame, this)) {
@@ -1331,17 +1927,22 @@ afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (xdata)
local->replies[call_child].xdata = dict_ref(xdata);
}
+
if (xdata) {
ret = dict_get_int8(xdata, "link-count", &need_heal);
- local->replies[call_child].need_heal = need_heal;
- } else {
- local->replies[call_child].need_heal = need_heal;
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to get link count");
+ }
}
+ local->replies[call_child].need_heal = need_heal;
call_count = afr_frame_return(frame);
if (call_count == 0) {
afr_set_need_heal(this, local);
ret = afr_inode_refresh_err(frame, this);
+ if (ret) {
+ gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed");
+ }
afr_inode_refresh_done(frame, this, ret);
}
}
@@ -1610,19 +2211,18 @@ out:
}
int
-afr_least_pending_reads_child(afr_private_t *priv)
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
{
int i = 0;
- int child = 0;
+ int child = -1;
int64_t read_iter = -1;
int64_t pending_read = -1;
- pending_read = GF_ATOMIC_GET(priv->pending_reads[0]);
- for (i = 1; i < priv->child_count; i++) {
- if (AFR_IS_ARBITER_BRICK(priv, i))
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
continue;
read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
- if (read_iter < pending_read) {
+ if (child == -1 || read_iter < pending_read) {
pending_read = read_iter;
child = i;
}
@@ -1631,8 +2231,54 @@ afr_least_pending_reads_child(afr_private_t *priv)
return child;
}
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ if (child == -1 ||
+ priv->child_latency[i] < priv->child_latency[child]) {
+ child = i;
+ }
+ }
+ return child;
+}
+
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+ unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+ int64_t pending_read = 0;
+ int64_t latency = -1;
+ int64_t least_latency = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+ latency = (pending_read + 1) * priv->child_latency[i];
+
+ if (child == -1 || latency < least_latency) {
+ least_latency = latency;
+ child = i;
+ }
+ }
+ return child;
+}
+
int
-afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+ unsigned char *readable)
{
uuid_t gfid_copy = {
0,
@@ -1641,14 +2287,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
int child = -1;
switch (priv->hash_mode) {
- case 0:
+ case AFR_READ_POLICY_FIRST_UP:
break;
- case 1:
+ case AFR_READ_POLICY_GFID_HASH:
gf_uuid_copy(gfid_copy, args->gfid);
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
break;
- case 2:
+ case AFR_READ_POLICY_GFID_PID_HASH:
if (args->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
@@ -1660,14 +2306,21 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
* need is a low probability that multiple clients
* won't converge on the same subvolume.
*/
+ gf_uuid_copy(gfid_copy, args->gfid);
pid = getpid();
- memcpy(gfid_copy, &pid, sizeof(pid));
+ *(pid_t *)gfid_copy ^= pid;
}
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
break;
- case 3:
- child = afr_least_pending_reads_child(priv);
+ case AFR_READ_POLICY_LESS_LOAD:
+ child = afr_least_pending_reads_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LEAST_LATENCY:
+ child = afr_least_latency_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+ child = afr_least_latency_times_pending_reads_child(priv, readable);
break;
}
@@ -1700,7 +2353,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
}
/* second preference - use hashed mode */
- read_subvol = afr_hash_child(&local_args, priv);
+ read_subvol = afr_hash_child(&local_args, priv, readable);
if (read_subvol >= 0 && readable[read_subvol])
return read_subvol;
@@ -2006,6 +2659,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this)
{ /* lk */
GF_FREE(local->cont.lk.locked_nodes);
+ GF_FREE(local->cont.lk.dom_locked_nodes);
+ GF_FREE(local->cont.lk.dom_lock_op_ret);
+ GF_FREE(local->cont.lk.dom_lock_op_errno);
}
{ /* create */
@@ -2236,7 +2892,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int spb_choice = -1;
+ int spb_subvol = -1;
int child_count = -1;
if (*read_subvol != -1)
@@ -2246,11 +2902,12 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
local = frame->local;
child_count = priv->child_count;
- afr_inode_split_brain_choice_get(local->inode, this, &spb_choice);
- if ((spb_choice >= 0) &&
+ afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+ if ((spb_subvol >= 0) &&
(AFR_COUNT(success_replies, child_count) == child_count)) {
- *read_subvol = spb_choice;
- } else if (!priv->quorum_count) {
+ *read_subvol = spb_subvol;
+ } else if (!priv->quorum_count ||
+ frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
*read_subvol = afr_first_up_child(frame, this);
} else if (priv->quorum_count &&
afr_has_quorum(data_readable, this, NULL)) {
@@ -2289,6 +2946,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
0,
};
gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t in_flight_create = _gf_false;
gf_boolean_t can_interpret = _gf_true;
inode_t *parent = NULL;
ia_type_t ia_type = IA_INVAL;
@@ -2332,17 +2990,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
if (!replies[i].valid)
continue;
- if (locked_entry && replies[i].op_ret == -1 &&
- replies[i].op_errno == ENOENT) {
- /* Second, check entry is still
- "underway" in creation */
- local->op_ret = -1;
- local->op_errno = ENOENT;
- goto error;
- }
-
- if (replies[i].op_ret == -1)
+ if (replies[i].op_ret == -1) {
+ if (locked_entry && replies[i].op_errno == ENOENT) {
+ in_flight_create = _gf_true;
+ }
continue;
+ }
if (read_subvol == -1 || !readable[read_subvol]) {
read_subvol = i;
@@ -2352,6 +3005,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
}
}
+ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
+ }
+
if (read_subvol == -1)
goto error;
/* We now have a read_subvol, which is readable[] (if there
@@ -2410,7 +3069,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
if (read_subvol == -1)
goto cant_interpret;
if (ret) {
- afr_inode_event_gen_reset(local->inode, this);
+ afr_inode_need_refresh_set(local->inode, this);
dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
}
} else {
@@ -2463,7 +3122,7 @@ error:
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ENODATA > ENOENT > ESTALE > others
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
*/
int
@@ -2475,6 +3134,8 @@ afr_higher_errno(int32_t old_errno, int32_t new_errno)
return ENOENT;
if (old_errno == ESTALE || new_errno == ESTALE)
return ESTALE;
+ if (old_errno == ENOSPC || new_errno == ENOSPC)
+ return ENOSPC;
return new_errno;
}
@@ -2604,6 +3265,10 @@ afr_lookup_sh_metadata_wrap(void *opaque)
dict = dict_new();
if (!dict)
goto out;
+ if (local->xattr_req) {
+ dict_copy(local->xattr_req, dict);
+ }
+
ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT);
if (ret) {
gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
@@ -2612,7 +3277,7 @@ afr_lookup_sh_metadata_wrap(void *opaque)
if (loc_is_nameless(&local->loc)) {
ret = afr_selfheal_unlocked_discover_on(frame, local->inode,
local->loc.gfid, local->replies,
- local->child_up);
+ local->child_up, dict);
} else {
inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
local->loc.name, local->replies,
@@ -2786,7 +3451,7 @@ afr_lookup_selfheal_wrap(void *opaque)
inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
local->loc.name, local->replies,
- local->child_up, NULL);
+ local->child_up, local->xattr_req);
if (inode)
inode_unref(inode);
@@ -2962,6 +3627,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
int read_subvol = -1;
+ int ret = 0;
unsigned char *data_readable = NULL;
unsigned char *success_replies = NULL;
@@ -2983,7 +3649,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
if (!afr_has_quorum(success_replies, this, frame))
goto unwind;
- afr_replies_interpret(frame, this, local->inode, NULL);
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ }
read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
data_readable);
@@ -3035,7 +3704,7 @@ afr_ta_id_file_check(void *opaque)
this = opaque;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_false);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc.name);
@@ -3226,10 +3895,15 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
local->inode = inode_ref(loc->inode);
- if (xattr_req)
+ if (xattr_req) {
/* If xattr_req was null, afr_lookup_xattr_req_prepare() will
allocate one for us */
- local->xattr_req = dict_ref(xattr_req);
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ }
if (gf_uuid_is_null(loc->inode->gfid)) {
afr_discover_do(frame, this, 0);
@@ -3239,11 +3913,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
- if (afr_is_inode_refresh_reqd(loc->inode, this, event,
- local->event_generation))
- afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do);
- else
- afr_discover_do(frame, this, 0);
+ afr_discover_do(frame, this, 0);
return 0;
out:
@@ -3344,11 +4014,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
return 0;
}
- if (__is_root_gfid(loc->parent->gfid)) {
- if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) {
- op_errno = EPERM;
- goto out;
- }
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
}
local = AFR_FRAME_INIT(frame, op_errno);
@@ -3384,11 +4053,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
- if (afr_is_inode_refresh_reqd(loc->inode, this, event,
- local->event_generation))
- afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do);
- else
- afr_lookup_do(frame, this, 0);
+ afr_lookup_do(frame, this, 0);
return 0;
out:
@@ -3398,8 +4063,18 @@ out:
}
void
-_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx)
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
{
+ afr_private_t *priv = this->private;
+
+ if (fd_ctx->lk_heal_info) {
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+ fd_ctx->lk_heal_info = NULL;
+ }
GF_FREE(fd_ctx->opened_on);
GF_FREE(fd_ctx);
return;
@@ -3419,7 +4094,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long)ctx;
if (fd_ctx) {
- _afr_cleanup_fd_ctx(fd_ctx);
+ _afr_cleanup_fd_ctx(this, fd_ctx);
}
out:
@@ -3512,13 +4187,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd)
}
fd_ctx->readdir_subvol = -1;
+ fd_ctx->lk_heal_info = NULL;
ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
if (ret)
gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
out:
if (ret && fd_ctx)
- _afr_cleanup_fd_ctx(fd_ctx);
+ _afr_cleanup_fd_ctx(this, fd_ctx);
return ret;
}
@@ -3542,11 +4218,10 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
} else {
local->op_errno = op_errno;
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0)
AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno,
local->xdata_rsp);
@@ -3642,6 +4317,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
call_stub_t *stub = NULL;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -3682,11 +4358,10 @@ afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
} else {
local->op_errno = op_errno;
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0)
AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
local->xdata_rsp);
@@ -4179,9 +4854,9 @@ out:
}
static int32_t
-afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
- loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock,
- dict_t *xdata)
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
{
afr_local_t *local = NULL;
int32_t op_errno = ENOMEM;
@@ -4193,8 +4868,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
local->op = fop;
if (loc)
loc_copy(&local->loc, loc);
- if (fd)
+ if (fd && (flock->l_type != F_UNLCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local->fd = fd_ref(fd);
+ }
local->cont.inodelk.volume = gf_strdup(volume);
if (!local->cont.inodelk.volume) {
@@ -4223,8 +4900,8 @@ int32_t
afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock,
- xdata);
+ afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+ flock, xdata);
return 0;
}
@@ -4232,15 +4909,16 @@ int32_t
afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock,
- xdata);
+ afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+ flock, xdata);
return 0;
}
static int
-afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
- loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
afr_local_t *local = NULL;
int32_t op_errno = ENOMEM;
@@ -4252,8 +4930,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
local->op = fop;
if (loc)
loc_copy(&local->loc, loc);
- if (fd)
+ if (fd && (cmd != ENTRYLK_UNLOCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local->fd = fd_ref(fd);
+ }
local->cont.entrylk.cmd = cmd;
local->cont.entrylk.in_cmd = cmd;
local->cont.entrylk.type = type;
@@ -4280,8 +4960,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
const char *basename, entrylk_cmd cmd, entrylk_type type,
dict_t *xdata)
{
- afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd,
- type, xdata);
+ afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+ cmd, type, xdata);
return 0;
}
@@ -4290,8 +4970,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
const char *basename, entrylk_cmd cmd, entrylk_type type,
dict_t *xdata)
{
- afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd,
- type, xdata);
+ afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+ cmd, type, xdata);
return 0;
}
@@ -4303,10 +4983,10 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int call_count = 0;
struct statvfs *buf = NULL;
+ local = frame->local;
+
LOCK(&frame->lock);
{
- local = frame->local;
-
if (op_ret != 0) {
local->op_errno = op_errno;
goto unlock;
@@ -4332,10 +5012,9 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
}
}
unlock:
+ call_count = --local->call_count;
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0)
AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
&local->cont.statfs.buf, local->xdata_rsp);
@@ -4410,9 +5089,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
call_count = afr_frame_return(frame);
- if (call_count == 0)
+ if (call_count == 0) {
AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
local->xdata_rsp);
+ }
return 0;
}
@@ -4511,11 +5191,133 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
}
int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
+
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = -1;
+
+ local = frame->local;
+ child_index = (long)cookie;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int child_index = (long)cookie;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+ return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *wind_on = NULL;
+ int op_errno = 0;
+ int i = 0;
+ int ret = 0;
+
+ frame = (call_frame_t *)opaque;
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+
+ if (priv->arbiter_count || priv->child_count != 3) {
+ op_errno = ENOTSUP;
+ gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Lock healing supported only for replica 3 volumes.",
+ uuid_utoa(local->fd->inode->gfid));
+ goto err;
+ }
+
+ op_errno = -afr_dom_lock_acquire(frame); // Released during
+ // AFR_STACK_UNWIND
+ if (op_errno != 0) {
+ goto err;
+ }
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+ op_errno = afr_final_errno(local, priv);
+ goto err;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+ wind_on[i] = 1;
+ }
+ AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ goto unlock;
+ } else {
+ if (local->cont.lk.user_flock.l_type == F_UNLCK)
+ ret = afr_remove_lock_from_saved_locks(local, this);
+ else
+ ret = afr_add_lock_to_saved_locks(frame, this);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ goto unlock;
+ }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
+
+ return 0;
+
+unlock:
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+ AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return -1;
+}
+
+int
afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ int ret = 0;
int i = 0;
int32_t op_errno = ENOMEM;
@@ -4526,9 +5328,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
goto out;
local->op = GF_FOP_LK;
- if (!afr_lk_is_unlock(cmd, flock) &&
- !afr_is_consistent_io_possible(local, priv, &op_errno))
- goto out;
+ if (!afr_lk_is_unlock(cmd, flock)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ }
local->cont.lk.locked_nodes = GF_CALLOC(
priv->child_count, sizeof(*local->cont.lk.locked_nodes),
@@ -4546,6 +5350,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
if (xdata)
local->xdata_req = dict_ref(xdata);
+ if (afr_is_lock_mode_mandatory(xdata)) {
+ ret = synctask_new(this->ctx->env, afr_lk_transaction,
+ afr_lk_transaction_cbk, frame, frame);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ return 0;
+ }
+
STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
priv->children[i]->fops->lk, fd, cmd, flock,
local->xdata_req);
@@ -4867,6 +5681,8 @@ afr_priv_dump(xlator_t *this)
GF_ATOMIC_GET(priv->pending_reads[i]));
sprintf(key, "child_latency[%d]", i);
gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+ sprintf(key, "halo_child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
}
gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
@@ -4879,6 +5695,7 @@ afr_priv_dump(xlator_t *this)
priv->background_self_heal_count);
gf_proc_dump_write("healers", "%d", priv->healers);
gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
gf_proc_dump_write("quorum-type", "auto");
} else if (priv->quorum_count == 0) {
@@ -4939,13 +5756,31 @@ __afr_get_up_children_count(afr_private_t *priv)
return up_children;
}
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ return 0;
+ }
+ }
+ if (priv->thin_arbiter_count && !priv->ta_child_up) {
+ return 0;
+ }
+ return 1;
+}
+
glusterfs_event_t
-__afr_transform_event_from_state(afr_private_t *priv)
+__afr_transform_event_from_state(xlator_t *this)
{
int i = 0;
int up_children = 0;
+ afr_private_t *priv = this->private;
- if (AFR_COUNT(priv->last_event, priv->child_count) == priv->child_count)
+ if (__get_heard_from_all_status(this))
/* have_heard_from_all. Let afr_notify() do the propagation. */
return GF_EVENT_MAXVAL;
@@ -4987,7 +5822,7 @@ afr_notify_cbk(void *data)
goto unlock;
}
priv->timer = NULL;
- event = __afr_transform_event_from_state(priv);
+ event = __afr_transform_event_from_state(this);
if (event != GF_EVENT_MAXVAL)
propagate = _gf_true;
}
@@ -5014,22 +5849,6 @@ __afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
}
}
-int
-__get_heard_from_all_status(xlator_t *this)
-{
- afr_private_t *priv = this->private;
- int heard_from_all = 1;
- int i = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i]) {
- heard_from_all = 0;
- break;
- }
- }
- return heard_from_all;
-}
-
static int
find_best_down_child(xlator_t *this)
{
@@ -5041,7 +5860,7 @@ find_best_down_child(xlator_t *this)
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
priv->child_latency[i] < best_latency) {
best_child = i;
best_latency = priv->child_latency[i];
@@ -5113,7 +5932,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
"), "
"marking child down.",
child_latency_msec, halo_max_latency_msec);
- *event = GF_EVENT_CHILD_DOWN;
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_DOWN;
+ }
}
} else if (child_latency_msec < halo_max_latency_msec &&
priv->child_up[idx] == 0) {
@@ -5125,7 +5946,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
"), "
"marking child up.",
child_latency_msec, halo_max_latency_msec);
- *event = GF_EVENT_CHILD_UP;
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_UP;
+ }
} else {
gf_log(child_xlator->name, GF_LOG_INFO,
"Not marking child %d up, "
@@ -5187,9 +6010,15 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
* want to set the child_latency to MAX to indicate
* the child needs ping data to be available before doing child-up
*/
- if (child_latency_msec < 0 && priv->halo_enabled) {
+ if (!priv->halo_enabled)
+ goto out;
+
+ if (child_latency_msec < 0) {
/*set to INT64_MAX-1 so that it is found for best_down_child*/
- priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ priv->halo_child_up[idx] = 1;
+ if (priv->child_latency[idx] < 0) {
+ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ }
}
/*
@@ -5227,13 +6056,14 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
"up_children (%d) > halo_max_replicas (%d)",
worst_up_child, up_children, priv->halo_max_replicas);
}
-
+out:
if (up_children == 1) {
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
"Subvolume '%s' came back up; "
"going online.",
child_xlator->name);
- gf_event(EVENT_AFR_SUBVOL_UP, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
} else {
*event = GF_EVENT_SOME_DESCENDENT_UP;
}
@@ -5277,6 +6107,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
*/
if (child_latency_msec < 0) {
priv->child_latency[idx] = child_latency_msec;
+ priv->halo_child_up[idx] = 0;
}
priv->child_up[idx] = 0;
@@ -5289,7 +6120,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
* as we want it to be up to date if we are going to
* begin using it synchronously.
*/
- if (up_children < priv->halo_min_replicas) {
+ if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
best_down_child = find_best_down_child(this);
if (best_down_child >= 0) {
gf_msg_debug(this->name, 0,
@@ -5301,7 +6132,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
*up_child = best_down_child;
}
}
-
for (i = 0; i < priv->child_count; i++)
if (priv->child_up[i] == 0)
down_children++;
@@ -5310,7 +6140,8 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
"All subvolumes are down. Going "
"offline until at least one of them "
"comes back up.");
- gf_event(EVENT_AFR_SUBVOLS_DOWN, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
} else {
*event = GF_EVENT_SOME_DESCENDENT_DOWN;
}
@@ -5359,6 +6190,11 @@ afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
}
LOCK(&priv->lock);
{
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Ignore multiple release requests from shds.*/
+ UNLOCK(&priv->lock);
+ return;
+ }
priv->release_ta_notify_dom_lock = _gf_true;
inmem_count = priv->ta_in_mem_txn_count;
onwire_count = priv->ta_on_wire_txn_count;
@@ -5366,7 +6202,7 @@ afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
UNLOCK(&priv->lock);
if (inmem_count || onwire_count)
/* lock release will happen in txn code path after
- * inflight or on-wire txns are over.*/
+ * in-memory or on-wire txns are over.*/
return;
afr_ta_lock_release_synctask(this);
@@ -5467,13 +6303,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
had_quorum = priv->quorum_count &&
afr_has_quorum(priv->child_up, this, NULL);
- if (priv->halo_enabled) {
- halo_max_latency_msec = afr_get_halo_latency(this);
+ if (event == GF_EVENT_CHILD_PING) {
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency(this);
- if (event == GF_EVENT_CHILD_PING) {
/* Calculates the child latency and sets event
*/
- child_latency_msec = (int64_t)(uintptr_t)data2;
LOCK(&priv->lock);
{
__afr_handle_ping_event(this, child_xlator, idx,
@@ -5481,6 +6317,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
child_latency_msec);
}
UNLOCK(&priv->lock);
+ } else {
+ LOCK(&priv->lock);
+ {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ UNLOCK(&priv->lock);
}
}
@@ -5524,22 +6366,27 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
if (priv->thin_arbiter_count &&
(idx == AFR_CHILD_THIN_ARBITER)) {
priv->ta_child_up = 1;
+ priv->ta_event_gen++;
break;
}
__afr_handle_child_up_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
+ __afr_lock_heal_synctask(this, priv, idx);
break;
case GF_EVENT_CHILD_DOWN:
if (priv->thin_arbiter_count &&
(idx == AFR_CHILD_THIN_ARBITER)) {
priv->ta_child_up = 0;
+ priv->ta_event_gen++;
+ afr_ta_locked_priv_invalidate(priv);
break;
}
__afr_handle_child_down_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
+ __afr_mark_pending_lk_heal(this, priv, idx);
break;
case GF_EVENT_CHILD_CONNECTING:
@@ -5585,12 +6432,14 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
if (!had_quorum && has_quorum) {
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
"Client-quorum is met");
- gf_event(EVENT_AFR_QUORUM_MET, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
}
if (had_quorum && !has_quorum) {
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
"Client-quorum is not met");
- gf_event(EVENT_AFR_QUORUM_FAIL, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
}
}
@@ -5609,10 +6458,8 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
* b) Already heard from everyone, but we now got a child-up
* event.
*/
- if (have_heard_from_all && priv->shd.iamshd) {
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i])
- afr_selfheal_childup(this, i);
+ if (have_heard_from_all) {
+ afr_selfheal_childup(this, priv);
}
}
out:
@@ -5633,7 +6480,7 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
- local->child_up = GF_CALLOC(priv->child_count, sizeof(*local->child_up),
+ local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up),
gf_afr_mt_char);
if (!local->child_up) {
if (op_errno)
@@ -5692,6 +6539,9 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
if (priv->thin_arbiter_count) {
local->ta_child_up = priv->ta_child_up;
local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+ local->read_txn_query_child = AFR_CHILD_UNKNOWN;
+ local->ta_event_gen = priv->ta_event_gen;
+ local->fop_state = TA_SUCCESS;
}
local->is_new_entry = _gf_false;
@@ -5763,6 +6613,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
afr_private_t *priv = NULL;
priv = this->private;
+ INIT_LIST_HEAD(&local->transaction.wait_list);
+ INIT_LIST_HEAD(&local->transaction.owner_list);
+ INIT_LIST_HEAD(&local->ta_waitq);
+ INIT_LIST_HEAD(&local->ta_onwireq);
ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
if (ret < 0)
goto out;
@@ -5800,10 +6654,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
goto out;
ret = 0;
- INIT_LIST_HEAD(&local->transaction.wait_list);
- INIT_LIST_HEAD(&local->transaction.owner_list);
- INIT_LIST_HEAD(&local->ta_waitq);
- INIT_LIST_HEAD(&local->ta_onwireq);
out:
return ret;
}
@@ -5822,6 +6672,8 @@ afr_priv_destroy(afr_private_t *priv)
if (!priv)
goto out;
+
+ GF_FREE(priv->sh_domain);
GF_FREE(priv->last_event);
child_count = priv->child_count;
@@ -5837,7 +6689,9 @@ afr_priv_destroy(afr_private_t *priv)
GF_FREE(priv->local);
GF_FREE(priv->pending_key);
GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
GF_FREE(priv->child_up);
+ GF_FREE(priv->halo_child_up);
GF_FREE(priv->child_latency);
LOCK_DESTROY(&priv->lock);
@@ -5888,274 +6742,218 @@ out:
return changelog;
}
-gf_boolean_t
-afr_decide_heal_info(afr_private_t *priv, unsigned char *sources, int source)
+static dict_t *
+afr_set_heal_info(char *status)
{
- int sources_count = 0;
+ dict_t *dict = NULL;
+ int ret = -1;
- if (source < 0)
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
goto out;
+ }
- sources_count = AFR_COUNT(sources, priv->child_count);
- if (sources_count == priv->child_count)
- return _gf_false;
+ ret = dict_set_dynstr_sizen(dict, "heal-info", status);
+ if (ret)
+ gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
out:
- return _gf_true;
+ /* Any error other than EINVAL, dict_set_dynstr frees status */
+ if (ret == -ENOMEM || ret == -EINVAL) {
+ GF_FREE(status);
+ }
+
+ if (ret && dict) {
+ dict_unref(dict);
+ dict = NULL;
+ }
+ return dict;
}
-int
-afr_selfheal_locked_metadata_inspect(call_frame_t *frame, xlator_t *this,
- inode_t *inode, gf_boolean_t *msh,
- unsigned char *pending)
+static gf_boolean_t
+afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type)
{
- int ret = -1;
- unsigned char *locked_on = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- unsigned char *undid_pending = NULL;
- struct afr_reply *locked_replies = NULL;
-
afr_private_t *priv = this->private;
+ int *dirty = alloca0(priv->child_count * sizeof(int));
+ int i = 0;
- locked_on = alloca0(priv->child_count);
- sources = alloca0(priv->child_count);
- sinks = alloca0(priv->child_count);
- healed_sinks = alloca0(priv->child_count);
- undid_pending = alloca0(priv->child_count);
-
- locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
- locked_on);
- {
- if (ret == 0) {
- /* Not a single lock */
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN; /* all invalid responses */
- goto out;
- }
- ret = __afr_selfheal_metadata_prepare(
- frame, this, inode, locked_on, sources, sinks, healed_sinks,
- undid_pending, locked_replies, pending);
- *msh = afr_decide_heal_info(priv, sources, ret);
+ afr_selfheal_extract_xattr(this, replies, type, dirty, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dirty[i] > 1)
+ return _gf_true;
}
- afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
- locked_on);
-out:
- if (locked_replies)
- afr_replies_wipe(locked_replies, priv->child_count);
- return ret;
+
+ return _gf_false;
}
-int
-afr_selfheal_locked_data_inspect(call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_boolean_t *dsh, unsigned char *pflag)
+static gf_boolean_t
+afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type)
{
- int ret = -1;
- unsigned char *data_lock = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- unsigned char *undid_pending = NULL;
- afr_private_t *priv = NULL;
- struct afr_reply *locked_replies = NULL;
- inode_t *inode = fd->inode;
+ gf_boolean_t data_chk = _gf_false;
+ gf_boolean_t mdata_chk = _gf_false;
+ gf_boolean_t entry_chk = _gf_false;
- priv = this->private;
- data_lock = alloca0(priv->child_count);
- sources = alloca0(priv->child_count);
- sinks = alloca0(priv->child_count);
- healed_sinks = alloca0(priv->child_count);
- undid_pending = alloca0(priv->child_count);
-
- locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+ switch (ia_type) {
+ case IA_IFDIR:
+ mdata_chk = _gf_true;
+ entry_chk = _gf_true;
+ break;
+ case IA_IFREG:
+ mdata_chk = _gf_true;
+ data_chk = _gf_true;
+ break;
+ default:
+ /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/
+ mdata_chk = _gf_true;
+ break;
+ }
- ret = afr_selfheal_inodelk(frame, this, inode, this->name, 0, 0, data_lock);
- {
- if (ret == 0) {
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN; /* all invalid responses */
- goto out;
- }
- ret = __afr_selfheal_data_prepare(frame, this, inode, data_lock,
- sources, sinks, healed_sinks,
- undid_pending, locked_replies, pflag);
- *dsh = afr_decide_heal_info(priv, sources, ret);
+ if (data_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_DATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_METADATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_ENTRY_TRANSACTION)) {
+ return _gf_true;
}
- afr_selfheal_uninodelk(frame, this, inode, this->name, 0, 0, data_lock);
-out:
- if (locked_replies)
- afr_replies_wipe(locked_replies, priv->child_count);
- return ret;
+
+ return _gf_false;
}
-int
-afr_selfheal_locked_entry_inspect(call_frame_t *frame, xlator_t *this,
- inode_t *inode, gf_boolean_t *esh,
- unsigned char *pflag)
+static int
+afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+ gf_boolean_t *msh, unsigned char pending)
{
int ret = -1;
- int source = -1;
+ GF_UNUSED int ret1 = 0;
+ int i = 0;
+ int io_domain_lk_count = 0;
+ int shd_domain_lk_count = 0;
afr_private_t *priv = NULL;
- unsigned char *locked_on = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
- gf_boolean_t granular_locks = _gf_false;
+ char *key1 = NULL;
+ char *key2 = NULL;
priv = this->private;
- granular_locks = priv->granular_locks; /*Assign to local variable so that
- reconfigure doesn't change this
- value between locking and unlocking
- below*/
- locked_on = alloca0(priv->child_count);
- data_lock = alloca0(priv->child_count);
- sources = alloca0(priv->child_count);
- sinks = alloca0(priv->child_count);
- healed_sinks = alloca0(priv->child_count);
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
- locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
-
- if (!granular_locks) {
- ret = afr_selfheal_tryentrylk(frame, this, inode, priv->sh_domain, NULL,
- locked_on);
- }
- {
- if (!granular_locks && ret == 0) {
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN; /* all invalid responses */
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if ((replies[i].valid != 1) || (replies[i].op_ret != 0))
+ continue;
+ if (!io_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count);
+ }
+ if (!shd_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count);
}
+ }
- ret = afr_selfheal_entrylk(frame, this, inode, this->name, NULL,
- data_lock);
- {
- if (ret == 0) {
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;
- /* all invalid responses */
- goto unlock;
- }
- ret = __afr_selfheal_entry_prepare(frame, this, inode, data_lock,
- sources, sinks, healed_sinks,
- locked_replies, &source, pflag);
- if ((ret == 0) && (*pflag & PFLAG_SBRAIN))
- ret = -EIO;
- *esh = afr_decide_heal_info(priv, sources, ret);
+ if (!pending) {
+ if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
+ (!io_domain_lk_count)) {
+ /* Needs heal. */
+ ret = 0;
+ } else {
+ /* No heal needed. */
+ *dsh = *esh = *msh = 0;
+ }
+ } else {
+ if (shd_domain_lk_count) {
+ ret = -EAGAIN; /*For 'possibly-healing'. */
+ } else {
+ ret = 0; /*needs heal. Just set a non -ve value so that it is
+ assumed as the source index.*/
}
- afr_selfheal_unentrylk(frame, this, inode, this->name, NULL, data_lock,
- NULL);
}
-unlock:
- if (!granular_locks)
- afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL,
- locked_on, NULL);
-out:
- if (locked_replies)
- afr_replies_wipe(locked_replies, priv->child_count);
return ret;
}
+/*return EIO, EAGAIN or pending*/
int
-afr_selfheal_locked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
- inode_t **inode, gf_boolean_t *entry_selfheal,
- gf_boolean_t *data_selfheal,
- gf_boolean_t *metadata_selfheal,
- unsigned char *pending)
-
+afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **inode, gf_boolean_t *entry_selfheal,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal, unsigned char *pending)
{
int ret = -1;
- fd_t *fd = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
gf_boolean_t dsh = _gf_false;
gf_boolean_t msh = _gf_false;
gf_boolean_t esh = _gf_false;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *valid_on = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+ sources = alloca0(sizeof(*sources) * priv->child_count);
+ sinks = alloca0(sizeof(*sinks) * priv->child_count);
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ valid_on = alloca0(sizeof(*valid_on) * priv->child_count);
ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh,
- &esh);
+ &esh, replies);
if (ret)
goto out;
-
- /* For every heal type hold locks and check if it indeed needs heal */
-
- /* Heal-info does an open() on the file being examined so that the
- * current eager-lock holding client, if present, at some point sees
- * open-fd count being > 1 and releases the eager-lock so that heal-info
- * doesn't remain blocked forever until IO completes.
- */
- if ((*inode)->ia_type == IA_IFREG) {
- ret = afr_selfheal_data_open(this, *inode, &fd);
- if (ret < 0) {
- gf_msg_debug(this->name, -ret, "%s: Failed to open",
- uuid_utoa((*inode)->gfid));
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ valid_on[i] = 1;
}
}
-
if (msh) {
- ret = afr_selfheal_locked_metadata_inspect(frame, this, *inode, &msh,
- pending);
- if (ret == -EIO)
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
goto out;
}
-
if (dsh) {
- ret = afr_selfheal_locked_data_inspect(frame, this, fd, &dsh, pending);
- if (ret == -EIO || (ret == -EAGAIN))
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
goto out;
}
-
if (esh) {
- ret = afr_selfheal_locked_entry_inspect(frame, this, *inode, &esh,
- pending);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
}
+ ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+ &msh, *pending);
out:
*data_selfheal = dsh;
*entry_selfheal = esh;
*metadata_selfheal = msh;
- if (fd)
- fd_unref(fd);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
return ret;
}
-static dict_t *
-afr_set_heal_info(char *status)
-{
- dict_t *dict = NULL;
- int ret = -1;
-
- dict = dict_new();
- if (!dict) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = dict_set_dynstr_sizen(dict, "heal-info", status);
- if (ret)
- gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
- "Failed to set heal-info key to "
- "%s",
- status);
-out:
- /* Any error other than EINVAL, dict_set_dynstr frees status */
- if (ret == -ENOMEM || ret == -EINVAL) {
- GF_FREE(status);
- }
-
- if (ret && dict) {
- dict_unref(dict);
- dict = NULL;
- }
- return dict;
-}
-
int
afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
@@ -6169,10 +6967,21 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
inode_t *inode = NULL;
char *substr = NULL;
char *status = NULL;
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+
+ /*Use frame with lk-owner set*/
+ heal_frame = afr_frame_create(frame->this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
- ret = afr_selfheal_locked_inspect(frame, this, loc->gfid, &inode,
- &entry_selfheal, &data_selfheal,
- &metadata_selfheal, &pending);
+ ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
+ &entry_selfheal, &data_selfheal,
+ &metadata_selfheal, &pending);
if (ret == -ENOMEM) {
ret = -1;
@@ -6257,6 +7066,10 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
op_errno = 0;
out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
if (dict)
dict_unref(dict);
@@ -6453,6 +7266,8 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
int op_errno = 0;
dict_t *dict = NULL;
afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
local = frame->local;
dict = dict_new();
@@ -6462,7 +7277,16 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
goto out;
}
- ret = afr_selfheal_do(frame, this, loc->gfid);
+ heal_frame = afr_frame_create(this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+ /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk
+ * work correctly*/
+ ret = afr_selfheal_do(heal_frame, this, loc->gfid);
if (ret == 1 || ret == 2) {
ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg",
@@ -6484,6 +7308,10 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
}
out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
if (local->op == GF_FOP_GETXATTR)
AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
else if (local->op == GF_FOP_SETXATTR)
@@ -6626,7 +7454,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS)
+ if (ret < priv->child_count)
goto data_unlock;
ret = __afr_selfheal_data_prepare(
heal_frame, this, inode, locked_on, sources, sinks,
@@ -6643,7 +7471,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
LLONG_MAX - 1, 0, locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS)
+ if (ret < priv->child_count)
goto mdata_unlock;
ret = __afr_selfheal_metadata_prepare(
heal_frame, this, inode, locked_on, sources, sinks,
@@ -6975,16 +7803,16 @@ afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
return _gf_false;
}
-gf_boolean_t
+static gf_boolean_t
afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
{
afr_local_t *local = NULL;
- local = frame->local;
-
if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
return _gf_false;
+ local = frame->local;
+
if (local->op != GF_FOP_LOOKUP)
/* TODO:If the replica count is being increased on a plain distribute
* volume that was never mounted, we need to allow setxattr on '/' with
@@ -7001,14 +7829,49 @@ afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
}
gf_boolean_t
-afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this,
- unsigned char *subvols)
+afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count)
{
+ if (frame && (up_children_count > 0) &&
+ afr_is_add_replica_mount_lookup_on_root(frame))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
afr_private_t *priv = this->private;
+ unsigned char *success_replies = NULL;
- if (frame && afr_is_add_replica_mount_lookup_on_root(frame)) {
- if (AFR_COUNT(subvols, priv->child_count) > 0)
- return _gf_true;
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_errno = afr_final_errno(local, priv);
+ if (!local->op_errno)
+ local->op_errno = afr_quorum_errno(priv);
+ local->op_ret = -1;
+ }
+}
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
+{
+ int *pending = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
+ if (ret == 0) {
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ /* Not doing a ntoh32(pending) as we just want to check
+ * if it is non-zero or not. */
+ if (pending[i]) {
+ return _gf_true;
+ }
+ }
}
return _gf_false;
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 6307b637f8d..f8bf8340dab 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -10,7 +10,6 @@
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
@@ -18,16 +17,10 @@
#include <glusterfs/glusterfs.h>
#include <glusterfs/dict.h>
-#include <glusterfs/xlator.h>
-#include <glusterfs/hashfn.h>
-#include <glusterfs/logging.h>
#include <glusterfs/list.h>
-#include <glusterfs/call-stub.h>
-#include <glusterfs/defaults.h>
#include <glusterfs/common-utils.h>
#include <glusterfs/compat-errno.h>
#include <glusterfs/compat.h>
-#include <glusterfs/checksum.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -45,6 +38,10 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
fd_ctx = local->fd_ctx;
child_index = (long)cookie;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
LOCK(&frame->lock);
{
if (op_ret == -1) {
@@ -56,19 +53,22 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!local->xdata_rsp && xdata)
local->xdata_rsp = dict_ref(xdata);
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
- if (call_count == 0)
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
local->fd, NULL);
+ }
+
return 0;
}
int
-afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -84,6 +84,12 @@ afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
goto out;
local->op = GF_FOP_OPENDIR;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
if (!afr_is_consistent_io_possible(local, priv, &op_errno))
goto out;
@@ -158,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
}
static void
-afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
- gf_dirent_t *entries, fd_t *fd)
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
int ret = -1;
gf_dirent_t *entry = NULL;
@@ -177,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
{
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
continue;
}
@@ -222,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
if (op_ret >= 0)
- afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries,
- local->fd);
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 84e2a344624..b7cceb79158 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -10,7 +10,6 @@
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
@@ -18,11 +17,8 @@
#include <glusterfs/glusterfs.h>
#include "afr.h"
#include <glusterfs/dict.h>
-#include <glusterfs/xlator.h>
-#include <glusterfs/hashfn.h>
#include <glusterfs/logging.h>
#include <glusterfs/list.h>
-#include <glusterfs/call-stub.h>
#include <glusterfs/defaults.h>
#include <glusterfs/common-utils.h>
#include <glusterfs/compat-errno.h>
@@ -123,11 +119,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
continue;
if (local->replies[i].op_ret < 0) {
if (local->inode)
- afr_inode_event_gen_reset(local->inode, this);
+ afr_inode_need_refresh_set(local->inode, this);
if (local->parent)
- afr_inode_event_gen_reset(local->parent, this);
+ afr_inode_need_refresh_set(local->parent, this);
if (local->parent2)
- afr_inode_event_gen_reset(local->parent2, this);
+ afr_inode_need_refresh_set(local->parent2, this);
continue;
}
@@ -233,9 +229,9 @@ __afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
__afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf,
preparent, postparent, preparent2, postparent2,
xdata);
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
if (call_count == 0) {
__afr_dir_write_finalize(frame, this);
@@ -349,6 +345,7 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
int pre_op_count = 0;
int failed_count = 0;
+ unsigned char *success_replies = NULL;
local = frame->local;
priv = this->private;
@@ -364,9 +361,16 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
failed_count = AFR_COUNT(local->transaction.failed_subvols,
priv->child_count);
+ /* FOP succeeded on all bricks. */
if (pre_op_count == priv->child_count && !failed_count)
return;
+ /* FOP did not suceed on quorum no. of bricks. */
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (!afr_has_quorum(success_replies, this, NULL))
+ return;
+
if (priv->thin_arbiter_count) {
/*Mark new entry using ta file*/
local->is_new_entry = _gf_true;
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 523a5b48880..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -18,11 +18,8 @@
#include <glusterfs/glusterfs.h>
#include "afr.h"
#include <glusterfs/dict.h>
-#include <glusterfs/xlator.h>
-#include <glusterfs/hashfn.h>
#include <glusterfs/logging.h>
#include <glusterfs/list.h>
-#include <glusterfs/call-stub.h>
#include <glusterfs/byte-order.h>
#include <glusterfs/defaults.h>
#include <glusterfs/common-utils.h>
@@ -305,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
afr_local_t *local = NULL;
int op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -948,24 +946,13 @@ unlock:
goto unwind;
}
- len = dict_serialized_length(local->dict);
- if (len <= 0) {
- goto unwind;
- }
-
- lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char);
- if (!lockinfo_buf) {
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
local->op_ret = -1;
- local->op_errno = ENOMEM;
goto unwind;
}
- op_ret = dict_serialize(local->dict, lockinfo_buf);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- }
-
op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
(void *)lockinfo_buf, len);
if (op_ret < 0) {
@@ -1064,24 +1051,13 @@ unlock:
goto unwind;
}
- len = dict_serialized_length(local->dict);
- if (len <= 0) {
- goto unwind;
- }
-
- lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char);
- if (!lockinfo_buf) {
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
local->op_ret = -1;
- local->op_errno = ENOMEM;
goto unwind;
}
- op_ret = dict_serialize(local->dict, lockinfo_buf);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- }
-
op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
(void *)lockinfo_buf, len);
if (op_ret < 0) {
@@ -1723,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
int32_t op_errno = 0;
fop_fgetxattr_cbk_t cbk = NULL;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1816,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
afr_local_t *local = NULL;
int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1891,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
afr_local_t *local = NULL;
int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index 1627ee2c426..8c982bc7e6f 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -38,5 +38,8 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
dict_t *xdata);
int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata);
+int
afr_handle_quota_size(call_frame_t *frame, xlator_t *this);
#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 7fcc9d48ada..1d6e4f3570a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -8,9 +8,7 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
@@ -18,11 +16,7 @@
#include <glusterfs/glusterfs.h>
#include "afr.h"
#include <glusterfs/dict.h>
-#include <glusterfs/xlator.h>
-#include <glusterfs/hashfn.h>
#include <glusterfs/logging.h>
-#include <glusterfs/list.h>
-#include <glusterfs/call-stub.h>
#include <glusterfs/defaults.h>
#include <glusterfs/common-utils.h>
#include <glusterfs/compat-errno.h>
@@ -180,11 +174,10 @@ __afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
{
__afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
prebuf, postbuf, xattr, xdata);
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0) {
__afr_inode_write_finalize(frame, this);
@@ -498,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
int op_errno = ENOMEM;
int ret = -1;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -737,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -947,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1063,11 +1059,10 @@ afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie,
if (ret)
goto out;
- gf_msg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
- op_ret ? op_errno : 0, afr_get_msg_id(op_type),
- "Set of pending xattr %s on"
- " %s.",
- op_ret ? "failed" : "succeeded", priv->children[i]->name);
+ gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
+ op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s",
+ priv->children[i]->name, "op_ret=%s",
+ op_ret ? "failed" : "succeeded", NULL);
out:
syncbarrier_wake(&local->barrier);
@@ -1161,9 +1156,8 @@ _afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
}
if (!count) {
- gf_msg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
- "Couldn't acquire lock on"
- " any child.");
+ gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
+ NULL);
ret = -EAGAIN;
goto unlock;
}
@@ -1214,6 +1208,7 @@ _afr_handle_empty_brick(void *opaque)
char *op_type = NULL;
int op_type_len = 0;
afr_empty_brick_args_t *data = NULL;
+ call_frame_t *op_frame = NULL;
data = opaque;
frame = data->frame;
@@ -1221,21 +1216,29 @@ _afr_handle_empty_brick(void *opaque)
if (!data->op_type)
goto out;
+ op_frame = copy_frame(frame);
+ if (!op_frame) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
op_type = data->op_type;
op_type_len = strlen(op_type);
- this = frame->this;
+ this = op_frame->this;
priv = this->private;
- local = AFR_FRAME_INIT(frame, op_errno);
+ afr_set_lk_owner(op_frame, this, op_frame->root);
+ local = AFR_FRAME_INIT(op_frame, op_errno);
if (!local)
goto out;
loc_copy(&local->loc, &data->loc);
- gf_msg(this->name, GF_LOG_INFO, 0, 0, "New brick is : %s",
- priv->children[empty_index]->name);
+ gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s",
+ priv->children[empty_index]->name, NULL);
- ret = _afr_handle_empty_brick_type(this, frame, &local->loc, empty_index,
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
AFR_METADATA_TRANSACTION, op_type,
op_type_len);
if (ret) {
@@ -1251,7 +1254,7 @@ _afr_handle_empty_brick(void *opaque)
local->xattr_req = NULL;
local->xdata_req = NULL;
- ret = _afr_handle_empty_brick_type(this, frame, &local->loc, empty_index,
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
AFR_ENTRY_TRANSACTION, op_type,
op_type_len);
if (ret) {
@@ -1261,6 +1264,9 @@ _afr_handle_empty_brick(void *opaque)
}
ret = 0;
out:
+ if (op_frame) {
+ AFR_STACK_DESTROY(op_frame);
+ }
AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
return 0;
}
@@ -1305,9 +1311,8 @@ afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
*/
ret = afr_inode_split_brain_choice_set(loc->inode, this, -1);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to set"
- "split-brain choice to -1");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ NULL);
afr_heal_splitbrain_file(frame, this, loc);
ret = 0;
out:
@@ -1330,8 +1335,8 @@ afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len)
spb_child_index = afr_get_child_index_from_name(this, spb_child_str);
if (spb_child_index < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
- "Invalid subvol: %s", spb_child_str);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "subvol=%s", spb_child_str, NULL);
}
return spb_child_index;
}
@@ -1353,11 +1358,9 @@ afr_can_set_split_brain_choice(void *opaque)
&data->m_spb);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to determine if %s"
- " is in split-brain. "
- "Aborting split-brain-choice set.",
- uuid_utoa(loc->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s",
+ uuid_utoa(loc->gfid), NULL);
return ret;
}
@@ -1365,7 +1368,8 @@ int
afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
dict_t *dict)
{
- void *value = NULL;
+ void *choice_value = NULL;
+ void *resolve_value = NULL;
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_spbc_timeout_t *data = NULL;
@@ -1376,6 +1380,14 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
priv = this->private;
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len);
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value,
+ &len);
+ if (!choice_value && !resolve_value) {
+ ret = -1;
+ goto out;
+ }
+
local = AFR_FRAME_INIT(frame, op_errno);
if (!local) {
ret = 1;
@@ -1384,9 +1396,9 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
local->op = GF_FOP_SETXATTR;
- ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &value, &len);
- if (value) {
- spb_child_index = afr_get_split_brain_child_index(this, value, len);
+ if (choice_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, choice_value,
+ len);
if (spb_child_index < 0) {
/* Case where value was "none" */
if (spb_child_index == -2)
@@ -1410,12 +1422,8 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice,
afr_set_split_brain_choice, NULL, data);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to create"
- " synctask. Aborting split-brain choice set"
- " for %s",
- loc->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "name=%s", loc->name, NULL);
ret = 1;
op_errno = ENOMEM;
goto out;
@@ -1424,9 +1432,9 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
goto out;
}
- ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &value, &len);
- if (value) {
- spb_child_index = afr_get_split_brain_child_index(this, value, len);
+ if (resolve_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, resolve_value,
+ len);
if (spb_child_index < 0) {
ret = 1;
goto out;
@@ -1490,8 +1498,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
goto out;
if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
- gf_msg(this->name, GF_LOG_ERROR, EPERM, afr_get_msg_id(op_type),
- "'%s' is an internal extended attribute.", op_type);
+ gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR,
+ "op_type=%s", op_type, NULL);
ret = 1;
goto out;
}
@@ -1517,8 +1525,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
ret = synctask_new(this->ctx->env, _afr_handle_empty_brick,
_afr_handle_empty_brick_cbk, NULL, data);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, afr_get_msg_id(op_type),
- "Failed to create synctask.");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ NULL);
ret = 1;
op_errno = ENOMEM;
afr_brick_args_cleanup(data);
@@ -1676,6 +1684,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1884,6 +1893,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1984,6 +1994,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2093,6 +2104,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2199,6 +2211,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2398,6 +2411,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2492,7 +2506,9 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
call_frame_t *transaction_frame = NULL;
int ret = -1;
int32_t op_errno = ENOMEM;
+ int8_t last_fsync = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2501,10 +2517,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
if (!local)
goto out;
- if (xdata)
+ if (xdata) {
local->xdata_req = dict_copy_with_ref(xdata, NULL);
- else
+ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+ if (last_fsync) {
+ local->transaction.disable_delayed_post_op = _gf_true;
+ }
+ }
+ } else {
local->xdata_req = dict_new();
+ }
if (!local->xdata_req)
goto out;
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 4091671fa3f..bc8eabe0f43 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -397,7 +397,6 @@ afr_unlock_now(call_frame_t *frame, xlator_t *this)
int_lock->lk_call_count = call_count;
if (!call_count) {
- GF_ASSERT(!local->transaction.do_eager_unlock);
gf_msg_trace(this->name, 0, "No internal locks unlocked");
int_lock->lock_cbk(frame, this);
goto out;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index b0fb00641a0..816065fb57a 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -31,6 +31,8 @@ enum gf_afr_mem_types_ {
gf_afr_mt_empty_brick_t,
gf_afr_mt_child_latency_t,
gf_afr_mt_atomic_t,
+ gf_afr_mt_lk_heal_info_t,
+ gf_afr_mt_gf_lock,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index c9c99270e98..e73fd997765 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -23,25 +23,145 @@
* glfs-message-id.h.
*/
-GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET,
- AFR_MSG_QUORUM_OVERRIDE, AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP,
- AFR_MSG_SUBVOLS_DOWN, AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN,
- AFR_MSG_OPEN_FAIL, AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS,
- AFR_MSG_GFID_NULL, AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
- AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
- AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
- AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
- AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
- AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR,
- AFR_MSG_SELF_HEAL_INFO, AFR_MSG_READ_SUBVOL_ERROR,
- AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD,
- AFR_MSG_INVALID_DATA, AFR_MSG_INVALID_ARG,
- AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
- AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
- AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS,
- AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL,
- AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED,
- AFR_MSG_THIN_ARB);
+GLFS_MSGID(
+ AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE,
+ AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN,
+ AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL,
+ AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL,
+ AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
+ AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
+ AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
+ AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO,
+ AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA,
+ AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
+ AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
+ AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG,
+ AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB,
+ AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED,
+ AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET,
+ AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START,
+ AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT,
+ AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED,
+ AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG,
+ AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED,
+ AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH,
+ AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED,
+ AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA,
+ SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE,
+ SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED,
+ AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED,
+ AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED,
+ AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED,
+ AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN,
+ AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP,
+ AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR,
+ AFR_MSG_INTERNAL_ATTR);
+#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed"
+#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed"
+#define AFR_MSG_INVALID_ARG_STR "Invalid argument"
+#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on "
+#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file"
+#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict."
+#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed."
+#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \
+ "Failed to populate loc for thin-arbiter"
+#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending"
+#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping."
+#define AFR_MSG_UNKNOWN_SET_STR "Unknown set"
+#define AFR_MSG_NO_XL_ID_STR "xl does not have id"
+#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on"
+#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on"
+#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter."
+#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output"
+#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time"
+#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict"
+#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \
+ "No majority to resolve gfid split brain"
+#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected"
+#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal"
+#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch"
+#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \
+ "Size policy is not applicable to directories."
+#define AFR_MSG_NO_CHILD_SELECTED_STR \
+ "No child selected by favorite-child policy"
+#define AFR_MSG_INVALID_CHILD_STR "Invalid child"
+#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \
+ "selected as authentic to resolve conflicting data"
+#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick"
+#define SNO_DIFF_IN_MTIME_STR "No difference in mtime"
+#define SNO_BIGGER_FILE_STR "No bigger file"
+#define SALL_BRICKS_UP_TO_RESOLVE_STR \
+ "All the bricks should be up to resolve the gfid split brain"
+#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock"
+#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed"
+#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame"
+#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop"
+#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed"
+#define AFR_MSG_FSYNC_FAILED_STR "fsync failed"
+#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met"
+#define AFR_MSG_FOP_FAILED_STR "Failing Fop"
+#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume"
+#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling"
+#define AFR_MSG_CHILD_MISCONFIGURED_STR \
+ "replicate translator needs more than one subvolume defined"
+#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads"
+#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count"
+#define AFR_MSG_UNABLE_TO_FETCH_STR \
+ "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \
+ "using client translator names"
+#define AFR_MSG_NULL_DEREF_STR "possible NULL deref"
+#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key"
+#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask"
+#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up"
+#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \
+ "Failed to cancel split-brain choice"
+#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \
+ "Cannot set replica. File is not in data/metadata split-brain"
+#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx"
+#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols"
+#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child"
+#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file"
+#define AFR_MSG_TIMER_CREATE_FAIL_STR \
+ "Cannot create timer for delayed initialization"
+#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online"
+#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \
+ "All subvolumes are down. Going offline until atleast one of them is up"
+#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock"
+#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume"
+#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met"
+#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir"
+#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid"
+#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file"
+#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain"
+#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down"
+#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask"
+#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \
+ "read subvolume in this generation is not up"
+#define AFR_MSG_INTERNAL_LKS_FAILED_STR \
+ "Unable to work with lk-owner while attempting fop"
+#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \
+ "subvolume does not support locking. please load features/locks xlator " \
+ "on server."
+#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx"
+#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting."
+#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child."
+#define AFR_MSG_NEW_BRICK_STR "New brick"
+#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \
+ "Failed to set split-brain choice to -1"
+#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \
+ "Failed to determine split-brain. Aborting split-brain-choice set"
+#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume"
+#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr"
+#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute"
#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index ff72c73a9f4..64856042b65 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -8,9 +8,7 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
@@ -18,11 +16,7 @@
#include <glusterfs/glusterfs.h>
#include "afr.h"
#include <glusterfs/dict.h>
-#include <glusterfs/xlator.h>
-#include <glusterfs/hashfn.h>
#include <glusterfs/logging.h>
-#include <glusterfs/list.h>
-#include <glusterfs/call-stub.h>
#include <glusterfs/defaults.h>
#include <glusterfs/common-utils.h>
#include <glusterfs/compat-errno.h>
@@ -30,10 +24,6 @@
#include <glusterfs/byte-order.h>
#include <glusterfs/statedump.h>
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
#include "afr-transaction.h"
gf_boolean_t
@@ -73,6 +63,10 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
local = frame->local;
fd_ctx = local->fd_ctx;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
LOCK(&frame->lock);
{
if (op_ret == -1) {
@@ -84,13 +78,16 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
if (!local->xdata_rsp && xdata)
local->xdata_rsp = dict_ref(xdata);
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0) {
- if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {
+ afr_handle_replies_quorum(frame, this);
+ if (local->op_ret == -1) {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ } else if (fd_ctx->flags & O_TRUNC) {
STACK_WIND(frame, afr_open_ftruncate_cbk, this,
this->fops->ftruncate, fd, 0, NULL);
} else {
@@ -140,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int spb_choice = 0;
+ int spb_subvol = 0;
int event_generation = 0;
int ret = 0;
int32_t op_errno = 0;
@@ -161,6 +158,11 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto out;
}
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
if (!afr_is_consistent_io_possible(local, priv, &op_errno))
goto out;
@@ -177,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = afr_inode_get_readable(frame, local->inode, this, NULL,
&event_generation, AFR_DATA_TRANSACTION);
if ((ret < 0) &&
- (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) ==
- 0) &&
- spb_choice < 0) {
+ (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+ &spb_subvol) == 0) &&
+ spb_subvol < 0) {
afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
afr_open_continue);
} else {
@@ -213,11 +215,9 @@ afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
"successfully on subvolume %s",
local->loc.path, priv->children[child_index]->name);
} else {
- gf_msg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
- AFR_MSG_OPEN_FAIL,
- "Failed to open %s on "
- "subvolume %s",
- local->loc.path, priv->children[child_index]->name);
+ gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
+ AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s",
+ priv->children[child_index]->name, NULL);
}
fd_ctx = local->fd_ctx;
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 9a91f2e56fc..6fc2c75145c 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -30,27 +30,6 @@ afr_pending_read_decrement(afr_private_t *priv, int child_index)
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
}
-static gf_boolean_t
-afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
-{
- int *pending = NULL;
- int ret = 0;
- int i = 0;
-
- ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
- if (ret == 0) {
- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
- /* Not doing a ntoh32(pending) as we just want to check
- * if it is non-zero or not. */
- if (pending[i]) {
- return _gf_true;
- }
- }
- }
-
- return _gf_false;
-}
-
void
afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
@@ -114,7 +93,7 @@ afr_ta_read_txn(void *opaque)
call_frame_t *frame = NULL;
xlator_t *this = NULL;
int read_subvol = -1;
- int up_child = AFR_CHILD_UNKNOWN;
+ int query_child = AFR_CHILD_UNKNOWN;
int possible_bad_child = AFR_CHILD_UNKNOWN;
int ret = 0;
int op_errno = ENOMEM;
@@ -134,18 +113,18 @@ afr_ta_read_txn(void *opaque)
this = frame->this;
local = frame->local;
priv = this->private;
+ query_child = local->read_txn_query_child;
- if (local->child_up[AFR_CHILD_ZERO]) {
- up_child = AFR_CHILD_ZERO;
+ if (query_child == AFR_CHILD_ZERO) {
possible_bad_child = AFR_CHILD_ONE;
- } else if (local->child_up[AFR_CHILD_ONE]) {
- up_child = AFR_CHILD_ONE;
+ } else if (query_child == AFR_CHILD_ONE) {
possible_bad_child = AFR_CHILD_ZERO;
+ } else {
+ /*read_txn_query_child is AFR_CHILD_UNKNOWN*/
+ goto out;
}
- GF_ASSERT(up_child != AFR_CHILD_UNKNOWN);
-
- /* Query the up_child to see if it blames the down one. */
+ /* Ask the query_child to see if it blames the possibly bad one. */
xdata_req = dict_new();
if (!xdata_req)
goto out;
@@ -159,30 +138,33 @@ afr_ta_read_txn(void *opaque)
goto out;
if (local->fd) {
- ret = syncop_fxattrop(priv->children[up_child], local->fd,
+ ret = syncop_fxattrop(priv->children[query_child], local->fd,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
NULL);
} else {
- ret = syncop_xattrop(priv->children[up_child], &local->loc,
+ ret = syncop_xattrop(priv->children[query_child], &local->loc,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
NULL);
}
if (ret || !xdata_rsp) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed xattrop for gfid %s on %s",
- uuid_utoa(local->inode->gfid), priv->children[up_child]->name);
+ uuid_utoa(local->inode->gfid),
+ priv->children[query_child]->name);
op_errno = -ret;
goto out;
}
if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv,
possible_bad_child)) {
- read_subvol = up_child;
+ read_subvol = query_child;
goto out;
}
dict_unref(xdata_rsp);
- /* Query thin-arbiter to see if it blames any data brick. */
- ret = afr_fill_ta_loc(this, &loc);
+ xdata_rsp = NULL;
+
+ /* It doesn't. So query thin-arbiter to see if it blames any data brick. */
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc.name);
@@ -211,8 +193,8 @@ afr_ta_read_txn(void *opaque)
goto unlock;
}
- if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) {
- read_subvol = up_child;
+ if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) {
+ read_subvol = query_child;
} else {
gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
"Failing read for gfid %s since good brick %s is down",
@@ -290,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
int read_subvol = -1;
inode_t *inode = NULL;
int ret = -1;
- int spb_choice = -1;
+ int spb_subvol = -1;
local = frame->local;
inode = local->inode;
@@ -321,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
local->read_attempted[read_subvol] = 1;
readfn:
if (read_subvol == -1) {
- ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice);
- if ((ret == 0) && spb_choice >= 0)
- read_subvol = spb_choice;
+ ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+ if ((ret == 0) && spb_subvol >= 0)
+ read_subvol = spb_subvol;
}
if (read_subvol == -1) {
@@ -450,6 +432,11 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
if (priv->thin_arbiter_count &&
AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ if (local->child_up[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (local->child_up[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
afr_ta_read_txn_synctask(frame, this);
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 226876154d7..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -55,7 +55,8 @@ afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
for (i = 0; i < priv->child_count; i++) {
if (source == -1) {
/* case (a) above. */
- if (replies[i].valid && replies[i].op_ret == 0) {
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
ia_type = replies[i].poststat.ia_type;
break;
}
@@ -63,7 +64,8 @@ afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
/* case (b) above. */
if (i == source)
continue;
- if (sources[i] && replies[i].valid && replies[i].op_ret == 0) {
+ if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
ia_type = replies[i].poststat.ia_type;
break;
}
@@ -77,6 +79,12 @@ heal:
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid || replies[i].op_ret != 0)
continue;
+
+ if (gf_uuid_is_null(gfid) &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+ replies[i].poststat.ia_type == ia_type)
+ gfid = replies[i].poststat.ia_gfid;
+
if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
replies[i].poststat.ia_type != ia_type)
continue;
@@ -132,7 +140,7 @@ heal:
}
}
out:
- if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) {
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
ret = -afr_final_errno(local, priv);
}
loc_wipe(&loc);
@@ -383,11 +391,12 @@ out:
uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
priv->children[src_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=gfid;file="
"<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
"child-%d=%s;gfid-%d=%s",
- this->name, uuid_utoa(pargfid), bname, child_idx,
- priv->children[child_idx]->name, child_idx,
+ this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+ bname, child_idx, priv->children[child_idx]->name, child_idx,
uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
priv->children[src_idx]->name, src_idx,
uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
@@ -504,7 +513,8 @@ afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
&replies[source].poststat,
- (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME), NULL);
+ (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+ NULL);
loc_wipe(&loc);
@@ -1565,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
char *accused = NULL; /* Accused others without any self-accusal */
char *pending = NULL; /* Have pending operations on others */
char *self_accused = NULL; /* Accused itself */
- int min_participants = -1;
priv = this->private;
@@ -1589,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
}
}
- if (type == AFR_DATA_TRANSACTION) {
- min_participants = priv->child_count;
- } else {
- min_participants = AFR_SH_MIN_PARTICIPANTS;
- }
- if (afr_success_count(replies, priv->child_count) < min_participants) {
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
/* Treat this just like locks not being acquired */
return -ENOTCONN;
}
@@ -1655,7 +1659,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
}
}
- if (type == AFR_DATA_TRANSACTION)
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
afr_selfheal_post_op_failure_accounting(priv, accused, sources,
locked_on);
@@ -1763,11 +1767,9 @@ afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (xdata) {
local->replies[i].xdata = dict_ref(xdata);
ret = dict_get_int8(xdata, "link-count", &need_heal);
- local->replies[i].need_heal = need_heal;
- } else {
- local->replies[i].need_heal = need_heal;
}
+ local->replies[i].need_heal = need_heal;
syncbarrier_wake(&local->barrier);
return 0;
@@ -1823,10 +1825,41 @@ afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
return inode;
}
+static int
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+
+ ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ ret = dict_set_uint32(dict, key1, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+ ret = dict_set_uint32(dict, key2, 1);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
int
afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on)
+ unsigned char *discover_on, dict_t *dict)
{
loc_t loc = {
0,
@@ -1841,12 +1874,19 @@ afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
xattr_req = dict_new();
if (!xattr_req)
return -ENOMEM;
+ if (dict)
+ dict_copy(dict, xattr_req);
if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
dict_unref(xattr_req);
return -ENOMEM;
}
+ if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+ dict_unref(xattr_req);
+ return -1;
+ }
+
loc.inode = inode_ref(inode);
gf_uuid_copy(loc.gfid, gfid);
@@ -1865,12 +1905,16 @@ int
afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ dict_t *dict = NULL;
- priv = frame->this->private;
+ local = frame->local;
+
+ if (local->xattr_req)
+ dict = local->xattr_req;
return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
- priv->child_up);
+ local->child_up, dict);
}
unsigned int
@@ -2237,7 +2281,8 @@ int
afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
inode_t **link_inode, gf_boolean_t *data_selfheal,
gf_boolean_t *metadata_selfheal,
- gf_boolean_t *entry_selfheal)
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies_dst)
{
afr_private_t *priv = NULL;
inode_t *inode = NULL;
@@ -2296,11 +2341,13 @@ afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
priv->children[i]->name,
uuid_utoa(replies[i].poststat.ia_gfid));
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;"
"type=file;gfid=%s;"
"ia_type-%d=%s;ia_type-%d=%s",
- this->name, uuid_utoa(replies[i].poststat.ia_gfid),
- first_idx, gf_inode_type_to_str(first.ia_type), i,
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+ gf_inode_type_to_str(first.ia_type), i,
gf_inode_type_to_str(replies[i].poststat.ia_type));
ret = -EIO;
goto out;
@@ -2371,6 +2418,8 @@ afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
ret = 0;
out:
+ if (replies && replies_dst)
+ afr_replies_copy(replies_dst, replies, priv->child_count);
if (inode)
inode_unref(inode);
if (replies)
@@ -2410,8 +2459,11 @@ afr_frame_create(xlator_t *this, int32_t *op_errno)
pid_t pid = GF_CLIENT_PID_SELF_HEALD;
frame = create_frame(this, this->ctx->pool);
- if (!frame)
+ if (!frame) {
+ if (op_errno)
+ *op_errno = ENOMEM;
return NULL;
+ }
local = AFR_FRAME_INIT(frame, (*op_errno));
if (!local) {
@@ -2487,7 +2539,7 @@ afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
&data_selfheal, &metadata_selfheal,
- &entry_selfheal);
+ &entry_selfheal, NULL);
if (ret)
goto out;
@@ -2698,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
out:
return source;
}
+
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
+ }
+ }
+
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
+
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
+ }
+ goto out;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 6dd38ef633a..37bcc2b3f9e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -225,24 +225,40 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
return ret;
}
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+ unsigned char *healed_sinks)
+{
+ afr_private_t *priv = this->private;
+ int i = 0;
+
+ if (!locked_on[source])
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && locked_on[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
static int
afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
int source, unsigned char *healed_sinks, off_t offset,
size_t size, int type, struct afr_reply *replies)
{
int ret = -1;
- int sink_count = 0;
afr_private_t *priv = NULL;
unsigned char *data_lock = NULL;
priv = this->private;
- sink_count = AFR_COUNT(healed_sinks, priv->child_count);
data_lock = alloca0(priv->child_count);
ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
data_lock);
{
- if (ret < sink_count) {
+ if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
ret = -ENOTCONN;
goto unlock;
}
@@ -324,6 +340,9 @@ afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
call_frame_t *iter_frame = NULL;
unsigned char arbiter_sink_status = 0;
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing data selfheal on %s", uuid_utoa(fd->inode->gfid));
+
priv = this->private;
if (priv->arbiter_count) {
arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
@@ -375,17 +394,18 @@ __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- unsigned char arbiter_sink_status = 0;
int i = 0;
local = frame->local;
priv = this->private;
- if (priv->arbiter_count) {
- arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
- healed_sinks[ARBITER_BRICK_INDEX] = 0;
- }
-
+ /* This will send truncate on the arbiter brick as well if it is marked as
+ * sink. If changelog is enabled on the volume it captures truncate as a
+ * data transactions on the arbiter brick. This will help geo-rep to
+ * properly sync the data from master to slave if arbiter is the ACTIVE
+ * brick during syncing and which had got some entries healed for data as
+ * part of self heal.
+ */
AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
NULL);
@@ -396,8 +416,6 @@ __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
*/
healed_sinks[i] = 0;
- if (arbiter_sink_status)
- healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
return 0;
}
@@ -538,9 +556,11 @@ __afr_selfheal_data_finalize_source(
replies, AFR_DATA_TRANSACTION);
if (source < 0) {
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=data;"
"file=%s",
- this->name, uuid_utoa(inode->gfid));
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
return -EIO;
}
@@ -693,19 +713,18 @@ __afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
goto unlock;
}
- if (priv->arbiter_count &&
- AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
- healed_sinks[ARBITER_BRICK_INDEX]) {
- is_arbiter_the_only_sink = _gf_true;
- goto restore_time;
- }
-
ret = __afr_selfheal_truncate_sinks(
frame, this, fd, healed_sinks,
locked_replies[source].poststat.ia_size);
if (ret < 0)
goto unlock;
+ if (priv->arbiter_count &&
+ AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
+ healed_sinks[ARBITER_BRICK_INDEX]) {
+ is_arbiter_the_only_sink = _gf_true;
+ goto restore_time;
+ }
ret = 0;
}
unlock:
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index b23ed6a1afa..64893f441e3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -16,54 +16,170 @@
#include <glusterfs/syncop-utils.h>
#include <glusterfs/events.h>
-static int
-afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
- inode_t *inode, int child, struct afr_reply *replies)
+int
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
xlator_t *subvol = NULL;
int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
loc_t loc = {
0,
};
- char g[64];
priv = this->private;
-
subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
+
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
loc.parent = inode_ref(dir);
gf_uuid_copy(loc.pargfid, dir->gfid);
loc.name = name;
- loc.inode = inode_ref(inode);
- if (replies[child].valid && replies[child].op_ret == 0) {
- switch (replies[child].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_unlink(subvol, &loc, NULL, NULL);
- break;
- }
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
}
+out:
loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
return ret;
}
int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
+{
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
+
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
unsigned char *sources, inode_t *dir,
const char *name, inode_t *inode,
@@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
loc_t srcloc = {
0,
};
+ loc_t anonloc = {
+ 0,
+ };
xlator_t *this = frame->this;
afr_private_t *priv = NULL;
dict_t *xdata = NULL;
@@ -86,15 +205,18 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
0,
};
unsigned char *newentry = NULL;
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
priv = this->private;
iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
"Invalid ia_type (%d) or gfid(%s). source brick=%d, "
"pargfid=%s, name=%s",
- iatt->ia_type, uuid_utoa(iatt->ia_gfid), source,
- uuid_utoa(dir->gfid), name);
+ iatt->ia_type, iatt_uuid_str, source,
+ uuid_utoa_r(dir->gfid, dir_uuid_str), name);
ret = -EINVAL;
goto out;
}
@@ -119,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
srcloc.inode = inode_ref(inode);
gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
- if (iatt->ia_type != IA_IFDIR)
- ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
- if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) {
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
newentry[dst] = 1;
ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
sources, newentry);
if (ret)
goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
}
mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
@@ -165,6 +297,7 @@ out:
GF_FREE(linkname);
loc_wipe(&loc);
loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
return ret;
}
@@ -246,6 +379,19 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
if (replies[i].op_ret != 0)
continue;
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
+
+ if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
+ src_idx = i;
+ ia_type = replies[src_idx].poststat.ia_type;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ continue;
+ }
+
if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
(ia_type == replies[i].poststat.ia_type)) {
ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
@@ -269,11 +415,12 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
priv->children[src_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=file;"
"file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
"%d=%s;child-%d=%s;type-%d=%s",
- this->name, uuid_utoa(pargfid), bname, i,
- priv->children[i]->name, i,
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
priv->children[src_idx]->name, src_idx,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
@@ -465,6 +612,7 @@ __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
afr_private_t *priv = NULL;
int source = -1;
int sources_count = 0;
+ int i = 0;
priv = this->private;
@@ -478,6 +626,20 @@ __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
}
source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
+
+ /*If the selected source does not blame any other brick, then mark
+ * everything as sink to trigger conservative merge.
+ */
+ if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ return -1;
+ }
+
return source;
}
@@ -548,6 +710,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
+ return 0;
+ }
+
xattr = dict_new();
if (!xattr)
return -ENOMEM;
@@ -568,7 +735,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes "
@@ -596,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
replies);
if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
- ret = afr_shd_index_purge(subvol, parent_idx_inode, name,
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
inode->ia_type);
/* Why is ret force-set to 0? We do not care about
* index purge failing for full heal as it is quite
@@ -726,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR))
- continue;
-
ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
loc.inode, subvol,
local->need_full_crawl);
@@ -792,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
/* The name indices under the pgfid index dir are guaranteed
* to be regular files. Hence the hardcoding.
*/
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
ret = 0;
goto out;
}
@@ -831,6 +994,8 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
subvol = priv->children[subvol_idx];
args.frame = afr_copy_frame(frame);
+ if (!args.frame)
+ goto out;
args.xl = this;
/* args.heal_fd represents the fd associated with the original directory
* on which entry heal is being attempted.
@@ -849,9 +1014,10 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
* do not treat heal as failure.
*/
if (is_src)
- return -errno;
+ ret = -errno;
else
- return 0;
+ ret = 0;
+ goto out;
}
ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
@@ -861,7 +1027,9 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (args.mismatch == _gf_true)
ret = -1;
-
+out:
+ if (args.frame)
+ AFR_STACK_DESTROY(args.frame);
return ret;
}
@@ -957,7 +1125,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
data_lock);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
@@ -1013,6 +1181,8 @@ unlock:
goto postop_unlock;
}
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
ret = afr_selfheal_undo_pending(
frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
@@ -1079,7 +1249,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
NULL, locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index a661fcb5acc..03f43bad16e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -190,6 +190,59 @@ out:
return ret;
}
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ struct afr_reply *replies,
+ unsigned char *sources)
+{
+ int ret = 0;
+ int i = 0;
+ int m_idx = 0;
+ afr_private_t *priv = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ raw[m_idx] = 1;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+ "Failed to set pending metadata xattr on child %d for %s", i,
+ uuid_utoa(inode->gfid));
+ goto out;
+ }
+ }
+
+ afr_replies_wipe(replies, priv->child_count);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
+
/*
* Look for mismatching uid/gid or mode or user xattrs even if
* AFR xattrs don't say so, and pick one arbitrarily as winner. */
@@ -210,6 +263,7 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
};
int source = -1;
int sources_count = 0;
+ int ret = 0;
priv = this->private;
@@ -242,9 +296,11 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
if (!priv->metadata_splitbrain_forced_heal) {
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;"
"type=metadata;file=%s",
- this->name, uuid_utoa(inode->gfid));
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
return -EIO;
}
@@ -298,7 +354,13 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
healed_sinks[i] = 1;
}
}
-
+ if ((sources_count == priv->child_count) && (source > -1) &&
+ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+ replies, sources);
+ if (ret < 0)
+ return ret;
+ }
out:
afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
return source;
@@ -396,7 +458,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
data_lock);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
ret = -ENOTCONN;
goto unlock;
}
@@ -419,12 +481,8 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
if (ret)
goto unlock;
- /* Restore atime/mtime for files that don't need data heal as
- * restoring timestamps happens only as a part of data-heal.
- */
- if (!IA_ISREG(locked_replies[source].poststat.ia_type))
- afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
- locked_replies);
+ afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
+ locked_replies);
ret = afr_selfheal_undo_pending(
frame, this, inode, sources, sinks, healed_sinks, undid_pending,
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index c4df5d4609d..834aac86d48 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
const char *bname, inode_t *inode,
struct afr_reply *replies)
{
- loc_t loc = {
- 0,
- };
int i = 0;
afr_private_t *priv = NULL;
- char g[64];
int ret = 0;
priv = this->private;
- loc.parent = inode_ref(parent);
- gf_uuid_copy(loc.pargfid, pargfid);
- loc.name = bname;
- loc.inode = inode_ref(inode);
-
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
@@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
if (replies[i].op_ret)
continue;
- switch (replies[i].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL);
- break;
- }
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
}
- loc_wipe(&loc);
-
return ret;
}
@@ -222,13 +193,14 @@ afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies,
gf_inode_type_to_str(inode_type),
priv->children[type_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=file;"
"file=<gfid:%s>/%s;count=2;"
"child-%d=%s;type-%d=%s;child-%d=%s;"
"type-%d=%s",
- this->name, uuid_utoa(pargfid), bname, i,
- priv->children[i]->name, i,
- gf_inode_type_to_str(inode_type1), type_idx,
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name,
+ i, gf_inode_type_to_str(inode_type1), type_idx,
priv->children[type_idx]->name, type_idx,
gf_inode_type_to_str(inode_type));
return -EIO;
@@ -380,7 +352,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
replies, gfid, locked_on, source, sources,
is_gfid_absent, &gfid_idx);
- if (ret)
+ if (ret || (gfid_idx < 0))
return ret;
ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
@@ -513,7 +485,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
ret = -ENOTCONN;
goto unlock;
}
@@ -559,13 +531,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
struct afr_reply *replies = NULL;
inode_t *inode = NULL;
int first_idx = -1;
+ afr_local_t *local = NULL;
priv = this->private;
+ local = frame->local;
replies = alloca0(sizeof(*replies) * priv->child_count);
inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
- priv->child_up, NULL);
+ local->child_up, NULL);
if (!inode)
return -ENOMEM;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 6555ec55771..48e6dbcfb18 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -11,8 +11,6 @@
#ifndef _AFR_SELFHEAL_H
#define _AFR_SELFHEAL_H
-#define AFR_SH_MIN_PARTICIPANTS 2
-
/* Perform fop on all UP subvolumes and wait for all callbacks to return */
#define AFR_ONALL(frame, rfn, fop, args...) \
@@ -20,9 +18,8 @@
afr_local_t *__local = frame->local; \
afr_private_t *__priv = frame->this->private; \
int __i = 0, __count = 0; \
- unsigned char *__child_up = NULL; \
+ unsigned char *__child_up = alloca(__priv->child_count); \
\
- __child_up = alloca0(__priv->child_count); \
memcpy(__child_up, __priv->child_up, \
sizeof(*__child_up) * __priv->child_count); \
__count = AFR_COUNT(__child_up, __priv->child_count); \
@@ -48,13 +45,16 @@
afr_local_t *__local = frame->local; \
afr_private_t *__priv = frame->this->private; \
int __i = 0; \
- int __count = AFR_COUNT(list, __priv->child_count); \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
\
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
__local->barrier.waitfor = __count; \
afr_local_replies_wipe(__local, __priv); \
\
for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!list[__i]) \
+ if (!__list[__i]) \
continue; \
STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
__priv->children[__i], \
@@ -83,9 +83,9 @@
#define ALLOC_MATRIX(n, type) \
({ \
- type **__ptr = NULL; \
int __i; \
- __ptr = alloca0(n * sizeof(type *)); \
+ type **__ptr = alloca(n * sizeof(type *)); \
+ \
for (__i = 0; __i < n; __i++) \
__ptr[__i] = alloca0(n * sizeof(type)); \
__ptr; \
@@ -187,7 +187,7 @@ afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
int
afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on);
+ unsigned char *discover_on, dict_t *dict);
inode_t *
afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
const char *name, struct afr_reply *replies,
@@ -326,7 +326,8 @@ int
afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
inode_t **link_inode, gf_boolean_t *data_selfheal,
gf_boolean_t *metadata_selfheal,
- gf_boolean_t *entry_selfheal);
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies);
int
afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid);
@@ -368,4 +369,9 @@ gf_boolean_t
afr_is_file_empty_on_all_children(afr_private_t *priv,
struct afr_reply *replies);
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 7eb12070a39..109fd4b7421 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -94,7 +94,7 @@ __afr_shd_healer_wait(struct subvol_healer *healer)
priv = healer->this->private;
disabled_loop:
- wait_till.tv_sec = time(NULL) + priv->shd.timeout;
+ wait_till.tv_sec = gf_time() + priv->shd.timeout;
while (!healer->rerun) {
ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
@@ -222,7 +222,7 @@ out:
}
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type)
{
int ret = 0;
@@ -371,8 +371,9 @@ afr_shd_sweep_prepare(struct subvol_healer *healer)
event->split_brain_count = 0;
event->heal_failed_count = 0;
- time(&event->start_time);
+ event->start_time = gf_time();
event->end_time = 0;
+ _mask_cancellation();
}
void
@@ -385,8 +386,8 @@ afr_shd_sweep_done(struct subvol_healer *healer)
event = &healer->crawl_event;
shd = &(((afr_private_t *)healer->this->private)->shd);
- time(&event->end_time);
- history = memdup(event, sizeof(*event));
+ event->end_time = gf_time();
+ history = gf_memdup(event, sizeof(*event));
event->start_time = 0;
if (!history)
@@ -394,6 +395,7 @@ afr_shd_sweep_done(struct subvol_healer *healer)
if (eh_save_history(shd->statistics[healer->subvol], history) < 0)
GF_FREE(history);
+ _unmask_cancellation();
}
int
@@ -422,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ret = afr_shd_selfheal(healer, healer->subvol, gfid);
if (ret == -ENOENT || ret == -ESTALE)
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, val);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
if (ret == 2)
/* If bricks crashed in pre-op after creating indices/xattrop
@@ -522,6 +524,11 @@ afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
afr_private_t *priv = NULL;
priv = this->private;
+
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
+
if (!priv->shd.enabled)
return -EBUSY;
@@ -590,7 +597,9 @@ _afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
{
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
- int *raw = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
int ret = -1;
int i = 0;
@@ -602,18 +611,11 @@ _afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
"Failed to create dict.");
goto out;
}
-
for (i = 0; i < priv->child_count; i++) {
- raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
- if (!raw)
- goto out;
-
- ret = dict_set_bin(xattr, priv->pending_key[i], raw,
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret) {
- GF_FREE(raw);
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret)
goto out;
- }
}
ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
@@ -640,6 +642,7 @@ afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
if (afr_shd_fill_ta_loc(this, loc)) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc->name);
+ ret = -1;
goto out;
}
@@ -798,6 +801,218 @@ afr_bricks_available_for_heal(afr_private_t *priv)
return _gf_true;
}
+static gf_boolean_t
+afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer)
+{
+ dict_t *xdata = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+ gf_boolean_t need_heal = _gf_false;
+
+ priv = this->private;
+
+ ret = afr_shd_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ healer->rerun = 1;
+ goto out;
+ }
+
+ if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) {
+ healer->rerun = 1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+
+ return need_heal;
+}
+
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
+ }
+ }
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
+
void *
afr_shd_index_healer(void *data)
{
@@ -824,7 +1039,8 @@ afr_shd_index_healer(void *data)
priv->local[healer->subvol] = healer->local;
if (priv->thin_arbiter_count) {
- afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
+ if (afr_shd_ta_needs_heal(this, healer))
+ afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
}
do {
@@ -854,9 +1070,17 @@ afr_shd_index_healer(void *data)
sleep(1);
} while (ret > 0);
- if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) {
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
+
+ if (ret == 0 && pre_crawl_xdata &&
+ !healer->crawl_event.heal_failed_count) {
afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
pre_crawl_xdata);
+ }
+
+ if (pre_crawl_xdata) {
dict_unref(pre_crawl_xdata);
pre_crawl_xdata = NULL;
}
@@ -974,7 +1198,7 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
{
int ret = 0;
uint64_t count = 0;
- char key[256] = {0};
+ char key[128] = {0};
int keylen = 0;
char suffix[64] = {0};
int xl_id = 0;
@@ -1099,9 +1323,9 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
{
int ret = -1;
uint64_t count = 0;
- char key[256] = {0};
+ char key[64] = {0};
int keylen = 0;
- char xl_id_child_str[64] = {0};
+ char xl_id_child_str[32] = {0};
int xl_id = 0;
ret = dict_get_int32(output, this->name, &xl_id);
@@ -1258,12 +1482,18 @@ out:
return ret;
}
-int
-afr_selfheal_childup(xlator_t *this, int subvol)
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv)
{
- afr_shd_index_healer_spawn(this, subvol);
+ int subvol = 0;
- return 0;
+ if (!priv->shd.iamshd)
+ return;
+ for (subvol = 0; subvol < priv->child_count; subvol++)
+ if (priv->child_up[subvol])
+ afr_shd_index_healer_spawn(this, subvol);
+
+ return;
}
int
@@ -1318,19 +1548,40 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
int op_ret = 0;
uint64_t cnt = 0;
+#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \
+ dict_str_len) \
+ { \
+ int ret; \
+ \
+ ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \
+ if (ret) { \
+ gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \
+ "key=%s", key, "value=%s", dict_str, NULL); \
+ } \
+ }
+
priv = this->private;
shd = &priv->shd;
ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op);
- if (ret)
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=xl-op", NULL);
goto out;
+ }
this_name_len = strlen(this->name);
ret = dict_get_int32n(input, this->name, this_name_len, &xl_id);
- if (ret)
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=%s", this->name, NULL);
goto out;
+ }
ret = dict_set_int32n(output, this->name, this_name_len, xl_id);
- if (ret)
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "key=%s", this->name, NULL);
goto out;
+ }
switch (op) {
case GF_SHD_OP_HEAL_INDEX:
op_ret = 0;
@@ -1340,23 +1591,30 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
if (!priv->child_up[i]) {
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SBRICK_NOT_CONNECTED,
SLEN(SBRICK_NOT_CONNECTED));
op_ret = -1;
} else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SLESS_THAN2_BRICKS_in_REP,
SLEN(SLESS_THAN2_BRICKS_in_REP));
op_ret = -1;
} else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
- ret = dict_set_nstrn(output, key, keylen, SBRICK_IS_REMOTE,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
SLEN(SBRICK_IS_REMOTE));
} else {
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SSTARTED_SELF_HEAL,
SLEN(SSTARTED_SELF_HEAL));
- afr_shd_index_healer_spawn(this, i);
+
+ ret = afr_shd_index_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
}
}
break;
@@ -1368,21 +1626,28 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
if (!priv->child_up[i]) {
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SBRICK_NOT_CONNECTED,
SLEN(SBRICK_NOT_CONNECTED));
} else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SLESS_THAN2_BRICKS_in_REP,
SLEN(SLESS_THAN2_BRICKS_in_REP));
} else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
- ret = dict_set_nstrn(output, key, keylen, SBRICK_IS_REMOTE,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
SLEN(SBRICK_IS_REMOTE));
} else {
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SSTARTED_SELF_HEAL,
SLEN(SSTARTED_SELF_HEAL));
- afr_shd_full_healer_spawn(this, i);
+
+ ret = afr_shd_full_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
op_ret = 0;
}
}
@@ -1390,24 +1655,25 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
case GF_SHD_OP_INDEX_SUMMARY:
/* this case has been handled in glfs-heal.c */
break;
- case GF_SHD_OP_HEALED_FILES:
- case GF_SHD_OP_HEAL_FAILED_FILES:
- for (i = 0; i < priv->child_count; i++) {
- keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
- ret = dict_set_nstrn(output, key, keylen, SOP_NOT_SUPPORTED,
- SLEN(SOP_NOT_SUPPORTED));
- }
- break;
case GF_SHD_OP_SPLIT_BRAIN_FILES:
eh_dump(shd->split_brain, output, afr_add_shd_event);
break;
case GF_SHD_OP_STATISTICS:
for (i = 0; i < priv->child_count; i++) {
eh_dump(shd->statistics[i], output, afr_add_crawl_event);
- afr_shd_dict_add_crawl_event(
+ ret = afr_shd_dict_add_crawl_event(
this, output, &shd->index_healers[i].crawl_event);
- afr_shd_dict_add_crawl_event(this, output,
- &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
}
break;
case GF_SHD_OP_STATISTICS_HEAL_COUNT:
@@ -1418,7 +1684,7 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
if (!priv->child_up[i]) {
keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id,
i);
- ret = dict_set_nstrn(output, key, keylen,
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
SBRICK_NOT_CONNECTED,
SLEN(SBRICK_NOT_CONNECTED));
} else {
@@ -1427,6 +1693,10 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
if (ret == 0) {
ret = dict_set_uint64(output, key, cnt);
}
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_DICT_SET_FAILED, NULL);
+ }
op_ret = 0;
}
}
@@ -1434,11 +1704,13 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
break;
default:
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG,
- "Unknown set op %d", op);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d",
+ op, NULL);
break;
}
out:
dict_deln(output, this->name, this_name_len);
return op_ret;
+
+#undef AFR_SET_DICT_AND_LOG
}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 7de7c431460..18db728ea7b 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -14,12 +14,11 @@
#include <pthread.h>
typedef struct {
- int child;
char *path;
+ int child;
} shd_event_t;
typedef struct {
- int child;
uint64_t healed_count;
uint64_t split_brain_count;
uint64_t heal_failed_count;
@@ -31,38 +30,36 @@ typedef struct {
cralwer is in progress */
time_t end_time;
char *crawl_type;
+ int child;
} crawl_event_t;
struct subvol_healer {
xlator_t *this;
- int subvol;
- gf_boolean_t local;
- gf_boolean_t running;
- gf_boolean_t rerun;
crawl_event_t crawl_event;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_t thread;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
};
typedef struct {
- gf_boolean_t iamshd;
- gf_boolean_t enabled;
- int timeout;
struct subvol_healer *index_healers;
struct subvol_healer *full_healers;
eh_t *split_brain;
eh_t **statistics;
+ int timeout;
uint32_t max_threads;
uint32_t wait_qlength;
uint32_t halo_max_latency_msec;
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
} afr_self_heald_t;
int
-afr_selfheal_childup(xlator_t *this, int subvol);
-
-int
afr_selfheal_daemon_init(xlator_t *this);
int
@@ -73,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
char **path_p);
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type);
#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 229820ba26a..a51f79b1f43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -32,7 +32,7 @@ static void
afr_post_op_handle_success(call_frame_t *frame, xlator_t *this);
static void
-afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this);
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno);
void
__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
@@ -74,6 +74,14 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
static void
afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv)
+{
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ priv->ta_notify_dom_lock_offset = 0;
+}
+
static void
afr_ta_process_waitq(xlator_t *this)
{
@@ -116,9 +124,9 @@ afr_release_notify_lock_for_ta(void *opaque)
this = (xlator_t *)opaque;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate loc for thin-arbiter.");
goto out;
}
@@ -127,17 +135,16 @@ afr_release_notify_lock_for_ta(void *opaque)
flock.l_len = 1;
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
- if (!ret) {
- LOCK(&priv->lock);
- priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
- priv->release_ta_notify_dom_lock = _gf_false;
- priv->ta_notify_dom_lock_offset = 0;
- UNLOCK(&priv->lock);
- } else {
+ if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to unlock AFR_TA_DOM_NOTIFY lock.");
}
+ LOCK(&priv->lock);
+ {
+ afr_ta_locked_priv_invalidate(priv);
+ }
+ UNLOCK(&priv->lock);
out:
loc_wipe(&loc);
return ret;
@@ -372,6 +379,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this)
}
local->transaction.unwind(frame, this);
+ GF_ASSERT(list_empty(&local->transaction.owner_list));
+ GF_ASSERT(list_empty(&local->transaction.wait_list));
AFR_STACK_DESTROY(frame);
return 0;
@@ -393,7 +402,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
}
static void
-afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
+afr_handle_lock_acquire_failure(afr_local_t *local)
{
struct list_head shared;
afr_lock_t *lock = NULL;
@@ -414,13 +423,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
afr_lock_fail_shared(local, &shared);
local->transaction.do_eager_unlock = _gf_true;
out:
- if (locked) {
- local->internal_lock.lock_cbk = afr_transaction_done;
- afr_unlock(local->transaction.frame, local->transaction.frame->this);
- } else {
- afr_transaction_done(local->transaction.frame,
- local->transaction.frame->this);
- }
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
}
call_frame_t *
@@ -517,42 +521,6 @@ afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
local->transaction.pre_op_sources[j] = 0;
}
-gf_boolean_t
-afr_has_arbiter_fop_cbk_quorum(call_frame_t *frame)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- xlator_t *this = NULL;
- gf_boolean_t fop_failed = _gf_false;
- unsigned char *pre_op_sources = NULL;
- int i = 0;
-
- local = frame->local;
- this = frame->this;
- priv = this->private;
- pre_op_sources = local->transaction.pre_op_sources;
-
- /* If the fop failed on the brick, it is not a source. */
- for (i = 0; i < priv->child_count; i++)
- if (local->transaction.failed_subvols[i])
- pre_op_sources[i] = 0;
-
- switch (AFR_COUNT(pre_op_sources, priv->child_count)) {
- case 1:
- if (pre_op_sources[ARBITER_BRICK_INDEX])
- fop_failed = _gf_true;
- break;
- case 0:
- fop_failed = _gf_true;
- break;
- }
-
- if (fop_failed)
- return _gf_false;
-
- return _gf_true;
-}
-
void
afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
{
@@ -619,7 +587,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
failure_count = AFR_COUNT(local->transaction.failed_subvols,
priv->child_count);
if (failure_count == priv->child_count) {
- afr_handle_lock_acquire_failure(local, _gf_true);
+ afr_handle_lock_acquire_failure(local);
return 0;
} else {
lock = &local->inode_ctx->lock[local->transaction.type];
@@ -668,7 +636,7 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
}
static void
-afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)
+afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)
{
afr_private_t *priv = this->private;
unsigned int inmem_count = 0;
@@ -678,17 +646,25 @@ afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)
LOCK(&priv->lock);
{
/*Once we get notify lock release upcall notification,
- if two fop states are non empty/non zero, we will
- not release lock.
- 1 - If anything in memory txn
- 2 - If anything in onwire or onwireq
+ if any of the fop state counters are non-zero, we will
+ not release the lock.
*/
- if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) {
- inmem_count = --priv->ta_in_mem_txn_count;
- } else {
- inmem_count = priv->ta_in_mem_txn_count;
- }
onwire_count = priv->ta_on_wire_txn_count;
+ inmem_count = priv->ta_in_mem_txn_count;
+ switch (fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ onwire_count = --priv->ta_on_wire_txn_count;
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ case TA_INFO_IN_MEMORY_FAILED:
+ inmem_count = --priv->ta_in_mem_txn_count;
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ GF_ASSERT(0);
+ break;
+ case TA_SUCCESS:
+ break;
+ }
release = priv->release_ta_notify_dom_lock;
}
UNLOCK(&priv->lock);
@@ -700,7 +676,7 @@ afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)
}
static void
-afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)
+afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)
{
afr_private_t *priv = this->private;
afr_local_t *entry = NULL;
@@ -713,15 +689,6 @@ afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)
LOCK(&priv->lock);
{
- if (--priv->ta_on_wire_txn_count == 0) {
- UNLOCK(&priv->lock);
- /*Only one write fop came and after taking notify
- *lock and before doing xattrop, it has received
- *lock contention upcall, so this is the only place
- *to find this out and release the lock*/
- afr_ta_dom_lock_check_and_release(local, this);
- return;
- }
bad_child = priv->ta_bad_child_index;
if (bad_child == AFR_CHILD_UNKNOWN) {
/*The previous on-wire ta_post_op was a failure. Just dequeue
@@ -742,13 +709,10 @@ afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)
while (!list_empty(&onwireq)) {
entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
list_del_init(&entry->ta_onwireq);
- LOCK(&priv->lock);
- --priv->ta_on_wire_txn_count;
- UNLOCK(&priv->lock);
if (entry->ta_failed_subvol == bad_child) {
afr_post_op_handle_success(entry->transaction.frame, this);
} else {
- afr_post_op_handle_failure(entry->transaction.frame, this);
+ afr_post_op_handle_failure(entry->transaction.frame, this, EIO);
}
}
}
@@ -767,7 +731,7 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
if (priv->thin_arbiter_count) {
/*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
- afr_ta_dom_lock_check_and_release(frame->local, this);
+ afr_ta_dom_lock_check_and_release(local->fop_state, this);
}
/* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
@@ -873,7 +837,7 @@ afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame)
priv = this->private;
up_children_count = AFR_COUNT(subvols, priv->child_count);
- if (afr_lookup_has_quorum(frame, this, subvols))
+ if (afr_lookup_has_quorum(frame, up_children_count))
return _gf_true;
if (priv->quorum_count == AFR_QUORUM_AUTO) {
@@ -971,12 +935,8 @@ afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
priv->child_count)
return _gf_false;
- if (priv->arbiter_count) {
- if (!afr_has_arbiter_fop_cbk_quorum(frame))
- need_dirty = _gf_true;
- } else if (!afr_has_fop_cbk_quorum(frame)) {
+ if (!afr_has_fop_cbk_quorum(frame))
need_dirty = _gf_true;
- }
return need_dirty;
}
@@ -1026,12 +986,8 @@ afr_handle_quorum(call_frame_t *frame, xlator_t *this)
* no split-brain with the fix. The problem is eliminated completely.
*/
- if (priv->arbiter_count) {
- if (afr_has_arbiter_fop_cbk_quorum(frame))
- return;
- } else if (afr_has_fop_cbk_quorum(frame)) {
+ if (afr_has_fop_cbk_quorum(frame))
return;
- }
if (afr_need_dirty_marking(frame, this))
goto set_response;
@@ -1073,7 +1029,7 @@ set_response:
}
int
-afr_fill_ta_loc(xlator_t *this, loc_t *loc)
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
{
afr_private_t *priv = NULL;
@@ -1081,6 +1037,11 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc)
loc->parent = inode_ref(priv->root_inode);
gf_uuid_copy(loc->pargfid, loc->parent->gfid);
loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+ /* Except afr_ta_id_file_check() which is path based, all other gluster
+ * FOPS need gfid.*/
+ return -EINVAL;
+ }
gf_uuid_copy(loc->gfid, priv->ta_gfid);
loc->inode = inode_new(loc->parent->table);
if (!loc->inode) {
@@ -1090,97 +1051,28 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc)
return 0;
}
-int
-afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local)
-{
- int ret = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- int failed_count = 0;
- struct gf_flock flock = {
- 0,
- };
- loc_t loc = {
- 0,
- };
- int i = 0;
-
- priv = this->private;
- if (!priv->thin_arbiter_count)
- return 0;
-
- failed_count = AFR_COUNT(local->transaction.failed_subvols,
- priv->child_count);
- if (!failed_count)
- return 0;
-
- GF_ASSERT(failed_count == 1);
- ret = afr_fill_ta_loc(this, &loc);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed to populate thin-arbiter loc for: %s.", loc.name);
- goto out;
- }
-
- xattr = dict_new();
- if (!xattr) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin(xattr, priv->pending_key[i],
- local->pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret)
- goto out;
- }
-
- flock.l_type = F_WRLCK;
- flock.l_start = 0;
- flock.l_len = 0;
-
- /*TODO: Convert to two domain locking. */
- ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
- AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL);
- if (ret)
- goto out;
-
- ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
- GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
-
- if (ret == -EINVAL) {
- gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB,
- "Thin-arbiter has denied post-op on %s for gfid %s.",
- priv->pending_key[THIN_ARBITER_BRICK_INDEX],
- uuid_utoa(local->inode->gfid));
-
- } else if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Post-op on thin-arbiter id file %s failed for gfid %s.",
- priv->pending_key[THIN_ARBITER_BRICK_INDEX],
- uuid_utoa(local->inode->gfid));
- }
- flock.l_type = F_UNLCK;
- syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY,
- &loc, F_SETLK, &flock, NULL, NULL);
-out:
- if (xattr)
- dict_unref(xattr);
-
- return ret;
-}
-
static int
afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
{
xlator_t *this = NULL;
afr_local_t *local = NULL;
+ call_frame_t *txn_frame = NULL;
+ afr_ta_fop_state_t fop_state;
local = (afr_local_t *)opaque;
+ fop_state = local->fop_state;
+ txn_frame = local->transaction.frame;
this = frame->this;
+ if (ret == 0) {
+ /*Mark pending xattrs on the up data brick.*/
+ afr_post_op_handle_success(txn_frame, this);
+ } else {
+ afr_post_op_handle_failure(txn_frame, this, -ret);
+ }
+
STACK_DESTROY(frame->root);
- afr_ta_process_onwireq(local, this);
+ afr_ta_process_onwireq(fop_state, this);
return 0;
}
@@ -1191,6 +1083,7 @@ afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
{
int **changelog = NULL;
int idx = 0;
+ int ret = 0;
int i;
if (local->is_new_entry == _gf_true) {
@@ -1206,20 +1099,36 @@ afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
if (local->transaction.failed_subvols[i])
changelog[i][idx] = hton32(1);
}
- afr_set_pending_dict(priv, xattr, changelog);
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
}
out:
return changelog;
}
+static void
+afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local,
+ gf_boolean_t *valid)
+{
+ if (priv->ta_event_gen > local->ta_event_gen) {
+ /* We can't trust the ta's response anymore.*/
+ afr_ta_locked_priv_invalidate(priv);
+ *valid = _gf_false;
+ return;
+ }
+ return;
+}
+
static int
afr_ta_post_op_do(void *opaque)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
xlator_t *this = NULL;
- call_frame_t *txn_frame = NULL;
dict_t *xattr = NULL;
unsigned char *pending = NULL;
int **changelog = NULL;
@@ -1230,15 +1139,15 @@ afr_ta_post_op_do(void *opaque)
};
int i = 0;
int ret = 0;
+ gf_boolean_t valid = _gf_true;
local = (afr_local_t *)opaque;
- txn_frame = local->transaction.frame;
- this = txn_frame->this;
+ this = local->transaction.frame->this;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate loc for thin-arbiter.");
goto out;
}
@@ -1262,8 +1171,10 @@ afr_ta_post_op_do(void *opaque)
changelog = afr_set_changelog_xattr(priv, pending, xattr, local);
- if (!changelog)
+ if (!changelog) {
+ ret = -ENOMEM;
goto out;
+ }
ret = afr_ta_post_op_lock(this, &loc);
if (ret)
@@ -1271,22 +1182,31 @@ afr_ta_post_op_do(void *opaque)
ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ }
LOCK(&priv->lock);
{
if (ret == 0) {
priv->ta_bad_child_index = failed_subvol;
} else if (ret == -EINVAL) {
priv->ta_bad_child_index = success_subvol;
+ ret = -EIO; /* TA failed the fop. Return EIO to application. */
}
+
+ afr_ta_locked_xattrop_validate(priv, local, &valid);
}
UNLOCK(&priv->lock);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ if (valid == _gf_false) {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s for gfid %s invalidated due "
+ "to event-gen mismatch.",
priv->pending_key[THIN_ARBITER_BRICK_INDEX],
uuid_utoa(local->inode->gfid));
- if (ret == -EINVAL)
- ret = -EIO; /* TA failed the fop. Return EIO to application. */
+ ret = -EIO;
}
afr_ta_post_op_unlock(this, &loc);
@@ -1299,12 +1219,6 @@ out:
loc_wipe(&loc);
- if (ret == 0) {
- /*Mark pending xattrs on the up data brick.*/
- afr_post_op_handle_success(local->transaction.frame, this);
- } else {
- afr_post_op_handle_failure(local->transaction.frame, this);
- }
return ret;
}
@@ -1363,6 +1277,7 @@ afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
/* Post-op on TA not needed as the fop succeeded only on the
* in-memory bad data brick and not the good one. Fail the fop.*/
local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+ priv->ta_in_mem_txn_count++;
}
}
UNLOCK(&priv->lock);
@@ -1396,9 +1311,9 @@ afr_post_op_handle_success(call_frame_t *frame, xlator_t *this)
}
static void
-afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this)
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno)
{
- afr_changelog_post_op_fail(frame, this, EIO);
+ afr_changelog_post_op_fail(frame, this, op_errno);
return;
}
@@ -1428,7 +1343,7 @@ afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
afr_post_op_handle_success(frame, this);
break;
case TA_INFO_IN_MEMORY_FAILED:
- afr_post_op_handle_failure(frame, this);
+ afr_post_op_handle_failure(frame, this, EIO);
break;
default:
break;
@@ -2092,7 +2007,7 @@ err:
local->op_ret = -1;
local->op_errno = op_errno;
- afr_handle_lock_acquire_failure(local, _gf_true);
+ afr_handle_lock_acquire_failure(local);
if (xdata_req)
dict_unref(xdata_req);
@@ -2361,7 +2276,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
} else {
lock = &local->inode_ctx->lock[local->transaction.type];
if (local->internal_lock.lock_op_ret < 0) {
- afr_handle_lock_acquire_failure(local, _gf_false);
+ afr_handle_lock_acquire_failure(local);
} else {
lock->event_generation = local->event_generation;
afr_changelog_pre_op(frame, this);
@@ -2432,8 +2347,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
goto out;
}
- if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) {
- /*Only allow writes but shard does [f]xattrops on writes, so
+ if (local->transaction.disable_delayed_post_op) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+ (local->op != GF_FOP_FSYNC)) {
+ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
* they are fine too*/
goto out;
}
@@ -2560,8 +2480,10 @@ afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
local->call_count = call_count;
xdata = dict_new();
- if (xdata)
+ if (xdata) {
ret = dict_set_int32_sizen(xdata, "batch-fsync", 1);
+ ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ }
for (i = 0; i < priv->child_count; i++) {
if (!local->transaction.pre_op[i])
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 33258a048a4..df7366f0a65 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -135,12 +135,34 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
}
}
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
+
int
reconfigure(xlator_t *this, dict_t *options)
{
afr_private_t *priv = NULL;
xlator_t *read_subvol = NULL;
int read_subvol_index = -1;
+ int timeout_old = 0;
int ret = -1;
int index = -1;
char *qtype = NULL;
@@ -150,6 +172,7 @@ reconfigure(xlator_t *this, dict_t *options)
char *locking_scheme = NULL;
gf_boolean_t consistent_io = _gf_false;
gf_boolean_t choose_local_old = _gf_false;
+ gf_boolean_t enabled_old = _gf_false;
priv = this->private;
@@ -166,7 +189,8 @@ reconfigure(xlator_t *this, dict_t *options)
bool, out);
GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
- gf_string2boolean(data_self_heal, &priv->data_self_heal);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
out);
@@ -239,6 +263,8 @@ reconfigure(xlator_t *this, dict_t *options)
out);
GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
+ GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log,
+ options, bool, out);
GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
fix_quorum_options(this, priv, qtype, options);
@@ -255,11 +281,13 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
bool, out);
+ enabled_old = priv->shd.enabled;
GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
out);
+ timeout_old = priv->shd.timeout;
GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
@@ -283,6 +311,16 @@ reconfigure(xlator_t *this, dict_t *options)
consistent_io = _gf_false;
priv->consistent_io = consistent_io;
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
+ if (priv->shd.enabled) {
+ if ((priv->shd.enabled != enabled_old) ||
+ (timeout_old != priv->shd.timeout))
+ afr_selfheal_childup(this, priv);
+ }
+
ret = 0;
out:
return ret;
@@ -368,7 +406,7 @@ afr_ta_init(afr_private_t *priv)
priv->release_ta_notify_dom_lock = _gf_false;
INIT_LIST_HEAD(&priv->ta_waitq);
INIT_LIST_HEAD(&priv->ta_onwireq);
- *priv->ta_gfid = 0;
+ gf_uuid_clear(priv->ta_gfid);
}
int32_t
@@ -407,6 +445,8 @@ init(xlator_t *this)
goto out;
priv = this->private;
+ INIT_LIST_HEAD(&priv->saved_locks);
+ INIT_LIST_HEAD(&priv->lk_healq);
LOCK_INIT(&priv->lock);
child_count = xlator_subvolume_count(this);
@@ -471,7 +511,8 @@ init(xlator_t *this)
GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
- gf_string2boolean(data_self_heal, &priv->data_self_heal);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
out);
@@ -525,7 +566,9 @@ init(xlator_t *this)
GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
if (priv->quorum_count != 0)
priv->consistent_io = _gf_false;
@@ -537,13 +580,19 @@ init(xlator_t *this)
goto out;
}
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
+
priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
gf_afr_mt_child_latency_t);
+ priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- if (!priv->child_up || !priv->child_latency) {
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
ret = -ENOMEM;
goto out;
}
@@ -611,19 +660,83 @@ init(xlator_t *this)
out:
return ret;
}
+void
+afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = -1;
+
+ if (!healer)
+ return;
+
+ if (healer->running) {
+ /*
+ * If there are any resources to cleanup, We need
+ * to do that gracefully using pthread_cleanup_push
+ */
+ ret = gf_thread_cleanup_xint(healer->thread);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Failed to clean up healer threads.");
+ healer->thread = 0;
+ }
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
+}
+
+void
+afr_selfheal_daemon_fini(xlator_t *this)
+{
+ struct subvol_healer *healer = NULL;
+ afr_self_heald_t *shd = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ if (shd->statistics[i])
+ eh_destroy(shd->statistics[i]);
+ }
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+ GF_FREE(shd->statistics);
+ if (shd->split_brain)
+ eh_destroy(shd->split_brain);
+}
void
fini(xlator_t *this)
{
afr_private_t *priv = NULL;
priv = this->private;
+
+ afr_selfheal_daemon_fini(this);
+ GF_ASSERT(list_empty(&priv->saved_locks));
+
LOCK(&priv->lock);
if (priv->timer != NULL) {
gf_timer_call_cancel(this->ctx, priv->timer);
priv->timer = NULL;
}
UNLOCK(&priv->lock);
+
+ if (this->local_pool != NULL) {
+ mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
+ }
+
this->private = NULL;
afr_priv_destroy(priv);
if (this->itable) {
@@ -655,6 +768,7 @@ struct xlator_fops fops = {
.getxattr = afr_getxattr,
.fgetxattr = afr_fgetxattr,
.readv = afr_readv,
+ .seek = afr_seek,
/* inode write */
.writev = afr_writev,
@@ -727,7 +841,7 @@ struct volume_options options[] = {
{.key = {"read-hash-mode"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .max = 3,
+ .max = 5,
.default_value = "1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
@@ -740,7 +854,10 @@ struct volume_options options[] = {
"1 = hash by GFID of file (all clients use "
"same subvolume).\n"
"2 = hash by GFID of file and client PID.\n"
- "3 = brick having the least outstanding read requests."},
+ "3 = brick having the least outstanding read requests.\n"
+ "4 = brick having the least network ping latency.\n"
+ "5 = Hybrid mode between 3 and 4, ie least value among "
+ "network-latency multiplied by outstanding-read-requests."},
{
.key = {"choose-local"},
.type = GF_OPTION_TYPE_BOOL,
@@ -1200,6 +1317,14 @@ struct volume_options options[] = {
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
{.key = {NULL}},
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 2cc3797675e..d62f9a9caf2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -39,7 +39,10 @@
#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
#define PFLAG_PENDING (1 << 0)
#define PFLAG_SBRAIN (1 << 1)
@@ -95,6 +98,25 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \
} while (0)
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \
+ do { \
+ afr_fd_ctx_t *__fd_ctx = NULL; \
+ __fd_ctx = afr_fd_ctx_get(__fd, __this); \
+ if (__fd_ctx && __fd_ctx->is_fd_bad) { \
+ __error = EBADF; \
+ goto __label; \
+ } \
+ } while (0)
+
+typedef enum {
+ AFR_READ_POLICY_FIRST_UP,
+ AFR_READ_POLICY_GFID_HASH,
+ AFR_READ_POLICY_GFID_PID_HASH,
+ AFR_READ_POLICY_LESS_LOAD,
+ AFR_READ_POLICY_LEAST_LATENCY,
+ AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
+
typedef enum {
AFR_FAV_CHILD_NONE,
AFR_FAV_CHILD_BY_SIZE,
@@ -126,13 +148,27 @@ typedef enum {
*on BAD brick - Success*/
TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
*on GOOD brick - Failed*/
+ TA_SUCCESS, /*FOP succeeded on both data bricks.*/
} afr_ta_fop_state_t;
struct afr_nfsd {
- gf_boolean_t iamnfsd;
uint32_t halo_max_latency_msec;
+ gf_boolean_t iamnfsd;
};
+typedef struct _afr_lk_heal_info {
+ fd_t *fd;
+ int32_t cmd;
+ struct gf_flock flock;
+ dict_t *xdata_req;
+ unsigned char *locked_nodes;
+ struct list_head pos;
+ gf_lkowner_t lk_owner;
+ pid_t pid;
+ int32_t *child_up_event_gen;
+ int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -143,19 +179,21 @@ typedef struct _afr_private {
inode_t *root_inode;
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
/* For thin-arbiter. */
- unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
uuid_t ta_gfid;
- unsigned char ta_child_up;
+ unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
int ta_bad_child_index;
- off_t ta_notify_dom_lock_offset;
- gf_boolean_t release_ta_notify_dom_lock;
+ int ta_event_gen;
unsigned int ta_in_mem_txn_count;
unsigned int ta_on_wire_txn_count;
struct list_head ta_waitq;
struct list_head ta_onwireq;
+ unsigned char *anon_inode;
unsigned char *child_up;
+ unsigned char *halo_child_up;
int64_t *child_latency;
unsigned char *local;
@@ -176,30 +214,31 @@ typedef struct _afr_private {
int32_t healers; /* No. of elements currently undergoing background
heal*/
+ gf_boolean_t release_ta_notify_dom_lock;
+
gf_boolean_t metadata_self_heal; /* on/off */
gf_boolean_t entry_self_heal; /* on/off */
gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
- unsigned int hash_mode; /* for when read_child is not set */
gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
- int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
- afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
- resolution of split-brains.*/
+ gf_timer_t *timer; /* launched when parent up is received */
unsigned int wait_count; /* # of servers to wait for success */
- gf_timer_t *timer; /* launched when parent up is received */
-
+ unsigned char ta_child_up;
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;
gf_boolean_t pre_op_compat; /* on/off */
uint32_t post_op_delay_secs;
unsigned int quorum_count;
- char vol_uuid[UUID_SIZE + 1];
+ off_t ta_notify_dom_lock_offset;
+ afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
+ resolution of split-brains.*/
+ afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+
int32_t *last_event;
/* @event_generation: Keeps count of number of events received which can
@@ -212,34 +251,41 @@ typedef struct _afr_private {
important as we might have had a network split brain.
*/
uint32_t event_generation;
+ char vol_uuid[UUID_SIZE + 1];
gf_boolean_t choose_local;
gf_boolean_t did_discovery;
- uint64_t sh_readdir_size;
gf_boolean_t ensure_durability;
+ gf_boolean_t halo_enabled;
+ gf_boolean_t consistent_metadata;
+ gf_boolean_t need_heal;
+ gf_boolean_t granular_locks;
+ uint64_t sh_readdir_size;
char *sh_domain;
char *afr_dirty;
- gf_boolean_t halo_enabled;
- uint32_t halo_max_latency_msec;
- uint32_t halo_max_replicas;
- uint32_t halo_min_replicas;
+ uint64_t spb_choice_timeout;
afr_self_heald_t shd;
struct afr_nfsd nfsd;
- gf_boolean_t consistent_metadata;
- uint64_t spb_choice_timeout;
- gf_boolean_t need_heal;
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
- /* pump dependencies */
- void *pump_private;
- gf_boolean_t use_afr_in_pump;
- gf_boolean_t granular_locks;
gf_boolean_t full_lock;
gf_boolean_t esh_granular;
gf_boolean_t consistent_io;
gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
+
+ /*For lock healing.*/
+ struct list_head saved_locks;
+ struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
} afr_private_t;
typedef enum {
@@ -303,18 +349,17 @@ afr_entry_lockee_cmp(const void *l1, const void *l2);
typedef struct {
loc_t *lk_loc;
- int lockee_count;
afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
- char lower_locked;
- char higher_locked;
unsigned char *lower_locked_nodes;
- int32_t lock_count;
+ afr_lock_cbk_t lock_cbk;
+
+ int lockee_count;
int32_t lk_call_count;
int32_t lk_expected_count;
@@ -322,14 +367,15 @@ typedef struct {
int32_t lock_op_ret;
int32_t lock_op_errno;
- afr_lock_cbk_t lock_cbk;
char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+ int32_t lock_count;
+ char lower_locked;
+ char higher_locked;
} afr_internal_lock_t;
struct afr_reply {
int valid;
int32_t op_ret;
- int32_t op_errno;
dict_t *xattr; /*For xattrop*/
dict_t *xdata;
struct iatt poststat;
@@ -338,6 +384,7 @@ struct afr_reply {
struct iatt preparent;
struct iatt preparent2;
struct iatt postparent2;
+ int32_t op_errno;
/* For rchecksum */
uint8_t checksum[SHA256_DIGEST_LENGTH];
gf_boolean_t buf_has_zeroes;
@@ -361,6 +408,10 @@ typedef struct {
arrives, we continue to read off this subvol.
*/
int readdir_subvol;
+ /* lock-healing related members. */
+ gf_boolean_t is_fd_bad;
+ afr_lk_heal_info_t *lk_heal_info;
+
} afr_fd_ctx_t;
typedef enum {
@@ -377,8 +428,6 @@ typedef struct _afr_inode_lock_t {
*/
int32_t num_inodelks;
unsigned int event_generation;
- gf_boolean_t release;
- gf_boolean_t acquired;
gf_timer_t *delay_timer;
struct list_head owners; /*Transactions that are performing fop*/
struct list_head post_op; /*Transactions that are done with the fop
@@ -387,6 +436,8 @@ typedef struct _afr_inode_lock_t {
*conflicting transactions to complete*/
struct list_head frozen; /*Transactions that need to go as part of
* next batch of eager-lock*/
+ gf_boolean_t release;
+ gf_boolean_t acquired;
} afr_lock_t;
typedef struct _afr_inode_ctx {
@@ -395,15 +446,11 @@ typedef struct _afr_inode_ctx {
int lock_count;
int spb_choice;
gf_timer_t *timer;
- gf_boolean_t need_refresh;
unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
int inherited[AFR_NUM_CHANGE_LOGS];
int on_disk[AFR_NUM_CHANGE_LOGS];
-
- /* set if any write on this fd was a non stable write
- (i.e, without O_SYNC or O_DSYNC)
- */
- gf_boolean_t witnessed_unstable_write;
+ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+ afr_lock_t lock[2];
/* @open_fd_count:
Number of open FDs queried from the server, as queried through
@@ -411,8 +458,12 @@ typedef struct _afr_inode_ctx {
temporarily disabled.
*/
uint32_t open_fd_count;
- /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
- afr_lock_t lock[2];
+ gf_boolean_t need_refresh;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
} afr_inode_ctx_t;
typedef struct _afr_local {
@@ -426,19 +477,15 @@ typedef struct _afr_local {
unsigned int event_generation;
uint32_t open_fd_count;
- gf_boolean_t update_open_fd_count;
int32_t num_inodelks;
- gf_boolean_t update_num_inodelks;
-
- gf_lkowner_t saved_lk_owner;
int32_t op_ret;
int32_t op_errno;
- int32_t **pending;
-
int dirty[AFR_NUM_CHANGE_LOGS];
+ int32_t **pending;
+
loc_t loc;
loc_t newloc;
@@ -469,14 +516,6 @@ typedef struct _afr_local {
afr_read_txn_wind_t readfn;
- /* @refreshed:
-
- the inode was "refreshed" (i.e, pending xattrs from all subvols
- freshly inspected and inode ctx updated accordingly) as part of
- this transaction already.
- */
- gf_boolean_t refreshed;
-
/* @inode:
the inode on which the read txn is performed on. ref'ed and copied
@@ -501,8 +540,6 @@ typedef struct _afr_local {
unsigned char *readable;
unsigned char *readable2; /*For rename transaction*/
- int read_subvol; /* Current read subvolume */
-
afr_inode_refresh_cbk_t refreshfn;
/* @refreshinode:
@@ -511,9 +548,30 @@ typedef struct _afr_local {
*/
inode_t *refreshinode;
+ dict_t *xattr_req;
+
+ dict_t *dict;
+
+ int read_subvol; /* Current read subvolume */
+
+ int optimistic_change_log;
+
+ afr_internal_lock_t internal_lock;
+
/*To handle setattr/setxattr on yet to be linked inode from dht*/
uuid_t refreshgfid;
+ /* @refreshed:
+
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
+
+ gf_boolean_t update_num_inodelks;
+ gf_boolean_t update_open_fd_count;
+
/*
@pre_op_compat:
@@ -523,14 +581,6 @@ typedef struct _afr_local {
gf_boolean_t pre_op_compat;
- dict_t *xattr_req;
-
- afr_internal_lock_t internal_lock;
-
- dict_t *dict;
-
- int optimistic_change_log;
-
/* Is the current writev() going to perform a stable write?
i.e, is fd->flags or @flags writev param have O_SYNC or
O_DSYNC?
@@ -549,25 +599,25 @@ typedef struct _afr_local {
struct {
struct {
- gf_boolean_t needs_fresh_lookup;
- uuid_t gfid_req;
- } lookup;
-
- struct {
- unsigned char buf_set;
struct statvfs buf;
+ unsigned char buf_set;
} statfs;
struct {
- int32_t flags;
fd_t *fd;
+ int32_t flags;
} open;
struct {
- int32_t cmd;
struct gf_flock user_flock;
struct gf_flock ret_flock;
unsigned char *locked_nodes;
+ int32_t cmd;
+ /*For lock healing only.*/
+ unsigned char *dom_locked_nodes;
+ int32_t *dom_lock_op_ret;
+ int32_t *dom_lock_op_errno;
+ struct gf_flock *getlk_rsp;
} lk;
/* inode read */
@@ -592,8 +642,8 @@ typedef struct _afr_local {
struct {
char *name;
- int last_index;
long xattr_len;
+ int last_index;
} getxattr;
struct {
@@ -606,11 +656,10 @@ typedef struct _afr_local {
/* dir read */
struct {
+ uint32_t *checksum;
int success_count;
int32_t op_ret;
int32_t op_errno;
-
- uint32_t *checksum;
} opendir;
struct {
@@ -619,8 +668,8 @@ typedef struct _afr_local {
size_t size;
off_t offset;
dict_t *dict;
- gf_boolean_t failed;
int last_index;
+ gf_boolean_t failed;
} readdir;
/* inode write */
@@ -630,12 +679,11 @@ typedef struct _afr_local {
} inode_wfop; // common structure for all inode-write-fops
struct {
- int32_t op_ret;
-
struct iovec *vector;
struct iobref *iobref;
- int32_t count;
off_t offset;
+ int32_t op_ret;
+ int32_t count;
uint32_t flags;
} writev;
@@ -695,29 +743,25 @@ typedef struct _afr_local {
} create;
struct {
+ dict_t *params;
dev_t dev;
mode_t mode;
- dict_t *params;
} mknod;
struct {
- int32_t mode;
dict_t *params;
+ int32_t mode;
} mkdir;
struct {
- int flags;
- } rmdir;
-
- struct {
dict_t *params;
char *linkpath;
} symlink;
struct {
- int32_t mode;
off_t offset;
size_t len;
+ int32_t mode;
} fallocate;
struct {
@@ -744,10 +788,10 @@ typedef struct _afr_local {
struct {
char *volume;
char *basename;
+ void *xdata;
entrylk_cmd in_cmd;
entrylk_cmd cmd;
entrylk_type type;
- void *xdata;
} entrylk;
struct {
@@ -756,31 +800,33 @@ typedef struct _afr_local {
} seek;
struct {
- int32_t datasync;
- } fsync;
-
- struct {
struct gf_lease user_lease;
struct gf_lease ret_lease;
unsigned char *locked_nodes;
} lease;
- } cont;
+ struct {
+ int flags;
+ } rmdir;
- struct {
- off_t start, len;
+ struct {
+ int32_t datasync;
+ } fsync;
- gf_boolean_t eager_lock_on;
- gf_boolean_t do_eager_unlock;
+ struct {
+ uuid_t gfid_req;
+ gf_boolean_t needs_fresh_lookup;
+ } lookup;
+
+ } cont;
+ struct {
char *basename;
char *new_basename;
loc_t parent_loc;
loc_t new_parent_loc;
- afr_transaction_type type;
-
/* stub to resume on destruction
of the transaction frame */
call_stub_t *resume_stub;
@@ -798,6 +844,30 @@ typedef struct _afr_local {
FOP failed. */
unsigned char *failed_subvols;
+ call_frame_t *main_frame; /*Fop frame*/
+ call_frame_t *frame; /*Transaction frame*/
+
+ int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
+
+ int (*unwind)(call_frame_t *frame, xlator_t *this);
+
+ off_t start, len;
+
+ afr_transaction_type type;
+
+ int32_t in_flight_sb_errno; /* This is where the cause of the
+ failure on the last good copy of
+ the file is stored.
+ */
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+ afr_changelog_resume_t changelog_resume;
+
+ gf_boolean_t eager_lock_on;
+ gf_boolean_t do_eager_unlock;
+
/* @dirtied: flag which indicates whether we set dirty flag
in the OP. Typically true when we are performing operation
on more than one subvol and optimistic changelog is disabled
@@ -822,6 +892,10 @@ typedef struct _afr_local {
*/
gf_boolean_t no_uninherit;
+ gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+ split-brain while in the middle of
+ a txn. */
+
/* @uninherit_done:
@uninherit_value:
@@ -834,27 +908,7 @@ typedef struct _afr_local {
gf_boolean_t uninherit_done;
gf_boolean_t uninherit_value;
- gf_boolean_t in_flight_sb; /* Indicator for occurrence of
- split-brain while in the middle of
- a txn. */
- int32_t in_flight_sb_errno; /* This is where the cause of the
- failure on the last good copy of
- the file is stored.
- */
-
- /* @changelog_resume: function to be called after changlogging
- (either pre-op or post-op) is done
- */
- afr_changelog_resume_t changelog_resume;
-
- call_frame_t *main_frame; /*Fop frame*/
- call_frame_t *frame; /*Transaction frame*/
-
- int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
-
- int (*unwind)(call_frame_t *frame, xlator_t *this);
-
- /* post-op hook */
+ gf_boolean_t disable_delayed_post_op;
} transaction;
syncbarrier_t barrier;
@@ -867,34 +921,36 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
- gf_boolean_t do_discovery;
struct afr_reply *replies;
/* For client side background heals. */
struct list_head healer;
call_frame_t *heal_frame;
- gf_boolean_t need_full_crawl;
- afr_fop_lock_state_t fop_lock_state;
-
- gf_boolean_t is_read_txn;
afr_inode_ctx_t *inode_ctx;
/*For thin-arbiter transactions.*/
- unsigned char ta_child_up;
+ int ta_failed_subvol;
+ int ta_event_gen;
struct list_head ta_waitq;
struct list_head ta_onwireq;
afr_ta_fop_state_t fop_state;
- int ta_failed_subvol;
+ afr_fop_lock_state_t fop_lock_state;
+ gf_lkowner_t saved_lk_owner;
+ unsigned char read_txn_query_child;
+ unsigned char ta_child_up;
+ gf_boolean_t do_discovery;
+ gf_boolean_t need_full_crawl;
+ gf_boolean_t is_read_txn;
gf_boolean_t is_new_entry;
} afr_local_t;
typedef struct afr_spbc_timeout {
call_frame_t *frame;
- gf_boolean_t d_spb;
- gf_boolean_t m_spb;
loc_t *loc;
int spb_child_index;
+ gf_boolean_t d_spb;
+ gf_boolean_t m_spb;
} afr_spbc_timeout_t;
typedef struct afr_spb_status {
@@ -904,9 +960,9 @@ typedef struct afr_spb_status {
typedef struct afr_empty_brick_args {
call_frame_t *frame;
+ char *op_type;
loc_t loc;
int empty_index;
- char *op_type;
} afr_empty_brick_args_t;
typedef struct afr_read_subvol_args {
@@ -948,7 +1004,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
int event_generation);
int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this);
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
int
afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
@@ -1064,6 +1123,9 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
if (__local && __local->is_read_txn) \
afr_pending_read_decrement(__this->private, \
__local->read_subvol); \
+ if (__local && __local->xdata_req && \
+ afr_is_lock_mode_mandatory(__local->xdata_req)) \
+ afr_dom_lock_release(frame); \
frame->local = NULL; \
} \
\
@@ -1091,8 +1153,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
#define AFR_FRAME_INIT(frame, op_errno) \
({ \
frame->local = mem_get0(THIS->local_pool); \
- if (afr_local_init(frame->local, THIS->private, &op_errno)) { \
- afr_local_cleanup(frame->local, THIS); \
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
+ afr_local_cleanup(frame->local, frame->this); \
mem_put(frame->local); \
frame->local = NULL; \
}; \
@@ -1216,8 +1278,8 @@ int
afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
int spb_choice);
int
-afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
- int *spb_choice);
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol);
int
afr_get_child_index_from_name(xlator_t *this, char *name);
@@ -1302,7 +1364,7 @@ int
afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
int
-afr_fill_ta_loc(xlator_t *this, loc_t *loc);
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
int
afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
@@ -1325,11 +1387,37 @@ afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
void
afr_ta_lock_release_synctask(xlator_t *this);
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv);
+
gf_boolean_t
-afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this,
- unsigned char *subvols);
+afr_lookup_has_quorum(call_frame_t *frame,
+ const unsigned int up_children_count);
void
afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
+ int child);
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies);
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */