summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr')
-rw-r--r--xlators/cluster/afr/src/afr-common.c317
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c15
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c14
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c11
-rw-r--r--xlators/cluster/afr/src/afr-open.c8
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c10
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c199
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c22
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c211
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c62
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c41
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h14
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c193
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h2
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c152
-rw-r--r--xlators/cluster/afr/src/afr.c49
-rw-r--r--xlators/cluster/afr/src/afr.h29
17 files changed, 934 insertions, 415 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index a2f0b2a..032ab5c 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,7 +45,42 @@ afr_quorum_errno(afr_private_t *priv)
return ENOTCONN;
}
-static void
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
+{
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
+
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
+
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
+
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
+ }
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+void
afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
unsigned char *replies)
{
@@ -885,7 +920,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
metadatamap |= (1 << index);
}
if (metadatamap_old != metadatamap) {
- event = 0;
+ __afr_inode_need_refresh_set(inode, this);
}
break;
@@ -898,7 +933,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
datamap |= (1 << index);
}
if (datamap_old != datamap)
- event = 0;
+ __afr_inode_need_refresh_set(inode, this);
break;
default:
@@ -1062,34 +1097,6 @@ out:
}
int
-__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this)
-{
- int ret = -1;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint32_t event = 0;
- uint64_t val = 0;
- afr_inode_ctx_t *ctx = NULL;
-
- ret = __afr_inode_ctx_get(this, inode, &ctx);
- if (ret)
- return ret;
-
- val = ctx->read_subvol;
-
- metadatamap = (val & 0x000000000000ffff) >> 0;
- datamap = (val & 0x00000000ffff0000) >> 16;
- event = 0;
-
- val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
- (((uint64_t)event) << 32);
-
- ctx->read_subvol = val;
-
- return ret;
-}
-
-int
__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int *event_p)
{
@@ -1160,22 +1167,6 @@ out:
}
int
-__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int ret = -1;
-
- priv = this->private;
-
- if (priv->child_count <= 16)
- ret = __afr_inode_event_gen_reset_small(inode, this);
- else
- ret = -1;
-
- return ret;
-}
-
-int
afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int *event_p)
{
@@ -1241,12 +1232,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
return 0;
}
-int
+static int
afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
int *spb_choice)
{
int ret = -1;
-
GF_VALIDATE_OR_GOTO(this->name, inode, out);
LOCK(&inode->lock);
@@ -1258,6 +1248,40 @@ out:
return ret;
}
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol)
+{
+ int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO("afr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
+
+ priv = this->private;
+
+ ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+ if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+ local = frame->local;
+ *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (*spb_subvol >= 0) {
+ ret = 0;
+ }
+ }
+
+out:
+ return ret;
+}
int
afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int event)
@@ -1324,30 +1348,22 @@ out:
return need_refresh;
}
-static int
-afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
int ret = -1;
afr_inode_ctx_t *ctx = NULL;
- GF_VALIDATE_OR_GOTO(this->name, inode, out);
-
- LOCK(&inode->lock);
- {
- ret = __afr_inode_ctx_get(this, inode, &ctx);
- if (ret)
- goto unlock;
-
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret == 0) {
ctx->need_refresh = _gf_true;
}
-unlock:
- UNLOCK(&inode->lock);
-out:
+
return ret;
}
int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
int ret = -1;
@@ -1355,7 +1371,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
LOCK(&inode->lock);
{
- ret = __afr_inode_event_gen_reset(inode, this);
+ ret = __afr_inode_need_refresh_set(inode, this);
}
UNLOCK(&inode->lock);
out:
@@ -1790,7 +1806,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
ret = afr_inode_get_readable(frame, inode, this, local->readable,
&event_generation, local->transaction.type);
- if (ret == -EIO || (local->is_read_txn && !event_generation)) {
+ if (ret == -EIO) {
/* No readable subvolume even after refresh ==> splitbrain.*/
if (!priv->fav_child_policy) {
err = EIO;
@@ -2290,8 +2306,9 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
* need is a low probability that multiple clients
* won't converge on the same subvolume.
*/
+ gf_uuid_copy(gfid_copy, args->gfid);
pid = getpid();
- memcpy(gfid_copy, &pid, sizeof(pid));
+ *(pid_t *)gfid_copy ^= pid;
}
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
@@ -2875,7 +2892,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int spb_choice = -1;
+ int spb_subvol = -1;
int child_count = -1;
if (*read_subvol != -1)
@@ -2885,10 +2902,10 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
local = frame->local;
child_count = priv->child_count;
- afr_inode_split_brain_choice_get(local->inode, this, &spb_choice);
- if ((spb_choice >= 0) &&
+ afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+ if ((spb_subvol >= 0) &&
(AFR_COUNT(success_replies, child_count) == child_count)) {
- *read_subvol = spb_choice;
+ *read_subvol = spb_subvol;
} else if (!priv->quorum_count ||
frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
*read_subvol = afr_first_up_child(frame, this);
@@ -2929,6 +2946,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
0,
};
gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t in_flight_create = _gf_false;
gf_boolean_t can_interpret = _gf_true;
inode_t *parent = NULL;
ia_type_t ia_type = IA_INVAL;
@@ -2972,17 +2990,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
if (!replies[i].valid)
continue;
- if (locked_entry && replies[i].op_ret == -1 &&
- replies[i].op_errno == ENOENT) {
- /* Second, check entry is still
- "underway" in creation */
- local->op_ret = -1;
- local->op_errno = ENOENT;
- goto error;
- }
-
- if (replies[i].op_ret == -1)
+ if (replies[i].op_ret == -1) {
+ if (locked_entry && replies[i].op_errno == ENOENT) {
+ in_flight_create = _gf_true;
+ }
continue;
+ }
if (read_subvol == -1 || !readable[read_subvol]) {
read_subvol = i;
@@ -2992,6 +3005,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
}
}
+ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
+ }
+
if (read_subvol == -1)
goto error;
/* We now have a read_subvol, which is readable[] (if there
@@ -3050,7 +3069,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
if (read_subvol == -1)
goto cant_interpret;
if (ret) {
- afr_inode_event_gen_reset(local->inode, this);
+ afr_inode_need_refresh_set(local->inode, this);
dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
}
} else {
@@ -3103,7 +3122,7 @@ error:
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ENODATA > ENOENT > ESTALE > others
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
*/
int
@@ -3115,6 +3134,8 @@ afr_higher_errno(int32_t old_errno, int32_t new_errno)
return ENOENT;
if (old_errno == ESTALE || new_errno == ESTALE)
return ESTALE;
+ if (old_errno == ENOSPC || new_errno == ENOSPC)
+ return ENOSPC;
return new_errno;
}
@@ -3606,6 +3627,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
int read_subvol = -1;
+ int ret = 0;
unsigned char *data_readable = NULL;
unsigned char *success_replies = NULL;
@@ -3627,7 +3649,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
if (!afr_has_quorum(success_replies, this, frame))
goto unwind;
- afr_replies_interpret(frame, this, local->inode, NULL);
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ }
read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
data_readable);
@@ -3679,7 +3704,7 @@ afr_ta_id_file_check(void *opaque)
this = opaque;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_false);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc.name);
@@ -3888,11 +3913,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
- if (afr_is_inode_refresh_reqd(loc->inode, this, event,
- local->event_generation))
- afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do);
- else
- afr_discover_do(frame, this, 0);
+ afr_discover_do(frame, this, 0);
return 0;
out:
@@ -3993,11 +4014,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
return 0;
}
- if (__is_root_gfid(loc->parent->gfid)) {
- if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) {
- op_errno = EPERM;
- goto out;
- }
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
}
local = AFR_FRAME_INIT(frame, op_errno);
@@ -4033,11 +4053,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
- if (afr_is_inode_refresh_reqd(loc->inode, this, event,
- local->event_generation))
- afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do);
- else
- afr_lookup_do(frame, this, 0);
+ afr_lookup_do(frame, this, 0);
return 0;
out:
@@ -5665,6 +5681,8 @@ afr_priv_dump(xlator_t *this)
GF_ATOMIC_GET(priv->pending_reads[i]));
sprintf(key, "child_latency[%d]", i);
gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+ sprintf(key, "halo_child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
}
gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
@@ -5677,6 +5695,7 @@ afr_priv_dump(xlator_t *this)
priv->background_self_heal_count);
gf_proc_dump_write("healers", "%d", priv->healers);
gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
gf_proc_dump_write("quorum-type", "auto");
} else if (priv->quorum_count == 0) {
@@ -5737,13 +5756,31 @@ __afr_get_up_children_count(afr_private_t *priv)
return up_children;
}
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ return 0;
+ }
+ }
+ if (priv->thin_arbiter_count && !priv->ta_child_up) {
+ return 0;
+ }
+ return 1;
+}
+
glusterfs_event_t
-__afr_transform_event_from_state(afr_private_t *priv)
+__afr_transform_event_from_state(xlator_t *this)
{
int i = 0;
int up_children = 0;
+ afr_private_t *priv = this->private;
- if (AFR_COUNT(priv->last_event, priv->child_count) == priv->child_count)
+ if (__get_heard_from_all_status(this))
/* have_heard_from_all. Let afr_notify() do the propagation. */
return GF_EVENT_MAXVAL;
@@ -5785,7 +5822,7 @@ afr_notify_cbk(void *data)
goto unlock;
}
priv->timer = NULL;
- event = __afr_transform_event_from_state(priv);
+ event = __afr_transform_event_from_state(this);
if (event != GF_EVENT_MAXVAL)
propagate = _gf_true;
}
@@ -5813,20 +5850,6 @@ __afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
}
static int
-__get_heard_from_all_status(xlator_t *this)
-{
- afr_private_t *priv = this->private;
- int i;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i]) {
- return 0;
- }
- }
- return 1;
-}
-
-static int
find_best_down_child(xlator_t *this)
{
afr_private_t *priv = NULL;
@@ -5837,7 +5860,7 @@ find_best_down_child(xlator_t *this)
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
priv->child_latency[i] < best_latency) {
best_child = i;
best_latency = priv->child_latency[i];
@@ -5909,7 +5932,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
"), "
"marking child down.",
child_latency_msec, halo_max_latency_msec);
- *event = GF_EVENT_CHILD_DOWN;
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_DOWN;
+ }
}
} else if (child_latency_msec < halo_max_latency_msec &&
priv->child_up[idx] == 0) {
@@ -5921,7 +5946,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
"), "
"marking child up.",
child_latency_msec, halo_max_latency_msec);
- *event = GF_EVENT_CHILD_UP;
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_UP;
+ }
} else {
gf_log(child_xlator->name, GF_LOG_INFO,
"Not marking child %d up, "
@@ -5988,7 +6015,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
if (child_latency_msec < 0) {
/*set to INT64_MAX-1 so that it is found for best_down_child*/
- priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ priv->halo_child_up[idx] = 1;
+ if (priv->child_latency[idx] < 0) {
+ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ }
}
/*
@@ -6077,6 +6107,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
*/
if (child_latency_msec < 0) {
priv->child_latency[idx] = child_latency_msec;
+ priv->halo_child_up[idx] = 0;
}
priv->child_up[idx] = 0;
@@ -6641,6 +6672,8 @@ afr_priv_destroy(afr_private_t *priv)
if (!priv)
goto out;
+
+ GF_FREE(priv->sh_domain);
GF_FREE(priv->last_event);
child_count = priv->child_count;
@@ -6656,7 +6689,9 @@ afr_priv_destroy(afr_private_t *priv)
GF_FREE(priv->local);
GF_FREE(priv->pending_key);
GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
GF_FREE(priv->child_up);
+ GF_FREE(priv->halo_child_up);
GF_FREE(priv->child_latency);
LOCK_DESTROY(&priv->lock);
@@ -6794,8 +6829,8 @@ afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
static int
afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
- char *index_vgfid, ia_type_t ia_type, gf_boolean_t *esh,
- gf_boolean_t *dsh, gf_boolean_t *msh)
+ ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+ gf_boolean_t *msh, unsigned char pending)
{
int ret = -1;
GF_UNUSED int ret1 = 0;
@@ -6825,14 +6860,7 @@ afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
}
}
- if (!strcmp(index_vgfid, GF_XATTROP_INDEX_GFID)) {
- if (shd_domain_lk_count) {
- ret = -EAGAIN; /*For 'possibly-healing'. */
- } else {
- ret = 0; /*needs heal. Just set a non -ve value so that it is
- assumed as the source index.*/
- }
- } else if (!strcmp(index_vgfid, GF_XATTROP_DIRTY_GFID)) {
+ if (!pending) {
if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
(!io_domain_lk_count)) {
/* Needs heal. */
@@ -6841,6 +6869,13 @@ afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
/* No heal needed. */
*dsh = *esh = *msh = 0;
}
+ } else {
+ if (shd_domain_lk_count) {
+ ret = -EAGAIN; /*For 'possibly-healing'. */
+ } else {
+ ret = 0; /*needs heal. Just set a non -ve value so that it is
+ assumed as the source index.*/
+ }
}
return ret;
}
@@ -6848,8 +6883,8 @@ afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
/*return EIO, EAGAIN or pending*/
int
afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
- inode_t **inode, char *index_vgfid,
- gf_boolean_t *entry_selfheal, gf_boolean_t *data_selfheal,
+ inode_t **inode, gf_boolean_t *entry_selfheal,
+ gf_boolean_t *data_selfheal,
gf_boolean_t *metadata_selfheal, unsigned char *pending)
{
int ret = -1;
@@ -6908,8 +6943,8 @@ afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
goto out;
}
- ret = afr_update_heal_status(this, replies, index_vgfid, (*inode)->ia_type,
- &esh, &dsh, &msh);
+ ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+ &msh, *pending);
out:
*data_selfheal = dsh;
*entry_selfheal = esh;
@@ -6934,14 +6969,6 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
char *status = NULL;
call_frame_t *heal_frame = NULL;
afr_local_t *heal_local = NULL;
- afr_local_t *local = NULL;
- char *index_vgfid = NULL;
-
- local = frame->local;
- if (dict_get_str(local->xdata_req, "index-vgfid", &index_vgfid)) {
- ret = -1;
- goto out;
- }
/*Use frame with lk-owner set*/
heal_frame = afr_frame_create(frame->this, &op_errno);
@@ -6952,7 +6979,7 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
heal_local = heal_frame->local;
heal_frame->local = frame->local;
- ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, index_vgfid,
+ ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
&entry_selfheal, &data_selfheal,
&metadata_selfheal, &pending);
@@ -7427,7 +7454,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS)
+ if (ret < priv->child_count)
goto data_unlock;
ret = __afr_selfheal_data_prepare(
heal_frame, this, inode, locked_on, sources, sinks,
@@ -7444,7 +7471,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
LLONG_MAX - 1, 0, locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS)
+ if (ret < priv->child_count)
goto mdata_unlock;
ret = __afr_selfheal_metadata_prepare(
heal_frame, this, inode, locked_on, sources, sinks,
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 74f71fd..f8bf834 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -67,7 +67,8 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
int
-afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -163,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
}
static void
-afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
- gf_dirent_t *entries, fd_t *fd)
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
int ret = -1;
gf_dirent_t *entry = NULL;
@@ -182,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
{
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
continue;
}
@@ -227,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
if (op_ret >= 0)
- afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries,
- local->fd);
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index e96b7d0..b7cceb7 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -119,11 +119,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
continue;
if (local->replies[i].op_ret < 0) {
if (local->inode)
- afr_inode_event_gen_reset(local->inode, this);
+ afr_inode_need_refresh_set(local->inode, this);
if (local->parent)
- afr_inode_event_gen_reset(local->parent, this);
+ afr_inode_need_refresh_set(local->parent, this);
if (local->parent2)
- afr_inode_event_gen_reset(local->parent2, this);
+ afr_inode_need_refresh_set(local->parent2, this);
continue;
}
@@ -345,6 +345,7 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
int pre_op_count = 0;
int failed_count = 0;
+ unsigned char *success_replies = NULL;
local = frame->local;
priv = this->private;
@@ -360,9 +361,16 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
failed_count = AFR_COUNT(local->transaction.failed_subvols,
priv->child_count);
+ /* FOP succeeded on all bricks. */
if (pre_op_count == priv->child_count && !failed_count)
return;
+ /* FOP did not suceed on quorum no. of bricks. */
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (!afr_has_quorum(success_replies, this, NULL))
+ return;
+
if (priv->thin_arbiter_count) {
/*Mark new entry using ta file*/
local->is_new_entry = _gf_true;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index c01b413..1d6e4f3 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -2506,6 +2506,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
call_frame_t *transaction_frame = NULL;
int ret = -1;
int32_t op_errno = ENOMEM;
+ int8_t last_fsync = 0;
AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
@@ -2516,10 +2517,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
if (!local)
goto out;
- if (xdata)
+ if (xdata) {
local->xdata_req = dict_copy_with_ref(xdata, NULL);
- else
+ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+ if (last_fsync) {
+ local->transaction.disable_delayed_post_op = _gf_true;
+ }
+ }
+ } else {
local->xdata_req = dict_new();
+ }
if (!local->xdata_req)
goto out;
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index a5b004f..6485604 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -137,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int spb_choice = 0;
+ int spb_subvol = 0;
int event_generation = 0;
int ret = 0;
int32_t op_errno = 0;
@@ -179,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = afr_inode_get_readable(frame, local->inode, this, NULL,
&event_generation, AFR_DATA_TRANSACTION);
if ((ret < 0) &&
- (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) ==
- 0) &&
- spb_choice < 0) {
+ (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+ &spb_subvol) == 0) &&
+ spb_subvol < 0) {
afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
afr_open_continue);
} else {
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 772b59f..6fc2c75 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -164,7 +164,7 @@ afr_ta_read_txn(void *opaque)
xdata_rsp = NULL;
/* It doesn't. So query thin-arbiter to see if it blames any data brick. */
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc.name);
@@ -272,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
int read_subvol = -1;
inode_t *inode = NULL;
int ret = -1;
- int spb_choice = -1;
+ int spb_subvol = -1;
local = frame->local;
inode = local->inode;
@@ -303,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
local->read_attempted[read_subvol] = 1;
readfn:
if (read_subvol == -1) {
- ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice);
- if ((ret == 0) && spb_choice >= 0)
- read_subvol = spb_choice;
+ ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+ if ((ret == 0) && spb_subvol >= 0)
+ read_subvol = spb_subvol;
}
if (read_subvol == -1) {
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 16f19e7..a580a15 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -140,7 +140,7 @@ heal:
}
}
out:
- if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) {
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
ret = -afr_final_errno(local, priv);
}
loc_wipe(&loc);
@@ -1575,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
char *accused = NULL; /* Accused others without any self-accusal */
char *pending = NULL; /* Have pending operations on others */
char *self_accused = NULL; /* Accused itself */
- int min_participants = -1;
priv = this->private;
@@ -1599,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
}
}
- if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
- min_participants = priv->child_count;
- } else {
- min_participants = AFR_SH_MIN_PARTICIPANTS;
- }
- if (afr_success_count(replies, priv->child_count) < min_participants) {
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
/* Treat this just like locks not being acquired */
return -ENOTCONN;
}
@@ -1911,17 +1905,16 @@ int
afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
dict_t *dict = NULL;
- priv = frame->this->private;
local = frame->local;
- if (local && local->xattr_req)
+
+ if (local->xattr_req)
dict = local->xattr_req;
return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
- priv->child_up, dict);
+ local->child_up, dict);
}
unsigned int
@@ -2757,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
out:
return source;
}
+
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
+ }
+ }
+
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
+
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
+ }
+ goto out;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index cdff4a5..37bcc2b 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -225,24 +225,40 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
return ret;
}
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+ unsigned char *healed_sinks)
+{
+ afr_private_t *priv = this->private;
+ int i = 0;
+
+ if (!locked_on[source])
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && locked_on[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
static int
afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
int source, unsigned char *healed_sinks, off_t offset,
size_t size, int type, struct afr_reply *replies)
{
int ret = -1;
- int sink_count = 0;
afr_private_t *priv = NULL;
unsigned char *data_lock = NULL;
priv = this->private;
- sink_count = AFR_COUNT(healed_sinks, priv->child_count);
data_lock = alloca0(priv->child_count);
ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
data_lock);
{
- if (ret < sink_count) {
+ if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
ret = -ENOTCONN;
goto unlock;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index e706228..64893f4 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -16,54 +16,170 @@
#include <glusterfs/syncop-utils.h>
#include <glusterfs/events.h>
-static int
-afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
- inode_t *inode, int child, struct afr_reply *replies)
+int
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
xlator_t *subvol = NULL;
int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
loc_t loc = {
0,
};
- char g[64];
priv = this->private;
-
subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
+
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
loc.parent = inode_ref(dir);
gf_uuid_copy(loc.pargfid, dir->gfid);
loc.name = name;
- loc.inode = inode_ref(inode);
- if (replies[child].valid && replies[child].op_ret == 0) {
- switch (replies[child].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_unlink(subvol, &loc, NULL, NULL);
- break;
- }
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
}
+out:
loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
return ret;
}
int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
+{
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
+
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
unsigned char *sources, inode_t *dir,
const char *name, inode_t *inode,
@@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
loc_t srcloc = {
0,
};
+ loc_t anonloc = {
+ 0,
+ };
xlator_t *this = frame->this;
afr_private_t *priv = NULL;
dict_t *xdata = NULL;
@@ -86,15 +205,17 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
0,
};
unsigned char *newentry = NULL;
- char dir_uuid_str[64] = {0}, iatt_uuid_str[64] = {0};
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
priv = this->private;
iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
"Invalid ia_type (%d) or gfid(%s). source brick=%d, "
"pargfid=%s, name=%s",
- iatt->ia_type, uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str), source,
+ iatt->ia_type, iatt_uuid_str, source,
uuid_utoa_r(dir->gfid, dir_uuid_str), name);
ret = -EINVAL;
goto out;
@@ -120,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
srcloc.inode = inode_ref(inode);
gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
- if (iatt->ia_type != IA_IFDIR)
- ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
- if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) {
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
newentry[dst] = 1;
ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
sources, newentry);
if (ret)
goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
}
mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
@@ -166,6 +297,7 @@ out:
GF_FREE(linkname);
loc_wipe(&loc);
loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
return ret;
}
@@ -578,6 +710,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
+ return 0;
+ }
+
xattr = dict_new();
if (!xattr)
return -ENOMEM;
@@ -598,7 +735,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes "
@@ -626,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
replies);
if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
- ret = afr_shd_index_purge(subvol, parent_idx_inode, name,
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
inode->ia_type);
/* Why is ret force-set to 0? We do not care about
* index purge failing for full heal as it is quite
@@ -756,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR))
- continue;
-
ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
loc.inode, subvol,
local->need_full_crawl);
@@ -822,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
/* The name indices under the pgfid index dir are guaranteed
* to be regular files. Hence the hardcoding.
*/
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
ret = 0;
goto out;
}
@@ -992,7 +1125,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
data_lock);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
@@ -1116,7 +1249,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
NULL, locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index f4e31b6..03f43ba 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -190,6 +190,59 @@ out:
return ret;
}
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ struct afr_reply *replies,
+ unsigned char *sources)
+{
+ int ret = 0;
+ int i = 0;
+ int m_idx = 0;
+ afr_private_t *priv = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ raw[m_idx] = 1;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+ "Failed to set pending metadata xattr on child %d for %s", i,
+ uuid_utoa(inode->gfid));
+ goto out;
+ }
+ }
+
+ afr_replies_wipe(replies, priv->child_count);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
+
/*
* Look for mismatching uid/gid or mode or user xattrs even if
* AFR xattrs don't say so, and pick one arbitrarily as winner. */
@@ -210,6 +263,7 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
};
int source = -1;
int sources_count = 0;
+ int ret = 0;
priv = this->private;
@@ -300,7 +354,13 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
healed_sinks[i] = 1;
}
}
-
+ if ((sources_count == priv->child_count) && (source > -1) &&
+ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+ replies, sources);
+ if (ret < 0)
+ return ret;
+ }
out:
afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
return source;
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index 36640b54..834aac8 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
const char *bname, inode_t *inode,
struct afr_reply *replies)
{
- loc_t loc = {
- 0,
- };
int i = 0;
afr_private_t *priv = NULL;
- char g[64];
int ret = 0;
priv = this->private;
- loc.parent = inode_ref(parent);
- gf_uuid_copy(loc.pargfid, pargfid);
- loc.name = bname;
- loc.inode = inode_ref(inode);
-
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
@@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
if (replies[i].op_ret)
continue;
- switch (replies[i].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL);
- break;
- }
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
}
- loc_wipe(&loc);
-
return ret;
}
@@ -381,7 +352,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
replies, gfid, locked_on, source, sources,
is_gfid_absent, &gfid_idx);
- if (ret)
+ if (ret || (gfid_idx < 0))
return ret;
ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
@@ -514,7 +485,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
ret = -ENOTCONN;
goto unlock;
}
@@ -560,13 +531,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
struct afr_reply *replies = NULL;
inode_t *inode = NULL;
int first_idx = -1;
+ afr_local_t *local = NULL;
priv = this->private;
+ local = frame->local;
replies = alloca0(sizeof(*replies) * priv->child_count);
inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
- priv->child_up, NULL);
+ local->child_up, NULL);
if (!inode)
return -ENOMEM;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 5e7bde8..48e6dbc 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -11,8 +11,6 @@
#ifndef _AFR_SELFHEAL_H
#define _AFR_SELFHEAL_H
-#define AFR_SH_MIN_PARTICIPANTS 2
-
/* Perform fop on all UP subvolumes and wait for all callbacks to return */
#define AFR_ONALL(frame, rfn, fop, args...) \
@@ -47,13 +45,16 @@
afr_local_t *__local = frame->local; \
afr_private_t *__priv = frame->this->private; \
int __i = 0; \
- int __count = AFR_COUNT(list, __priv->child_count); \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
\
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
__local->barrier.waitfor = __count; \
afr_local_replies_wipe(__local, __priv); \
\
for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!list[__i]) \
+ if (!__list[__i]) \
continue; \
STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
__priv->children[__i], \
@@ -368,4 +369,9 @@ gf_boolean_t
afr_is_file_empty_on_all_children(afr_private_t *priv,
struct afr_reply *replies);
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index aa774bb..109fd4b 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -94,7 +94,7 @@ __afr_shd_healer_wait(struct subvol_healer *healer)
priv = healer->this->private;
disabled_loop:
- wait_till.tv_sec = time(NULL) + priv->shd.timeout;
+ wait_till.tv_sec = gf_time() + priv->shd.timeout;
while (!healer->rerun) {
ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
@@ -222,7 +222,7 @@ out:
}
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type)
{
int ret = 0;
@@ -371,7 +371,7 @@ afr_shd_sweep_prepare(struct subvol_healer *healer)
event->split_brain_count = 0;
event->heal_failed_count = 0;
- time(&event->start_time);
+ event->start_time = gf_time();
event->end_time = 0;
_mask_cancellation();
}
@@ -386,7 +386,7 @@ afr_shd_sweep_done(struct subvol_healer *healer)
event = &healer->crawl_event;
shd = &(((afr_private_t *)healer->this->private)->shd);
- time(&event->end_time);
+ event->end_time = gf_time();
history = gf_memdup(event, sizeof(*event));
event->start_time = 0;
@@ -424,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ret = afr_shd_selfheal(healer, healer->subvol, gfid);
if (ret == -ENOENT || ret == -ESTALE)
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, val);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
if (ret == 2)
/* If bricks crashed in pre-op after creating indices/xattrop
@@ -843,6 +843,176 @@ out:
return need_heal;
}
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
+ }
+ }
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
+
void *
afr_shd_index_healer(void *data)
{
@@ -900,6 +1070,10 @@ afr_shd_index_healer(void *data)
sleep(1);
} while (ret > 0);
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
+
if (ret == 0 && pre_crawl_xdata &&
!healer->crawl_event.heal_failed_count) {
afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
@@ -1481,15 +1655,6 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
case GF_SHD_OP_INDEX_SUMMARY:
/* this case has been handled in glfs-heal.c */
break;
- case GF_SHD_OP_HEALED_FILES:
- case GF_SHD_OP_HEAL_FAILED_FILES:
- for (i = 0; i < priv->child_count; i++) {
- keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
- AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
- SOP_NOT_SUPPORTED,
- SLEN(SOP_NOT_SUPPORTED));
- }
- break;
case GF_SHD_OP_SPLIT_BRAIN_FILES:
eh_dump(shd->split_brain, output, afr_add_shd_event);
break;
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 687c28e..18db728 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -70,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
char **path_p);
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type);
#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 78438f9..a51f79b 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -124,9 +124,9 @@ afr_release_notify_lock_for_ta(void *opaque)
this = (xlator_t *)opaque;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate loc for thin-arbiter.");
goto out;
}
@@ -521,42 +521,6 @@ afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
local->transaction.pre_op_sources[j] = 0;
}
-gf_boolean_t
-afr_has_arbiter_fop_cbk_quorum(call_frame_t *frame)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- xlator_t *this = NULL;
- gf_boolean_t fop_failed = _gf_false;
- unsigned char *pre_op_sources = NULL;
- int i = 0;
-
- local = frame->local;
- this = frame->this;
- priv = this->private;
- pre_op_sources = local->transaction.pre_op_sources;
-
- /* If the fop failed on the brick, it is not a source. */
- for (i = 0; i < priv->child_count; i++)
- if (local->transaction.failed_subvols[i])
- pre_op_sources[i] = 0;
-
- switch (AFR_COUNT(pre_op_sources, priv->child_count)) {
- case 1:
- if (pre_op_sources[ARBITER_BRICK_INDEX])
- fop_failed = _gf_true;
- break;
- case 0:
- fop_failed = _gf_true;
- break;
- }
-
- if (fop_failed)
- return _gf_false;
-
- return _gf_true;
-}
-
void
afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
{
@@ -971,12 +935,8 @@ afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
priv->child_count)
return _gf_false;
- if (priv->arbiter_count) {
- if (!afr_has_arbiter_fop_cbk_quorum(frame))
- need_dirty = _gf_true;
- } else if (!afr_has_fop_cbk_quorum(frame)) {
+ if (!afr_has_fop_cbk_quorum(frame))
need_dirty = _gf_true;
- }
return need_dirty;
}
@@ -1026,12 +986,8 @@ afr_handle_quorum(call_frame_t *frame, xlator_t *this)
* no split-brain with the fix. The problem is eliminated completely.
*/
- if (priv->arbiter_count) {
- if (afr_has_arbiter_fop_cbk_quorum(frame))
- return;
- } else if (afr_has_fop_cbk_quorum(frame)) {
+ if (afr_has_fop_cbk_quorum(frame))
return;
- }
if (afr_need_dirty_marking(frame, this))
goto set_response;
@@ -1073,7 +1029,7 @@ set_response:
}
int
-afr_fill_ta_loc(xlator_t *this, loc_t *loc)
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
{
afr_private_t *priv = NULL;
@@ -1081,6 +1037,11 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc)
loc->parent = inode_ref(priv->root_inode);
gf_uuid_copy(loc->pargfid, loc->parent->gfid);
loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+ /* Except afr_ta_id_file_check() which is path based, all other gluster
+ * FOPS need gfid.*/
+ return -EINVAL;
+ }
gf_uuid_copy(loc->gfid, priv->ta_gfid);
loc->inode = inode_new(loc->parent->table);
if (!loc->inode) {
@@ -1090,86 +1051,6 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc)
return 0;
}
-int
-afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local)
-{
- int ret = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- int failed_count = 0;
- struct gf_flock flock = {
- 0,
- };
- loc_t loc = {
- 0,
- };
- int i = 0;
-
- priv = this->private;
- if (!priv->thin_arbiter_count)
- return 0;
-
- failed_count = AFR_COUNT(local->transaction.failed_subvols,
- priv->child_count);
- if (!failed_count)
- return 0;
-
- GF_ASSERT(failed_count == 1);
- ret = afr_fill_ta_loc(this, &loc);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed to populate thin-arbiter loc for: %s.", loc.name);
- goto out;
- }
-
- xattr = dict_new();
- if (!xattr) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin(xattr, priv->pending_key[i],
- local->pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret)
- goto out;
- }
-
- flock.l_type = F_WRLCK;
- flock.l_start = 0;
- flock.l_len = 0;
-
- /*TODO: Convert to two domain locking. */
- ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
- AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL);
- if (ret)
- goto out;
-
- ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
- GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
-
- if (ret == -EINVAL) {
- gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB,
- "Thin-arbiter has denied post-op on %s for gfid %s.",
- priv->pending_key[THIN_ARBITER_BRICK_INDEX],
- uuid_utoa(local->inode->gfid));
-
- } else if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Post-op on thin-arbiter id file %s failed for gfid %s.",
- priv->pending_key[THIN_ARBITER_BRICK_INDEX],
- uuid_utoa(local->inode->gfid));
- }
- flock.l_type = F_UNLCK;
- syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY,
- &loc, F_SETLK, &flock, NULL, NULL);
-out:
- if (xattr)
- dict_unref(xattr);
-
- return ret;
-}
-
static int
afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
{
@@ -1264,9 +1145,9 @@ afr_ta_post_op_do(void *opaque)
this = local->transaction.frame->this;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate loc for thin-arbiter.");
goto out;
}
@@ -2466,8 +2347,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
goto out;
}
- if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) {
- /*Only allow writes but shard does [f]xattrops on writes, so
+ if (local->transaction.disable_delayed_post_op) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+ (local->op != GF_FOP_FSYNC)) {
+ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
* they are fine too*/
goto out;
}
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index ec7aa22..df7366f 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -135,6 +135,27 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
}
}
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
+
int
reconfigure(xlator_t *this, dict_t *options)
{
@@ -168,7 +189,8 @@ reconfigure(xlator_t *this, dict_t *options)
bool, out);
GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
- gf_string2boolean(data_self_heal, &priv->data_self_heal);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
out);
@@ -289,6 +311,10 @@ reconfigure(xlator_t *this, dict_t *options)
consistent_io = _gf_false;
priv->consistent_io = consistent_io;
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
if (priv->shd.enabled) {
if ((priv->shd.enabled != enabled_old) ||
(timeout_old != priv->shd.timeout))
@@ -485,7 +511,8 @@ init(xlator_t *this)
GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
- gf_string2boolean(data_self_heal, &priv->data_self_heal);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
out);
@@ -539,7 +566,9 @@ init(xlator_t *this)
GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
if (priv->quorum_count != 0)
priv->consistent_io = _gf_false;
@@ -551,13 +580,19 @@ init(xlator_t *this)
goto out;
}
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
+
priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
gf_afr_mt_child_latency_t);
+ priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- if (!priv->child_up || !priv->child_latency) {
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
ret = -ENOMEM;
goto out;
}
@@ -1282,6 +1317,14 @@ struct volume_options options[] = {
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
{.key = {NULL}},
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 8845656..d62f9a9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -42,6 +42,7 @@
#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
#define PFLAG_PENDING (1 << 0)
#define PFLAG_SBRAIN (1 << 1)
@@ -190,7 +191,9 @@ typedef struct _afr_private {
struct list_head ta_waitq;
struct list_head ta_onwireq;
+ unsigned char *anon_inode;
unsigned char *child_up;
+ unsigned char *halo_child_up;
int64_t *child_latency;
unsigned char *local;
@@ -274,10 +277,15 @@ typedef struct _afr_private {
gf_boolean_t esh_granular;
gf_boolean_t consistent_io;
gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
/*For lock healing.*/
struct list_head saved_locks;
struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
} afr_private_t;
typedef enum {
@@ -900,7 +908,7 @@ typedef struct _afr_local {
gf_boolean_t uninherit_done;
gf_boolean_t uninherit_value;
- /* post-op hook */
+ gf_boolean_t disable_delayed_post_op;
} transaction;
syncbarrier_t barrier;
@@ -996,7 +1004,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
int event_generation);
int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this);
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
int
afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
@@ -1267,8 +1278,8 @@ int
afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
int spb_choice);
int
-afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
- int *spb_choice);
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol);
int
afr_get_child_index_from_name(xlator_t *this, char *name);
@@ -1353,7 +1364,7 @@ int
afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
int
-afr_fill_ta_loc(xlator_t *this, loc_t *loc);
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
int
afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
@@ -1401,4 +1412,12 @@ afr_is_lock_mode_mandatory(dict_t *xdata);
void
afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies);
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */