summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorKrutika Dhananjay <kdhananj@redhat.com>2015-10-14 14:14:51 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-04-29 18:21:56 -0700
commit84c8cc9c5936a2a7539f343c180f06312c8f6d39 (patch)
tree0a89b67bde2e03dafa9f61ffea34f19d11cc9938 /xlators/cluster
parentf0fb05d2cefae08c143f2bfdef151084f5ddb498 (diff)
cluster/afr: Entry self-heal performance enhancements
Change-Id: I52da41dff5619492b656c2217f4716a6cdadebe0 BUG: 1269461 Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com> Reviewed-on: http://review.gluster.org/12442 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c13
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c13
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c95
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c305
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h5
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c2
-rw-r--r--xlators/cluster/afr/src/afr.c10
-rw-r--r--xlators/cluster/afr/src/afr.h11
9 files changed, 408 insertions, 48 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index fda9785bdda..160170e035c 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4284,6 +4284,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->need_full_crawl = _gf_false;
+
INIT_LIST_HEAD (&local->healer);
return 0;
out:
@@ -4535,9 +4537,11 @@ afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
int **changelog = NULL;
int idx = -1;
int m_idx = 0;
+ int d_idx = 0;
int ret = 0;
m_idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
+ d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
idx = afr_index_from_ia_type (iat);
@@ -4552,6 +4556,11 @@ afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
changelog[i][m_idx] = hton32(1);
if (idx != -1)
changelog[i][idx] = hton32(1);
+ /* If the newentry marking is on a newly created directory,
+ * then mark it with the full-heal indicator.
+ */
+ if ((IA_ISDIR (iat)) && (priv->esh_granular))
+ changelog[i][d_idx] = hton32(1);
}
ret = afr_set_pending_dict (priv, xattr, changelog);
if (ret < 0) {
@@ -4764,12 +4773,12 @@ afr_selfheal_locked_entry_inspect (call_frame_t *frame, xlator_t *this,
*esh = afr_decide_heal_info (priv, sources, ret);
}
afr_selfheal_unentrylk (frame, this, inode, this->name, NULL,
- data_lock);
+ data_lock, NULL);
}
unlock:
if (!granular_locks)
afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain,
- NULL, locked_on);
+ NULL, locked_on, NULL);
out:
if (locked_replies)
afr_replies_wipe (locked_replies, priv->child_count);
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 15bae87a4f4..f240b5eec39 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1104,12 +1104,13 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,
afr_transaction_type type,
char *op_type)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- unsigned char *locked_nodes = NULL;
int count = 0;
int ret = -ENOMEM;
int idx = -1;
+ int d_idx = -1;
+ unsigned char *locked_nodes = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
priv = this->private;
local = frame->local;
@@ -1117,6 +1118,7 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,
locked_nodes = alloca0 (priv->child_count);
idx = afr_index_for_transaction_type (type);
+ d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
local->pending = afr_matrix_create (priv->child_count,
AFR_NUM_CHANGE_LOGS);
@@ -1125,6 +1127,9 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,
local->pending[empty_index][idx] = hton32 (1);
+ if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION))
+ local->pending[empty_index][d_idx] = hton32 (1);
+
local->xdata_req = dict_new ();
if (!local->xdata_req)
goto out;
@@ -1165,7 +1170,7 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,
unlock:
if (AFR_ENTRY_TRANSACTION == type) {
afr_selfheal_unentrylk (frame, this, loc->inode, this->name,
- NULL, locked_nodes);
+ NULL, locked_nodes, NULL);
} else {
afr_selfheal_uninodelk (frame, this, loc->inode, this->name,
LLONG_MAX - 1, 0, locked_nodes);
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 68b5bb06799..0b92f616030 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -34,7 +34,7 @@ afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int subvol, dict_t *xattr)
+ int subvol, dict_t *xattr, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -48,7 +48,7 @@ afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol],
priv->children[subvol]->fops->xattrop, &loc,
- GF_XATTROP_ADD_ARRAY, xattr, NULL);
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
syncbarrier_wait (&local->barrier, 1);
@@ -80,18 +80,22 @@ afr_check_stale_error (struct afr_reply *replies, afr_private_t *priv)
dict_t *
-afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,
- int *output_dirty, int **output_matrix, int subvol)
+afr_selfheal_output_xattr (xlator_t *this, gf_boolean_t is_full_crawl,
+ afr_transaction_type type, int *output_dirty,
+ int **output_matrix, int subvol,
+ int **full_heal_mtx_out)
{
- dict_t *xattr = NULL;
- afr_private_t *priv = NULL;
- int j = 0;
- int idx = 0;
- int ret = 0;
- int *raw = 0;
+ int j = 0;
+ int idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+ int *raw = 0;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
priv = this->private;
idx = afr_index_for_transaction_type (type);
+ d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
xattr = dict_new ();
if (!xattr)
@@ -118,6 +122,8 @@ afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,
goto err;
raw[idx] = hton32 (output_matrix[subvol][j]);
+ if (is_full_crawl)
+ raw[d_idx] = hton32 (full_heal_mtx_out[subvol][j]);
ret = dict_set_bin (xattr, priv->pending_key[j],
raw, sizeof(int) * AFR_NUM_CHANGE_LOGS);
@@ -142,37 +148,57 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
struct afr_reply *replies, unsigned char *locked_on)
{
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
int i = 0;
int j = 0;
unsigned char *pending = NULL;
int *input_dirty = NULL;
int **input_matrix = NULL;
+ int **full_heal_mtx_in = NULL;
+ int **full_heal_mtx_out = NULL;
int *output_dirty = NULL;
int **output_matrix = NULL;
dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
priv = this->private;
+ local = frame->local;
pending = alloca0 (priv->child_count);
input_dirty = alloca0 (priv->child_count * sizeof (int));
input_matrix = ALLOC_MATRIX (priv->child_count, int);
+ full_heal_mtx_in = ALLOC_MATRIX (priv->child_count, int);
+ full_heal_mtx_out = ALLOC_MATRIX (priv->child_count, int);
output_dirty = alloca0 (priv->child_count * sizeof (int));
output_matrix = ALLOC_MATRIX (priv->child_count, int);
+ xdata = dict_new ();
+ if (!xdata)
+ return -1;
+
afr_selfheal_extract_xattr (this, replies, type, input_dirty,
input_matrix);
+ if (local->need_full_crawl)
+ afr_selfheal_extract_xattr (this, replies, AFR_DATA_TRANSACTION,
+ NULL, full_heal_mtx_in);
+
for (i = 0; i < priv->child_count; i++)
if (sinks[i] && !healed_sinks[i])
pending[i] = 1;
for (i = 0; i < priv->child_count; i++) {
for (j = 0; j < priv->child_count; j++) {
- if (pending[j])
+ if (pending[j]) {
output_matrix[i][j] = 1;
- else
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = 1;
+ } else {
output_matrix[i][j] = -input_matrix[i][j];
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
+ }
}
}
@@ -188,17 +214,30 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
*/
continue;
- xattr = afr_selfheal_output_xattr (this, type, output_dirty,
- output_matrix, i);
+ xattr = afr_selfheal_output_xattr (this, local->need_full_crawl,
+ type, output_dirty,
+ output_matrix, i,
+ full_heal_mtx_out);
if (!xattr) {
continue;
}
- afr_selfheal_post_op (frame, this, inode, i, xattr);
+ if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
+ if (xdata &&
+ dict_set_int8 (xdata, GF_XATTROP_PURGE_INDEX, 1))
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_DICT_SET_FAILED, "Failed to set"
+ " dict value for %s",
+ GF_XATTROP_PURGE_INDEX);
+ }
+ afr_selfheal_post_op (frame, this, inode, i, xattr, xdata);
dict_unref (xattr);
}
+ if (xdata)
+ dict_unref (xdata);
+
return 0;
}
@@ -242,6 +281,9 @@ afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol,
void *pending_raw = NULL;
int pending[3] = {0, };
+ if (!dirty)
+ return 0;
+
if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw))
return -1;
@@ -267,6 +309,9 @@ afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol,
priv = this->private;
+ if (!matrix)
+ return 0;
+
for (i = 0; i < priv->child_count; i++) {
if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
continue;
@@ -1150,7 +1195,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->replies[i].op_errno == EAGAIN) {
afr_locked_fill (frame, this, locked_on);
afr_selfheal_unentrylk (frame, this, inode, dom, name,
- locked_on);
+ locked_on, NULL);
AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,
&loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
@@ -1189,7 +1234,7 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this,
if (lock_count > priv->child_count/2 && eagain_count) {
afr_locked_fill (frame, this, locked_on);
afr_selfheal_unentrylk (frame, this, inode, dom, name,
- locked_on);
+ locked_on, NULL);
AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,
&loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
@@ -1203,7 +1248,8 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this,
int
afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on)
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata)
{
loc_t loc = {0,};
@@ -1211,7 +1257,7 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
gf_uuid_copy (loc.gfid, inode->gfid);
AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk,
- dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
+ dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
loc_wipe (&loc);
@@ -1316,7 +1362,12 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
if (replies[i].op_ret == -1)
continue;
- if (data_selfheal && afr_is_data_set (this, replies[i].xdata))
+ /* The data segment of the changelog can be non-zero to indicate
+ * the directory needs a full heal. So the check below ensures
+ * it's not a directory before setting the data_selfheal boolean.
+ */
+ if (data_selfheal && !IA_ISDIR (replies[i].poststat.ia_type) &&
+ afr_is_data_set (this, replies[i].xdata))
*data_selfheal = _gf_true;
if (metadata_selfheal &&
@@ -1326,7 +1377,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata))
*entry_selfheal = _gf_true;
- valid_cnt ++;
+ valid_cnt++;
if (valid_cnt == 1) {
first = replies[i].poststat;
continue;
@@ -1500,7 +1551,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
- afr_selfheal_post_op (frame, this, inode, i, xattr);
+ afr_selfheal_post_op (frame, this, inode, i, xattr, NULL);
}
out:
if (changelog)
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index fccffa7dbac..00af8e9f2e6 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -14,6 +14,7 @@
#include "byte-order.h"
#include "afr-transaction.h"
#include "afr-messages.h"
+#include "syncop-utils.h"
/* Max file name length is 255 this filename is of length 256. No file with
* this name can ever come, entry-lock with this name is going to prevent
@@ -349,6 +350,82 @@ __afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
return ret;
}
+static gf_boolean_t
+is_full_heal_marker_present (xlator_t *this, dict_t *xdata, int idx)
+{
+ int i = 0;
+ int pending[3] = {0,};
+ void *pending_raw = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!xdata)
+ return _gf_false;
+
+ /* Iterate over each of the priv->pending_keys[] elements and then
+ * see if any of them have data segment non-zero. If they do, return
+ * true. Else return false.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
+ continue;
+
+ if (!pending_raw)
+ continue;
+
+ memcpy (pending, pending_raw, sizeof (pending));
+ if (ntoh32 (pending[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_need_full_heal (xlator_t *this, struct afr_reply *replies, int source,
+ unsigned char *healed_sinks, afr_transaction_type type)
+{
+ int i = 0;
+ int idx = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!priv->esh_granular)
+ return _gf_true;
+
+ if (type != AFR_ENTRY_TRANSACTION)
+ return _gf_true;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
+
+ /* If there is a clear source, check whether the full-heal-indicator
+ * is present in its xdata. Otherwise, we need to examine all the
+ * participating bricks and then figure if *even* one of them has a
+ * full-heal-indicator.
+ */
+
+ if (source != -1) {
+ if (is_full_heal_marker_present (this, replies[source].xdata,
+ idx))
+ return _gf_true;
+ }
+
+ /* else ..*/
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ if (is_full_heal_marker_present (this, replies[i].xdata, idx))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
static int
__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
unsigned char *healed_sinks,
@@ -431,7 +508,8 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
static int
afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this,
- fd_t *fd, char *name)
+ fd_t *fd, char *name, inode_t *parent_idx_inode,
+ xlator_t *subvol, gf_boolean_t full_crawl)
{
int ret = 0;
int source = -1;
@@ -486,10 +564,15 @@ afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this,
ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,
source, sources, healed_sinks,
locked_on, replies);
+
+ if ((ret == 0) && (priv->esh_granular) && (!full_crawl))
+ ret = afr_shd_index_purge (subvol, parent_idx_inode,
+ name);
}
+
unlock:
afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- locked_on);
+ locked_on, NULL);
if (inode)
inode_unref (inode);
if (replies)
@@ -513,12 +596,16 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
afr_private_t *priv = NULL;
gf_boolean_t mismatch = _gf_false;
+ afr_local_t *iter_local = NULL;
+ afr_local_t *local = NULL;
priv = this->private;
subvol = priv->children[child];
INIT_LIST_HEAD (&entries.list);
+ local = frame->local;
+
iter_frame = afr_copy_frame (frame);
if (!iter_frame)
return -ENOMEM;
@@ -539,7 +626,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
continue;
ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
- entry->d_name);
+ entry->d_name, NULL,
+ NULL,
+ local->need_full_crawl);
AFR_STACK_RESET (iter_frame);
if (iter_frame->local == NULL) {
ret = -ENOTCONN;
@@ -567,36 +656,210 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
return ret;
}
+static inode_t *
+afr_shd_entry_changes_index_inode (xlator_t *this, xlator_t *subvol,
+ uuid_t pargfid)
+{
+ int ret = -1;
+ void *index_gfid = NULL;
+ loc_t rootloc = {0,};
+ loc_t loc = {0,};
+ dict_t *xattr = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {0,};
+
+ rootloc.inode = inode_ref (this->itable->root);
+ gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
+ if (ret) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ loc.inode = inode_new (this->itable);
+ if (!loc.inode) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ gf_uuid_copy (loc.pargfid, index_gfid);
+ loc.name = gf_strdup (uuid_utoa (pargfid));
+
+ ret = syncop_lookup (subvol, &loc, &iatt, NULL, NULL, NULL);
+ if (ret < 0) {
+ errno = -ret;
+ goto out;
+ }
+
+ inode = inode_link (loc.inode, NULL, NULL, &iatt);
+
+out:
+ if (xattr)
+ dict_unref (xattr);
+ loc_wipe (&rootloc);
+ GF_FREE ((char *)loc.name);
+ loc_wipe (&loc);
+
+ return inode;
+}
+
+static int
+afr_selfheal_entry_granular_dirent (xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data)
+{
+ int ret = 0;
+ loc_t loc = {0,};
+ struct iatt iatt = {0,};
+ afr_granular_esh_args_t *args = data;
+
+ /* Look up the actual inode associated with entry. If the lookup returns
+ * ESTALE or ENOENT, then it means we have a stale index. Remove it.
+ * This is analogous to the check in afr_shd_index_heal() except that
+ * here it is achieved through LOOKUP and in afr_shd_index_heal() through
+ * a GETXATTR.
+ */
+
+ loc.inode = inode_new (args->xl->itable);
+ loc.parent = inode_ref (args->heal_fd->inode);
+ gf_uuid_copy (loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ ret = syncop_lookup (args->xl, &loc, &iatt, NULL, NULL, NULL);
+ if ((ret == -ENOENT) || (ret == -ESTALE)) {
+ afr_shd_index_purge (subvol, parent->inode, entry->d_name);
+ ret = 0;
+ goto out;
+ }
+ /* TBD: afr_shd_zero_xattrop? */
+
+ ret = afr_selfheal_entry_dirent (args->frame, args->xl, args->heal_fd,
+ entry->d_name, parent->inode, subvol,
+ _gf_false);
+ AFR_STACK_RESET (args->frame);
+ if (args->frame->local == NULL)
+ ret = -ENOTCONN;
+
+ if (ret == -1)
+ args->mismatch = _gf_true;
+
+out:
+ loc_wipe (&loc);
+ return 0;
+}
+
+static int
+afr_selfheal_entry_granular (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int subvol_idx, gf_boolean_t is_src)
+{
+ int ret = 0;
+ loc_t loc = {0,};
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ afr_granular_esh_args_t args = {0,};
+
+ priv = this->private;
+ subvol = priv->children[subvol_idx];
+
+ args.frame = afr_copy_frame (frame);
+ args.xl = this;
+ /* args.heal_fd represents the fd associated with the original directory
+ * on which entry heal is being attempted.
+ */
+ args.heal_fd = fd;
+
+ /* @subvol here represents the subvolume of AFR where
+ * indices/entry-changes/<pargfid> will be processed
+ */
+ loc.inode = afr_shd_entry_changes_index_inode (this, subvol,
+ fd->inode->gfid);
+ if (!loc.inode) {
+ /* If granular heal failed on the sink (as it might sometimes
+ * because it is the src that would mostly contain the granular
+ * changelogs and the sink's entry-changes would be empty),
+ * do not treat heal as failure.
+ */
+ if (is_src)
+ return -errno;
+ else
+ return 0;
+ }
+
+ ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+ &args, afr_selfheal_entry_granular_dirent);
+
+ loc_wipe (&loc);
+
+ if (args.mismatch == _gf_true)
+ ret = -1;
+
+ return ret;
+}
+
static int
afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
int source, unsigned char *sources,
unsigned char *healed_sinks)
{
- int i = 0;
- afr_private_t *priv = NULL;
- gf_boolean_t mismatch = _gf_false;
- int ret = 0;
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
priv = this->private;
+ local = frame->local;
gf_msg (this->name, GF_LOG_INFO, 0,
AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s",
uuid_utoa (fd->inode->gfid));
for (i = 0; i < priv->child_count; i++) {
+ /* Expunge */
if (!healed_sinks[i])
continue;
- ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
+
+ if (!local->need_full_crawl)
+ /* Why call afr_selfheal_entry_granular() on a "healed sink",
+ * given that it is the source that contains the granular
+ * indices?
+ * If the index for this directory is non-existent or empty on
+ * this subvol (=> clear sink), then it will return early
+ * without failure status.
+ * If the index is non-empty and it is yet a 'healed sink', then
+ * it is due to a split-brain in which case we anyway need to
+ * crawl the indices/entry-changes/pargfid directory.
+ */
+ ret = afr_selfheal_entry_granular (frame, this, fd, i,
+ _gf_false);
+ else
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
+
if (ret == -1) {
/* gfid or type mismatch. */
mismatch = _gf_true;
ret = 0;
}
- if (ret)
- break;
+ if (ret)
+ break;
}
- if (!ret && source != -1)
- ret = afr_selfheal_entry_do_subvol (frame, this, fd, source);
+
+ if (!ret && source != -1) {
+ /* Impunge */
+ if (local->need_full_crawl)
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd,
+ source);
+ else
+ ret = afr_selfheal_entry_granular (frame, this, fd,
+ source, _gf_true);
+ }
if (mismatch == _gf_true)
/* undo pending will be skipped */
@@ -616,10 +879,12 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
unsigned char *postop_lock = NULL;
unsigned char *healed_sinks = NULL;
struct afr_reply *locked_replies = NULL;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
gf_boolean_t did_sh = _gf_true;
priv = this->private;
+ local = frame->local;
sources = alloca0 (priv->child_count);
sinks = alloca0 (priv->child_count);
@@ -651,10 +916,16 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
did_sh = _gf_false;
goto unlock;
}
+
+ local->need_full_crawl = afr_need_full_heal (this,
+ locked_replies,
+ source,
+ healed_sinks,
+ AFR_ENTRY_TRANSACTION);
}
unlock:
afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- data_lock);
+ data_lock, NULL);
if (ret < 0)
goto out;
@@ -695,7 +966,7 @@ unlock:
}
postop_unlock:
afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- postop_lock);
+ postop_lock, NULL);
out:
if (did_sh)
afr_log_selfheal (fd->inode->gfid, this, ret, "entry", source,
@@ -796,10 +1067,12 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
}
if (!granular_locks)
afr_selfheal_unentrylk (frame, this, inode, this->name,
- LONG_FILENAME, long_name_locked);
+ LONG_FILENAME, long_name_locked,
+ NULL);
}
unlock:
- afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on);
+ afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL,
+ locked_on, NULL);
if (fd)
fd_unref (fd);
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index 9f7a5b1ff0f..3445ecccf9c 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -618,7 +618,7 @@ afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
}
unlock:
afr_selfheal_unentrylk (frame, this, parent, this->name, bname,
- locked_on);
+ locked_on, NULL);
if (inode)
inode_unref (inode);
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index afc086c0560..becbe67e084 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -137,7 +137,8 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this,
int
afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on);
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata);
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
@@ -177,7 +178,7 @@ afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir,
int
afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int subvol, dict_t *xattr);
+ int subvol, dict_t *xattr, dict_t *xdata);
call_frame_t *
afr_frame_create (xlator_t *this);
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 1ae4f18e764..2ec9d9ce686 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -265,7 +265,7 @@ afr_shd_zero_xattrop (xlator_t *this, uuid_t gfid)
/*Send xattrop to all bricks. Doing a lookup to see if bricks are up or
* has valid repies for this gfid seems a bit of an overkill.*/
for (i = 0; i < priv->child_count; i++)
- afr_selfheal_post_op (frame, this, inode, i, xattr);
+ afr_selfheal_post_op (frame, this, inode, i, xattr, NULL);
out:
if (frame)
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 0930a081965..d01a806fe86 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -186,6 +186,8 @@ reconfigure (xlator_t *this, dict_t *options)
out);
GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str,
out);
+ GF_OPTION_RECONF ("granular-entry-heal", priv->esh_granular, options,
+ bool, out);
GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);
GF_OPTION_RECONF ("quorum-type", qtype, options, str, out);
@@ -379,6 +381,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out);
+ GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out);
GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);
GF_OPTION_INIT ("quorum-type", qtype, str, out);
@@ -897,5 +900,12 @@ struct volume_options options[] = {
"stop being compatible with afr-v1, which helps afr "
"be more granular while self-healing",
},
+ { .key = {"granular-entry-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "If this option is enabled, self-heal will resort to "
+ "granular way of recording changelogs and doing entry "
+ "self-heal.",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 0a872a98284..f16f9b4b4ac 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -139,6 +139,7 @@ typedef struct _afr_private {
void *pump_private;
gf_boolean_t use_afr_in_pump;
char *locking_scheme;
+ gf_boolean_t esh_granular;
} afr_private_t;
@@ -755,6 +756,8 @@ typedef struct _afr_local {
/* For client side background heals. */
struct list_head healer;
call_frame_t *heal_frame;
+
+ gf_boolean_t need_full_crawl;
} afr_local_t;
@@ -789,6 +792,14 @@ typedef struct afr_read_subvol_args {
uuid_t gfid;
} afr_read_subvol_args_t;
+typedef struct afr_granular_esh_args {
+ fd_t *heal_fd;
+ xlator_t *xl;
+ call_frame_t *frame;
+ gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid
+ mismatch */
+} afr_granular_esh_args_t;
+
/* did a call fail due to a child failing? */
#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
((op_errno == ENOTCONN) || \