diff options
| author | Krutika Dhananjay <kdhananj@redhat.com> | 2015-10-14 14:14:51 +0530 | 
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2016-04-29 18:21:56 -0700 | 
| commit | 84c8cc9c5936a2a7539f343c180f06312c8f6d39 (patch) | |
| tree | 0a89b67bde2e03dafa9f61ffea34f19d11cc9938 | |
| parent | f0fb05d2cefae08c143f2bfdef151084f5ddb498 (diff) | |
cluster/afr: Entry self-heal performance enhancements
Change-Id: I52da41dff5619492b656c2217f4716a6cdadebe0
BUG: 1269461
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: http://review.gluster.org/12442
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 95 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 305 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 5 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heald.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 10 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 11 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 | 
11 files changed, 417 insertions, 48 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 6b5958f9894..303714d25f9 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -167,6 +167,7 @@  /* Index xlator related */  #define GF_XATTROP_INDEX_GFID "glusterfs.xattrop_index_gfid" +#define GF_XATTROP_ENTRY_CHANGES_GFID "glusterfs.xattrop_entry_changes_gfid"  #define GF_XATTROP_INDEX_COUNT "glusterfs.xattrop_index_count"  #define GF_XATTROP_DIRTY_GFID "glusterfs.xattrop_dirty_gfid"  #define GF_XATTROP_DIRTY_COUNT "glusterfs.xattrop_dirty_count" @@ -180,6 +181,8 @@  #define GF_AFR_ADD_BRICK "trusted.add-brick"  #define GF_AFR_REPLACE_BRICK "trusted.replace-brick"  #define GF_AFR_DIRTY "trusted.afr.dirty" +#define GF_XATTROP_ENTRY_OUT "glusterfs.xattrop-entry-delete" +#define GF_XATTROP_PURGE_INDEX "glusterfs.xattrop-purge-index"  #define GF_GFIDLESS_LOOKUP "gfidless-lookup"  /* replace-brick and pump related internal xattrs */ diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index fda9785bdda..160170e035c 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4284,6 +4284,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)  		goto out;  	} +        local->need_full_crawl = _gf_false; +          INIT_LIST_HEAD (&local->healer);  	return 0;  out: @@ -4535,9 +4537,11 @@ afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,         int **changelog = NULL;         int idx = -1;         int m_idx = 0; +       int d_idx = 0;         int ret = 0;         m_idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); +       d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);         idx = afr_index_from_ia_type (iat); @@ -4552,6 +4556,11 @@ afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,                 changelog[i][m_idx] = hton32(1);                 if (idx != -1)                         changelog[i][idx] = hton32(1); +                /* If the newentry marking is on a newly created directory, +                 * then mark it with the full-heal indicator. +                 */ +                if ((IA_ISDIR (iat)) && (priv->esh_granular)) +                        changelog[i][d_idx] = hton32(1);         }         ret = afr_set_pending_dict (priv, xattr, changelog);         if (ret < 0) { @@ -4764,12 +4773,12 @@ afr_selfheal_locked_entry_inspect (call_frame_t *frame, xlator_t *this,                          *esh = afr_decide_heal_info (priv, sources, ret);                  }                  afr_selfheal_unentrylk (frame, this, inode, this->name, NULL, -                                        data_lock); +                                        data_lock, NULL);          }  unlock:          if (!granular_locks)                  afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, -                                        NULL, locked_on); +                                        NULL, locked_on, NULL);  out:          if (locked_replies)                  afr_replies_wipe (locked_replies, priv->child_count); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 15bae87a4f4..f240b5eec39 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1104,12 +1104,13 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,                              afr_transaction_type type,                              char *op_type)  { -        afr_local_t     *local            = NULL; -        afr_private_t   *priv             = NULL; -        unsigned char   *locked_nodes     = NULL;          int              count            = 0;          int              ret              = -ENOMEM;          int              idx              = -1; +        int              d_idx            = -1; +        unsigned char   *locked_nodes     = NULL; +        afr_local_t     *local            = NULL; +        afr_private_t   *priv             = NULL;          priv = this->private;          local = frame->local; @@ -1117,6 +1118,7 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,          locked_nodes = alloca0 (priv->child_count);          idx = afr_index_for_transaction_type (type); +        d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);          local->pending = afr_matrix_create (priv->child_count,                                              AFR_NUM_CHANGE_LOGS); @@ -1125,6 +1127,9 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,          local->pending[empty_index][idx] = hton32 (1); +        if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION)) +                        local->pending[empty_index][d_idx] = hton32 (1); +          local->xdata_req = dict_new ();          if (!local->xdata_req)                  goto out; @@ -1165,7 +1170,7 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame,  unlock:          if (AFR_ENTRY_TRANSACTION == type) {                  afr_selfheal_unentrylk (frame, this, loc->inode, this->name, -                                        NULL, locked_nodes); +                                        NULL, locked_nodes, NULL);          } else {                  afr_selfheal_uninodelk (frame, this, loc->inode, this->name,                                          LLONG_MAX - 1, 0, locked_nodes); diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 68b5bb06799..0b92f616030 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -34,7 +34,7 @@ afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  int  afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, -		      int subvol, dict_t *xattr) +		      int subvol, dict_t *xattr, dict_t *xdata)  {  	afr_private_t *priv = NULL;  	afr_local_t *local = NULL; @@ -48,7 +48,7 @@ afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,  	STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol],  		    priv->children[subvol]->fops->xattrop, &loc, -		    GF_XATTROP_ADD_ARRAY, xattr, NULL); +		    GF_XATTROP_ADD_ARRAY, xattr, xdata);  	syncbarrier_wait (&local->barrier, 1); @@ -80,18 +80,22 @@ afr_check_stale_error (struct afr_reply *replies, afr_private_t *priv)  dict_t * -afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, -			   int *output_dirty, int **output_matrix, int subvol) +afr_selfheal_output_xattr (xlator_t *this, gf_boolean_t is_full_crawl, +                           afr_transaction_type type, int *output_dirty, +                           int **output_matrix, int subvol, +                           int **full_heal_mtx_out)  { -	dict_t *xattr = NULL; -	afr_private_t *priv = NULL; -	int j = 0; -	int idx = 0; -	int ret = 0; -	int *raw = 0; +	int                j     = 0; +	int                idx   = 0; +	int                d_idx = 0; +	int                ret   = 0; +	int               *raw   = 0; +	dict_t            *xattr = NULL; +	afr_private_t     *priv  = NULL;  	priv = this->private;  	idx = afr_index_for_transaction_type (type); +        d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);  	xattr = dict_new ();  	if (!xattr) @@ -118,6 +122,8 @@ afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,  			goto err;  		raw[idx] = hton32 (output_matrix[subvol][j]); +                if (is_full_crawl) +                        raw[d_idx] = hton32 (full_heal_mtx_out[subvol][j]);  		ret = dict_set_bin (xattr, priv->pending_key[j],  				    raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); @@ -142,37 +148,57 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,  			   struct afr_reply *replies, unsigned char *locked_on)  {  	afr_private_t *priv = NULL; +        afr_local_t *local = NULL;  	int i = 0;  	int j = 0;  	unsigned char *pending = NULL;  	int *input_dirty = NULL;  	int **input_matrix = NULL; +	int **full_heal_mtx_in = NULL; +	int **full_heal_mtx_out = NULL;  	int *output_dirty = NULL;  	int **output_matrix = NULL;  	dict_t *xattr = NULL; +	dict_t *xdata = NULL;  	priv = this->private; +        local = frame->local;  	pending = alloca0 (priv->child_count);  	input_dirty = alloca0 (priv->child_count * sizeof (int));  	input_matrix = ALLOC_MATRIX (priv->child_count, int); +	full_heal_mtx_in = ALLOC_MATRIX (priv->child_count, int); +	full_heal_mtx_out = ALLOC_MATRIX (priv->child_count, int);  	output_dirty = alloca0 (priv->child_count * sizeof (int));  	output_matrix = ALLOC_MATRIX (priv->child_count, int); +        xdata = dict_new (); +        if (!xdata) +                return -1; +  	afr_selfheal_extract_xattr (this, replies, type, input_dirty,  				    input_matrix); +        if (local->need_full_crawl) +                afr_selfheal_extract_xattr (this, replies, AFR_DATA_TRANSACTION, +                                            NULL, full_heal_mtx_in); +  	for (i = 0; i < priv->child_count; i++)  		if (sinks[i] && !healed_sinks[i])  			pending[i] = 1;  	for (i = 0; i < priv->child_count; i++) {  		for (j = 0; j < priv->child_count; j++) { -			if (pending[j]) +			if (pending[j]) {  				output_matrix[i][j] = 1; -			else +                                if (type == AFR_ENTRY_TRANSACTION) +                                        full_heal_mtx_out[i][j] = 1; +			} else {  				output_matrix[i][j] = -input_matrix[i][j]; +                                if (type == AFR_ENTRY_TRANSACTION) +                                        full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; +                        }  		}  	} @@ -188,17 +214,30 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,  			*/  			continue; -		xattr = afr_selfheal_output_xattr (this, type, output_dirty, -						   output_matrix, i); +		xattr = afr_selfheal_output_xattr (this, local->need_full_crawl, +                                                   type, output_dirty, +                                                   output_matrix, i, +                                                   full_heal_mtx_out);  		if (!xattr) {  			continue;  		} -		afr_selfheal_post_op (frame, this, inode, i, xattr); +                if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { +                        if (xdata && +                            dict_set_int8 (xdata, GF_XATTROP_PURGE_INDEX, 1)) +                                gf_msg (this->name, GF_LOG_WARNING, 0, +                                        AFR_MSG_DICT_SET_FAILED, "Failed to set" +                                        " dict value for %s", +                                        GF_XATTROP_PURGE_INDEX); +                } +		afr_selfheal_post_op (frame, this, inode, i, xattr, xdata);  		dict_unref (xattr);  	} +        if (xdata) +                dict_unref (xdata); +  	return 0;  } @@ -242,6 +281,9 @@ afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol,  	void *pending_raw = NULL;  	int pending[3] = {0, }; +        if (!dirty) +                return 0; +  	if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw))  		return -1; @@ -267,6 +309,9 @@ afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol,  	priv = this->private; +        if (!matrix) +                return 0; +  	for (i = 0; i < priv->child_count; i++) {  		if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))  			continue; @@ -1150,7 +1195,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,  		    local->replies[i].op_errno == EAGAIN) {  			afr_locked_fill (frame, this, locked_on);  			afr_selfheal_unentrylk (frame, this, inode, dom, name, -						locked_on); +						locked_on, NULL);  			AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,  				 &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1189,7 +1234,7 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this,  	if (lock_count > priv->child_count/2 && eagain_count) {                  afr_locked_fill (frame, this, locked_on);                  afr_selfheal_unentrylk (frame, this, inode, dom, name, -                                        locked_on); +                                        locked_on, NULL);                  AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,                           &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1203,7 +1248,8 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this,  int  afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, -			char *dom, const char *name, unsigned char *locked_on) +			char *dom, const char *name, unsigned char *locked_on, +                        dict_t *xdata)  {  	loc_t loc = {0,}; @@ -1211,7 +1257,7 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,  	gf_uuid_copy (loc.gfid, inode->gfid);  	AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, -		    dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); +		    dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);  	loc_wipe (&loc); @@ -1316,7 +1362,12 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,  		if (replies[i].op_ret == -1)  			continue; -		if (data_selfheal && afr_is_data_set (this, replies[i].xdata)) +                /* The data segment of the changelog can be non-zero to indicate +                 * the directory needs a full heal. So the check below ensures +                 * it's not a directory before setting the data_selfheal boolean. +                 */ +		if (data_selfheal && !IA_ISDIR (replies[i].poststat.ia_type) && +                    afr_is_data_set (this, replies[i].xdata))  			*data_selfheal = _gf_true;  		if (metadata_selfheal && @@ -1326,7 +1377,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,  		if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata))  			*entry_selfheal = _gf_true; -		valid_cnt ++; +		valid_cnt++;  		if (valid_cnt == 1) {  			first = replies[i].poststat;  			continue; @@ -1500,7 +1551,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,  	for (i = 0; i < priv->child_count; i++) {  		if (!sources[i])  			continue; -		afr_selfheal_post_op (frame, this, inode, i, xattr); +		afr_selfheal_post_op (frame, this, inode, i, xattr, NULL);  	}  out:          if (changelog) diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index fccffa7dbac..00af8e9f2e6 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -14,6 +14,7 @@  #include "byte-order.h"  #include "afr-transaction.h"  #include "afr-messages.h" +#include "syncop-utils.h"  /* Max file name length is 255 this filename is of length 256. No file with   * this name can ever come, entry-lock with this name is going to prevent @@ -349,6 +350,82 @@ __afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,  	return ret;  } +static gf_boolean_t +is_full_heal_marker_present (xlator_t *this, dict_t *xdata, int idx) +{ +        int                  i           = 0; +        int                  pending[3]  = {0,}; +        void                *pending_raw = NULL; +        afr_private_t       *priv        = NULL; + +        priv = this->private; + +        if (!xdata) +                return _gf_false; + +        /* Iterate over each of the priv->pending_keys[] elements and then +         * see if any of them have data segment non-zero. If they do, return +         * true. Else return false. +         */ +        for (i = 0; i < priv->child_count; i++) { +                if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) +                        continue; + +                if (!pending_raw) +                        continue; + +                memcpy (pending, pending_raw, sizeof (pending)); +                if (ntoh32 (pending[idx])) +                        return _gf_true; +        } + +        return _gf_false; +} + +static gf_boolean_t +afr_need_full_heal (xlator_t *this, struct afr_reply *replies, int source, +                    unsigned char *healed_sinks, afr_transaction_type type) +{ +        int                i     = 0; +        int                idx   = 0; +        afr_private_t     *priv  = NULL; + +        priv = this->private; + +        if (!priv->esh_granular) +                return _gf_true; + +        if (type != AFR_ENTRY_TRANSACTION) +                return _gf_true; + +        priv = this->private; +        idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + +        /* If there is a clear source, check whether the full-heal-indicator +         * is present in its xdata. Otherwise, we need to examine all the +         * participating bricks and then figure if *even* one of them has a +         * full-heal-indicator. +         */ + +        if (source != -1) { +                if (is_full_heal_marker_present (this, replies[source].xdata, +                                                 idx)) +                        return _gf_true; +        } + +        /* else ..*/ + +        for (i = 0; i < priv->child_count; i++) { +                if (!healed_sinks[i]) +                        continue; + +                if (is_full_heal_marker_present (this, replies[i].xdata, idx)) +                        return _gf_true; +        } + +        return _gf_false; +} +  static int  __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,  				      unsigned char *healed_sinks, @@ -431,7 +508,8 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,  static int  afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, -                           fd_t *fd, char *name) +                           fd_t *fd, char *name, inode_t *parent_idx_inode, +                           xlator_t *subvol, gf_boolean_t full_crawl)  {          int                ret          = 0;          int                source       = -1; @@ -486,10 +564,15 @@ afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this,  		ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,  						   source, sources, healed_sinks,  						   locked_on, replies); + +                if ((ret == 0) && (priv->esh_granular) && (!full_crawl)) +                        ret = afr_shd_index_purge (subvol, parent_idx_inode, +                                                   name);  	} +  unlock:          afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, -                                locked_on); +                                locked_on, NULL);  	if (inode)  		inode_unref (inode);          if (replies) @@ -513,12 +596,16 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,  	xlator_t *subvol = NULL;  	afr_private_t *priv = NULL;          gf_boolean_t mismatch = _gf_false; +        afr_local_t *iter_local = NULL; +        afr_local_t *local = NULL;  	priv = this->private;  	subvol = priv->children[child];  	INIT_LIST_HEAD (&entries.list); +        local = frame->local; +  	iter_frame = afr_copy_frame (frame);  	if (!iter_frame)  		return -ENOMEM; @@ -539,7 +626,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,  				continue;  			ret = afr_selfheal_entry_dirent (iter_frame, this, fd, -                                                         entry->d_name); +                                                         entry->d_name, NULL, +                                                         NULL, +                                                        local->need_full_crawl);  			AFR_STACK_RESET (iter_frame);  			if (iter_frame->local == NULL) {                                  ret = -ENOTCONN; @@ -567,36 +656,210 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,  	return ret;  } +static inode_t * +afr_shd_entry_changes_index_inode (xlator_t *this, xlator_t *subvol, +                                   uuid_t pargfid) +{ +        int             ret         = -1; +        void           *index_gfid  = NULL; +        loc_t           rootloc     = {0,}; +        loc_t           loc         = {0,}; +        dict_t         *xattr       = NULL; +        inode_t        *inode       = NULL; +        struct iatt     iatt        = {0,}; + +        rootloc.inode = inode_ref (this->itable->root); +        gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid); + +        ret = syncop_getxattr (subvol, &rootloc, &xattr, +                               GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL); +        if (ret || !xattr) { +                errno = -ret; +                goto out; +        } + +        ret = dict_get_ptr (xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid); +        if (ret) { +                errno = EINVAL; +                goto out; +        } + +        loc.inode = inode_new (this->itable); +        if (!loc.inode) { +                errno = ENOMEM; +                goto out; +        } + +        gf_uuid_copy (loc.pargfid, index_gfid); +        loc.name = gf_strdup (uuid_utoa (pargfid)); + +        ret = syncop_lookup (subvol, &loc, &iatt, NULL, NULL, NULL); +        if (ret < 0) { +                errno = -ret; +                goto out; +        } + +        inode = inode_link (loc.inode, NULL, NULL, &iatt); + +out: +        if (xattr) +                dict_unref (xattr); +        loc_wipe (&rootloc); +        GF_FREE ((char *)loc.name); +        loc_wipe (&loc); + +        return inode; +} + +static int +afr_selfheal_entry_granular_dirent (xlator_t *subvol, gf_dirent_t *entry, +                                    loc_t *parent, void *data) +{ +        int                      ret  = 0; +        loc_t                    loc  = {0,}; +        struct iatt              iatt = {0,}; +        afr_granular_esh_args_t *args = data; + +        /* Look up the actual inode associated with entry. If the lookup returns +         * ESTALE or ENOENT, then it means we have a stale index. Remove it. +         * This is analogous to the check in afr_shd_index_heal() except that +         * here it is achieved through LOOKUP and in afr_shd_index_heal() through +         * a GETXATTR. +         */ + +        loc.inode = inode_new (args->xl->itable); +        loc.parent = inode_ref (args->heal_fd->inode); +        gf_uuid_copy (loc.pargfid, loc.parent->gfid); +        loc.name = entry->d_name; + +        ret = syncop_lookup (args->xl, &loc, &iatt, NULL, NULL, NULL); +        if ((ret == -ENOENT) || (ret == -ESTALE)) { +                afr_shd_index_purge (subvol, parent->inode, entry->d_name); +                ret = 0; +                goto out; +        } +        /* TBD: afr_shd_zero_xattrop? */ + +        ret = afr_selfheal_entry_dirent (args->frame, args->xl, args->heal_fd, +                                         entry->d_name, parent->inode, subvol, +                                         _gf_false); +        AFR_STACK_RESET (args->frame); +        if (args->frame->local == NULL) +                ret = -ENOTCONN; + +        if (ret == -1) +                args->mismatch = _gf_true; + +out: +        loc_wipe (&loc); +        return 0; +} + +static int +afr_selfheal_entry_granular (call_frame_t *frame, xlator_t *this, fd_t *fd, +                             int subvol_idx, gf_boolean_t is_src) +{ +        int                         ret    = 0; +        loc_t                       loc    = {0,}; +        xlator_t                   *subvol = NULL; +        afr_private_t              *priv   = NULL; +        afr_granular_esh_args_t     args   = {0,}; + +        priv = this->private; +        subvol = priv->children[subvol_idx]; + +        args.frame = afr_copy_frame (frame); +        args.xl = this; +        /* args.heal_fd represents the fd associated with the original directory +         * on which entry heal is being attempted. +         */ +        args.heal_fd = fd; + +        /* @subvol here represents the subvolume of AFR where +         * indices/entry-changes/<pargfid> will be processed +         */ +        loc.inode = afr_shd_entry_changes_index_inode (this, subvol, +                                                       fd->inode->gfid); +        if (!loc.inode) { +                /* If granular heal failed on the sink (as it might sometimes +                 * because it is the src that would mostly contain the granular +                 * changelogs and the sink's entry-changes would be empty), +                 * do not treat heal as failure. +                 */ +                if (is_src) +                        return -errno; +                else +                        return 0; +        } + +        ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_SELF_HEALD, +                               &args, afr_selfheal_entry_granular_dirent); + +        loc_wipe (&loc); + +        if (args.mismatch == _gf_true) +                ret = -1; + +        return ret; +} +  static int  afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,  		       int source, unsigned char *sources,  		       unsigned char *healed_sinks)  { -	int i = 0; -	afr_private_t *priv = NULL; -        gf_boolean_t mismatch = _gf_false; -	int ret = 0; +	int i                   = 0; +	int ret                 = 0; +        gf_boolean_t   mismatch = _gf_false; +        afr_local_t   *local    = NULL; +	afr_private_t *priv     = NULL;  	priv = this->private; +        local = frame->local;          gf_msg (this->name, GF_LOG_INFO, 0,                  AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s",  		uuid_utoa (fd->inode->gfid));  	for (i = 0; i < priv->child_count; i++) { +                /* Expunge */  		if (!healed_sinks[i])  			continue; -		ret = afr_selfheal_entry_do_subvol (frame, this, fd, i); + +                if (!local->need_full_crawl) +                /* Why call afr_selfheal_entry_granular() on a "healed sink", +                 * given that it is the source that contains the granular +                 * indices? +                 * If the index for this directory is non-existent or empty on +                 * this subvol (=> clear sink), then it will return early +                 * without failure status. +                 * If the index is non-empty and it is yet a 'healed sink', then +                 * it is due to a split-brain in which case we anyway need to +                 * crawl the indices/entry-changes/pargfid directory. +                 */ +                        ret = afr_selfheal_entry_granular (frame, this, fd, i, +                                                           _gf_false); +                else +                        ret = afr_selfheal_entry_do_subvol (frame, this, fd, i); +                  if (ret == -1) {                          /* gfid or type mismatch. */                          mismatch = _gf_true;                          ret = 0;                  } -		if (ret) -			break; +                if (ret) +                        break;  	} -        if (!ret && source != -1) -		ret = afr_selfheal_entry_do_subvol (frame, this, fd, source); + +        if (!ret && source != -1) { +                /* Impunge */ +                if (local->need_full_crawl) +                        ret = afr_selfheal_entry_do_subvol (frame, this, fd, +                                                            source); +                else +                        ret = afr_selfheal_entry_granular (frame, this, fd, +                                                           source, _gf_true); +        }          if (mismatch == _gf_true)                  /* undo pending will be skipped */ @@ -616,10 +879,12 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,          unsigned char          *postop_lock           = NULL;  	unsigned char          *healed_sinks          = NULL;  	struct afr_reply       *locked_replies        = NULL; +        afr_local_t            *local                 = NULL;  	afr_private_t          *priv                  = NULL;          gf_boolean_t            did_sh                = _gf_true;  	priv = this->private; +        local = frame->local;  	sources = alloca0 (priv->child_count);  	sinks = alloca0 (priv->child_count); @@ -651,10 +916,16 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,                          did_sh = _gf_false;                          goto unlock;                  } + +                local->need_full_crawl = afr_need_full_heal (this, +                                                             locked_replies, +                                                             source, +                                                             healed_sinks, +                                                         AFR_ENTRY_TRANSACTION);  	}  unlock:  	afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, -				data_lock); +				data_lock, NULL);  	if (ret < 0)  		goto out; @@ -695,7 +966,7 @@ unlock:          }  postop_unlock:          afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, -                                postop_lock); +                                postop_lock, NULL);  out:          if (did_sh)                  afr_log_selfheal (fd->inode->gfid, this, ret, "entry", source, @@ -796,10 +1067,12 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)                  }                  if (!granular_locks)                          afr_selfheal_unentrylk (frame, this, inode, this->name, -                                               LONG_FILENAME, long_name_locked); +                                               LONG_FILENAME, long_name_locked, +                                               NULL);  	}  unlock: -	afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); +	afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, +                                locked_on, NULL);  	if (fd)  		fd_unref (fd); diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index 9f7a5b1ff0f..3445ecccf9c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -618,7 +618,7 @@ afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,  	}  unlock:  	afr_selfheal_unentrylk (frame, this, parent, this->name, bname, -				locked_on); +				locked_on, NULL);  	if (inode)  		inode_unref (inode); diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index afc086c0560..becbe67e084 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -137,7 +137,8 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this,  int  afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, -			char *dom, const char *name, unsigned char *locked_on); +			char *dom, const char *name, unsigned char *locked_on, +                        dict_t *xdata);  int  afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, @@ -177,7 +178,7 @@ afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir,  int  afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, -		      int subvol, dict_t *xattr); +		      int subvol, dict_t *xattr, dict_t *xdata);  call_frame_t *  afr_frame_create (xlator_t *this); diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 1ae4f18e764..2ec9d9ce686 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -265,7 +265,7 @@ afr_shd_zero_xattrop (xlator_t *this, uuid_t gfid)          /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or          * has valid repies for this gfid seems a bit of an overkill.*/          for (i = 0; i < priv->child_count; i++) -                afr_selfheal_post_op (frame, this, inode, i, xattr); +                afr_selfheal_post_op (frame, this, inode, i, xattr, NULL);  out:          if (frame) diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 0930a081965..d01a806fe86 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -186,6 +186,8 @@ reconfigure (xlator_t *this, dict_t *options)                            out);          GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str,                            out); +        GF_OPTION_RECONF ("granular-entry-heal", priv->esh_granular, options, +                          bool, out);          GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);          GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); @@ -379,6 +381,7 @@ init (xlator_t *this)          GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);          GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); +        GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out);          GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);          GF_OPTION_INIT ("quorum-type", qtype, str, out); @@ -897,5 +900,12 @@ struct volume_options options[] = {                           "stop being compatible with afr-v1, which helps afr "                           "be more granular while self-healing",          }, +        { .key = {"granular-entry-heal"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "no", +          .description = "If this option is enabled, self-heal will resort to " +                         "granular way of recording changelogs and doing entry " +                         "self-heal.", +        },          { .key  = {NULL} },  }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 0a872a98284..f16f9b4b4ac 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -139,6 +139,7 @@ typedef struct _afr_private {  	void                   *pump_private;  	gf_boolean_t           use_afr_in_pump;  	char                   *locking_scheme; +        gf_boolean_t            esh_granular;  } afr_private_t; @@ -755,6 +756,8 @@ typedef struct _afr_local {          /* For  client side background heals. */          struct list_head healer;          call_frame_t *heal_frame; + +        gf_boolean_t need_full_crawl;  } afr_local_t; @@ -789,6 +792,14 @@ typedef struct afr_read_subvol_args {          uuid_t gfid;  } afr_read_subvol_args_t; +typedef struct afr_granular_esh_args { +        fd_t *heal_fd; +        xlator_t *xl; +        call_frame_t *frame; +        gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid +                                  mismatch */ +} afr_granular_esh_args_t; +  /* did a call fail due to a child failing? */  #define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&            \                                             ((op_errno == ENOTCONN) ||   \ diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index d07a5f07cb4..c9c3047b2de 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -2774,6 +2774,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = GD_OP_VERSION_3_7_12,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key        = "cluster.granular-entry-heal", +          .voltype    = "cluster/replicate", +          .type       = DOC, +          .op_version = GD_OP_VERSION_3_8_0, +          .flags      = OPT_FLAG_CLIENT_OPT +        },          { .key         = NULL          }  };  | 
