diff options
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.c | 302 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 33 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 7 | 
4 files changed, 299 insertions, 46 deletions
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index b48488526..98cda1e80 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -24,6 +24,7 @@  #include <sys/time.h>  #include <stdlib.h>  #include <signal.h> +#include <string.h>  #ifndef _CONFIG_H  #define _CONFIG_H @@ -219,6 +220,7 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,  	afr_local_t * local  = NULL;  	int call_count = -1; +        int ret        = 0;  	LOCK (&frame->lock);  	{ @@ -234,30 +236,33 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,  	call_count = afr_frame_return (frame);  	if (call_count == 0) { -                if ((local->op_ret == 0) && -                    !afr_is_opendir_done (this, fd->inode)) { - -                        /* -                         * This is the first opendir on this inode. We need -                         * to check if the directory's entries are the same -                         * on all subvolumes. This is needed in addition -                         * to regular entry self-heal because the readdir -                         * call is sent only to the first subvolume, and -                         * thus files that exist only there will never be healed -                         * otherwise (assuming changelog shows no anamolies). -                         */ - -                        gf_log (this->name, GF_LOG_TRACE, -                                "reading contents of directory %s looking for mismatch", -                                local->loc.path); - -                        afr_examine_dir (frame, this); - -                } else { -                        AFR_STACK_UNWIND (opendir, frame, local->op_ret, -                                          local->op_errno, local->fd); +                if (local->op_ret == 0) { +                        ret = afr_fd_ctx_set (this, local->fd); + +                        if (!afr_is_opendir_done (this, fd->inode)) { + +                                /* +                                 * This is the first opendir on this inode. We need +                                 * to check if the directory's entries are the same +                                 * on all subvolumes. This is needed in addition +                                 * to regular entry self-heal because the readdir +                                 * call is sent only to the first subvolume, and +                                 * thus files that exist only there will never be healed +                                 * otherwise (assuming changelog shows no anamolies). +                                 */ + +                                gf_log (this->name, GF_LOG_TRACE, +                                        "reading contents of directory %s looking for mismatch", +                                        local->loc.path); + +                                afr_examine_dir (frame, this); + +                        } else { +                                AFR_STACK_UNWIND (opendir, frame, local->op_ret, +                                                  local->op_errno, local->fd); +                        }                  } -	} +        }  	return 0;  } @@ -333,6 +338,120 @@ out:   * Applicable to: readdir   */ + +struct entry_name { +        char *name; +        struct list_head list; +}; + + +static gf_boolean_t +remembered_name (const char *name, struct list_head *entries) +{ +        struct entry_name *e; +        gf_boolean_t ret = _gf_false; + +        list_for_each_entry (e, entries, list) { +                if (!strcmp (name, e->name)) { +                        ret = _gf_true; +                        goto out; +                } +        } + +out: +        return ret; +} + + +static void +afr_remember_entries (gf_dirent_t *entries, fd_t *fd) +{ +	struct entry_name *n       = NULL; +	gf_dirent_t *      entry   = NULL; + +	int ret = 0; + +	uint64_t      ctx; +	afr_fd_ctx_t *fd_ctx; + +	ret = fd_ctx_get (fd, THIS, &ctx); +	if (ret < 0) { +		gf_log (THIS->name, GF_LOG_DEBUG, +			"could not get fd ctx for fd=%p", fd); +		return; +	} + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +	list_for_each_entry (entry, &entries->list, list) { +		n = CALLOC (1, sizeof (*n)); +		n->name = strdup (entry->d_name); +		INIT_LIST_HEAD (&n->list); + +		list_add (&n->list, &fd_ctx->entries); +	} +} + + +static off_t +afr_filter_entries (gf_dirent_t *entries, fd_t *fd) +{ +	gf_dirent_t *entry, *tmp; +	int ret = 0; + +	uint64_t      ctx; +	afr_fd_ctx_t *fd_ctx; + +	off_t offset; + +	ret = fd_ctx_get (fd, THIS, &ctx); +	if (ret < 0) { +		gf_log (THIS->name, GF_LOG_DEBUG, +			"could not get fd ctx for fd=%p", fd); +		return -1; +	} + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +	list_for_each_entry_safe (entry, tmp, &entries->list, list) { +		offset = entry->d_off; + +		if (remembered_name (entry->d_name, &fd_ctx->entries)) { +			list_del (&entry->list); +			FREE (entry); +		} +	} + +	return offset; +} + + +static void +afr_forget_entries (fd_t *fd) +{ +	struct entry_name *entry, *tmp; +	int ret = 0; + +	uint64_t      ctx; +	afr_fd_ctx_t *fd_ctx; + +	ret = fd_ctx_get (fd, THIS, &ctx); +	if (ret < 0) { +		gf_log (THIS->name, GF_LOG_DEBUG, +			"could not get fd ctx for fd=%p", fd); +		return; +	} + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +	list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { +		FREE (entry->name); +		list_del (&entry->list); +		FREE (entry); +	} +} + +  int32_t  afr_readdir_cbk (call_frame_t *frame, void *cookie,  		 xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -383,11 +502,19 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          xlator_t **     children = NULL;          ino_t           inum = 0; +        int call_child = 0; +        int ret        = 0; +          gf_dirent_t * entry = NULL;          gf_dirent_t * tmp   = NULL;          int child_index = -1; +        uint64_t      ctx; +        afr_fd_ctx_t *fd_ctx; + +	off_t offset = 0; +          priv     = this->private;          children = priv->children; @@ -395,23 +522,88 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          child_index = (long) cookie; -        if (op_ret != -1) { -                list_for_each_entry_safe (entry, tmp, &entries->list, list) { -                        inum = afr_itransform (entry->d_ino, priv->child_count, -                                               child_index); -                        entry->d_ino = inum; -                        inum  = afr_itransform (entry->d_stat.st_ino, -                                                priv->child_count, child_index); -                        entry->d_stat.st_ino = inum; +	if (priv->strict_readdir) { +		ret = fd_ctx_get (local->fd, this, &ctx); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_DEBUG, +				"could not get fd ctx for fd=%p", local->fd); +			op_ret   = -1; +			op_errno = -ret; +			goto out; +		} -                        if ((local->fd->inode == local->fd->inode->table->root) -                            && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { -                                list_del_init (&entry->list); -                                FREE (entry); -                        } +		fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +		if (child_went_down (op_ret, op_errno)) { +			if (all_tried (child_index, priv->child_count)) { +				goto out; +			} + +			call_child = ++child_index; + +			gf_log (this->name, GF_LOG_TRACE, +				"starting readdir afresh on child %d, offset %"PRId64, +				call_child, (uint64_t) 0); + +			fd_ctx->failed_over = _gf_true; + +			STACK_WIND_COOKIE (frame, afr_readdirp_cbk, +					   (void *) (long) call_child, +					   children[call_child], +					   children[call_child]->fops->readdirp, local->fd, +					   local->cont.readdir.size, 0); +			return 0; +		} +	} + +        list_for_each_entry_safe (entry, tmp, &entries->list, list) { +                inum = afr_itransform (entry->d_ino, priv->child_count, +                                       child_index); +                entry->d_ino = inum; +                inum  = afr_itransform (entry->d_stat.st_ino, +                                        priv->child_count, child_index); +                entry->d_stat.st_ino = inum; + +                if ((local->fd->inode == local->fd->inode->table->root) +                    && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { +                        list_del_init (&entry->list); +                        FREE (entry);                  }          } +	if (priv->strict_readdir) { +		if (fd_ctx->failed_over) { +			if (list_empty (&entries->list)) { +				goto out; +			} + +			offset = afr_filter_entries (entries, local->fd); + +			afr_remember_entries (entries, local->fd); + +			if (list_empty (&entries->list)) { +				/* All the entries we got were duplicate. We +				   shouldn't send an empty list now, because +				   that'll make the application stop reading. So +				   try to get more entries */ + +				gf_log (this->name, GF_LOG_TRACE, +					"trying to fetch non-duplicate entries from offset %"PRId64", child %s", +					offset, children[child_index]->name); + +				STACK_WIND_COOKIE (frame, afr_readdirp_cbk, +						   (void *) (long) child_index, +						   children[child_index], +						   children[child_index]->fops->readdirp, +						   local->fd, local->cont.readdir.size, offset); +				return 0; +			} +		} else { +			afr_remember_entries (entries, local->fd); +		} +	} + +out:          AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries);          return 0; @@ -427,6 +619,9 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,  	int             call_child = 0;  	afr_local_t     *local     = NULL; +        uint64_t      ctx; +        afr_fd_ctx_t *fd_ctx; +  	int ret = -1;  	int32_t op_ret   = -1; @@ -445,7 +640,7 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,  		op_errno = -ret;  		goto out;  	} -						 +  	frame->local = local;  	call_child = afr_first_up_child (priv); @@ -458,7 +653,29 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,          local->fd                  = fd_ref (fd);          local->cont.readdir.size   = size; -        local->cont.readdir.offset = offset; + +	if (priv->strict_readdir) { +		ret = fd_ctx_get (fd, this, &ctx); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_DEBUG, +				"could not get fd ctx for fd=%p", fd); +			op_errno = -ret; +			goto out; +		} + +		fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +		if (fd_ctx->last_tried != call_child) { +			gf_log (this->name, GF_LOG_TRACE, +				"first up child has changed from %d to %d, restarting readdir from offset 0", +				fd_ctx->last_tried, call_child); + +			fd_ctx->failed_over = _gf_true; +			offset = 0; +		} + +		fd_ctx->last_tried = call_child; +	}          if (whichop == GF_FOP_READDIR)                  STACK_WIND_COOKIE (frame, afr_readdir_cbk, @@ -545,6 +762,15 @@ out:  int32_t +afr_releasedir (xlator_t *this, fd_t *fd) +{ +	afr_forget_entries (fd); + +	return 0; +} + + +int32_t  afr_getdents (call_frame_t *frame, xlator_t *this,  	      fd_t *fd, size_t size, off_t offset, int32_t flag)  { diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 98ce1ca16..abde2534d 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -26,8 +26,7 @@ afr_opendir (call_frame_t *frame, xlator_t *this,  	     loc_t *loc, fd_t *fd);  int32_t -afr_closedir (call_frame_t *frame, xlator_t *this, -	      fd_t *fd); +afr_releasedir (xlator_t *this, fd_t *fd);  int32_t  afr_readdir (call_frame_t *frame, xlator_t *this, diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 91189267f..c041adc99 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1128,6 +1128,8 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)                  if (ret < 0) {                          op_ret = ret;                  } + +                INIT_LIST_HEAD (&fd_ctx->entries);          }  unlock:          UNLOCK (&fd->lock); @@ -2693,11 +2695,12 @@ init (xlator_t *this)  	int             ret         = -1;  	int             op_errno    = 0; -	char * read_subvol = NULL; -	char * fav_child   = NULL; -	char * self_heal   = NULL; -        char * algo        = NULL; -	char * change_log  = NULL; +	char * read_subvol     = NULL; +	char * fav_child       = NULL; +	char * self_heal       = NULL; +        char * algo            = NULL; +	char * change_log      = NULL; +	char * strict_readdir  = NULL;          int32_t background_count  = 0;  	int32_t lock_server_count = 1; @@ -2893,6 +2896,20 @@ init (xlator_t *this)  		priv->entry_lock_server_count = lock_server_count;  	} +	priv->strict_readdir = _gf_false; + +	dict_ret = dict_get_str (this->options, "strict-readdir", +				 &strict_readdir); +	if (dict_ret == 0) { +		ret = gf_string2boolean (strict_readdir, &priv->strict_readdir); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"Invalid 'option strict-readdir %s'. " +				"Defaulting to strict-readdir as 'off'.", +				strict_readdir); +		} +	} +  	trav = this->children;  	while (trav) {  		if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { @@ -3037,7 +3054,8 @@ struct xlator_dumpops dumpops = {  struct xlator_cbks cbks = { -        .release     = afr_release, +	.release     = afr_release, +	.releasedir  = afr_releasedir,  }; @@ -3090,5 +3108,8 @@ struct volume_options options[] = {  	  .type = GF_OPTION_TYPE_INT,  	  .min  = 0  	}, +	{ .key  = {"strict-readdir"}, +	  .type = GF_OPTION_TYPE_BOOL, +	},  	{ .key  = {NULL} },  }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 71f4b7e56..a5c75add7 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -67,6 +67,8 @@ typedef struct _afr_private {  	unsigned int metadata_lock_server_count;  	unsigned int entry_lock_server_count; +	gf_boolean_t strict_readdir; +  	unsigned int wait_count;      /* # of servers to wait for success */          uint64_t up_count;      /* number of CHILD_UPs we have seen */ @@ -327,6 +329,7 @@ typedef struct _afr_local {  			size_t size;  			off_t offset; +                        gf_boolean_t failed;  			int last_tried;  		} readdir; @@ -547,6 +550,10 @@ typedef struct {          int32_t wbflags;          uint64_t up_count;   /* number of CHILD_UPs this fd has seen */          uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ + +        int32_t last_tried; +        gf_boolean_t failed_over; +        struct list_head entries; /* needed for readdir failover */  } afr_fd_ctx_t;  | 
