diff options
| author | Anand Avati <avati@redhat.com> | 2013-07-19 08:31:41 -0700 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2013-07-23 06:11:12 -0700 | 
| commit | 37ac6bdca826046cbcb0d50727af29baf9407950 (patch) | |
| tree | b899eb81a70c7719c5d7e4328697cc314da24b97 | |
| parent | cee1f9b5c7917bba220f1156b342bf07cac4ad38 (diff) | |
storage/posix: implement batched fsync in a single thread
Because of the extra fsync()s issued by AFR transaction, they
could potentially "clog" all the io-threads denying unrelated
operations from making progress.
This patch assigns a dedicated thread to issues fsyncs, as
an experimental feature to understand performance characteristics
with the approach.
As a basis, incoming individual fsync requests are grouped into
batches, falling in the same @batch-fsync-delay-usec window of
time. These windows can extend in practice, as processing of
the previous batch can take longer than @batch-fsync-delay-usec
while new requests are getting batched.
The feature support three modes (similar to the -S modes of fs_mark)
- syncfs: In this mode one syncfs() is issued per batch, instead
  of N fsync()s (one per file.)
- syncfs-single-fsync: In this mode one syncfs() is issued per
  batch (which, on Linux, guarantees the completion of write-out
  of dirty pages in the filesystem up to that point) and one single
  fsync() to synchronize or flush the controller/drive cache. This
  corresponds to -S 2 of fsmark.
- syncfs-reverse-fsync: In this mode, one syncfs() is issued per
  batch, and all the open files in that batch are fsync()'ed in
  the reverse order of the queue. This corresponds to -S 4 of
  fsmark.
- reverse-fsync: In this mode, no syncfs() is issued and all the
  files in the batch are fsync()'ed in the reverse order. This
  corresponds to -S 3 of fsmark.
Change-Id: Ia1e170a810c780c8d80e02cf910accc4170c4cd4
BUG: 927146
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/4746
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 11 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 8 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 140 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 111 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 19 | 
5 files changed, 288 insertions, 1 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index d6d420910..def00e288 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1478,6 +1478,8 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this)          int i = 0;          int call_count = 0;          afr_private_t *priv = NULL; + 	dict_t *xdata = NULL; + 	GF_UNUSED int ret = -1;          local = frame->local;          priv = this->private; @@ -1493,6 +1495,10 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this)          local->call_count = call_count; +	xdata = dict_new(); +	if (xdata) +		ret = dict_set_int32 (xdata, "batch-fsync", 1); +          for (i = 0; i < priv->child_count; i++) {                  if (!local->transaction.pre_op[i])                          continue; @@ -1500,11 +1506,14 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this)                  STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,                                  (void *) (long) i, priv->children[i],                                  priv->children[i]->fops->fsync, local->fd, -                                1, NULL); +                                1, xdata);                  if (!--call_count)                          break;          } +	if (xdata) +		dict_unref (xdata); +          return 0;  } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 6575e2528..245b29af9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1201,6 +1201,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .voltype     = "storage/posix",            .op_version  = 1          }, +        { .key         = "storage.batch-fsync-mode", +          .voltype     = "storage/posix", +          .op_version  = 3 +        }, +        { .key         = "storage.batch-fsync-delay-usec", +          .voltype     = "storage/posix", +          .op_version  = 3 +        },          { .key         = "storage.owner-uid",            .voltype     = "storage/posix",            .option      = "brick-uid", diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index a13e02078..0c0fdbabc 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1177,3 +1177,143 @@ posix_spawn_health_check_thread (xlator_t *xl)  unlock:          UNLOCK (&priv->lock);  } + +int +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ +	struct posix_private *priv = NULL; +	int count = 0; + +	priv = this->private; +	pthread_mutex_lock (&priv->fsync_mutex); +	{ +		while (list_empty (&priv->fsyncs)) +			pthread_cond_wait (&priv->fsync_cond, +					   &priv->fsync_mutex); + +		count = priv->fsync_queue_count; +		priv->fsync_queue_count = 0; +		list_splice_init (&priv->fsyncs, head); +	} +	pthread_mutex_unlock (&priv->fsync_mutex); + +	return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ +	struct posix_fd *pfd = NULL; +	int ret = -1; +	struct posix_private *priv = NULL; + +	priv = this->private; + +	ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); +	if (ret < 0) { +		gf_log (this->name, GF_LOG_ERROR, +			"could not get fdctx for fd(%s)", +			uuid_utoa (stub->args.fd->inode->gfid)); +		call_unwind_error (stub, -1, EINVAL); +		return; +	} + +	if (do_fsync) { +#ifdef HAVE_FDATASYNC +		if (stub->args.datasync) +			ret = fdatasync (pfd->fd); +		else +#endif +			ret = fsync (pfd->fd); +	} else { +		ret = 0; +	} + +	if (ret) { +		gf_log (this->name, GF_LOG_ERROR, +			"could not fstat fd(%s)", +			uuid_utoa (stub->args.fd->inode->gfid)); +		call_unwind_error (stub, -1, errno); +		return; +	} + +	call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ +	call_stub_t *stub = NULL; +	struct posix_fd *pfd = NULL; +	int ret = -1; + +	stub = list_entry (head->prev, call_stub_t, list); +	ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); +	if (ret) +		return; + +#ifdef GF_LINUX_HOST_OS +	/* syncfs() is not "declared" in RHEL's glibc even though +	   the kernel has support. +	*/ +#include <sys/syscall.h> +#include <unistd.h> +	syscall (SYS_syncfs, pfd->fd); +#else +	sync(); +#endif + +} + + +void * +posix_fsyncer (void *d) +{ +	xlator_t *this = d; +	struct posix_private *priv = NULL; +	call_stub_t *stub = NULL; +	call_stub_t *tmp = NULL; +	struct list_head list; +	int count = 0; +	gf_boolean_t do_fsync = _gf_true; + +	priv = this->private; + +	for (;;) { +		INIT_LIST_HEAD (&list); + +		count = posix_fsyncer_pick (this, &list); + +		usleep (priv->batch_fsync_delay_usec); + +		gf_log (this->name, GF_LOG_DEBUG, +			"picked %d fsyncs", count); + +		switch (priv->batch_fsync_mode) { +		case BATCH_NONE: +		case BATCH_REVERSE_FSYNC: +			break; +		case BATCH_SYNCFS: +		case BATCH_SYNCFS_SINGLE_FSYNC: +		case BATCH_SYNCFS_REVERSE_FSYNC: +			posix_fsyncer_syncfs (this, &list); +			break; +		} + +		if (priv->batch_fsync_mode == BATCH_SYNCFS) +			do_fsync = _gf_false; +		else +			do_fsync = _gf_true; + +		list_for_each_entry_safe_reverse (stub, tmp, &list, list) { +			list_del_init (&stub->list); + +			posix_fsyncer_process (this, stub, do_fsync); + +			if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) +				do_fsync = _gf_false; +		} +	} +} diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index fee2ce9d7..e917766b5 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2417,6 +2417,33 @@ out:  } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, +		     fd_t *fd, int datasync, dict_t *xdata) +{ +	call_stub_t *stub = NULL; +	struct posix_private *priv = NULL; + +	priv = this->private; + +	stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); +	if (!stub) { +		STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); +		return 0; +	} + +	pthread_mutex_lock (&priv->fsync_mutex); +	{ +		list_add_tail (&stub->list, &priv->fsyncs); +		priv->fsync_queue_count++; +		pthread_cond_signal (&priv->fsync_cond); +	} +	pthread_mutex_unlock (&priv->fsync_mutex); + +	return 0; +} + +  int32_t  posix_fsync (call_frame_t *frame, xlator_t *this,               fd_t *fd, int32_t datasync, dict_t *xdata) @@ -2428,6 +2455,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this,          int               ret      = -1;          struct iatt       preop = {0,};          struct iatt       postop = {0,}; +        struct posix_private *priv = NULL;          DECLARE_OLD_FS_ID_VAR; @@ -2443,6 +2471,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this,          goto out;  #endif +	priv = this->private; +	if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { +		posix_batch_fsync (frame, this, fd, datasync, xdata); +		return 0; +	} +          ret = posix_fd_ctx_get (fd, this, &pfd);          if (ret < 0) {                  op_errno = -ret; @@ -4303,6 +4337,27 @@ posix_set_owner (xlator_t *this, uid_t uid, gid_t gid)          return ret;  } + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ +	if (strcmp (str, "none") == 0) +		priv->batch_fsync_mode = BATCH_NONE; +	else if (strcmp (str, "syncfs") == 0) +		priv->batch_fsync_mode = BATCH_SYNCFS; +	else if (strcmp (str, "syncfs-single-fsync") == 0) +		priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; +	else if (strcmp (str, "syncfs-reverse-fsync") == 0) +		priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; +	else if (strcmp (str, "reverse-fsync") == 0) +		priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; +	else +		return -1; + +	return 0; +} + +  int  reconfigure (xlator_t *this, dict_t *options)  { @@ -4310,6 +4365,7 @@ reconfigure (xlator_t *this, dict_t *options)  	struct posix_private *priv = NULL;          uid_t                 uid = -1;          gid_t                 gid = -1; +	char                 *batch_fsync_mode_str = NULL;  	priv = this->private; @@ -4317,6 +4373,18 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out);          posix_set_owner (this, uid, gid); +	GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, +			  options, uint32, out); + +	GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, +			  options, str, out); + +	if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { +		gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", +			batch_fsync_mode_str); +		goto out; +	} +  	GF_OPTION_RECONF ("linux-aio", priv->aio_configured,  			  options, bool, out); @@ -4368,6 +4436,7 @@ init (xlator_t *this)          char                 *guuid         = NULL;          uid_t                 uid           = -1;          gid_t                 gid           = -1; +	char                 *batch_fsync_mode_str;          dir_data = dict_get (this->options, "directory"); @@ -4720,6 +4789,28 @@ init (xlator_t *this)          INIT_LIST_HEAD (&_private->janitor_fds);          posix_spawn_janitor_thread (this); + +	pthread_mutex_init (&_private->fsync_mutex, NULL); +	pthread_cond_init (&_private->fsync_cond, NULL); +	INIT_LIST_HEAD (&_private->fsyncs); + +	ret = pthread_create (&_private->fsyncer, NULL, posix_fsyncer, this); +	if (ret) { +		gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" +			" creation failed (%s)", strerror (errno)); +		goto out; +	} + +	GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + +	if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { +		gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", +			batch_fsync_mode_str); +		goto out; +	} + +	GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, +			uint32, out);  out:          return ret;  } @@ -4849,5 +4940,25 @@ struct volume_options options[] = {            .description = "Interval in seconds for a filesystem health check, "                           "set to 0 to disable"          }, +	{ .key = {"batch-fsync-mode"}, +	  .type = GF_OPTION_TYPE_STR, +	  .default_value = "reverse-fsync", +	  .description = "Possible values:\n" +	  "\t- syncfs: Perform one syncfs() on behalf oa batch" +	  "of fsyncs.\n" +	  "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" +	  " of fsyncs and one fsync() per batch.\n" +	  "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" +	  " of fsyncs and fsync() each file in the batch in reverse order.\n" +	  " in reverse order.\n" +	  "\t- reverse-fsync: Perform fsync() of each file in the batch in" +	  " reverse order." +	}, +	{ .key = {"batch-fsync-delay-usec"}, +	  .type = GF_OPTION_TYPE_INT, +	  .default_value = "1000000", +	  .description = "Num of usecs to wait for aggregating fsync" +	  " requests", +	},          { .key  = {NULL} }  }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 22340370e..c834b29d9 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -43,6 +43,7 @@  #include "timer.h"  #include "posix-mem-types.h"  #include "posix-handle.h" +#include "call-stub.h"  #ifdef HAVE_LIBAIO  #include <libaio.h> @@ -128,6 +129,22 @@ struct posix_private {          /* node-uuid in pathinfo xattr */          gf_boolean_t  node_uuid_pathinfo; +	pthread_t         fsyncer; +	struct list_head  fsyncs; +	pthread_mutex_t   fsync_mutex; +	pthread_cond_t    fsync_cond; +	int               fsync_queue_count; + +	enum { +		BATCH_NONE = 0, +		BATCH_SYNCFS, +		BATCH_SYNCFS_SINGLE_FSYNC, +		BATCH_REVERSE_FSYNC, +		BATCH_SYNCFS_REVERSE_FSYNC +	}               batch_fsync_mode; + +	uint32_t        batch_fsync_delay_usec; +          /* seconds to sleep between health checks */          uint32_t        health_check_interval;          pthread_t       health_check; @@ -184,4 +201,6 @@ void  __posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,  			off_t offset, size_t size);  void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *);  #endif /* _POSIX_H */  | 
