diff options
| -rw-r--r-- | libglusterfs/src/common-utils.h | 6 | ||||
| -rwxr-xr-x | tests/bugs/rdma/bug-765473.t | 35 | ||||
| -rw-r--r-- | tests/bugs/write-behind/bug-1279730.c | 131 | ||||
| -rwxr-xr-x | tests/bugs/write-behind/bug-1279730.t | 31 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 14 | ||||
| -rw-r--r-- | xlators/mount/fuse/src/fuse-bridge.c | 7 | ||||
| -rw-r--r-- | xlators/performance/write-behind/src/write-behind.c | 480 | 
7 files changed, 562 insertions, 142 deletions
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index 0e7aa016cbd..f5f4493e21b 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -432,10 +432,12 @@ iov_subset (struct iovec *orig, int orig_count,  	int    i;  	off_t  offset = 0;  	size_t start_offset = 0; -	size_t end_offset = 0; +	size_t end_offset = 0, origin_iov_len = 0;  	for (i = 0; i < orig_count; i++) { +                origin_iov_len = orig[i].iov_len; +  		if ((offset + orig[i].iov_len < src_offset)  		    || (offset > dst_offset)) {  			goto not_subset; @@ -463,7 +465,7 @@ iov_subset (struct iovec *orig, int orig_count,  		new_count++;  	not_subset: -		offset += orig[i].iov_len; +		offset += origin_iov_len;  	}  	return new_count; diff --git a/tests/bugs/rdma/bug-765473.t b/tests/bugs/rdma/bug-765473.t deleted file mode 100755 index 9f595a1d479..00000000000 --- a/tests/bugs/rdma/bug-765473.t +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../volume.rc -. $(dirname $0)/../../fileio.rc - -cleanup; - -function clients_connected() -{ -    volname=$1 -    gluster volume status $volname clients | grep -i 'Clients connected' | sed -e 's/[^0-9]*\(.*\)/\1/g' -} - -## Start and create a volume -TEST glusterd; -TEST pidof glusterd; -TEST $CLI volume create $V0 $H0:$B0/${V0}1 -TEST $CLI volume start $V0; - -TEST glusterfs --direct-io-mode=yes --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0; - -TEST fd=`fd_available` -TEST fd_open $fd 'w' "$M0/testfile" -TEST fd_write $fd "content" -TEST $CLI volume stop $V0 -# write some content which will result in marking fd bad -fd_write $fd "more content" -sync $V0 -TEST $CLI volume start $V0 -EXPECT 'Started' volinfo_field $V0 'Status'; -EXPECT_WITHIN $PROCESS_UP_TIMEOUT 2 clients_connected $V0 -TEST ! fd_write $fd "still more content" - -cleanup diff --git a/tests/bugs/write-behind/bug-1279730.c b/tests/bugs/write-behind/bug-1279730.c new file mode 100644 index 00000000000..535d289c582 --- /dev/null +++ b/tests/bugs/write-behind/bug-1279730.c @@ -0,0 +1,131 @@ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <assert.h> + +int +main (int argc, char *argv[]) +{ +        int          fd                = -1, ret = -1, len = 0; +        char        *path              = NULL, buf[128] = {0, }, *cmd = NULL; +        struct stat  stbuf             = {0, }; +        int          write_to_child[2] = {0, }, write_to_parent[2] = {0, }; + +        path = argv[1]; +        cmd = argv[2]; + +        assert (argc == 3); + +        ret = pipe (write_to_child); +        if (ret < 0) { +                fprintf (stderr, "creation of write-to-child pipe failed " +                         "(%s)\n", strerror (errno)); +                goto out; +        } + +        ret = pipe (write_to_parent); +        if (ret < 0) { +                fprintf (stderr, "creation of write-to-parent pipe failed " +                         "(%s)\n", strerror (errno)); +                goto out; +        } + +        ret = fork (); +        switch (ret) { +        case 0: +                close (write_to_child[1]); +                close (write_to_parent[0]); + +                /* child, wait for instructions to execute command */ +                ret = read (write_to_child[0], buf, 128); +                if (ret < 0) { +                        fprintf (stderr, "child: read on pipe failed (%s)\n", +                                 strerror (errno)); +                        goto out; +                } + +                system (cmd); + +                ret = write (write_to_parent[1], "1", 2); +                if (ret < 0) { +                        fprintf (stderr, "child: write to pipe failed (%s)\n", +                                 strerror (errno)); +                        goto out; +                } +                break; + +        case -1: +                fprintf (stderr, "fork failed (%s)\n", strerror (errno)); +                goto out; + +        default: +                close (write_to_parent[1]); +                close (write_to_child[0]); + +                fd = open (path, O_CREAT | O_RDWR | O_APPEND, S_IRWXU); +                if (fd < 0) { +                        fprintf (stderr, "open failed (%s)\n", +                                 strerror (errno)); +                        goto out; +                } + +                len = strlen ("test-content") + 1; +                ret = write (fd, "test-content", len); + +                if (ret < len) { +                        fprintf (stderr, "write failed %d (%s)\n", ret, +                                 strerror (errno)); +                } + +                ret = pread (fd, buf, 128, 0); +                if ((ret == len) && (strcmp (buf, "test-content") == 0)) { +                        fprintf (stderr, "read should've failed as previous " +                                 "write would've failed with EDQUOT, but its " +                                 "successful"); +                        ret = -1; +                        goto out; +                } + +                ret = write (write_to_child[1], "1", 2); +                if (ret < 0) { +                        fprintf (stderr, "parent: write to pipe failed (%s)\n", +                                 strerror (errno)); +                        goto out; +                } + +                ret = read (write_to_parent[0], buf, 128); +                if (ret < 0) { +                        fprintf (stderr, "parent: read from pipe failed (%s)\n", +                                 strerror (errno)); +                        goto out; +                } + +                /* this will force a sync on cached-write and now that quota +                   limit is increased, sync will be successful. ignore return +                   value as fstat would fail with EDQUOT (picked up from +                   cached-write because of previous sync failure. +                */ +                fstat (fd, &stbuf); + +                ret = pread (fd, buf, 128, 0); +                if (ret != len) { +                        fprintf (stderr, "post cmd read failed %d (data:%s) " +                                 "(error:%s)\n", ret, buf, strerror (errno)); +                        goto out; +                } + +                if (strcmp (buf, "test-content")) { +                        fprintf (stderr, "wrong data (%s)\n", buf); +                        goto out; +                } +        } + +        ret = 0; + +out: +        return ret; +} diff --git a/tests/bugs/write-behind/bug-1279730.t b/tests/bugs/write-behind/bug-1279730.t new file mode 100755 index 00000000000..38e564b7afc --- /dev/null +++ b/tests/bugs/write-behind/bug-1279730.t @@ -0,0 +1,31 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../fileio.rc + +cleanup; + +## Start and create a volume +TEST glusterd; +TEST pidof glusterd; +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/$V0; +TEST $CLI volume start $V0; +TEST $CLI volume quota $V0 enable +TEST $CLI volume quota $V0 limit-usage / 4 + +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# compile the test program and run it +TEST $CC -O0 -g3 $(dirname $0)/bug-1279730.c -o $(dirname $0)/bug-1279730 + +TEST $(dirname $0)/bug-1279730 $M0/file "\"$CLI volume quota $V0 limit-usage / 1024\"" + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +TEST $CLI volume stop $V0; +TEST $CLI volume delete $V0; + +cleanup; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index b5e5d5fbf7b..593ae2aa833 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1215,6 +1215,20 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = 1,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key        = "performance.resync-failed-syncs-after-fsync", +          .voltype    = "performance/write-behind", +          .option     = "resync-failed-syncs-after-fsync", +          .op_version = GD_OP_VERSION_3_7_6, +          .flags      = OPT_FLAG_CLIENT_OPT, +          .description = "If sync of \"cached-writes issued before fsync\" " +                         "(to backend) fails, this option configures whether " +                         "to retry syncing them after fsync or forget them. " +                         "If set to on, cached-writes are retried " +                         "till a \"flush\" fop (or a successful sync) on sync " +                         "failures. " +                         "fsync itself is failed irrespective of the value of " +                         "this option. ", +        },          { .key        = "performance.nfs.write-behind-window-size",            .voltype    = "performance/write-behind",            .option     = "cache-size", diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index fcc77c51be9..eba8a6ddb0c 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -4968,11 +4968,16 @@ fuse_thread_proc (void *data)  int32_t  fuse_itable_dump (xlator_t  *this)  { +        fuse_private_t *priv = NULL; +          if (!this)                   return -1; +        priv = this->private; +          gf_proc_dump_add_section("xlator.mount.fuse.itable"); -        inode_table_dump(this->itable, "xlator.mount.fuse.itable"); +        inode_table_dump(priv->active_subvol->itable, +                         "xlator.mount.fuse.itable");          return 0;  } diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 1adda4eaff4..285420526f4 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -107,6 +107,14 @@ typedef struct wb_inode {  	size_t       size; /* Size of the file to catch write after EOF. */          gf_lock_t    lock;          xlator_t    *this; +        int          dontsync; /* If positive, dont pick lies for +                                * winding. This is needed to break infinite +                                * recursion during invocation of +                                * wb_process_queue from +                                * wb_fulfill_cbk in case of an +                                * error during fulfill. +                                */ +  } wb_inode_t; @@ -144,6 +152,8 @@ typedef struct wb_request {  				       request arrival */  	fd_t                 *fd; +        int                   wind_count;    /* number of sync-attempts. Only +                                                for debug purposes */  	struct {  		size_t        size;          /* 0 size == till infinity */  		off_t         off; @@ -164,6 +174,7 @@ typedef struct wb_conf {          gf_boolean_t     trickling_writes;  	gf_boolean_t     strict_write_ordering;  	gf_boolean_t     strict_O_DIRECT; +        gf_boolean_t     resync_after_fsync;  } wb_conf_t; @@ -202,26 +213,6 @@ out:  } -gf_boolean_t -wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) -{ -        gf_boolean_t err   = _gf_false; -        uint64_t     value = 0; -        int32_t      tmp   = 0; - -	if (fd_ctx_get (fd, this, &value) == 0) { -                if (op_errno) { -                        tmp = value; -                        *op_errno = tmp; -                } - -                err = _gf_true; -        } - -        return err; -} - -  /*    Below is a succinct explanation of the code deciding whether two regions    overlap, from Pavan <tcp@gluster.com>. @@ -305,17 +296,17 @@ wb_requests_conflict (wb_request_t *lie, wb_request_t *req)  } -gf_boolean_t +wb_request_t *  wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req)  {          wb_request_t *each     = NULL;          list_for_each_entry (each, &wb_inode->liability, lie) {  		if (wb_requests_conflict (each, req)) -			return _gf_true; +			return each;          } -        return _gf_false; +        return NULL;  } @@ -552,6 +543,9 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)  		break;  	default: +                if (stub && stub->args.fd) +                        req->fd = fd_ref (stub->args.fd); +  		break;  	} @@ -679,6 +673,88 @@ __wb_fulfill_request (wb_request_t *req)  } +/* get a flush/fsync waiting on req */ +wb_request_t * +__wb_request_waiting_on (wb_request_t *req) +{ +        wb_inode_t   *wb_inode = NULL; +        wb_request_t *trav     = NULL; + +        wb_inode = req->wb_inode; + +        list_for_each_entry (trav, &wb_inode->todo, todo) { +                if ((trav->fd == req->fd) +                    && ((trav->stub->fop == GF_FOP_FLUSH) +                        || (trav->stub->fop == GF_FOP_FSYNC)) +                    && (trav->gen >= req->gen)) +                        return trav; +        } + +        return NULL; +} + + +void +__wb_fulfill_request_err (wb_request_t *req, int32_t op_errno) +{ +	wb_inode_t   *wb_inode = NULL; +        wb_request_t *waiter   = NULL; +        wb_conf_t    *conf     = NULL; + +        wb_inode = req->wb_inode; + +        conf = wb_inode->this->private; + +        req->op_ret = -1; +        req->op_errno = op_errno; + +        if (req->ordering.lied) +                waiter = __wb_request_waiting_on (req); + +        if (!req->ordering.lied || waiter) { +                if (!req->ordering.lied) { +                        /* response to app is still pending, send failure in +                         * response. +                         */ +                } else { +                        /* response was sent, store the error in a +                         * waiter (either an fsync or flush). +                         */ +                        waiter->op_ret = -1; +                        waiter->op_errno = op_errno; +                } + +                if (!req->ordering.lied +                    || (waiter->stub->fop == GF_FOP_FLUSH) +                    || ((waiter->stub->fop == GF_FOP_FSYNC) +                        && !conf->resync_after_fsync)) { +                        /* No retry needed, forget the request */ +                        __wb_fulfill_request (req); +                        return; +                } +        } + +        /* response was unwound and no waiter waiting on this request, retry +           till a flush or fsync (subject to conf->resync_after_fsync). +        */ +	wb_inode->transit -= req->total_size; + +        req->total_size = 0; + +        list_del_init (&req->winds); +        list_del_init (&req->todo); +        list_del_init (&req->wip); + +        /* sanitize ordering flags to retry */ +        req->ordering.go = 0; + +        /* Add back to todo list to retry */ +        list_add (&req->todo, &wb_inode->todo); + +        return; +} + +  void  wb_head_done (wb_request_t *head)  { @@ -693,6 +769,7 @@ wb_head_done (wb_request_t *head)  		list_for_each_entry_safe (req, tmp, &head->winds, winds) {  			__wb_fulfill_request (req);  		} +  		__wb_fulfill_request (head);  	}  	UNLOCK (&wb_inode->lock); @@ -700,29 +777,130 @@ wb_head_done (wb_request_t *head)  void +__wb_fulfill_err (wb_request_t *head, int op_errno) +{ +	wb_request_t *req      = NULL, *tmp = NULL; + +        if (!head) +                goto out; + +        head->wb_inode->dontsync++; + +        list_for_each_entry_safe_reverse (req, tmp, &head->winds, +                                          winds) { +                __wb_fulfill_request_err (req, op_errno); +        } + +        __wb_fulfill_request_err (head, op_errno); + +out: +        return; +} + + +void  wb_fulfill_err (wb_request_t *head, int op_errno)  { -	wb_inode_t *wb_inode; -	wb_request_t *req; +	wb_inode_t   *wb_inode = NULL;  	wb_inode = head->wb_inode; -	/* for all future requests yet to arrive */ -	fd_ctx_set (head->fd, THIS, op_errno); -  	LOCK (&wb_inode->lock);  	{ -		/* for all requests already arrived */ -		list_for_each_entry (req, &wb_inode->all, all) { -			if (req->fd != head->fd) -				continue; -			req->op_ret = -1; -			req->op_errno = op_errno; -		} +                __wb_fulfill_err (head, op_errno); +  	}  	UNLOCK (&wb_inode->lock);  } +inline void +__wb_modify_write_request (wb_request_t *req, int synced_size, +                           int head_total_size) +{ +        struct iovec *vector = NULL; +        int           count  = 0; + +        if (!req || synced_size == 0) +                goto out; + +        req->write_size -= synced_size; +        req->stub->args.offset += synced_size; +        req->total_size = head_total_size; + +        vector = req->stub->args.vector; +        count = req->stub->args.count; + +        req->stub->args.count = iov_subset (vector, count, synced_size, +                                            iov_length (vector, count), vector); + +out: +        return; +} + +int +__wb_fulfill_short_write (wb_request_t *req, int size, int total_size) +{ +        int accounted_size = 0; + +        if (req == NULL) +                goto out; + +        if (req->write_size <= size) { +                accounted_size = req->write_size; +                __wb_fulfill_request (req); +        } else { +                accounted_size = size; +                __wb_modify_write_request (req, size, total_size); +        } + +out: +        return accounted_size; +} + +void +wb_fulfill_short_write (wb_request_t *head, int size) +{ +        wb_inode_t   *wb_inode   = NULL; +        wb_request_t *req        = NULL, *tmp = NULL; +        int           total_size = 0, accounted_size = 0; + +        if (!head) +                goto out; + +        wb_inode = head->wb_inode; + +        total_size = head->total_size - size; +        head->total_size = size; + +        req = head; + +        LOCK (&wb_inode->lock); +        { +                accounted_size = __wb_fulfill_short_write (head, size, +                                                           total_size); + +                size -= accounted_size; + +                if (size == 0) +                        goto done; + +                list_for_each_entry_safe (req, tmp, &head->winds, winds) { +                        accounted_size = __wb_fulfill_short_write (req, size, +                                                                   total_size); +                        size -= accounted_size; + +                        if (size == 0) +                                break; + +                } +        } +done: +        UNLOCK (&wb_inode->lock); + +        wb_fulfill_err (req, EIO); +out: +        return; +}  int  wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -740,18 +918,10 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	if (op_ret == -1) {  		wb_fulfill_err (head, op_errno);  	} else if (op_ret < head->total_size) { -		/* -		 * We've encountered a short write, for whatever reason. -		 * Set an EIO error for the next fop. This should be -		 * valid for writev or flush (close). -		 * -		 * TODO: Retry the write so we can potentially capture -		 * a real error condition (i.e., ENOSPC). -		 */ -		wb_fulfill_err (head, EIO); -	} - -	wb_head_done (head); +		wb_fulfill_short_write (head, op_ret); +	} else { +                wb_head_done (head); +        }          wb_process_queue (wb_inode); @@ -776,10 +946,6 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)  	int           count    = 0;  	wb_request_t *req      = NULL;  	call_frame_t *frame    = NULL; -        gf_boolean_t  fderr    = _gf_false; -        xlator_t     *this     = NULL; - -        this = THIS;          /* make sure head->total_size is updated before we run into any           * errors @@ -795,11 +961,6 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)  			goto err;  	} -        if (wb_fd_err (head->fd, this, NULL)) { -                fderr = _gf_true; -                goto err; -        } -  	frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool);  	if (!frame)  		goto err; @@ -822,26 +983,21 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)  	return 0;  err: -        if (!fderr) { -                /* frame creation failure */ -                fderr = ENOMEM; -                wb_fulfill_err (head, fderr); -        } - -	wb_head_done (head); +        /* frame creation failure */ +        wb_fulfill_err (head, ENOMEM); -	return fderr; +	return ENOMEM;  } -#define NEXT_HEAD(head, req) do {					\ -		if (head)						\ -			ret |= wb_fulfill_head (wb_inode, head);	\ -		head = req;						\ -		expected_offset = req->stub->args.offset +		\ -			req->write_size;				\ -		curr_aggregate = 0;					\ -		vector_count = 0;					\ +#define NEXT_HEAD(head, req) do {                                       \ +		if (head)                                               \ +			ret |= wb_fulfill_head (wb_inode, head);        \ +		head = req;                                             \ +		expected_offset = req->stub->args.offset +              \ +			req->write_size;                                \ +		curr_aggregate = 0;                                     \ +		vector_count = 0;                                       \  	} while (0) @@ -1053,6 +1209,17 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)  	conf = wb_inode->this->private;          list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { +                if (wb_inode->dontsync && req->ordering.lied) { +                        /* sync has failed. Don't pick lies _again_ for winding +                         * as winding these lies again will trigger an infinite +                         * recursion of wb_process_queue being called from a +                         * failed fulfill. However, pick non-lied requests for +                         * winding so that application wont block indefinitely +                         * waiting for write result. +                         */ +                        continue; +                } +  		if (!req->ordering.tempted) {  			if (holder) {  				if (wb_requests_conflict (holder, req)) @@ -1124,20 +1291,96 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)  	if (conf->trickling_writes && !wb_inode->transit && holder)  		holder->ordering.go = 1; +        if (wb_inode->dontsync > 0) +                wb_inode->dontsync--; +          return;  } +int +__wb_handle_failed_conflict (wb_request_t *req, wb_request_t *conflict, +                             list_head_t *tasks) +{ +        wb_conf_t *conf = NULL; + +        conf = req->wb_inode->this->private; + +        if ((req->stub->fop != GF_FOP_FLUSH) +            && ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) { +                if (!req->ordering.lied && list_empty (&conflict->wip)) { +                        /* If request itself is in liability queue, +                         * 1. We cannot unwind as the response has already been +                         *    sent. +                         * 2. We cannot wind till conflict clears up. +                         * 3. So, skip the request for now. +                         * 4. Otherwise, resume (unwind) it with error. +                         */ +                        req->op_ret = -1; +                        req->op_errno = conflict->op_errno; + +                        list_del_init (&req->todo); +                        list_add_tail (&req->winds, tasks); +                } +        } else { +                /* flush and fsync (without conf->resync_after_fsync) act as +                   barriers. We cannot unwind them out of +                   order, when there are earlier generation writes just because +                   there is a conflicting liability with an error. So, wait for +                   our turn till there are no conflicting liabilities. + +                   This situation can arise when there liabilities spread across +                   multiple generations. For eg., consider two writes with +                   following characterstics: + +                   1. they belong to different generations gen1, gen2 and +                      (gen1 > gen2). +                   2. they overlap. +                   3. both are liabilities. +                   4. gen1 write was attempted to sync, but the attempt failed. +                   5. there was no attempt to sync gen2 write yet. +                   6. A flush (as part of close) is issued and gets a gen no +                      gen3. + +                   In the above scenario, if flush is unwound without waiting +                   for gen1 and gen2 writes either to be successfully synced or +                   purged, we end up with these two writes in wb_inode->todo +                   list forever as there will be no attempt to process the queue +                   as flush is the last operation. +                */ +        } + +        return 0; +} + -void +int  __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks,  		 list_head_t *liabilities)  { -	wb_request_t *req = NULL; -	wb_request_t *tmp = NULL; +	wb_request_t *req                 = NULL; +	wb_request_t *tmp                 = NULL; +        wb_request_t *conflict            = NULL;  	list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { -		if (wb_liability_has_conflict (wb_inode, req)) -			continue; +                conflict = wb_liability_has_conflict (wb_inode, req); +                if (conflict) { +                        if (conflict->op_ret == -1) { +                                /* There is a conflicting liability which failed +                                 * to sync in previous attempts, resume the req +                                 * and fail, unless its an fsync/flush. +                                 */ + +                                __wb_handle_failed_conflict (req, conflict, +                                                             tasks); +                        } else { +                                /* There is a conflicting liability which was +                                 * not attempted to sync even once. Wait till +                                 * atleast one attempt to sync is made. +                                 */ +                        } + +                        continue; +                }  		if (req->ordering.tempted && !req->ordering.go)  			/* wait some more */ @@ -1148,6 +1391,7 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks,  				continue;  			list_add_tail (&req->wip, &wb_inode->wip); +                        req->wind_count++;  			if (!req->ordering.tempted)  				/* unrefed in wb_writev_cbk */ @@ -1162,6 +1406,8 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks,  		else  			list_add_tail (&req->winds, tasks);  	} + +        return 0;  } @@ -1174,7 +1420,12 @@ wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks)  	list_for_each_entry_safe (req, tmp, tasks, winds) {  		list_del_init (&req->winds); -		call_resume (req->stub); +                if (req->op_ret == -1) { +                        call_unwind_error (req->stub, req->op_ret, +                                           req->op_errno); +                } else { +                        call_resume (req->stub); +                }  		wb_request_unref (req);  	} @@ -1184,10 +1435,10 @@ wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks)  void  wb_process_queue (wb_inode_t *wb_inode)  { -        list_head_t tasks  = {0, }; -	list_head_t lies = {0, }; -	list_head_t liabilities = {0, }; -        int         retry       = 0; +        list_head_t tasks        = {0, }; +	list_head_t lies         = {0, }; +	list_head_t liabilities  = {0, }; +        int         wind_failure = 0;          INIT_LIST_HEAD (&tasks);          INIT_LIST_HEAD (&lies); @@ -1209,13 +1460,12 @@ wb_process_queue (wb_inode_t *wb_inode)                  wb_do_winds (wb_inode, &tasks); -                /* fd might've been marked bad due to previous errors. -                 * Since, caller of wb_process_queue might be the last fop on -                 * inode, make sure we keep processing request queue, till there -                 * are no requests left. +                /* If there is an error in wb_fulfill before winding write +                 * requests, we would miss invocation of wb_process_queue +                 * from wb_fulfill_cbk. So, retry processing again.                   */ -                retry = wb_fulfill (wb_inode, &liabilities); -        } while (retry); +                wind_failure = wb_fulfill (wb_inode, &liabilities); +        } while (wind_failure);          return;  } @@ -1285,10 +1535,6 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,  	conf = this->private; -        if (wb_fd_err (fd, this, &op_errno)) { -                goto unwind; -        } -          wb_inode = wb_inode_create (this, fd->inode);  	if (!wb_inode) {  		op_errno = ENOMEM; @@ -1309,7 +1555,7 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,  					count, offset, flags, iobref, xdata);  	else  		stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, -					flags, iobref, xdata); +                                        flags, iobref, xdata);          if (!stub) {                  op_errno = ENOMEM;                  goto unwind; @@ -1413,10 +1659,6 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  		goto unwind;  	} -	if (wb_fd_err (fd, this, &op_errno)) { -		op_ret = -1; -		goto unwind; -	}          if (conf->flush_behind)  		goto flushbehind; @@ -1495,9 +1737,6 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,          call_stub_t  *stub         = NULL;  	int32_t       op_errno     = EINVAL; -	if (wb_fd_err (fd, this, &op_errno)) -		goto unwind; -          wb_inode = wb_inode_ctx_get (this, fd->inode);  	if (!wb_inode)  		goto noqueue; @@ -1720,9 +1959,6 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,  		goto unwind;          } -	if (wb_fd_err (fd, this, &op_errno)) -		goto unwind; -  	frame->local = wb_inode;  	stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, @@ -2003,7 +2239,18 @@ __wb_dump_requests (struct list_head *head, char *prefix)  		else  			gf_proc_dump_write ("wound", "no"); +                gf_proc_dump_write ("generation-number", "%d", req->gen); + +                gf_proc_dump_write ("req->op_ret", "%d", req->op_ret); +                gf_proc_dump_write ("req->op_errno", "%d", req->op_errno); +                gf_proc_dump_write ("sync-attempts", "%d", req->wind_count); +                  if (req->fop == GF_FOP_WRITE) { +                        if (list_empty (&req->wip)) +                                gf_proc_dump_write ("sync-in-progress", "no"); +                        else +                                gf_proc_dump_write ("sync-in-progress", "yes"); +                          gf_proc_dump_write ("size", "%"GF_PRI_SIZET,                                              req->write_size); @@ -2021,6 +2268,7 @@ __wb_dump_requests (struct list_head *head, char *prefix)                          flag = req->ordering.go;                          gf_proc_dump_write ("go", "%d", flag); +                  }          }  } @@ -2066,6 +2314,11 @@ wb_inode_dump (xlator_t *this, inode_t *inode)                              wb_inode->window_current); +        gf_proc_dump_write ("transit-size", "%"GF_PRI_SIZET, +                            wb_inode->transit); + +        gf_proc_dump_write ("dontsync", "%d", wb_inode->dontsync); +          ret = TRY_LOCK (&wb_inode->lock);          if (!ret)          { @@ -2117,7 +2370,8 @@ reconfigure (xlator_t *this, dict_t *options)          conf = this->private; -        GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out); +        GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, +                          out);          GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,                            out); @@ -2130,6 +2384,9 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("strict-write-ordering", conf->strict_write_ordering,  			  options, bool, out); +        GF_OPTION_RECONF ("resync-failed-syncs-after-fsync", +                          conf->resync_after_fsync, options, bool, out); +          ret = 0;  out:          return ret; @@ -2196,6 +2453,9 @@ init (xlator_t *this)          GF_OPTION_INIT ("strict-write-ordering", conf->strict_write_ordering,  			bool, out); +        GF_OPTION_INIT ("resync-failed-syncs-after-fsync", +                        conf->resync_after_fsync, bool, out); +          this->private = conf;          ret = 0; @@ -2287,5 +2547,17 @@ struct volume_options options[] = {  	  .description = "Do not let later writes overtake earlier writes even "  	                  "if they do not overlap",          }, +        { .key = {"resync-failed-syncs-after-fsync"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "If sync of \"cached-writes issued before fsync\" " +                         "(to backend) fails, this option configures whether " +                         "to retry syncing them after fsync or forget them. " +                         "If set to on, cached-writes are retried " +                         "till a \"flush\" fop (or a successful sync) on sync " +                         "failures. " +                         "fsync itself is failed irrespective of the value of " +                         "this option. ", +        },          { .key = {NULL} },  };  | 
