features/shard: Handle offset in appending writes

When a file is opened with append, all writes are appended at the end of file irrespective of the offset given in the write syscall. This needs to be considered in shard size update function and also for choosing which shard to write to. At the moment shard piggybacks on queuing from write-behind xlator for ordering of the operations. So if write-behind is disabled and two parallel appending-writes come both of which can increase the file size beyond shard-size the file will be corrupted. >BUG: 1455301 >Change-Id: I9007e6a39098ab0b5d5386367bd07eb5f89cb09e >Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> >Reviewed-on: https://review.gluster.org/17387 >Smoke: Gluster Build System <jenkins@build.gluster.org> >Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com> >NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> >CentOS-regression: Gluster Build System <jenkins@build.gluster.org> BUG: 1456225 Change-Id: I9007e6a39098ab0b5d5386367bd07eb5f89cb09e Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: https://review.gluster.org/17404 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
author: Pranith Kumar K <pkarampu@redhat.com> 2017-05-24 22:30:29 +0530
committer: Shyamsundar Ranganathan <srangana@redhat.com> 2017-05-29 14:12:20 +0000
commit: 1db7887771c748a63f3c46ce72918c98cb6dc208 (patch)
tree: 2d61937442f3f34b6e5dbd0f8b6eb62628df3214
parent: 5b3e5f1c54fae28c0c89c1b8f769a5777e970956 (diff)
4 files changed, 278 insertions, 43 deletions
diff --git a/extras/group-gluster-block b/extras/group-gluster-block
index 0753d26b3ca..a4a6367920b 100644
--- a/extras/group-gluster-block
+++ b/extras/group-gluster-block
@@ -2,7 +2,6 @@ performance.quick-read=off
 performance.read-ahead=off
 performance.io-cache=off
 performance.stat-prefetch=off
-performance.write-behind=off
 performance.open-behind=off
 performance.readdir-ahead=off
 network.remote-dio=enable
diff --git a/tests/bugs/shard/shard-append-test.c b/tests/bugs/shard/shard-append-test.c
new file mode 100644
index 00000000000..92dff3d078d
--- /dev/null
+++ b/tests/bugs/shard/shard-append-test.c
@@ -0,0 +1,179 @@
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <limits.h>
+#include <string.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define LOG_ERR(msg) do { \
+        fprintf (stderr, "%s : Error (%s)\n", msg, strerror (errno)); \
+        } while (0)
+
+/*This test tests that shard xlator handles offset in appending writes
+ * correctly. This test performs writes of 1025 bytes 1025 times, in 5 threads
+ * with different threads. The buffer to be written is same character repeated
+ * 1025 times in the buffer for a thread. At the end it reads the buffer till
+ * end of file and tests that the read of 1025 bytes is always same character
+ * and the content read is 5*1025*1025 size. 1025 bytes is chosen because it
+ * will lead to write on more than one shard at some point when the size is
+ * going over the initial shard*/
+pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+int thread_data = '1';
+
+glfs_t *
+init_glfs (const char *hostname, const char *volname,
+           const char *logfile)
+{
+        int     ret     = -1;
+        glfs_t *fs      = NULL;
+
+        fs = glfs_new (volname);
+        if (!fs) {
+                LOG_ERR ("glfs_new failed");
+                return NULL;
+        }
+
+        ret = glfs_set_volfile_server (fs, "tcp", hostname, 24007);
+        if (ret < 0) {
+                LOG_ERR ("glfs_set_volfile_server failed");
+                goto out;
+        }
+
+        ret = glfs_set_logging (fs, logfile, 7);
+        if (ret < 0) {
+                LOG_ERR ("glfs_set_logging failed");
+                goto out;
+        }
+
+        ret = glfs_init (fs);
+        if (ret < 0) {
+                LOG_ERR ("glfs_init failed");
+                goto out;
+        }
+
+        ret = 0;
+out:
+        if (ret) {
+                glfs_fini (fs);
+                fs = NULL;
+        }
+
+        return fs;
+}
+
+void*
+write_data (void *data)
+{
+        char           buf[1025] = {0};
+        glfs_fd_t      *glfd = NULL;
+        glfs_t         *fs       = data;
+        int            i     = 0;
+
+        pthread_mutex_lock (&lock);
+        {
+                memset(buf, thread_data, sizeof(buf));
+                thread_data++;
+        }
+        pthread_mutex_unlock (&lock);
+
+        for (i = 0; i < 1025; i++) {
+                glfd = glfs_creat(fs, "parallel-write.txt", O_WRONLY | O_APPEND,
+                                   S_IRUSR | S_IWUSR | O_SYNC);
+                if (!glfd) {
+                        LOG_ERR ("Failed to create file");
+                        exit(1);
+                }
+
+                if (glfs_write (glfd, buf, sizeof(buf), 0) < 0) {
+                        LOG_ERR ("Failed to write to file");
+                        exit(1);
+                }
+                if (glfs_close(glfd) != 0) {
+                        LOG_ERR ("Failed to close file");
+                        exit(1);
+                }
+        }
+        return NULL;
+}
+
+int
+main (int argc, char *argv[])
+{
+        pthread_t  tid[5] = {0};
+        char       buf[1025] = {0};
+        char       cmp_buf[1025] = {0};
+        int         ret      = 0;
+        char       *hostname = NULL;
+        char       *volname  = NULL;
+        char       *logfile  = NULL;
+        glfs_t     *fs       = NULL;
+        glfs_fd_t  *glfd     = NULL;
+        ssize_t     bytes_read = 0;
+        ssize_t     total_bytes_read = 0;
+        int i = 0;
+
+        if (argc != 4) {
+                fprintf (stderr, "Invalid argument\n");
+                exit(1);
+        }
+
+        hostname = argv[1];
+        volname = argv[2];
+        logfile = argv[3];
+
+        fs = init_glfs (hostname, volname, logfile);
+        if (fs == NULL) {
+                LOG_ERR ("init_glfs failed");
+                return -1;
+        }
+
+        for (i = 0; i < 5; i++) {
+                pthread_create(&tid[i], NULL, write_data, fs);
+        }
+
+        for (i = 0; i < 5; i++) {
+                pthread_join(tid[i], NULL);
+        }
+        glfd = glfs_open(fs, "parallel-write.txt", O_RDONLY);
+        if (!glfd) {
+                LOG_ERR ("Failed to open file for reading");
+                exit(1);
+        }
+
+        while ((bytes_read = glfs_read (glfd, buf, sizeof(buf), 0)) > 0) {
+                if (bytes_read != sizeof(buf)) {
+                        fprintf (stderr, "Didn't read complete data read: %zd "
+                                 "expected: %lu", bytes_read, sizeof(buf));
+                        exit(1);
+                }
+
+                total_bytes_read += bytes_read;
+                if (buf[0] < '1' || buf[0] >= thread_data) {
+                        fprintf(stderr, "Invalid character found: %c", buf[0]);
+                        exit(1);
+                }
+                memset(cmp_buf, buf[0], sizeof(cmp_buf));
+                if (memcmp(cmp_buf, buf, sizeof(cmp_buf))) {
+                        LOG_ERR ("Data corrupted");
+                        exit(1);
+                }
+                memset(cmp_buf, 0, sizeof(cmp_buf));
+        }
+
+        if (total_bytes_read != 5*1025*1025) {
+                fprintf(stderr, "Failed to read what is written, read; %zd, "
+                        "expected %zu", total_bytes_read, 5*1025*1025);
+                exit(1);
+        }
+
+        if (glfs_close(glfd) != 0) {
+                LOG_ERR ("Failed to close");
+                exit(1);
+        }
+        return 0;
+}
diff --git a/tests/bugs/shard/shard-append-test.t b/tests/bugs/shard/shard-append-test.t
new file mode 100644
index 00000000000..f8719f2a2c1
--- /dev/null
+++ b/tests/bugs/shard/shard-append-test.t
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+
+TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{1,2,3};
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+
+#Uncomment the following line after shard-queuing is implemented
+#TEST $CLI volume set $V0 performance.write-behind off
+
+TEST $CLI volume set $V0 performance.strict-o-direct on
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0;
+
+logdir=`gluster --print-logdir`
+
+TEST build_tester $(dirname $0)/shard-append-test.c -lgfapi -lpthread
+
+TEST ./$(dirname $0)/shard-append-test ${H0} $V0 $logdir/shard-append-test.log
+
+cleanup_tester $(dirname $0)/shard-append-test
+
+cleanup;
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
index a48806c6a8b..65cf23c7bdd 100644
--- a/xlators/features/shard/src/shard.c
+++ b/xlators/features/shard/src/shard.c
@@ -3629,6 +3629,18 @@ shard_common_inode_write_post_update_size_handler (call_frame_t *frame,
         return 0;
 }
 
+static gf_boolean_t
+shard_is_appending_write (shard_local_t *local)
+{
+        if (local->fop != GF_FOP_WRITE)
+                return _gf_false;
+        if (local->flags & O_APPEND)
+                return _gf_true;
+        if (local->fd->flags & O_APPEND)
+                return _gf_true;
+        return _gf_false;
+}
+
 int
 __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
                                        xlator_t *this)
@@ -3643,13 +3655,15 @@ __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
 
         ctx = (shard_inode_ctx_t *) ctx_uint;
 
-        if (local->offset + local->total_size > ctx->stat.ia_size) {
+        if (shard_is_appending_write (local)) {
+                local->delta_size = local->total_size;
+        } else if (local->offset + local->total_size > ctx->stat.ia_size) {
                 local->delta_size = (local->offset + local->total_size) -
                                     ctx->stat.ia_size;
-                ctx->stat.ia_size += (local->delta_size);
         } else {
                 local->delta_size = 0;
         }
+        ctx->stat.ia_size += (local->delta_size);
         local->postbuf = ctx->stat;
 
         return 0;
@@ -3955,8 +3969,11 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame,
 }
 
 int
-shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
-                                              xlator_t *this)
+shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this,
+                       shard_post_resolve_fop_handler_t handler);
+int
+shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
+                                               xlator_t *this)
 {
         shard_local_t *local = NULL;
 
@@ -3969,8 +3986,6 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
                 return 0;
         }
 
-        local->postbuf = local->prebuf;
-
         if (local->call_count) {
                 shard_common_lookup_shards (frame, this,
                                             local->resolver_base_inode,
@@ -3983,12 +3998,11 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
 }
 
 int
-shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
-                                               xlator_t *this)
+shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
+                                              xlator_t *this)
 {
-        shard_local_t *local = NULL;
-
-        local = frame->local;
+        shard_local_t *local = frame->local;
+        shard_priv_t  *priv  = this->private;
 
         if (local->op_ret < 0) {
                 shard_common_inode_write_failure_unwind (local->fop, frame,
@@ -3997,8 +4011,46 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
                 return 0;
         }
 
-        shard_lookup_base_file (frame, this, &local->loc,
-                                shard_common_inode_write_post_lookup_handler);
+        local->postbuf = local->prebuf;
+
+        /*Adjust offset to EOF so that correct shard is chosen for append*/
+        if (shard_is_appending_write (local))
+                local->offset = local->prebuf.ia_size;
+
+        local->first_block = get_lowest_block (local->offset,
+                                               local->block_size);
+        local->last_block = get_highest_block (local->offset, local->total_size,
+                                               local->block_size);
+        local->num_blocks = local->last_block - local->first_block + 1;
+        local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
+                                       gf_shard_mt_inode_list);
+        if (!local->inode_list) {
+                shard_common_inode_write_failure_unwind (local->fop, frame,
+                                                         -1, ENOMEM);
+                return 0;
+        }
+
+        gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
+                      "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64
+                      " total_size=%zu flags=%"PRId32"",
+                      gf_fop_list[local->fop],
+                      uuid_utoa (local->resolver_base_inode->gfid),
+                      local->first_block, local->last_block, local->num_blocks,
+                      local->offset, local->total_size, local->flags);
+
+        local->dot_shard_loc.inode = inode_find (this->itable,
+                                                 priv->dot_shard_gfid);
+
+        if (!local->dot_shard_loc.inode) {
+                /*change handler*/
+                shard_mkdir_dot_shard (frame, this,
+                                 shard_common_inode_write_post_resolve_handler);
+        } else {
+                /*change handler*/
+                local->post_res_handler =
+                                shard_common_inode_write_post_resolve_handler;
+                shard_refresh_dot_shard (frame, this);
+        }
         return 0;
 }
 
@@ -4697,9 +4749,6 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
         int             i              = 0;
         uint64_t        block_size     = 0;
         shard_local_t  *local          = NULL;
-        shard_priv_t   *priv           = NULL;
-
-        priv = this->private;
 
         ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
         if (ret) {
@@ -4775,37 +4824,13 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
                 local->iobref = iobref_ref (iobref);
         local->fd = fd_ref (fd);
         local->block_size = block_size;
-        local->first_block = get_lowest_block (offset, local->block_size);
-        local->last_block = get_highest_block (offset, local->total_size,
-                                               local->block_size);
-        local->num_blocks = local->last_block - local->first_block + 1;
         local->resolver_base_inode = local->fd->inode;
-        local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
-                                       gf_shard_mt_inode_list);
-        if (!local->inode_list)
-                goto out;
 
         local->loc.inode = inode_ref (fd->inode);
         gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
 
-        gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
-                      "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64""
-                      " total_size=%zu flags=%"PRId32"", gf_fop_list[fop],
-                      uuid_utoa (fd->inode->gfid), local->first_block,
-                      local->last_block, local->num_blocks, offset,
-                      local->total_size, local->flags);
-
-        local->dot_shard_loc.inode = inode_find (this->itable,
-                                                 priv->dot_shard_gfid);
-
-        if (!local->dot_shard_loc.inode) {
-                shard_mkdir_dot_shard (frame, this,
-                                 shard_common_inode_write_post_resolve_handler);
-        } else {
-                local->post_res_handler = shard_common_inode_write_post_resolve_handler;
-                shard_refresh_dot_shard (frame, this);
-        }
-
+        shard_lookup_base_file (frame, this, &local->loc,
+                                shard_common_inode_write_post_lookup_handler);
         return 0;
 out:
         shard_common_inode_write_failure_unwind (fop, frame, -1, ENOMEM);
author	Pranith Kumar K <pkarampu@redhat.com>	2017-05-24 22:30:29 +0530
committer	Shyamsundar Ranganathan <srangana@redhat.com>	2017-05-29 14:12:20 +0000
commit	1db7887771c748a63f3c46ce72918c98cb6dc208 (patch)
tree	2d61937442f3f34b6e5dbd0f8b6eb62628df3214
parent	5b3e5f1c54fae28c0c89c1b8f769a5777e970956 (diff)