4 files changed, 278 insertions, 43 deletions
diff --git a/extras/group-gluster-block b/extras/group-gluster-block
index 0753d26b3ca..a4a6367920b 100644
--- a/extras/group-gluster-block
+++ b/extras/group-gluster-block
@@ -2,7 +2,6 @@ performance.quick-read=off
 performance.read-ahead=off
 performance.io-cache=off
 performance.stat-prefetch=off
-performance.write-behind=off
 performance.open-behind=off
 performance.readdir-ahead=off
 network.remote-dio=enable
diff --git a/tests/bugs/shard/shard-append-test.c b/tests/bugs/shard/shard-append-test.c
new file mode 100644
index 00000000000..92dff3d078d
--- /dev/null
+++ b/tests/bugs/shard/shard-append-test.c
@@ -0,0 +1,179 @@
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <limits.h>
+#include <string.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define LOG_ERR(msg) do { \
+        fprintf (stderr, "%s : Error (%s)\n", msg, strerror (errno)); \
+        } while (0)
+
+/*This test tests that shard xlator handles offset in appending writes
+ * correctly. This test performs writes of 1025 bytes 1025 times, in 5 threads
+ * with different threads. The buffer to be written is same character repeated
+ * 1025 times in the buffer for a thread. At the end it reads the buffer till
+ * end of file and tests that the read of 1025 bytes is always same character
+ * and the content read is 5*1025*1025 size. 1025 bytes is chosen because it
+ * will lead to write on more than one shard at some point when the size is
+ * going over the initial shard*/
+pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+int thread_data = '1';
+
+glfs_t *
+init_glfs (const char *hostname, const char *volname,
+           const char *logfile)
+{
+        int     ret     = -1;
+        glfs_t *fs      = NULL;
+
+        fs = glfs_new (volname);
+        if (!fs) {
+                LOG_ERR ("glfs_new failed");
+                return NULL;
+        }
+
+        ret = glfs_set_volfile_server (fs, "tcp", hostname, 24007);
+        if (ret < 0) {
+                LOG_ERR ("glfs_set_volfile_server failed");
+                goto out;
+        }
+
+        ret = glfs_set_logging (fs, logfile, 7);
+        if (ret < 0) {
+                LOG_ERR ("glfs_set_logging failed");
+                goto out;
+        }
+
+        ret = glfs_init (fs);
+        if (ret < 0) {
+                LOG_ERR ("glfs_init failed");
+                goto out;
+        }
+
+        ret = 0;
+out:
+        if (ret) {
+                glfs_fini (fs);
+                fs = NULL;
+        }
+
+        return fs;
+}
+
+void*
+write_data (void *data)
+{
+        char           buf[1025] = {0};
+        glfs_fd_t      *glfd = NULL;
+        glfs_t         *fs       = data;
+        int            i     = 0;
+
+        pthread_mutex_lock (&lock);
+        {
+                memset(buf, thread_data, sizeof(buf));
+                thread_data++;
+        }
+        pthread_mutex_unlock (&lock);
+
+        for (i = 0; i < 1025; i++) {
+                glfd = glfs_creat(fs, "parallel-write.txt", O_WRONLY | O_APPEND,
+                                   S_IRUSR | S_IWUSR | O_SYNC);
+                if (!glfd) {
+                        LOG_ERR ("Failed to create file");
+                        exit(1);
+                }
+
+                if (glfs_write (glfd, buf, sizeof(buf), 0) < 0) {
+                        LOG_ERR ("Failed to write to file");
+                        exit(1);
+                }
+                if (glfs_close(glfd) != 0) {
+                        LOG_ERR ("Failed to close file");
+                        exit(1);
+                }
+        }
+        return NULL;
+}
+
+int
+main (int argc, char *argv[])
+{
+        pthread_t  tid[5] = {0};
+        char       buf[1025] = {0};
+        char       cmp_buf[1025] = {0};
+        int         ret      = 0;
+        char       *hostname = NULL;
+        char       *volname  = NULL;
+        char       *logfile  = NULL;
+        glfs_t     *fs       = NULL;
+        glfs_fd_t  *glfd     = NULL;
+        ssize_t     bytes_read = 0;
+        ssize_t     total_bytes_read = 0;
+        int i = 0;
+
+        if (argc != 4) {
+                fprintf (stderr, "Invalid argument\n");
+                exit(1);
+        }
+
+        hostname = argv[1];
+        volname = argv[2];
+        logfile = argv[3];
+
+        fs = init_glfs (hostname, volname, logfile);
+        if (fs == NULL) {
+                LOG_ERR ("init_glfs failed");
+                return -1;
+        }
+
+        for (i = 0; i < 5; i++) {
+                pthread_create(&tid[i], NULL, write_data, fs);
+        }
+
+        for (i = 0; i < 5; i++) {
+                pthread_join(tid[i], NULL);
+        }
+        glfd = glfs_open(fs, "parallel-write.txt", O_RDONLY);
+        if (!glfd) {
+                LOG_ERR ("Failed to open file for reading");
+                exit(1);
+        }
+
+        while ((bytes_read = glfs_read (glfd, buf, sizeof(buf), 0)) > 0) {
+                if (bytes_read != sizeof(buf)) {
+                        fprintf (stderr, "Didn't read complete data read: %zd "
+                                 "expected: %lu", bytes_read, sizeof(buf));
+                        exit(1);
+                }
+
+                total_bytes_read += bytes_read;
+                if (buf[0] < '1' || buf[0] >= thread_data) {
+                        fprintf(stderr, "Invalid character found: %c", buf[0]);
+                        exit(1);
+                }
+                memset(cmp_buf, buf[0], sizeof(cmp_buf));
+                if (memcmp(cmp_buf, buf, sizeof(cmp_buf))) {
+                        LOG_ERR ("Data corrupted");
+                        exit(1);
+                }
+                memset(cmp_buf, 0, sizeof(cmp_buf));
+        }
+
+        if (total_bytes_read != 5*1025*1025) {
+                fprintf(stderr, "Failed to read what is written, read; %zd, "
+                        "expected %zu", total_bytes_read, 5*1025*1025);
+                exit(1);
+        }
+
+        if (glfs_close(glfd) != 0) {
+                LOG_ERR ("Failed to close");
+                exit(1);
+        }
+        return 0;
+}
diff --git a/tests/bugs/shard/shard-append-test.t b/tests/bugs/shard/shard-append-test.t
new file mode 100644
index 00000000000..f8719f2a2c1
--- /dev/null
+++ b/tests/bugs/shard/shard-append-test.t
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+
+TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{1,2,3};
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+
+#Uncomment the following line after shard-queuing is implemented
+#TEST $CLI volume set $V0 performance.write-behind off
+
+TEST $CLI volume set $V0 performance.strict-o-direct on
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0;
+
+logdir=`gluster --print-logdir`
+
+TEST build_tester $(dirname $0)/shard-append-test.c -lgfapi -lpthread
+
+TEST ./$(dirname $0)/shard-append-test ${H0} $V0 $logdir/shard-append-test.log
+
+cleanup_tester $(dirname $0)/shard-append-test
+
+cleanup;
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
index eaeb840e86d..d7526339591 100644
--- a/xlators/features/shard/src/shard.c
+++ b/xlators/features/shard/src/shard.c
@@ -3631,6 +3631,18 @@ shard_common_inode_write_post_update_size_handler (call_frame_t *frame,
         return 0;
 }
 
+static gf_boolean_t
+shard_is_appending_write (shard_local_t *local)
+{
+        if (local->fop != GF_FOP_WRITE)
+                return _gf_false;
+        if (local->flags & O_APPEND)
+                return _gf_true;
+        if (local->fd->flags & O_APPEND)
+                return _gf_true;
+        return _gf_false;
+}
+
 int
 __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
                                        xlator_t *this)
@@ -3645,13 +3657,15 @@ __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
 
         ctx = (shard_inode_ctx_t *) ctx_uint;
 
-        if (local->offset + local->total_size > ctx->stat.ia_size) {
+        if (shard_is_appending_write (local)) {
+                local->delta_size = local->total_size;
+        } else if (local->offset + local->total_size > ctx->stat.ia_size) {
                 local->delta_size = (local->offset + local->total_size) -
                                     ctx->stat.ia_size;
-                ctx->stat.ia_size += (local->delta_size);
         } else {
                 local->delta_size = 0;
         }
+        ctx->stat.ia_size += (local->delta_size);
         local->postbuf = ctx->stat;
 
         return 0;
@@ -3957,8 +3971,11 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame,
 }
 
 int
-shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
-                                              xlator_t *this)
+shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this,
+                       shard_post_resolve_fop_handler_t handler);
+int
+shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
+                                               xlator_t *this)
 {
         shard_local_t *local = NULL;
 
@@ -3971,8 +3988,6 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
                 return 0;
         }
 
-        local->postbuf = local->prebuf;
-
         if (local->call_count) {
                 shard_common_lookup_shards (frame, this,
                                             local->resolver_base_inode,
@@ -3985,12 +4000,11 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
 }
 
 int
-shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
-                                               xlator_t *this)
+shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
+                                              xlator_t *this)
 {
-        shard_local_t *local = NULL;
-
-        local = frame->local;
+        shard_local_t *local = frame->local;
+        shard_priv_t  *priv  = this->private;
 
         if (local->op_ret < 0) {
                 shard_common_inode_write_failure_unwind (local->fop, frame,
@@ -3999,8 +4013,46 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
                 return 0;
         }
 
-        shard_lookup_base_file (frame, this, &local->loc,
-                                shard_common_inode_write_post_lookup_handler);
+        local->postbuf = local->prebuf;
+
+        /*Adjust offset to EOF so that correct shard is chosen for append*/
+        if (shard_is_appending_write (local))
+                local->offset = local->prebuf.ia_size;
+
+        local->first_block = get_lowest_block (local->offset,
+                                               local->block_size);
+        local->last_block = get_highest_block (local->offset, local->total_size,
+                                               local->block_size);
+        local->num_blocks = local->last_block - local->first_block + 1;
+        local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
+                                       gf_shard_mt_inode_list);
+        if (!local->inode_list) {
+                shard_common_inode_write_failure_unwind (local->fop, frame,
+                                                         -1, ENOMEM);
+                return 0;
+        }
+
+        gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
+                      "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64
+                      " total_size=%zu flags=%"PRId32"",
+                      gf_fop_list[local->fop],
+                      uuid_utoa (local->resolver_base_inode->gfid),
+                      local->first_block, local->last_block, local->num_blocks,
+                      local->offset, local->total_size, local->flags);
+
+        local->dot_shard_loc.inode = inode_find (this->itable,
+                                                 priv->dot_shard_gfid);
+
+        if (!local->dot_shard_loc.inode) {
+                /*change handler*/
+                shard_mkdir_dot_shard (frame, this,
+                                 shard_common_inode_write_post_resolve_handler);
+        } else {
+                /*change handler*/
+                local->post_res_handler =
+                                shard_common_inode_write_post_resolve_handler;
+                shard_refresh_dot_shard (frame, this);
+        }
         return 0;
 }
 
@@ -4699,9 +4751,6 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
         int             i              = 0;
         uint64_t        block_size     = 0;
         shard_local_t  *local          = NULL;
-        shard_priv_t   *priv           = NULL;
-
-        priv = this->private;
 
         ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
         if (ret) {
@@ -4777,37 +4826,13 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
                 local->iobref = iobref_ref (iobref);
         local->fd = fd_ref (fd);
         local->block_size = block_size;
-        local->first_block = get_lowest_block (offset, local->block_size);
-        local->last_block = get_highest_block (offset, local->total_size,
-                                               local->block_size);
-        local->num_blocks = local->last_block - local->first_block + 1;
         local->resolver_base_inode = local->fd->inode;
-        local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
-                                       gf_shard_mt_inode_list);
-        if (!local->inode_list)
-                goto out;
 
         local->loc.inode = inode_ref (fd->inode);
         gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
 
-        gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
-                      "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64""
-                      " total_size=%zu flags=%"PRId32"", gf_fop_list[fop],
-                      uuid_utoa (fd->inode->gfid), local->first_block,
-                      local->last_block, local->num_blocks, offset,
-                      local->total_size, local->flags);
-
-        local->dot_shard_loc.inode = inode_find (this->itable,
-                                                 priv->dot_shard_gfid);
-
-        if (!local->dot_shard_loc.inode) {
-                shard_mkdir_dot_shard (frame, this,
-                                 shard_common_inode_write_post_resolve_handler);
-        } else {
-                local->post_res_handler = shard_common_inode_write_post_resolve_handler;
-                shard_refresh_dot_shard (frame, this);
-        }
-
+        shard_lookup_base_file (frame, this, &local->loc,
+                                shard_common_inode_write_post_lookup_handler);
         return 0;
 out:
         shard_common_inode_write_failure_unwind (fop, frame, -1, ENOMEM);