summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extras/group-gluster-block1
-rw-r--r--tests/bugs/shard/shard-append-test.c179
-rw-r--r--tests/bugs/shard/shard-append-test.t32
-rw-r--r--xlators/features/shard/src/shard.c109
4 files changed, 278 insertions, 43 deletions
diff --git a/extras/group-gluster-block b/extras/group-gluster-block
index 0753d26b3ca..a4a6367920b 100644
--- a/extras/group-gluster-block
+++ b/extras/group-gluster-block
@@ -2,7 +2,6 @@ performance.quick-read=off
performance.read-ahead=off
performance.io-cache=off
performance.stat-prefetch=off
-performance.write-behind=off
performance.open-behind=off
performance.readdir-ahead=off
network.remote-dio=enable
diff --git a/tests/bugs/shard/shard-append-test.c b/tests/bugs/shard/shard-append-test.c
new file mode 100644
index 00000000000..92dff3d078d
--- /dev/null
+++ b/tests/bugs/shard/shard-append-test.c
@@ -0,0 +1,179 @@
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <limits.h>
+#include <string.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define LOG_ERR(msg) do { \
+ fprintf (stderr, "%s : Error (%s)\n", msg, strerror (errno)); \
+ } while (0)
+
+/*This test tests that shard xlator handles offset in appending writes
+ * correctly. This test performs writes of 1025 bytes 1025 times, in 5 threads
+ * with different threads. The buffer to be written is same character repeated
+ * 1025 times in the buffer for a thread. At the end it reads the buffer till
+ * end of file and tests that the read of 1025 bytes is always same character
+ * and the content read is 5*1025*1025 size. 1025 bytes is chosen because it
+ * will lead to write on more than one shard at some point when the size is
+ * going over the initial shard*/
+pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+int thread_data = '1';
+
+glfs_t *
+init_glfs (const char *hostname, const char *volname,
+ const char *logfile)
+{
+ int ret = -1;
+ glfs_t *fs = NULL;
+
+ fs = glfs_new (volname);
+ if (!fs) {
+ LOG_ERR ("glfs_new failed");
+ return NULL;
+ }
+
+ ret = glfs_set_volfile_server (fs, "tcp", hostname, 24007);
+ if (ret < 0) {
+ LOG_ERR ("glfs_set_volfile_server failed");
+ goto out;
+ }
+
+ ret = glfs_set_logging (fs, logfile, 7);
+ if (ret < 0) {
+ LOG_ERR ("glfs_set_logging failed");
+ goto out;
+ }
+
+ ret = glfs_init (fs);
+ if (ret < 0) {
+ LOG_ERR ("glfs_init failed");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ glfs_fini (fs);
+ fs = NULL;
+ }
+
+ return fs;
+}
+
+void*
+write_data (void *data)
+{
+ char buf[1025] = {0};
+ glfs_fd_t *glfd = NULL;
+ glfs_t *fs = data;
+ int i = 0;
+
+ pthread_mutex_lock (&lock);
+ {
+ memset(buf, thread_data, sizeof(buf));
+ thread_data++;
+ }
+ pthread_mutex_unlock (&lock);
+
+ for (i = 0; i < 1025; i++) {
+ glfd = glfs_creat(fs, "parallel-write.txt", O_WRONLY | O_APPEND,
+ S_IRUSR | S_IWUSR | O_SYNC);
+ if (!glfd) {
+ LOG_ERR ("Failed to create file");
+ exit(1);
+ }
+
+ if (glfs_write (glfd, buf, sizeof(buf), 0) < 0) {
+ LOG_ERR ("Failed to write to file");
+ exit(1);
+ }
+ if (glfs_close(glfd) != 0) {
+ LOG_ERR ("Failed to close file");
+ exit(1);
+ }
+ }
+ return NULL;
+}
+
+int
+main (int argc, char *argv[])
+{
+ pthread_t tid[5] = {0};
+ char buf[1025] = {0};
+ char cmp_buf[1025] = {0};
+ int ret = 0;
+ char *hostname = NULL;
+ char *volname = NULL;
+ char *logfile = NULL;
+ glfs_t *fs = NULL;
+ glfs_fd_t *glfd = NULL;
+ ssize_t bytes_read = 0;
+ ssize_t total_bytes_read = 0;
+ int i = 0;
+
+ if (argc != 4) {
+ fprintf (stderr, "Invalid argument\n");
+ exit(1);
+ }
+
+ hostname = argv[1];
+ volname = argv[2];
+ logfile = argv[3];
+
+ fs = init_glfs (hostname, volname, logfile);
+ if (fs == NULL) {
+ LOG_ERR ("init_glfs failed");
+ return -1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ pthread_create(&tid[i], NULL, write_data, fs);
+ }
+
+ for (i = 0; i < 5; i++) {
+ pthread_join(tid[i], NULL);
+ }
+ glfd = glfs_open(fs, "parallel-write.txt", O_RDONLY);
+ if (!glfd) {
+ LOG_ERR ("Failed to open file for reading");
+ exit(1);
+ }
+
+ while ((bytes_read = glfs_read (glfd, buf, sizeof(buf), 0)) > 0) {
+ if (bytes_read != sizeof(buf)) {
+ fprintf (stderr, "Didn't read complete data read: %zd "
+ "expected: %lu", bytes_read, sizeof(buf));
+ exit(1);
+ }
+
+ total_bytes_read += bytes_read;
+ if (buf[0] < '1' || buf[0] >= thread_data) {
+ fprintf(stderr, "Invalid character found: %c", buf[0]);
+ exit(1);
+ }
+ memset(cmp_buf, buf[0], sizeof(cmp_buf));
+ if (memcmp(cmp_buf, buf, sizeof(cmp_buf))) {
+ LOG_ERR ("Data corrupted");
+ exit(1);
+ }
+ memset(cmp_buf, 0, sizeof(cmp_buf));
+ }
+
+ if (total_bytes_read != 5*1025*1025) {
+ fprintf(stderr, "Failed to read what is written, read; %zd, "
+ "expected %zu", total_bytes_read, 5*1025*1025);
+ exit(1);
+ }
+
+ if (glfs_close(glfd) != 0) {
+ LOG_ERR ("Failed to close");
+ exit(1);
+ }
+ return 0;
+}
diff --git a/tests/bugs/shard/shard-append-test.t b/tests/bugs/shard/shard-append-test.t
new file mode 100644
index 00000000000..f8719f2a2c1
--- /dev/null
+++ b/tests/bugs/shard/shard-append-test.t
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+
+TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{1,2,3};
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+
+#Uncomment the following line after shard-queuing is implemented
+#TEST $CLI volume set $V0 performance.write-behind off
+
+TEST $CLI volume set $V0 performance.strict-o-direct on
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0;
+
+logdir=`gluster --print-logdir`
+
+TEST build_tester $(dirname $0)/shard-append-test.c -lgfapi -lpthread
+
+TEST ./$(dirname $0)/shard-append-test ${H0} $V0 $logdir/shard-append-test.log
+
+cleanup_tester $(dirname $0)/shard-append-test
+
+cleanup;
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
index eaeb840e86d..d7526339591 100644
--- a/xlators/features/shard/src/shard.c
+++ b/xlators/features/shard/src/shard.c
@@ -3631,6 +3631,18 @@ shard_common_inode_write_post_update_size_handler (call_frame_t *frame,
return 0;
}
+static gf_boolean_t
+shard_is_appending_write (shard_local_t *local)
+{
+ if (local->fop != GF_FOP_WRITE)
+ return _gf_false;
+ if (local->flags & O_APPEND)
+ return _gf_true;
+ if (local->fd->flags & O_APPEND)
+ return _gf_true;
+ return _gf_false;
+}
+
int
__shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
xlator_t *this)
@@ -3645,13 +3657,15 @@ __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
ctx = (shard_inode_ctx_t *) ctx_uint;
- if (local->offset + local->total_size > ctx->stat.ia_size) {
+ if (shard_is_appending_write (local)) {
+ local->delta_size = local->total_size;
+ } else if (local->offset + local->total_size > ctx->stat.ia_size) {
local->delta_size = (local->offset + local->total_size) -
ctx->stat.ia_size;
- ctx->stat.ia_size += (local->delta_size);
} else {
local->delta_size = 0;
}
+ ctx->stat.ia_size += (local->delta_size);
local->postbuf = ctx->stat;
return 0;
@@ -3957,8 +3971,11 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame,
}
int
-shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
- xlator_t *this)
+shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this,
+ shard_post_resolve_fop_handler_t handler);
+int
+shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
+ xlator_t *this)
{
shard_local_t *local = NULL;
@@ -3971,8 +3988,6 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
return 0;
}
- local->postbuf = local->prebuf;
-
if (local->call_count) {
shard_common_lookup_shards (frame, this,
local->resolver_base_inode,
@@ -3985,12 +4000,11 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
}
int
-shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
- xlator_t *this)
+shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
+ xlator_t *this)
{
- shard_local_t *local = NULL;
-
- local = frame->local;
+ shard_local_t *local = frame->local;
+ shard_priv_t *priv = this->private;
if (local->op_ret < 0) {
shard_common_inode_write_failure_unwind (local->fop, frame,
@@ -3999,8 +4013,46 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
return 0;
}
- shard_lookup_base_file (frame, this, &local->loc,
- shard_common_inode_write_post_lookup_handler);
+ local->postbuf = local->prebuf;
+
+ /*Adjust offset to EOF so that correct shard is chosen for append*/
+ if (shard_is_appending_write (local))
+ local->offset = local->prebuf.ia_size;
+
+ local->first_block = get_lowest_block (local->offset,
+ local->block_size);
+ local->last_block = get_highest_block (local->offset, local->total_size,
+ local->block_size);
+ local->num_blocks = local->last_block - local->first_block + 1;
+ local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
+ gf_shard_mt_inode_list);
+ if (!local->inode_list) {
+ shard_common_inode_write_failure_unwind (local->fop, frame,
+ -1, ENOMEM);
+ return 0;
+ }
+
+ gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
+ "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64
+ " total_size=%zu flags=%"PRId32"",
+ gf_fop_list[local->fop],
+ uuid_utoa (local->resolver_base_inode->gfid),
+ local->first_block, local->last_block, local->num_blocks,
+ local->offset, local->total_size, local->flags);
+
+ local->dot_shard_loc.inode = inode_find (this->itable,
+ priv->dot_shard_gfid);
+
+ if (!local->dot_shard_loc.inode) {
+ /*change handler*/
+ shard_mkdir_dot_shard (frame, this,
+ shard_common_inode_write_post_resolve_handler);
+ } else {
+ /*change handler*/
+ local->post_res_handler =
+ shard_common_inode_write_post_resolve_handler;
+ shard_refresh_dot_shard (frame, this);
+ }
return 0;
}
@@ -4699,9 +4751,6 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
int i = 0;
uint64_t block_size = 0;
shard_local_t *local = NULL;
- shard_priv_t *priv = NULL;
-
- priv = this->private;
ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
if (ret) {
@@ -4777,37 +4826,13 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
local->iobref = iobref_ref (iobref);
local->fd = fd_ref (fd);
local->block_size = block_size;
- local->first_block = get_lowest_block (offset, local->block_size);
- local->last_block = get_highest_block (offset, local->total_size,
- local->block_size);
- local->num_blocks = local->last_block - local->first_block + 1;
local->resolver_base_inode = local->fd->inode;
- local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
- gf_shard_mt_inode_list);
- if (!local->inode_list)
- goto out;
local->loc.inode = inode_ref (fd->inode);
gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
- gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
- "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64""
- " total_size=%zu flags=%"PRId32"", gf_fop_list[fop],
- uuid_utoa (fd->inode->gfid), local->first_block,
- local->last_block, local->num_blocks, offset,
- local->total_size, local->flags);
-
- local->dot_shard_loc.inode = inode_find (this->itable,
- priv->dot_shard_gfid);
-
- if (!local->dot_shard_loc.inode) {
- shard_mkdir_dot_shard (frame, this,
- shard_common_inode_write_post_resolve_handler);
- } else {
- local->post_res_handler = shard_common_inode_write_post_resolve_handler;
- shard_refresh_dot_shard (frame, this);
- }
-
+ shard_lookup_base_file (frame, this, &local->loc,
+ shard_common_inode_write_post_lookup_handler);
return 0;
out:
shard_common_inode_write_failure_unwind (fop, frame, -1, ENOMEM);