summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2012-10-28 07:31:20 +0530
committerVijay Bellur <vbellur@redhat.com>2012-11-20 00:30:25 -0800
commit65cc8cb531456de934e8ba3175430e8fcb304ca5 (patch)
tree0a41494a96bbbf313bfe56b0f5fea993d6d9c59d
parent18851652c9e3d566fd82fac91d67792d7c221f6b (diff)
storage/posix: Make rchecksum O_DIRECT friendly
Problem: When posix-aio is enabled to perform aio fd is set with O_DIRECT whenever possible in read, writev fops. Rchecksum does not take this into account. If either offset/size/memory-buf passed to pread in rchecksum fop is not aligned, pread fails with EINVAL. Fix: Before doing pread necessary O_DIRECT manipulation is done when aio is enabled. Memory buffer passed to pread is now page-aligned. Test: 1) Create replica volume with aio enabled. 2) dd if=/dev/urandom of=a bs=1M count=1 3) kill one of the bricks in the replica pair 4) dd if=/dev/urandom of=a bs=1M count=1 5) bring back the brick. Self-heal succeeds after the change. The test above checks both rchecksum, writev fops that were changed in this patch. Change-Id: I186099a2854d4864c5b48086ab7bc5f1a7b27313 BUG: 866459 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/4134 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--tests/bugs/bug-866459.t44
-rw-r--r--xlators/storage/posix/src/posix-aio.c10
-rw-r--r--xlators/storage/posix/src/posix.c71
-rw-r--r--xlators/storage/posix/src/posix.h4
4 files changed, 106 insertions, 23 deletions
diff --git a/tests/bugs/bug-866459.t b/tests/bugs/bug-866459.t
new file mode 100644
index 00000000000..d66f70c69ca
--- /dev/null
+++ b/tests/bugs/bug-866459.t
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+
+## Start and create a volume
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+## Create and start a volume with aio enabled
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2};
+TEST $CLI volume set $V0 linux-aio on
+TEST $CLI volume set $V0 background-self-heal-count 0
+TEST $CLI volume set $V0 performance.stat-prefetch off;
+TEST $CLI volume start $V0
+
+## Mount FUSE with caching disabled
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0;
+
+dd of=$M0/a if=/dev/urandom bs=1M count=1 2>&1 > /dev/null
+B0_hiphenated=`echo $B0 | tr '/' '-'`
+## Bring a brick down
+kill -9 `cat /var/lib/glusterd/vols/$V0/run/$H0$B0_hiphenated-${V0}1.pid`
+EXPECT '1' echo `pgrep glusterfsd | wc -l`
+## Rewrite the file
+dd of=$M0/a if=/dev/urandom bs=1M count=1 2>&1 > /dev/null
+TEST $CLI volume start $V0 force
+## Wait for the brick to give CHILD_UP in client protocol
+sleep 5
+md5offile2=`md5sum $B0/${V0}2/a | awk '{print $1}'`
+
+##trigger self-heal
+ls -l $M0/a
+
+EXPECT "$md5offile2" echo `md5sum $B0/${V0}1/a | awk '{print $1}'`
+
+## Finish up
+TEST $CLI volume stop $V0;
+TEST $CLI volume delete $V0;
+
+cleanup;
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
index 0a776b9bcf0..f807618ce1c 100644
--- a/xlators/storage/posix/src/posix-aio.c
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -560,4 +560,14 @@ posix_aio_off (xlator_t *this)
return 0;
}
+void
+__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not availble at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
#endif
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index 86c35f44f8b..812bfda7dfb 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -52,6 +52,7 @@
#include "posix-aio.h"
extern char *marker_xattrs[];
+#define ALIGN_SIZE 4096
#undef HAVE_SET_FSID
#ifdef HAVE_SET_FSID
@@ -2013,6 +2014,21 @@ err:
return op_ret;
}
+char*
+_page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char);
+ if (!alloc_buf)
+ goto out;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+out:
+ return alloc_buf;
+}
int32_t
__posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
@@ -2020,7 +2036,6 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
{
int32_t op_ret = 0;
int idx = 0;
- int align = 4096;
int max_buf_size = 0;
int retval = 0;
char *buf = NULL;
@@ -2036,7 +2051,7 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
max_buf_size = vector[idx].iov_len;
}
- alloc_buf = GF_MALLOC (1 * (max_buf_size + align), gf_posix_mt_char);
+ alloc_buf = _page_aligned_alloc (max_buf_size, &buf);
if (!alloc_buf) {
op_ret = -errno;
goto err;
@@ -2044,9 +2059,6 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
internal_off = startoff;
for (idx = 0; idx < count; idx++) {
- /* page aligned buffer */
- buf = GF_ALIGN_BUF (alloc_buf, align);
-
memcpy (buf, vector[idx].iov_base, vector[idx].iov_len);
/* not sure whether writev works on O_DIRECT'd fd */
@@ -3875,23 +3887,26 @@ int32_t
posix_rchecksum (call_frame_t *frame, xlator_t *this,
fd_t *fd, off_t offset, int32_t len, dict_t *xdata)
{
- char *buf = NULL;
- int _fd = -1;
- struct posix_fd *pfd = NULL;
- int op_ret = -1;
- int op_errno = 0;
- int ret = 0;
- int32_t weak_checksum = 0;
- unsigned char strong_checksum[MD5_DIGEST_LENGTH];
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+ int _fd = -1;
+ struct posix_fd *pfd = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ int32_t weak_checksum = 0;
+ unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
+ struct posix_private *priv = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
+ priv = this->private;
memset (strong_checksum, 0, MD5_DIGEST_LENGTH);
- buf = GF_CALLOC (1, len, gf_posix_mt_char);
- if (!buf) {
+ alloc_buf = _page_aligned_alloc (len, &buf);
+ if (!alloc_buf) {
op_errno = ENOMEM;
goto out;
}
@@ -3906,15 +3921,25 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this,
_fd = pfd->fd;
- ret = pread (_fd, buf, len, offset);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "pread of %d bytes returned %d (%s)",
- len, ret, strerror (errno));
+ LOCK (&fd->lock);
+ {
+ if (priv->aio_capable && priv->aio_init_done)
+ __posix_fd_set_odirect (fd, pfd, 0, offset, len);
+
+ ret = pread (_fd, buf, len, offset);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pread of %d bytes returned %d (%s)",
+ len, ret, strerror (errno));
+
+ op_errno = errno;
+ }
- op_errno = errno;
- goto out;
}
+ UNLOCK (&fd->lock);
+
+ if (ret < 0)
+ goto out;
weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) len);
gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, (unsigned char *) strong_checksum);
@@ -3924,7 +3949,7 @@ out:
STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno,
weak_checksum, strong_checksum, NULL);
- GF_FREE (buf);
+ GF_FREE (alloc_buf);
return 0;
}
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 6f96e49d18f..45ee3596330 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -159,4 +159,8 @@ int posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd);
void posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf);
gf_boolean_t posix_special_xattr (char **pattern, char *key);
+
+void
+__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size);
#endif /* _POSIX_H */