summaryrefslogtreecommitdiffstats
path: root/xlators/storage/bd
diff options
context:
space:
mode:
authorM. Mohan Kumar <mohan@in.ibm.com>2013-11-15 14:19:11 +0530
committerAnand Avati <avati@redhat.com>2013-11-20 14:46:16 -0800
commit2bb025699a8b9b34491c8b13a2bbb6da302a5d77 (patch)
treebcfca804f97dbbd960c0b74b499926b717e51e07 /xlators/storage/bd
parent5e31894fbda74a524e1fe30d26f7ed82a77eb5ff (diff)
bd: Add Zerofill FOP support
BUG: 1028673 Change-Id: I9ba8e3e6cf2f888640b4d2a2eb934a27ff903c42 Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com> Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com> Reviewed-on: http://review.gluster.org/6290 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/storage/bd')
-rw-r--r--xlators/storage/bd/src/bd-helper.c239
-rw-r--r--xlators/storage/bd/src/bd.c34
-rw-r--r--xlators/storage/bd/src/bd.h8
3 files changed, 278 insertions, 3 deletions
diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c
index 5525e346bd7..63e26d8a3a6 100644
--- a/xlators/storage/bd/src/bd-helper.c
+++ b/xlators/storage/bd/src/bd-helper.c
@@ -6,7 +6,8 @@
#ifdef HAVE_LIBAIO
#include <libaio.h>
#endif
-
+#include <linux/fs.h>
+#include <sys/ioctl.h>
#include "bd.h"
#include "run.h"
@@ -781,3 +782,239 @@ out:
return ret;
}
+#ifndef BLKZEROOUT
+
+int
+bd_do_manual_zerofill (int fd, off_t offset, off_t len, int o_direct)
+{
+ off_t num_vect = 0;
+ off_t num_loop = 1;
+ int idx = 0;
+ int op_ret = -1;
+ int vect_size = IOV_SIZE;
+ off_t remain = 0;
+ off_t extra = 0;
+ struct iovec *vector = NULL;
+ char *iov_base = NULL;
+ char *alloc_buf = NULL;
+
+ if (len == 0)
+ return 0;
+
+ if (len < IOV_SIZE)
+ vect_size = len;
+
+ num_vect = len / (vect_size);
+ remain = len % vect_size ;
+
+ if (num_vect > MAX_NO_VECT) {
+ extra = num_vect % MAX_NO_VECT;
+ num_loop = num_vect / MAX_NO_VECT;
+ num_vect = MAX_NO_VECT;
+ }
+
+ vector = GF_CALLOC (num_vect, sizeof(struct iovec),
+ gf_common_mt_iovec);
+ if (!vector)
+ return -1;
+
+ if (o_direct) {
+ alloc_buf = page_aligned_alloc (vect_size, &iov_base);
+ if (!alloc_buf) {
+ gf_log ("bd_do_manual_zerofill", GF_LOG_DEBUG,
+ "memory alloc failed, vect_size %d: %s",
+ vect_size, strerror (errno));
+ GF_FREE (vector);
+ return -1;
+ }
+ } else {
+ iov_base = GF_CALLOC (vect_size, sizeof(char),
+ gf_common_mt_char);
+ if (!iov_base) {
+ GF_FREE (vector);
+ return -1;
+ }
+ }
+
+ for (idx = 0; idx < num_vect; idx++) {
+ vector[idx].iov_base = iov_base;
+ vector[idx].iov_len = vect_size;
+ }
+
+ if (lseek (fd, offset, SEEK_SET) < 0) {
+ op_ret = -1;
+ goto err;
+ }
+
+ for (idx = 0; idx < num_loop; idx++) {
+ op_ret = writev (fd, vector, num_vect);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (extra) {
+ op_ret = writev (fd, vector, extra);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (remain) {
+ vector[0].iov_len = remain;
+ op_ret = writev (fd, vector , 1);
+ if (op_ret < 0)
+ goto err;
+ }
+ op_ret = 0;
+err:
+ if (o_direct)
+ GF_FREE (alloc_buf);
+ else
+ GF_FREE (iov_base);
+ GF_FREE (vector);
+ return op_ret;
+}
+
+#else
+
+/*
+ * Issue Linux ZEROOUT ioctl to write '0' to a scsi device at given offset
+ * and number of bytes. Each SCSI device's maximum write same bytes are exported
+ * in sysfs file. Sending ioctl request greater than this bytes results in slow
+ * performance. Read this file to get the maximum bytes and break down single
+ * ZEROOUT request into multiple ZEROOUT request not exceeding maximum bytes.
+ * From VG & LV name of device mapper identified and sysfs file read.
+ * /sys/block/<block-device>/queue/write_same_max_bytes
+ */
+int
+bd_do_ioctl_zerofill (bd_priv_t *priv, bd_attr_t *bdatt, int fd, char *vg,
+ off_t offset, off_t len)
+{
+ char *dm = NULL;
+ char dmname[4096] = {0, };
+ char lvname[4096] = {0, };
+ char sysfs[4096] = {0, };
+ bd_gfid_t uuid = {0, };
+ char *p = NULL;
+ off_t max_bytes = 0;
+ int sysfd = -1;
+ uint64_t param[2] = {0, 0};
+ off_t nr_loop = 0;
+ char buff[16] = {0, };
+
+ uuid_utoa_r (bdatt->iatt.ia_gfid, uuid);
+ sprintf (lvname, "/dev/%s/%s", vg, uuid);
+
+ readlink (lvname, dmname, sizeof (dmname));
+
+ p = strrchr (dmname, '/');
+ if (p)
+ dm = p + 1;
+ else
+ dm = dmname;
+
+ sprintf(sysfs, "/sys/block/%s/queue/write_same_max_bytes", dm);
+ sysfd = open (sysfs, O_RDONLY);
+ if (sysfd < 0) {
+ gf_log ("bd_do_ioctl_zerofill", GF_LOG_DEBUG,
+ "sysfs file %s does not exist", lvname);
+ goto skip;
+ }
+
+ read (sysfd, buff, sizeof (buff));
+ close (sysfd);
+
+ max_bytes = atoll (buff);
+
+skip:
+ /*
+ * If requested len is less than write_same_max_bytes,
+ * issue single ioctl to zeroout. Otherwise split the ioctls
+ */
+ if (!max_bytes || len <= max_bytes) {
+ param[0] = offset;
+ param[1] = len;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+ return 0;
+ }
+
+ /* Split ioctls to max write_same_max_bytes */
+ nr_loop = len / max_bytes;
+ for (; nr_loop; nr_loop--) {
+ param[0] = offset;
+ param[1] = max_bytes;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+
+ offset += max_bytes;
+ }
+
+ if (!(len % max_bytes))
+ return 0;
+
+ param[0] = offset;
+ param[1] = len % max_bytes;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+
+ return 0;
+}
+#endif
+
+int
+bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *prebuf,
+ struct iatt *postbuf)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = this->private;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+#ifndef BLKZEROOUT
+ ret = bd_do_manual_zerofill(bd_fd->fd, offset, len,
+ bd_fd->flag & O_DIRECT);
+#else
+ ret = bd_do_ioctl_zerofill(priv, bdatt, bd_fd->fd, priv->vg, offset,
+ len);
+#endif
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "zerofill failed on fd %d length %ld %s",
+ bd_fd->fd, len, strerror (ret));
+ goto out;
+ }
+
+ if (bd_fd->flag & (O_SYNC|O_DSYNC)) {
+ ret = fsync (bd_fd->fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ bd_fd->fd, strerror (errno));
+ return errno;
+ }
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (postbuf));
+
+out:
+
+ return ret;
+}
+
diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c
index 1eb5cd15838..17a9a5f159a 100644
--- a/xlators/storage/bd/src/bd.c
+++ b/xlators/storage/bd/src/bd.c
@@ -2195,6 +2195,36 @@ out:
return 0;
}
+static int
+bd_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+ bd_attr_t *bdatt = NULL;
+
+ /* iatt already cached */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt) < 0) {
+ STACK_WIND (frame, default_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_do_zerofill(frame, this, fd, offset, len,
+ &statpre, &statpost);
+ if (ret)
+ goto err;
+
+ STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
/**
* notify - when parent sends PARENT_UP, send CHILD_UP event from here
*/
@@ -2324,7 +2354,8 @@ init (xlator_t *this)
}
}
- _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT;
+ _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT |
+ BD_CAPS_OFFLOAD_ZERO;
return 0;
error:
@@ -2384,6 +2415,7 @@ struct xlator_fops fops = {
.flush = bd_flush,
.setattr = bd_setattr,
.discard = bd_discard,
+ .zerofill = bd_zerofill,
};
struct xlator_cbks cbks = {
diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h
index 34b4c9e2226..f59bc6a09ed 100644
--- a/xlators/storage/bd/src/bd.h
+++ b/xlators/storage/bd/src/bd.h
@@ -51,6 +51,7 @@
#define BD_CAPS_THIN 0x02
#define BD_CAPS_OFFLOAD_COPY 0x04
#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08
+#define BD_CAPS_OFFLOAD_ZERO 0x20
#define BD_CLONE "clone"
#define BD_SNAPSHOT "snapshot"
@@ -61,9 +62,11 @@
#define IOV_SIZE (64 * 1024)
#define ALIGN_SIZE 4096
-
#define LINKTO "trusted.glusterfs.dht.linkto"
+#define MAX_NO_VECT 1024
+
+
#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \
if (!buff) { \
op_errno = ENOMEM; \
@@ -174,5 +177,8 @@ int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
int bd_clone (bd_local_t *local, bd_priv_t *priv);
int bd_merge (bd_priv_t *priv, uuid_t gfid);
int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+int bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *prebuf,
+ struct iatt *postbuf);
#endif