From 1e1a162fa7ea5b6275a0212273ca96b4de410c00 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 7 May 2012 13:53:31 -0400 Subject: cluster/stripe: implement the coalesce stripe file format The coalesce file format for cluster/stripe condenses the striped files to a contiguous layout. The elimination of holes in striped files eliminates space wasted via local filesystem preallocation heuristics and significantly improves read performance. Coalesce mode is implemented with a new 'coalesce' xlator option, which is user-configurable and disabled by default. The format of newly created files is marked with a new 'stripe-coalesce' xattr. Cluster/stripe handles/preserves the format of files regardless of the current mode of operation (i.e., a volume can simultaneously consist of coalesced and non-coalesced files). Files without the stripe-coalesce attribute are assumed to have the traditional format to provide backward compatibility. extras/stripe-merge: support traditional and coalesce stripe formats Update the stripe-merge recovery tool to handle the traditional and coalesced file formats. The format of the file is detected automatically (and verified) via the stripe-coalesce attributes. BUG: 801887 Change-Id: I682f0b4e819f496ddb68c9a01c4de4688280fdf8 Signed-off-by: Brian Foster Reviewed-on: http://review.gluster.com/3639 Tested-by: Gluster Build System Reviewed-by: Amar Tumballi --- xlators/cluster/stripe/src/stripe-helpers.c | 92 +++++++- xlators/cluster/stripe/src/stripe.c | 339 ++++++++++++++++++++++------ xlators/cluster/stripe/src/stripe.h | 54 ++++- xlators/mgmt/glusterd/src/glusterd-volgen.c | 1 + xlators/storage/posix/src/posix.c | 2 +- 5 files changed, 409 insertions(+), 79 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c index a2ebc1201f7..3e689413bc1 100644 --- a/xlators/cluster/stripe/src/stripe-helpers.c +++ b/xlators/cluster/stripe/src/stripe-helpers.c @@ -236,8 +236,6 @@ out: return block_size; } - - int32_t stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, dict_t *dict) @@ -246,7 +244,6 @@ stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, data_t *data = NULL; int32_t index = 0; stripe_private_t *priv = NULL; - int32_t ret = -1; priv = this->private; @@ -343,14 +340,31 @@ stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, if (!local->fctx->xl_array[index]) local->fctx->xl_array[index] = prev->this; } - ret = 0; + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + data = dict_get(dict, key); + if (!data) { + /* + * The file was probably created prior to coalesce support. + * Assume non-coalesce mode for this file to maintain backwards + * compatibility. + */ + gf_log(this->name, GF_LOG_DEBUG, "missing stripe-coalesce " + "attr, assume non-coalesce mode"); + local->fctx->stripe_coalesce = 0; + } else { + local->fctx->stripe_coalesce = data_to_int32(data); + } + + out: - return ret; + return 0; } int32_t stripe_xattr_request_build (xlator_t *this, dict_t *dict, uint64_t stripe_size, - uint32_t stripe_count, uint32_t stripe_index) + uint32_t stripe_count, uint32_t stripe_index, + uint32_t stripe_coalesce) { char key[256] = {0,}; int32_t ret = -1; @@ -378,6 +392,14 @@ stripe_xattr_request_build (xlator_t *this, dict_t *dict, uint64_t stripe_size, "failed to set %s in xattr_req dict", key); goto out; } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + ret = dict_set_int32(dict, key, stripe_coalesce); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req_dict", key); + goto out; + } out: return ret; } @@ -501,3 +523,61 @@ stripe_iatt_merge (struct iatt *from, struct iatt *to) to->ia_atime = from->ia_atime; return 0; } + +off_t +coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count) +{ + size_t line_size = 0; + uint64_t stripe_num = 0; + off_t coalesced_offset = 0; + + line_size = stripe_size * stripe_count; + stripe_num = offset / line_size; + + coalesced_offset = (stripe_num * stripe_size) + + (offset % stripe_size); + + return coalesced_offset; +} + +off_t +uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index) +{ + uint64_t nr_full_stripe_chunks = 0, mod = 0; + + if (!size) + return size; + + /* + * Estimate the number of fully written stripes from the + * local file size. Each stripe_size chunk corresponds to + * a stripe. + */ + nr_full_stripe_chunks = (size / stripe_size) * stripe_count; + mod = size % stripe_size; + + if (!mod) { + /* + * There is no remainder, thus we could have overestimated + * the size of the file in terms of chunks. Trim the number + * of chunks by the following stripe members and leave it + * up to those nodes to respond with a larger size (if + * necessary). + */ + nr_full_stripe_chunks -= stripe_count - + (stripe_index + 1); + size = nr_full_stripe_chunks * stripe_size; + } else { + /* + * There is a remainder and thus we own the last chunk of the + * file. Add the preceding stripe members of the final stripe + * along with the remainder to calculate the exact size. + */ + nr_full_stripe_chunks += stripe_index; + size = nr_full_stripe_chunks * stripe_size + mod; + } + + return size; +} + diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index c64e8bfcb6d..7e711e9e91d 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -32,7 +32,6 @@ struct volume_options options[]; - int32_t stripe_sh_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -237,6 +236,8 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf_blocks += buf->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->postparent_size < postparent->ia_size) @@ -326,9 +327,19 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, /* get stripe-size xattr on lookup. This would be required for * open/read/write/pathinfo calls. Hence we send down the request * even when type == IA_INVAL */ + + /* + * We aren't guaranteed to have xdata here. We need the format info for + * the file, so allocate xdata if necessary. + */ + if (!xdata) + xdata = dict_new(); + else + xdata = dict_ref(xdata); + if (xdata && (IA_ISREG (loc->inode->ia_type) || (loc->inode->ia_type == IA_INVAL))) { - ret = stripe_xattr_request_build (this, xdata, 8, 4, 4); + ret = stripe_xattr_request_build (this, xdata, 8, 4, 4, 0); if (ret) gf_log (this->name , GF_LOG_ERROR, "Failed to build" " xattr request for %s", loc->path); @@ -344,6 +355,8 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, trav = trav->next; } + dict_unref(xdata); + return 0; err: STRIPE_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); @@ -388,6 +401,9 @@ stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; } @@ -416,6 +432,7 @@ stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -442,6 +459,13 @@ stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) frame->local = local; local->call_count = priv->child_count; + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { STACK_WIND (frame, stripe_stat_cbk, trav->xlator, trav->xlator->fops->stat, loc, NULL); @@ -583,6 +607,9 @@ stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += prebuf->ia_blocks; local->postbuf_blocks += postbuf->ia_blocks; + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + if (local->prebuf_size < prebuf->ia_size) local->prebuf_size = prebuf->ia_size; @@ -614,10 +641,12 @@ out: int32_t stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { - xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; + int i, eof_idx; + off_t dest_offset, tmp_offset; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -626,7 +655,6 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; - trav = this->children; if (priv->first_child_down) { op_errno = ENOTCONN; @@ -643,11 +671,51 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, frame->local = local; local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->truncate, loc, offset, NULL); - trav = trav->next; - } + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, + "no xlator at index %d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + /* + * The node that owns EOF is truncated to the exact + * coalesced offset. Nodes prior to this index should + * be rounded up to the size of the complete stripe, + * while nodes after this index should be rounded down + * to the size of the previous stripe. + */ + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->truncate, loc, dest_offset, + NULL); + } return 0; err: @@ -698,6 +766,9 @@ stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += preop->ia_blocks; local->postbuf_blocks += postop->ia_blocks; + correct_file_size(preop, local->fctx, prev); + correct_file_size(postop, local->fctx, prev); + if (local->prebuf_size < preop->ia_size) local->prebuf_size = preop->ia_size; if (local->postbuf_size < postop->ia_size) @@ -733,6 +804,7 @@ stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -766,6 +838,13 @@ stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, return 0; } + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_setattr_cbk, @@ -862,6 +941,8 @@ stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->pre_buf.ia_blocks += prenewparent->ia_blocks; local->post_buf.ia_blocks += postnewparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf.ia_size < buf->ia_size) local->stbuf.ia_size = buf->ia_size; @@ -947,6 +1028,7 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, stripe_private_t *priv = NULL; stripe_local_t *local = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -977,6 +1059,11 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, local->call_count = priv->child_count; + inode_ctx_get(oldloc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + frame->local = local; STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator, @@ -1367,7 +1454,6 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_private_t *priv = NULL; call_frame_t *prev = NULL; xlator_list_t *trav = NULL; - stripe_fd_ctx_t *fctx = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -1399,10 +1485,16 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (uuid_is_null (local->ia_gfid)) uuid_copy (local->ia_gfid, buf->ia_gfid); + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict"); + local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -1441,23 +1533,10 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent.ia_size = local->postparent_size; local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!fctx) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - fctx->stripe_size = local->stripe_size; - fctx->stripe_count = priv->child_count; - fctx->static_array = 1; - fctx->xl_array = priv->xl_array; inode_ctx_put (local->inode, this, - (uint64_t)(long)fctx); + (uint64_t)(long) local->fctx); } -unwind: STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, &local->preparent, &local->postparent, NULL); @@ -1531,7 +1610,8 @@ stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = stripe_xattr_request_build (this, dict, local->stripe_size, - priv->child_count, i); + priv->child_count, i, + priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, "Failed to build xattr request"); @@ -1579,9 +1659,6 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, stripe_local_t *local = NULL; int32_t op_errno = EINVAL; int32_t i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; dict_t *dict = NULL; int ret = 0; int need_unref = 0; @@ -1631,15 +1708,6 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, be looked up */ local->call_count = priv->child_count; - /* Send a setxattr request to nodes where the - files are created */ - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); - if (priv->xattr_supported) { dict = dict_new (); if (!dict) { @@ -1653,7 +1721,7 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, ret = stripe_xattr_request_build (this, dict, local->stripe_size, priv->child_count, - i); + i, priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, "failed to build xattr request"); @@ -1867,6 +1935,7 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t callcnt = 0; stripe_local_t *local = NULL; call_frame_t *prev = NULL; + stripe_fd_ctx_t *fctx = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -1880,6 +1949,14 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { callcnt = --local->call_count; + inode_ctx_get(inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "failed to get stripe " + "context"); + op_ret = -1; + op_errno = EINVAL; + } + if (op_ret == -1) { gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", @@ -1903,6 +1980,8 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -2023,7 +2102,6 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t callcnt = 0; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - stripe_fd_ctx_t *fctx = NULL; call_frame_t *prev = NULL; xlator_list_t *trav = NULL; @@ -2049,12 +2127,21 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret >= 0) { + if (IA_ISREG(buf->ia_type)) { + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from " + "dict"); + } + local->op_ret = op_ret; local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -2092,23 +2179,13 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!fctx) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - fctx->stripe_size = local->stripe_size; - fctx->stripe_count = priv->child_count; - fctx->static_array = 1; - fctx->xl_array = priv->xl_array; - inode_ctx_put (local->inode, this, - (uint64_t)(long)fctx); + stripe_copy_xl_array(local->fctx->xl_array, + priv->xl_array, + local->fctx->stripe_count); + inode_ctx_put(local->inode, this, + (uint64_t) local->fctx); } - unwind: /* Create itself has failed.. so return without setxattring */ STRIPE_STACK_UNWIND (create, frame, local->op_ret, @@ -2214,14 +2291,14 @@ stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = stripe_xattr_request_build (this, dict, local->stripe_size, priv->child_count, - i); + i, priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, "failed to build xattr request"); } else { dict = local->xattr; } - + STACK_WIND (frame, stripe_create_cbk, trav->xlator, trav->xlator->fops->create, &local->loc, local->flags, local->mode, local->umask, local->fd, @@ -2310,7 +2387,7 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, ret = stripe_xattr_request_build (this, dict, local->stripe_size, priv->child_count, - i); + i, priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, "failed to build xattr request"); @@ -2743,6 +2820,9 @@ stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += prebuf->ia_blocks; local->postbuf_blocks += postbuf->ia_blocks; + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + if (local->prebuf_size < prebuf->ia_size) local->prebuf_size = prebuf->ia_size; @@ -2777,6 +2857,7 @@ stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -2793,6 +2874,14 @@ stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict op_errno = ENOMEM; goto err; } + + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + op_errno = EINVAL; + goto err; + } + local->fctx = fctx; + local->op_ret = -1; frame->local = local; local->call_count = priv->child_count; @@ -2846,6 +2935,9 @@ stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf = *buf; local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; } @@ -2877,6 +2969,7 @@ stripe_fstat (call_frame_t *frame, stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -2897,6 +2990,13 @@ stripe_fstat (call_frame_t *frame, frame->local = local; local->call_count = priv->child_count; + if (IA_ISREG(fd->inode->ia_type)) { + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { STACK_WIND (frame, stripe_fstat_cbk, trav->xlator, trav->xlator->fops->fstat, fd, NULL); @@ -2915,8 +3015,10 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, d { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int i, eof_idx; + off_t dest_offset, tmp_offset; + int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2924,7 +3026,6 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, d VALIDATE_OR_GOTO (fd->inode, err); priv = this->private; - trav = this->children; /* Initialization */ local = mem_get0 (this->local_pool); @@ -2936,11 +3037,49 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, d frame->local = local; local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->ftruncate, fd, offset, NULL); - trav = trav->next; - } + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + if (!fctx->stripe_count) { + gf_log(this->name, GF_LOG_ERROR, "no stripe count"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, "no xlator at index " + "%d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->ftruncate, fd, dest_offset, + NULL); + } return 0; err: @@ -3045,6 +3184,7 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt tmp_stbuf = {0,}; struct iobref *tmp_iobref = NULL; struct iobuf *iobuf = NULL; + call_frame_t *prev = NULL; if (!this || !frame || !frame->local) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3052,13 +3192,16 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } local = frame->local; + prev = cookie; LOCK (&frame->lock); { callcnt = --local->call_count; - if (op_ret != -1) + if (op_ret != -1) { + correct_file_size(buf, local->fctx, prev); if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; + } } UNLOCK (&frame->lock); @@ -3148,6 +3291,7 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *tmp_stbuf_p = NULL; //need it for a warning struct iobref *tmp_iobref = NULL; stripe_fd_ctx_t *fctx = NULL; + call_frame_t *prev = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3156,6 +3300,7 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; index = local->node_index; + prev = cookie; mframe = local->orig_frame; if (!mframe) goto out; @@ -3175,6 +3320,9 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, mlocal->replies[index].stbuf = *stbuf; mlocal->replies[index].count = count; mlocal->replies[index].vector = iov_dup (vector, count); + + correct_file_size(stbuf, fctx, prev); + if (local->stbuf_size < stbuf->ia_size) local->stbuf_size = stbuf->ia_size; local->stbuf_blocks += stbuf->ia_blocks; @@ -3287,6 +3435,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, uint64_t stripe_size = 0; off_t rounded_start = 0; off_t frame_offset = offset; + off_t dest_offset = 0; stripe_local_t *local = NULL; call_frame_t *rframe = NULL; stripe_local_t *rlocal = NULL; @@ -3359,9 +3508,16 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, rlocal->readv_size = frame_size; rframe->local = rlocal; idx = (index % fctx->stripe_count); + + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(frame_offset, + stripe_size, fctx->stripe_count); + else + dest_offset = frame_offset; + STACK_WIND (rframe, stripe_readv_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->readv, - fd, frame_size, frame_offset, flags, xdata); + fd, frame_size, dest_offset, flags, xdata); frame_offset += frame_size; } @@ -3408,11 +3564,27 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret += op_ret; local->post_buf = *postbuf; local->pre_buf = *prebuf; + + local->prebuf_blocks += prebuf->ia_blocks; + local->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + + if (local->prebuf_size < prebuf->ia_size) + local->prebuf_size = prebuf->ia_size; + if (local->postbuf_size < postbuf->ia_size) + local->postbuf_size = postbuf->ia_size; } } UNLOCK (&frame->lock); if ((callcnt == local->wind_count) && local->unwind) { + local->pre_buf.ia_size = local->prebuf_size; + local->pre_buf.ia_blocks = local->prebuf_blocks; + local->post_buf.ia_size = local->postbuf_size; + local->post_buf.ia_blocks = local->postbuf_blocks; + STRIPE_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, &local->pre_buf, &local->post_buf, NULL); @@ -3438,6 +3610,7 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t fill_size = 0; uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; + off_t dest_offset = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3467,6 +3640,7 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, } frame->local = local; local->stripe_size = stripe_size; + local->fctx = fctx; if (!stripe_size) { gf_log (this->name, GF_LOG_DEBUG, @@ -3503,9 +3677,15 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (remaining_size == 0) local->unwind = 1; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(offset + offset_offset, + local->stripe_size, fctx->stripe_count); + else + dest_offset = offset + offset_offset; + STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->writev, fd, tmp_vec, - tmp_count, offset + offset_offset, flags, iobref, + tmp_count, dest_offset, flags, iobref, xdata); GF_FREE (tmp_vec); @@ -3857,10 +4037,15 @@ stripe_readdirp_lookup_cbk (call_frame_t *frame, void *cookie, local->op_ret = op_ret; goto unlock; } + + if (stripe_ctx_handle(this, prev, local, xattr)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict."); + + correct_file_size(stbuf, local->fctx, prev); + stripe_iatt_merge (stbuf, &entry->d_stat); local->stbuf_blocks += stbuf->ia_blocks; - - stripe_ctx_handle (this, prev, local, xattr); } unlock: UNLOCK(&frame->lock); @@ -3955,7 +4140,7 @@ unlock: xattrs = dict_new (); if (xattrs) - (void) stripe_xattr_request_build (this, xattrs, 0, 0, 0); + (void) stripe_xattr_request_build (this, xattrs, 0, 0, 0, 0); count = op_ret; list_for_each_entry_safe (local_entry, tmp_entry, (&local->entries.list), list) { @@ -4163,6 +4348,9 @@ reconfigure (xlator_t *this, dict_t *options) goto unlock; } } + + GF_OPTION_RECONF("coalesce", priv->coalesce, options, bool, + unlock); } unlock: UNLOCK (&priv->lock); @@ -4283,6 +4471,8 @@ init (xlator_t *this) /* notify related */ priv->nodes_down = priv->child_count; + GF_OPTION_INIT("coalesce", priv->coalesce, bool, out); + this->local_pool = mem_pool_new (stripe_local_t, 128); if (!this->local_pool) { ret = -1; @@ -4766,5 +4956,12 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "true" }, + { .key = {"coalesce"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Enable coalesce mode to flatten striped files as " + "stored on the server (i.e., eliminate holes caused " + "by the traditional format)." + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h index cb05eb56fc0..1b9e660c126 100644 --- a/xlators/cluster/stripe/src/stripe.h +++ b/xlators/cluster/stripe/src/stripe.h @@ -101,6 +101,7 @@ struct stripe_private { int8_t child_count; int8_t *state; /* Current state of child node */ gf_boolean_t xattr_supported; /* default yes */ + gf_boolean_t coalesce; char vol_uuid[UUID_SIZE + 1]; }; @@ -119,6 +120,7 @@ struct readv_replies { typedef struct _stripe_fd_ctx { off_t stripe_size; int stripe_count; + int stripe_coalesce; int static_array; xlator_t **xl_array; } stripe_fd_ctx_t; @@ -214,13 +216,41 @@ struct stripe_local { typedef struct stripe_local stripe_local_t; typedef struct stripe_private stripe_private_t; +/* + * Determine the stripe index of a particular frame based on the translator. + */ +static inline int32_t stripe_get_frame_index(stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int32_t i, idx = -1; + + for (i = 0; i < fctx->stripe_count; i++) { + if (fctx->xl_array[i] == prev->this) { + idx = i; + break; + } + } + + return idx; +} + +static inline void stripe_copy_xl_array(xlator_t **dst, xlator_t **src, + int count) +{ + int i; + + for (i = 0; i < count; i++) + dst[i] = src[i]; +} + void stripe_local_wipe (stripe_local_t *local); int32_t stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, dict_t *dict); void stripe_aggregate_xattr (dict_t *dst, dict_t *src); int32_t stripe_xattr_request_build (xlator_t *this, dict_t *dict, uint64_t stripe_size, uint32_t stripe_count, - uint32_t stripe_index); + uint32_t stripe_index, + uint32_t stripe_coalesce); int32_t stripe_get_matching_bs (const char *path, stripe_private_t *priv); int set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data); int32_t stripe_iatt_merge (struct iatt *from, struct iatt *to); @@ -229,5 +259,27 @@ int32_t stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, int32_t stripe_free_xattr_str (stripe_local_t *local); int32_t stripe_xattr_aggregate (char *buffer, stripe_local_t *local, int32_t *total); +off_t coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count); +off_t uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index); + +/* + * Adjust the size attribute for files if coalesce is enabled. + */ +static inline void correct_file_size(struct iatt *buf, stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int index; + + if (!IA_ISREG(buf->ia_type)) + return; + + if (!fctx || !fctx->stripe_coalesce) + return; + + index = stripe_get_frame_index(fctx, prev); + buf->ia_size = uncoalesced_size(buf->ia_size, fctx->stripe_size, + fctx->stripe_count, index); +} #endif /* _STRIPE_H_ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 621a42f5342..9bcf2cc7873 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -135,6 +135,7 @@ static struct volopt_map_entry glusterd_volopt_map[] = { {"cluster.quorum-count", "cluster/replicate", "quorum-count", NULL, NO_DOC, 0}, {"cluster.stripe-block-size", "cluster/stripe", "block-size", NULL, DOC, 0}, + {"cluster.stripe-coalesce", "cluster/stripe", "coalesce", NULL, DOC, 0}, {VKEY_DIAG_LAT_MEASUREMENT, "debug/io-stats", "latency-measurement", "off", NO_DOC, 0}, {"diagnostics.dump-fd-stats", "debug/io-stats", NULL, NULL, NO_DOC, 0}, diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 62c36d79426..24fd6a253a1 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1808,7 +1808,7 @@ out: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); + &postparent, xdata); return 0; } -- cgit