From 7dadea15c58eb92e5f5727190bf9446dd6fe7a3c Mon Sep 17 00:00:00 2001 From: Raghavendra Bhat Date: Tue, 6 Nov 2018 15:27:31 -0500 Subject: copy_file_range support in GlusterFS * libglusterfs changes to add new fop * Fuse changes: - Changes in fuse bridge xlator to receive and send responses * posix changes to perform the op on the backend filesystem * protocol and rpc changes for sending and receiving the fop * gfapi changes for performing the fop * tools: glfs-copy-file-range tool for testing copy_file_range fop - Although, copy_file_range support has been added to the upstream fuse kernel module, no release has been made yet of a kernel which contains the support. It is expected to come in the upcoming release of linux-4.20 So, as of now, executing copy_file_range fop on a fused based filesystem results in fuse kernel module sending read on the source fd and write on the destination fd. Therefore a small gfapi based tool has been written to be able test the copy_file_range fop. This tool is similar (in functionality) to the example program given in copy_file_range man page. So, running regular copy_file_range on a fuse mount point and running gfapi based glfs-copy-file-range tool gives some idea about how fast, the copy_file_range (or reflink) can be. On the local machine this was the result obtained. mount -t glusterfs workstation:new /mnt/glusterfs [root@workstation ~]# cd /mnt/glusterfs/ [root@workstation glusterfs]# ls file [root@workstation glusterfs]# cd [root@workstation ~]# time /tmp/a.out /mnt/glusterfs/file /mnt/glusterfs/new real 0m6.495s user 0m0.000s sys 0m1.439s [root@workstation ~]# time glfs-copy-file-range $(hostname) new /tmp/glfs.log /file /rrr OPEN_SRC: opening /file is success OPEN_DST: opening /rrr is success FSTAT_SRC: fstat on /rrr is success copy_file_range successful real 0m0.309s user 0m0.039s sys 0m0.017s This tool needs following arguments 1) hostname 2) volume name 3) log file path 4) source file path (relative to the gluster volume root) 5) destination file path (relative to the gluster volume root) "glfs-copy-file-range " - Added a testcase as well to run glfs-copy-file-range tool * io-stats changes to capture the fop for profiling * NOTE: - Added conditional check to see whether the copy_file_range syscall is available or not. If not, then return ENOSYS. - Added conditional check for kernel minor version in fuse_kernel.h and fuse-bridge while referring to copy_file_range. And the kernel minor version is kept as it is. i.e. 24. Increment it in future when there is a kernel release which contains the support for copy_file_range fop in fuse kernel module. * The document which contains a writeup on this enhancement can be found at https://docs.google.com/document/d/1BSILbXr_knynNwxSyyu503JoTz5QFM_4suNIh2WwrSc/edit Change-Id: I280069c814dd21ce6ec3be00a884fc24ab692367 updates: #536 Signed-off-by: Raghavendra Bhat --- xlators/debug/io-stats/src/io-stats.c | 27 +++ xlators/features/changelog/lib/src/Makefile.am | 2 +- xlators/features/utime/src/utime-gen-fops-c.py | 18 ++ xlators/features/utime/src/utime-gen-fops-h.py | 2 +- xlators/features/utime/src/utime-helpers.c | 9 + xlators/mount/fuse/src/fuse-bridge.c | 114 ++++++++++ xlators/mount/fuse/src/fuse-bridge.h | 36 +++ xlators/protocol/client/src/client-common.c | 32 +++ xlators/protocol/client/src/client-common.h | 6 + xlators/protocol/client/src/client-helpers.c | 28 +++ xlators/protocol/client/src/client-rpc-fops_v2.c | 141 ++++++++++++ xlators/protocol/client/src/client.c | 36 +++ xlators/protocol/client/src/client.h | 22 ++ xlators/protocol/server/src/server-common.c | 10 + xlators/protocol/server/src/server-common.h | 5 + xlators/protocol/server/src/server-helpers.c | 8 + xlators/protocol/server/src/server-resolve.c | 39 +++- xlators/protocol/server/src/server-rpc-fops_v2.c | 130 +++++++++++ xlators/protocol/server/src/server.h | 14 ++ xlators/storage/posix/src/posix-helpers.c | 20 ++ xlators/storage/posix/src/posix-inode-fd-ops.c | 268 +++++++++++++++++++++++ xlators/storage/posix/src/posix-messages.h | 3 +- xlators/storage/posix/src/posix-metadata.c | 78 +++++++ xlators/storage/posix/src/posix-metadata.h | 5 + xlators/storage/posix/src/posix.c | 1 + xlators/storage/posix/src/posix.h | 7 + 26 files changed, 1051 insertions(+), 10 deletions(-) (limited to 'xlators') diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index 7bf0d8a8f00..f40b00bba2d 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -2118,6 +2118,19 @@ io_stats_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +int +io_stats_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *stbuf, struct iatt *prebuf_dst, + struct iatt *postbuf_dst, dict_t *xdata) +{ + UPDATE_PROFILE_STATS(frame, COPY_FILE_RANGE); + + STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, stbuf, + prebuf_dst, postbuf_dst, xdata); + return 0; +} + int io_stats_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, @@ -2872,6 +2885,19 @@ io_stats_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, return 0; } +int +io_stats_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off_t off_in, fd_t *fd_out, off_t off_out, size_t len, + uint32_t flags, dict_t *xdata) +{ + START_FOP_LATENCY(frame); + + STACK_WIND(frame, io_stats_copy_file_range_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->copy_file_range, fd_in, off_in, fd_out, + off_out, len, flags, xdata); + return 0; +} + int io_stats_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { @@ -4189,6 +4215,7 @@ struct xlator_fops fops = { .getactivelk = io_stats_getactivelk, .setactivelk = io_stats_setactivelk, .compound = io_stats_compound, + .copy_file_range = io_stats_copy_file_range, }; struct xlator_cbks cbks = { diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am index c4b9a3df692..c933ec53ed2 100644 --- a/xlators/features/changelog/lib/src/Makefile.am +++ b/xlators/features/changelog/lib/src/Makefile.am @@ -1,7 +1,7 @@ libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \ -DDATADIR=\"$(localstatedir)\" -libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \ +libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -D__USE_LARGEFILE64 -fpic \ -I../../../src/ -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/features/changelog/src \ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ diff --git a/xlators/features/utime/src/utime-gen-fops-c.py b/xlators/features/utime/src/utime-gen-fops-c.py index ab56dc9a4b3..8730a51d13e 100755 --- a/xlators/features/utime/src/utime-gen-fops-c.py +++ b/xlators/features/utime/src/utime-gen-fops-c.py @@ -62,6 +62,20 @@ gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, } """ +FOPS_COPY_FILE_RANGE_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + gl_timespec_get(&frame->root->ctime); + + (void) utime_update_attribute_flags(frame, this, GF_FOP_COPY_FILE_RANGE); + STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@); + return 0; +} +""" + FOPS_SETATTR_TEMPLATE = """ int32_t gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, @@ -94,6 +108,7 @@ utime_ops = ['fallocate', 'zerofill', 'opendir', 'mknod', 'mkdir', utime_read_op = ['readv'] utime_write_op = ['writev'] utime_setattr_ops = ['setattr', 'fsetattr'] +utime_copy_file_range_ops = ['copy_file_range'] def gen_defaults(): for name in ops: @@ -109,6 +124,9 @@ def gen_defaults(): if name in utime_setattr_ops: print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) print(generate(FOPS_SETATTR_TEMPLATE, name, fop_subs)) + if name in utime_copy_file_range_ops: + print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) + print(generate(FOPS_COPY_FILE_RANGE_TEMPLATE, name, fop_subs)) for l in open(sys.argv[1], 'r').readlines(): if l.find('#pragma generate') != -1: diff --git a/xlators/features/utime/src/utime-gen-fops-h.py b/xlators/features/utime/src/utime-gen-fops-h.py index 3686f2e3c1e..e96274c229a 100755 --- a/xlators/features/utime/src/utime-gen-fops-h.py +++ b/xlators/features/utime/src/utime-gen-fops-h.py @@ -18,7 +18,7 @@ gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, utime_ops = ['fallocate', 'zerofill', 'opendir', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink', 'rename', 'link', 'truncate', 'ftruncate', 'create', 'open', 'removexattr', 'fremovexattr', - 'readv', 'writev', 'setattr', 'fsetattr'] + 'readv', 'writev', 'setattr', 'fsetattr', 'copy_file_range'] def gen_defaults(): for name, value in ops.items(): diff --git a/xlators/features/utime/src/utime-helpers.c b/xlators/features/utime/src/utime-helpers.c index c79e12badfa..79cc0145f50 100644 --- a/xlators/features/utime/src/utime-helpers.c +++ b/xlators/features/utime/src/utime-helpers.c @@ -93,6 +93,15 @@ utime_update_attribute_flags(call_frame_t *frame, xlator_t *this, frame->root->flags |= MDATA_CTIME; break; + case GF_FOP_COPY_FILE_RANGE: + /* Below 2 are for destination fd */ + frame->root->flags |= MDATA_CTIME; + frame->root->flags |= MDATA_MTIME; + /* Below flag is for the source fd */ + if (!utime_priv->noatime) { + frame->root->flags |= MDATA_ATIME; + } + break; default: frame->root->flags = 0; } diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index 3b2622b431f..3f4e19c211e 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -2993,6 +2993,116 @@ fuse_write(xlator_t *this, fuse_in_header_t *finh, void *msg, return; } +#if FUSE_KERNEL_MINOR_VERSION >= 28 +static int +fuse_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *prebuf_dst, struct iatt *postbuf_dst, + dict_t *xdata) +{ + fuse_state_t *state = NULL; + fuse_in_header_t *finh = NULL; + /* + * Fuse kernel module uses fuse_write_out itself as the + * output collector. In fact, fuse_kernel.h in the upstream + * kernel just defines the input structure fuse_copy_file_range_in + * for the fop. So, just use the fuse_write_out to send the + * response back to the kernel. + */ + struct fuse_write_out fcfro = { + 0, + }; + + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + + state = frame->root->state; + finh = state->finh; + + fuse_log_eh_fop(this, state, frame, op_ret, op_errno); + + if (op_ret >= 0) { + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "%" PRIu64 ": WRITE => %d/%" GF_PRI_SIZET ",%" PRIu64 + " , %" PRIu64 " ,%" PRIu64 ",%" PRIu64, + frame->root->unique, op_ret, state->size, state->off_in, + state->off_out, stbuf->ia_size, postbuf_dst->ia_size); + + fcfro.size = op_ret; + send_fuse_obj(this, finh, &fcfro); + } else { + if (state->fd && state->fd->inode) + uuid_utoa_r(state->fd->inode->gfid, src_gfid); + else + snprintf(src_gfid, sizeof(src_gfid), "nil"); + + if (state->fd_dst && state->fd_dst->inode) + uuid_utoa_r(state->fd_dst->inode->gfid, dst_gfid); + else + snprintf(dst_gfid, sizeof(dst_gfid), "nil"); + + gf_log("glusterfs-fuse", GF_LOG_WARNING, + "%" PRIu64 + ": COPY_FILE_RANGE => -1 gfid_in=%s fd_in=%p " + "gfid_out=%s fd_out=%p (%s)", + frame->root->unique, src_gfid, state->fd, dst_gfid, + state->fd_dst, strerror(op_errno)); + + send_fuse_err(this, finh, op_errno); + } + + free_fuse_state(state); + STACK_DESTROY(frame->root); + + return 0; +} + +void +fuse_copy_file_range_resume(fuse_state_t *state) +{ + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "%" PRIu64 + ": COPY_FILE_RANGE " + "(input fd: %p (gfid: %s), " + "output fd: %p (gfid: %s) size=%zu, " + "offset_in=%" PRIu64 ", offset_out=%" PRIu64 ")", + state->finh->unique, state->fd, uuid_utoa(state->fd->inode->gfid), + state->fd_dst, uuid_utoa(state->fd_dst->inode->gfid), state->size, + state->off_in, state->off_out); + + FUSE_FOP(state, fuse_copy_file_range_cbk, GF_FOP_COPY_FILE_RANGE, + copy_file_range, state->fd, state->off_in, state->fd_dst, + state->off_out, state->size, state->io_flags, state->xdata); +} + +static void +fuse_copy_file_range(xlator_t *this, fuse_in_header_t *finh, void *msg, + struct iobuf *iobuf) +{ + struct fuse_copy_file_range_in *fcfri = msg; + fuse_state_t *state = NULL; + fd_t *fd_in = NULL; + fd_t *fd_out = NULL; + + GET_STATE(this, finh, state); + + fd_in = FH_TO_FD(fcfri->fh_in); + fd_out = FH_TO_FD(fcfri->fh_out); + state->fd = fd_in; + state->fd_dst = fd_out; + + fuse_resolve_fd_init(state, &state->resolve, fd_in); + fuse_resolve_fd_init(state, &state->resolve2, fd_out); + + state->size = fcfri->len; + state->off_in = fcfri->off_in; + state->off_out = fcfri->off_out; + state->io_flags = fcfri->flags; + + fuse_resolve_and_resume(state, fuse_copy_file_range_resume); +} +#endif /* FUSE_KERNEL_MINOR_VERSION >= 28 */ + #if FUSE_KERNEL_MINOR_VERSION >= 24 && HAVE_SEEK_HOLE static int fuse_lseek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, @@ -6087,6 +6197,10 @@ static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = { #if FUSE_KERNEL_MINOR_VERSION >= 24 && HAVE_SEEK_HOLE [FUSE_LSEEK] = fuse_lseek, #endif + +#if FUSE_KERNEL_MINOR_VERSION >= 28 + [FUSE_COPY_FILE_RANGE] = fuse_copy_file_range, +#endif }; static fuse_handler_t *fuse_dump_ops[FUSE_OP_HIGH]; diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h index 57380786f17..60702ab1da5 100644 --- a/xlators/mount/fuse/src/fuse-bridge.h +++ b/xlators/mount/fuse/src/fuse-bridge.h @@ -41,7 +41,31 @@ #include #if defined(GF_LINUX_HOST_OS) || defined(__FreeBSD__) || defined(__NetBSD__) + +/* + * TODO: + * So, with the addition of copy_file_range support, it might + * require a bump up of fuse kernel minor version (like it was + * done when support for lseek fop was added. But, as of now, + * the copy_file_range support has just landed in upstream + * kernel fuse module. So, until, there is a release of that + * fuse as part of a kernel, the FUSE_KERNEL_MINOR_VERSION + * from fuse_kernel.h in the contrib might not be changed. + * If so, then the highest op available should be based on + * the current minor version (which is 24). So, selectively + * determine. When, the minor version is changed to 28 in + * fuse_kernel.h from contrib (because in upstream linux + * kernel source tree, the kernel minor version which + * contains support for copy_file_range is 28), then remove + * the reference to FUSE_LSEEK below and just determine + * FUSE_OP_HIGH based on copy_file_range. + */ +#if FUSE_KERNEL_MINOR_VERSION >= 28 +#define FUSE_OP_HIGH (FUSE_COPY_FILE_RANGE + 1) +#else #define FUSE_OP_HIGH (FUSE_LSEEK + 1) +#endif + #endif #ifdef GF_DARWIN_HOST_OS #define FUSE_OP_HIGH (FUSE_DESTROY + 1) @@ -400,10 +424,22 @@ typedef struct { loc_t loc2; fuse_in_header_t *finh; int32_t flags; + off_t off; + /* + * The man page of copy_file_range tells that the offset + * arguments are of type loff_t *. Here in fuse state, the values of + * those offsets are saved instead of pointers as the kernel sends + * the values of the offsets from those pointers instead of pointers. + * But the type loff_t is linux specific and is actually a typedef of + * off64_t. Hence using off64_t + */ + off64_t off_in; /* for copy_file_range source fd */ + off64_t off_out; /* for copy_file_range destination fd */ size_t size; unsigned long nlookup; fd_t *fd; + fd_t *fd_dst; /* for copy_file_range destination */ dict_t *xattr; dict_t *xdata; char *name; diff --git a/xlators/protocol/client/src/client-common.c b/xlators/protocol/client/src/client-common.c index 7708c820918..64db98d661b 100644 --- a/xlators/protocol/client/src/client-common.c +++ b/xlators/protocol/client/src/client-common.c @@ -2555,6 +2555,38 @@ out: return -op_errno; } +int +client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req, + fd_t *fd_in, off64_t off_in, fd_t *fd_out, + off64_t off_out, size_t size, int32_t flags, + dict_t **xdata) +{ + int64_t remote_fd_in = -1; + int64_t remote_fd_out = -1; + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd_in, FALLBACK_TO_ANON_FD, remote_fd_in, + op_errno, out); + + CLIENT_GET_REMOTE_FD(this, fd_out, FALLBACK_TO_ANON_FD, remote_fd_out, + op_errno, out); + req->size = size; + req->off_in = off_in; + req->off_out = off_out; + req->fd_in = remote_fd_in; + req->fd_out = remote_fd_out; + req->flag = flags; + + memcpy(req->gfid1, fd_in->inode->gfid, 16); + memcpy(req->gfid2, fd_out->inode->gfid, 16); + + dict_to_xdr(*xdata, &req->xdata); + + return 0; +out: + return -op_errno; +} + int client_pre_statfs_v2(xlator_t *this, gfx_statfs_req *req, loc_t *loc, dict_t *xdata) diff --git a/xlators/protocol/client/src/client-common.h b/xlators/protocol/client/src/client-common.h index 5214eae128e..a2043d8742a 100644 --- a/xlators/protocol/client/src/client-common.h +++ b/xlators/protocol/client/src/client-common.h @@ -621,4 +621,10 @@ client_post_rename_v2(xlator_t *this, gfx_rename_rsp *rsp, struct iatt *stbuf, struct iatt *prenewparent, struct iatt *postnewparent, dict_t **xdata); +int +client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req, + fd_t *fd_in, off64_t off_in, fd_t *fd_out, + off64_t off_out, size_t size, int32_t flags, + dict_t **xdata); + #endif /* __CLIENT_COMMON_H__ */ diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c index 849fdfca0bc..55e87b3c370 100644 --- a/xlators/protocol/client/src/client-helpers.c +++ b/xlators/protocol/client/src/client-helpers.c @@ -2459,6 +2459,20 @@ client_handle_fop_requirements_v2( lease, this, &this_req->compound_req_v2_u.compound_lease_req, op_errno, out, &args->loc, &args->lease, args->xdata); break; + case GF_FOP_COPY_FILE_RANGE: + /* + * Not going to handle the copy_file_range fop in compound + * operation. This is because, compound operation is going + * to be removed. In fact, AFR one of the heavy consumer of + * compound operations has stopped using that. + * https://github.com/gluster/glusterfs/issues/414 + * Therefore, sending ENOTSUP error for this fop coming as + * comound request. Though, there was no need of handling + * "case GF_FOP_COPY_FILE_RANGE" technically, this comment + * under the label of GF_FOP_COPY_FILE_RANGE will help in + * understanding that this fop does not handle the compund + * request and why. + */ default: return ENOTSUP; } @@ -2631,6 +2645,14 @@ compound_request_cleanup_v2(gfx_compound_req *req) case GF_FOP_SEEK: CLIENT4_COMPOUND_FOP_CLEANUP(curr_req, seek); break; + case GF_FOP_COPY_FILE_RANGE: + /* + * This fop is not handled in compund operations. + * Check the comment added under this fop's section + * in the compound_request_cleanup_v2. Therefore + * keeping this label only as a placeholder with + * a message that, this fop is not handled. + */ default: break; } @@ -3004,6 +3026,12 @@ client_process_response_v2(call_frame_t *frame, xlator_t *this, &this_args_cbk->lease, xdata); break; } + case GF_FOP_COPY_FILE_RANGE: + /* + * Not handling this fop. Returning ENOTSUP. Check + * the comment added for this fop in the function + * client_handle_fop_requirements_v2. + */ default: return -ENOTSUP; } diff --git a/xlators/protocol/client/src/client-rpc-fops_v2.c b/xlators/protocol/client/src/client-rpc-fops_v2.c index ca180c1db4b..8f3ee41e5c5 100644 --- a/xlators/protocol/client/src/client-rpc-fops_v2.c +++ b/xlators/protocol/client/src/client-rpc-fops_v2.c @@ -2833,6 +2833,72 @@ out: return 0; } +int +client4_0_copy_file_range_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + gfx_common_3iatt_rsp rsp = { + 0, + }; + call_frame_t *frame = NULL; + struct iatt stbuf = { + 0, + }; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; + clnt_local_t *local = NULL; + + this = THIS; + + frame = myframe; + local = frame->local; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + rsp.op_errno = ENOTCONN; + goto out; + } + + ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfx_common_3iatt_rsp); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, PC_MSG_XDR_DECODING_FAILED, + "XDR decoding failed"); + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + + ret = client_post_common_3iatt(this, &rsp, &stbuf, &prestat, &poststat, + &xdata); + if (ret < 0) + goto out; +out: + if (rsp.op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, gf_error_to_errno(rsp.op_errno), + PC_MSG_REMOTE_OP_FAILED, "remote operation failed"); + } else if (rsp.op_ret >= 0) { + if (local->attempt_reopen) + client_attempt_reopen(local->fd, this); + if (local->attempt_reopen_out) + client_attempt_reopen(local->fd_out, this); + } + CLIENT_STACK_UNWIND(copy_file_range, frame, rsp.op_ret, + gf_error_to_errno(rsp.op_errno), &stbuf, &prestat, + &poststat, xdata); + + if (xdata) + dict_unref(xdata); + + return 0; +} + int32_t client4_0_releasedir(call_frame_t *frame, xlator_t *this, void *data) { @@ -5845,6 +5911,80 @@ unwind: return 0; } +int32_t +client4_0_copy_file_range(call_frame_t *frame, xlator_t *this, void *data) +{ + clnt_args_t *args = NULL; + clnt_conf_t *conf = NULL; + clnt_local_t *local = NULL; + gfx_copy_file_range_req req = { + { + 0, + }, + }; + int op_errno = ESTALE; + int ret = 0; + + if (!frame || !this || !data) + goto unwind; + + args = data; + conf = this->private; + + ret = client_pre_copy_file_range_v2(this, &req, args->fd, args->off_in, + args->fd_out, args->off_out, args->size, + args->flags, &args->xdata); + + if (ret) { + op_errno = -ret; + goto unwind; + } + + ret = client_fd_fop_prepare_local(frame, args->fd, req.fd_in); + if (ret) { + op_errno = -ret; + goto unwind; + } + + /* + * Since frame->local is allocated in above function call + * itself, better to use it (with the assumption that it + * has been allocated) directly instead of again calling + * client_fd_fop_prepare_local or modifying it, as doing + * so requires changes in other places as well. + */ + + local = frame->local; + local->fd_out = fd_ref(args->fd_out); + local->attempt_reopen_out = client_is_reopen_needed(args->fd_out, this, + req.fd_out); + + ret = client_submit_request( + this, &req, frame, conf->fops, GFS3_OP_COPY_FILE_RANGE, + client4_0_copy_file_range_cbk, NULL, NULL, 0, NULL, 0, NULL, + (xdrproc_t)xdr_gfx_copy_file_range_req); + if (ret) { + /* + * If the lower layers fail to submit a request, they'll also + * do the unwind for us (see rpc_clnt_submit), so don't unwind + * here in such cases. + */ + gf_msg(this->name, GF_LOG_WARNING, 0, PC_MSG_FOP_SEND_FAILED, + "failed to send the fop"); + } + + GF_FREE(req.xdata.pairs.pairs_val); + + return 0; + +unwind: + CLIENT_STACK_UNWIND(copy_file_range, frame, -1, op_errno, NULL, NULL, NULL, + NULL); + GF_FREE(req.xdata.pairs.pairs_val); + + return 0; +} + int32_t client4_0_fsetattr(call_frame_t *frame, xlator_t *this, void *data) { @@ -6257,6 +6397,7 @@ rpc_clnt_procedure_t clnt4_0_fop_actors[GF_FOP_MAXVALUE] = { [GF_FOP_COMPOUND] = {"COMPOUND", client4_0_compound}, [GF_FOP_ICREATE] = {"ICREATE", client4_0_icreate}, [GF_FOP_NAMELINK] = {"NAMELINK", client4_0_namelink}, + [GF_FOP_COPY_FILE_RANGE] = {"COPY-FILE-RANGE", client4_0_copy_file_range}, }; rpc_clnt_prog_t clnt4_0_fop_prog = { diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 38723b43b45..c8e84f6e1b7 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -1129,6 +1129,41 @@ out: return 0; } +int32_t +client_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off_t off_in, fd_t *fd_out, off_t off_out, size_t len, + uint32_t flags, dict_t *xdata) +{ + int ret = -1; + clnt_conf_t *conf = NULL; + rpc_clnt_procedure_t *proc = NULL; + clnt_args_t args = { + 0, + }; + + conf = this->private; + if (!conf || !conf->fops) + goto out; + + args.fd = fd_in; + args.fd_out = fd_out; + args.offset = off_in; + args.off_out = off_out; + args.size = len; + args.flags = flags; + args.xdata = xdata; + + proc = &conf->fops->proctable[GF_FOP_COPY_FILE_RANGE]; + if (proc->fn) + ret = proc->fn(frame, this, &args); +out: + if (ret) + STACK_UNWIND_STRICT(copy_file_range, frame, -1, ENOTCONN, NULL, NULL, + NULL, NULL); + + return 0; +} + static gf_boolean_t is_client_rpc_init_command(dict_t *dict, xlator_t *this, char **value) { @@ -2898,6 +2933,7 @@ struct xlator_fops fops = { .icreate = client_icreate, .namelink = client_namelink, .put = client_put, + .copy_file_range = client_copy_file_range, }; struct xlator_dumpops dumpops = { diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h index 5fc75a84628..71f84f3ca89 100644 --- a/xlators/protocol/client/src/client.h +++ b/xlators/protocol/client/src/client.h @@ -269,6 +269,7 @@ typedef struct client_local { loc_t loc; loc_t loc2; fd_t *fd; + fd_t *fd_out; /* used in copy_file_range */ clnt_fd_ctx_t *fdctx; uint32_t flags; struct iobref *iobref; @@ -280,6 +281,11 @@ typedef struct client_local { pthread_mutex_t mutex; char *name; gf_boolean_t attempt_reopen; + /* + * The below boolean variable is used + * only for copy_file_range fop + */ + gf_boolean_t attempt_reopen_out; /* required for compound fops */ compound_args_t *compound_args; unsigned int length; /* length of a compound fop */ @@ -289,7 +295,13 @@ typedef struct client_local { typedef struct client_args { loc_t *loc; + /* + * This is the source fd for copy_file_range and + * the default fd for any other fd based fop which + * requires only one fd (i.e. opetates on one fd) + */ fd_t *fd; + fd_t *fd_out; /* this is the destination fd for copy_file_range */ const char *linkname; struct iobref *iobref; struct iovec *vector; @@ -301,7 +313,17 @@ typedef struct client_args { struct gf_flock *flock; const char *volume; const char *basename; + off_t offset; + /* + * According to the man page of copy_file_range, + * the offsets for source and destination file + * are of type loff_t. But the type loff_t is + * linux specific and is actual a typedef of + * off64_t. + */ + off64_t off_in; /* used in copy_file_range for source fd */ + off64_t off_out; /* used in copy_file_range for dst fd */ int32_t mask; int32_t cmd; size_t size; diff --git a/xlators/protocol/server/src/server-common.c b/xlators/protocol/server/src/server-common.c index 25b36155065..0639ac3feb3 100644 --- a/xlators/protocol/server/src/server-common.c +++ b/xlators/protocol/server/src/server-common.c @@ -540,6 +540,16 @@ server4_post_common_3iatt(server_state_t *state, gfx_common_3iatt_rsp *rsp, inode_unref(link_inode); } +void +server4_post_common_3iatt_noinode(gfx_common_3iatt_rsp *rsp, struct iatt *stbuf, + struct iatt *prebuf_dst, + struct iatt *postbuf_dst) +{ + gfx_stat_from_iattx(&rsp->stat, stbuf); + gfx_stat_from_iattx(&rsp->preparent, prebuf_dst); + gfx_stat_from_iattx(&rsp->postparent, postbuf_dst); +} + void server4_post_common_2iatt(gfx_common_2iatt_rsp *rsp, struct iatt *prebuf, struct iatt *postbuf) diff --git a/xlators/protocol/server/src/server-common.h b/xlators/protocol/server/src/server-common.h index 2844ee95756..6200415e304 100644 --- a/xlators/protocol/server/src/server-common.h +++ b/xlators/protocol/server/src/server-common.h @@ -192,3 +192,8 @@ void server4_post_link(server_state_t *state, gfx_common_3iatt_rsp *rsp, inode_t *inode, struct iatt *stbuf, struct iatt *pre, struct iatt *post); + +void +server4_post_common_3iatt_noinode(gfx_common_3iatt_rsp *rsp, struct iatt *stbuf, + struct iatt *prebuf_dst, + struct iatt *postbuf_dst); diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c index c55a422679d..8ad2d8492ed 100644 --- a/xlators/protocol/server/src/server-helpers.c +++ b/xlators/protocol/server/src/server-helpers.c @@ -4948,6 +4948,8 @@ server_populate_compound_response_v2(xlator_t *this, gfx_compound_rsp *rsp, rsp_args->op_errno = gf_errno_to_error(this_args_cbk->op_errno); break; } + case GF_FOP_COPY_FILE_RANGE: + /* Not handling this fop. */ default: return ENOTSUP; } @@ -5380,6 +5382,12 @@ server_get_compound_resolve_v2(server_state_t *state, gfx_compound_req *req) memcpy(state->resolve.gfid, this_req.gfid, 16); break; } + case GF_FOP_COPY_FILE_RANGE: + /* + * Compound operations is not being used anymore and + * planned for subsequent removal. Hence not handling + * this fop here. + */ default: return ENOTSUP; } diff --git a/xlators/protocol/server/src/server-resolve.c b/xlators/protocol/server/src/server-resolve.c index 26260a5ee2c..ec768acba44 100644 --- a/xlators/protocol/server/src/server-resolve.c +++ b/xlators/protocol/server/src/server-resolve.c @@ -545,14 +545,39 @@ server_resolve_fd(call_frame_t *frame) return 0; } - state->fd = gf_fd_fdptr_get(serv_ctx->fdtable, fd_no); - + /* + * With copy_file_range, there will be 2 fds to resolve. + * This same function is called to resolve both the source + * fd and the destination fd. As of now, this function does + * not have any mechanism to distinguish between the 2 fds + * being resolved except for checking the value of state->fd. + * The assumption is that, if source fd the one which is + * being resolved here, then state->fd would be NULL. If it + * is not NULL, then it is the destination fd which is being + * resolved. + * This method (provided the above assumption is true) is + * to achieve the ability to distinguish between 2 fds with + * minimum changes being done to this function. If this way + * is not correct, then more changes might be needed. + */ if (!state->fd) { - gf_msg("", GF_LOG_INFO, EBADF, PS_MSG_FD_NOT_FOUND, - "fd not " - "found in context"); - resolve->op_ret = -1; - resolve->op_errno = EBADF; + state->fd = gf_fd_fdptr_get(serv_ctx->fdtable, fd_no); + if (!state->fd) { + gf_msg("", GF_LOG_INFO, EBADF, PS_MSG_FD_NOT_FOUND, + "fd not " + "found in context"); + resolve->op_ret = -1; + resolve->op_errno = EBADF; + } + } else { + state->fd_out = gf_fd_fdptr_get(serv_ctx->fdtable, fd_no); + if (!state->fd_out) { + gf_msg("", GF_LOG_INFO, EBADF, PS_MSG_FD_NOT_FOUND, + "fd not " + "found in context"); + resolve->op_ret = -1; + resolve->op_errno = EBADF; + } } server_resolve_all(frame); diff --git a/xlators/protocol/server/src/server-rpc-fops_v2.c b/xlators/protocol/server/src/server-rpc-fops_v2.c index c5a8e482621..16570294f6d 100644 --- a/xlators/protocol/server/src/server-rpc-fops_v2.c +++ b/xlators/protocol/server/src/server-rpc-fops_v2.c @@ -2259,6 +2259,64 @@ out: return 0; } +int +server4_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *stbuf, struct iatt *prebuf_dst, + struct iatt *postbuf_dst, dict_t *xdata) +{ + gfx_common_3iatt_rsp rsp = { + 0, + }; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + char in_gfid[GF_UUID_BUF_SIZE] = {0}; + char out_gfid[GF_UUID_BUF_SIZE] = {0}; + + dict_to_xdr(xdata, &rsp.xdata); + + if (op_ret < 0) { + state = CALL_STATE(frame); + + uuid_utoa_r(state->resolve.gfid, in_gfid); + uuid_utoa_r(state->resolve2.gfid, out_gfid); + + gf_msg(this->name, fop_log_level(GF_FOP_COPY_FILE_RANGE, op_errno), + op_errno, PS_MSG_WRITE_INFO, + "%" PRId64 ": COPY_FILE_RANGE %" PRId64 " (%s), %" PRId64 + " (%s) client: %s, " + "error-xlator: %s", + frame->root->unique, state->resolve.fd_no, in_gfid, + state->resolve2.fd_no, out_gfid, STACK_CLIENT_NAME(frame->root), + STACK_ERR_XL_NAME(frame->root)); + goto out; + } + + /* + * server4_post_common_3iatt (ex: used by server4_put_cbk and some + * other cbks) also performs inode linking along with copying of 3 + * iatt structures to the response. But, for copy_file_range, linking + * of inode is not needed. Therefore a new function is used to + * construct the response using 3 iatt structures. + * @stbuf: iatt or stat of the source file (or fd) + * @prebuf_dst: iatt or stat of destination file (or fd) before the fop + * @postbuf_dst: iatt or stat of destination file (or fd) after the fop + */ + server4_post_common_3iatt_noinode(&rsp, stbuf, prebuf_dst, postbuf_dst); + +out: + rsp.op_ret = op_ret; + rsp.op_errno = gf_errno_to_error(op_errno); + + req = frame->local; + server_submit_reply(frame, req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gfx_common_3iatt_rsp); + + GF_FREE(rsp.xdata.pairs.pairs_val); + + return 0; +} + /* Resume function section */ int @@ -3447,6 +3505,29 @@ err: return 0; } +int +server4_copy_file_range_resume(call_frame_t *frame, xlator_t *bound_xl) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->resolve.op_ret != 0) + goto err; + + STACK_WIND(frame, server4_copy_file_range_cbk, bound_xl, + bound_xl->fops->copy_file_range, state->fd, state->off_in, + state->fd_out, state->off_out, state->size, state->flags, + state->xdata); + + return 0; +err: + server4_copy_file_range_cbk(frame, NULL, frame->this, state->resolve.op_ret, + state->resolve.op_errno, NULL, NULL, NULL, + NULL); + return 0; +} + int server4_0_stat(rpcsvc_request_t *req) { @@ -6104,6 +6185,53 @@ out: return ret; } +int +server4_0_copy_file_range(rpcsvc_request_t *req) +{ + server_state_t *state = NULL; + call_frame_t *frame = NULL; + gfx_copy_file_range_req args = { + { + 0, + }, + }; + ssize_t len = 0; + int ret = -1; + int op_errno = 0; + + if (!req) + return ret; + + ret = rpc_receive_common(req, &frame, &state, &len, &args, + xdr_gfx_copy_file_range_req, + GF_FOP_COPY_FILE_RANGE); + if (ret != 0) { + goto out; + } + + state->resolve.type = RESOLVE_MUST; + state->resolve.fd_no = args.fd_in; + state->resolve2.type = RESOLVE_MUST; /*making this resolve must */ + state->resolve2.fd_no = args.fd_out; + state->off_in = args.off_in; + state->off_out = args.off_out; + state->size = args.size; + state->flags = args.flag; + memcpy(state->resolve.gfid, args.gfid1, 16); + memcpy(state->resolve2.gfid, args.gfid2, 16); + + xdr_to_dict(&args.xdata, &state->xdata); + + ret = 0; + resolve_and_resume(frame, server4_copy_file_range_resume); +out: + + if (op_errno) + SERVER_REQ_SET_ERROR(req, ret); + + return ret; +} + rpcsvc_actor_t glusterfs4_0_fop_actors[] = { [GFS3_OP_NULL] = {"NULL", GFS3_OP_NULL, server_null, NULL, 0}, [GFS3_OP_STAT] = {"STAT", GFS3_OP_STAT, server4_0_stat, NULL, 0}, @@ -6195,6 +6323,8 @@ rpcsvc_actor_t glusterfs4_0_fop_actors[] = { DRC_NA}, [GFS3_OP_NAMELINK] = {"NAMELINK", GFS3_OP_NAMELINK, server4_0_namelink, NULL, 0, DRC_NA}, + [GFS3_OP_COPY_FILE_RANGE] = {"COPY-FILE-RANGE", GFS3_OP_COPY_FILE_RANGE, + server4_0_copy_file_range, NULL, 0, DRC_NA}, }; struct rpcsvc_program glusterfs4_0_fop_prog = { diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h index 2a77aba1f3c..bdf98c96f1c 100644 --- a/xlators/protocol/server/src/server.h +++ b/xlators/protocol/server/src/server.h @@ -180,7 +180,12 @@ struct _server_state { struct iatt stbuf; int valid; + /* + * this fd is used in all the fd based operations PLUS + * as a source fd in copy_file_range + */ fd_t *fd; + fd_t *fd_out; /* destination fd in copy_file_range */ dict_t *params; int32_t flags; int wbflags; @@ -191,6 +196,15 @@ struct _server_state { size_t size; off_t offset; + /* + * According to the man page of copy_file_range, + * the offsets for source and destination file + * are of type loff_t. But the type loff_t is + * linux specific and is actual a typedef of + * off64_t. + */ + off64_t off_in; /* source offset in copy_file_range */ + off64_t off_out; /* destination offset in copy_file_range */ mode_t mode; dev_t dev; size_t nr_count; diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index ed0516d4c4a..54fc1dc1195 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -3314,3 +3314,23 @@ unlock: out: return ret; } + +int +posix_check_dev_file(xlator_t *this, inode_t *inode, char *fop, int *op_errno) +{ + int ret = -1; + + if (inode->ia_type == IA_IFBLK || inode->ia_type == IA_IFCHR) { + *op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_INVALID_ARGUMENT, + "%s received on %s file (%s)", fop, + (inode->ia_type == IA_IFBLK) ? "block" : "char", + uuid_utoa(inode->gfid)); + goto out; + } + + ret = 0; + +out: + return ret; +} diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index 9e2b37f582c..dafd1855ef9 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1959,6 +1959,274 @@ out: return 0; } +int32_t +posix_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off64_t off_in, fd_t *fd_out, off64_t off_out, size_t len, + uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd_in = -1; + int _fd_out = -1; + struct posix_private *priv = NULL; + struct posix_fd *pfd_in = NULL; + struct posix_fd *pfd_out = NULL; + struct iatt preop_dst = { + 0, + }; + struct iatt postop_dst = { + 0, + }; + struct iatt stbuf = { + 0, + }; + int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; + gf_boolean_t update_atomic = _gf_false; + posix_inode_ctx_t *ctx = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd_in, out); + VALIDATE_OR_GOTO(fd_in->inode, out); + VALIDATE_OR_GOTO(fd_out, out); + VALIDATE_OR_GOTO(fd_out->inode, out); + VALIDATE_OR_GOTO(this->private, out); + + priv = this->private; + + VALIDATE_OR_GOTO(priv, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + if (posix_check_dev_file(this, fd_in->inode, "copy_file_range", &op_errno)) + goto out; + + if (posix_check_dev_file(this, fd_out->inode, "copy_file_range", &op_errno)) + goto out; + + ret = posix_fd_ctx_get(fd_in, this, &pfd_in, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd_in); + goto out; + } + + _fd_in = pfd_in->fd; + + ret = posix_fd_ctx_get(fd_out, this, &pfd_out, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd_out); + goto out; + } + + _fd_out = pfd_out->fd; + + /* + * Currently, the internal write is checked via xdata which + * is set by some xlator above. It could be due to several of + * the reasons such as healing or a snapshot operation happening + * using copy_file_range. As of now (i.e. writing the patch with + * this change) none of the xlators above posix are using the + * internal write with copy_file_range. In future it might + * change. Atleast as of now the hope is that, when that happens + * this functon or fop does not require additional changes for + * handling internal writes. + */ + ret = posix_check_internal_writes(this, fd_out, _fd_out, xdata); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "possible overwrite from internal client, fd=%p", fd_out); + op_ret = -1; + op_errno = EBUSY; + goto out; + } + + if (xdata) { + if (dict_get(xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) + update_atomic = _gf_true; + } + + /* + * The update_atomic option is to instruct posix to do prestat, + * write and poststat atomically. This is to prevent any modification to + * ia_size and ia_blocks until poststat and the diff in their values + * between pre and poststat could be of use for some translators. + * This is similar to the atomic write operation. atmoic write is + * (i.e. prestat + write + poststat) used by shard as of now. In case, + * some xlator needs copy_file_range to be atomic from prestat and postat + * prespective (i.e. prestat + copy_file_range + poststat) then it has + * to send "GLUSTERFS_WRITE_UPDATE_ATOMIC" key in xdata. + */ + + op_ret = posix_inode_ctx_get_all(fd_out->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + if (update_atomic) { + ret = pthread_mutex_lock(&ctx->write_atomic_lock); + if (!ret) + locked = _gf_true; + else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_MUTEX_FAILED, + "failed to hold write atomic lock on %s", + uuid_utoa(fd_out->inode->gfid)); + goto out; + } + } + + op_ret = posix_fdstat(this, fd_out->inode, _fd_out, &preop_dst); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd_out); + goto out; + } + + /* + * Since, only the destination file (fd_out) is undergoing + * modification, the write related tests are done on that. + * i.e. this is treater similar to as if the destination file + * undergoing write fop from maintenance perspective. + */ + if (xdata) { + op_ret = posix_cs_maintenance(this, fd_out, NULL, &_fd_out, &preop_dst, + NULL, xdata, &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd_out); + op_errno = EIO; + goto out; + } + } + + /* + * NOTE: This is just doing a single execution of copy_file_range + * system call. If the returned value of this system call is less + * than len, then should we keep doing it in a for loop until the + * copy_file_range of all the len bytes is done? + * Check the example program provided in the man page of + * copy_file_range. + * If so, then a separate variables for both off_in and off_out + * should be used which are initialized to off_in and off_out + * that this function call receives, but then advanced by the + * value returned by sys_copy_file_range and then use that as + * off_in and off_out for next instance of copy_file_range execution. + */ + op_ret = sys_copy_file_range(_fd_in, &off_in, _fd_out, &off_out, len, + flags); + + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_COPY_FILE_RANGE_FAILED, + "copy_file_range failed: fd_in: %p (gfid: %s) ," + " fd_out %p (gfid:%s)", + fd_in, uuid_utoa(fd_in->inode->gfid), fd_out, + uuid_utoa(fd_out->inode->gfid)); + goto out; + } + + /* + * Let this be as it is for now. This function collects + * infomration such as open fd count etc. So, even though + * is_append does not apply to copy_file_range, for now, + * allowing it to be recorded in the dict as _gf_false. + */ + rsp_xdata = _fill_writev_xdata(fd_out, xdata, this, is_append); + + /* copy_file_range successful, we also need to get the stat of + * the file we wrote to (i.e. destination file or fd_out). + */ + ret = posix_fdstat(this, fd_out->inode, _fd_out, &postop_dst); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd_out); + goto out; + } + + /* + * Also perform the stat on the source fd (i.e. fd_in). For now, + * allowing it to be done within the locked region if the request + * is for atomic operation (and update) of copy_file_range. + */ + ret = posix_fdstat(this, fd_in->inode, _fd_in, &stbuf); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd_in); + goto out; + } + + /* + * The core logic of what time attributes are to be updated + * on a fop is decided at client side xlator utime. + * All the remaining fops call posix_set_ctime function + * to update the {a,m,c}time. But, for all the other fops, + * the operation is happening on only one file (or inode). + * But here, there are 2 fds (source and destination). Hence + * the new function below to update the appropriate times for + * both the source and the destination file. + * For the source file, if at all anything has to be updated, + * it would be atime (as that file is only read, not updated). + * For the destination file, the attributes that require the + * modification would be mtime and ctime. + * What times have to be changed is actually determined by + * utime xlator. But, all of them would be in frame->root->flags. + * So, currently posix assumes that, the atime flag is for + * the source file and the other 2 flags are for the destination + * file. Since, the assumption is rigid (i.e. atime for source + * and {m,c}time for destination), the below function is called + * posix_set_ctime_cfr (cfr standing for copy_file_range). + * FUTURE TODO: + * In future, some other functionality or fop might operate + * simultaneously on 2 files. Then, depending upon what that new + * fop does or what are its requirements, the below function might + * require changes to become generic for consumption in case of + * simultaneous operations on 2 files. + */ + posix_set_ctime_cfr(frame, this, NULL, pfd_in->fd, fd_in->inode, &stbuf, + NULL, pfd_out->fd, fd_out->inode, &postop_dst); + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + /* + * Record copy_file_range in priv->write_value for now. + * If not needed, remove below section of code along with + * this comment (or add comment to explain why it is not + * needed). + */ + LOCK(&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK(&priv->lock); + +out: + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, &stbuf, + &preop_dst, &postop_dst, rsp_xdata); + + if (rsp_xdata) + dict_unref(rsp_xdata); + return 0; +} + int32_t posix_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { diff --git a/xlators/storage/posix/src/posix-messages.h b/xlators/storage/posix/src/posix-messages.h index 62af32ac8fe..928723db8f9 100644 --- a/xlators/storage/posix/src/posix-messages.h +++ b/xlators/storage/posix/src/posix-messages.h @@ -67,6 +67,7 @@ GLFS_MSGID(POSIX, P_MSG_XATTR_FAILED, P_MSG_NULL_GFID, P_MSG_FCNTL_FAILED, P_MSG_ANCESTORY_FAILED, P_MSG_DISK_SPACE_CHECK_FAILED, P_MSG_FALLOCATE_FAILED, P_MSG_STOREMDATA_FAILED, P_MSG_FETCHMDATA_FAILED, P_MSG_GETMDATA_FAILED, - P_MSG_SETMDATA_FAILED, P_MSG_FRESHFILE); + P_MSG_SETMDATA_FAILED, P_MSG_FRESHFILE, P_MSG_MUTEX_FAILED, + P_MSG_COPY_FILE_RANGE_FAILED); #endif /* !_GLUSTERD_MESSAGES_H_ */ diff --git a/xlators/storage/posix/src/posix-metadata.c b/xlators/storage/posix/src/posix-metadata.c index 26fae2019b5..62669a0b83f 100644 --- a/xlators/storage/posix/src/posix-metadata.c +++ b/xlators/storage/posix/src/posix-metadata.c @@ -663,3 +663,81 @@ posix_set_parent_ctime(call_frame_t *frame, xlator_t *this, out: return; } + +void +posix_set_ctime_cfr(call_frame_t *frame, xlator_t *this, + const char *real_path_in, int fd_in, inode_t *inode_in, + struct iatt *stbuf_in, const char *real_path_out, + int fd_out, inode_t *inode_out, struct iatt *stbuf_out) +{ + posix_mdata_flag_t flag = { + 0, + }; + posix_mdata_flag_t flag_dup = { + 0, + }; + int ret = 0; + struct posix_private *priv = NULL; + + priv = this->private; + + if (priv->ctime) { + (void)posix_get_mdata_flag(frame->root->flags, &flag); + if ((flag.ctime == 0) && (flag.mtime == 0) && (flag.atime == 0)) { + goto out; + } + + if (frame->root->ctime.tv_sec == 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed, No ctime : in: %s gfid_in:%s " + "out: %s gfid_out:%s", + real_path_in, + inode_in ? uuid_utoa(inode_in->gfid) : "No inode", + real_path_out, + inode_out ? uuid_utoa(inode_out->gfid) : "No inode"); + goto out; + } + + flag_dup = flag; + + /* + * For the destination file, no need to update atime. + * It got modified. Hence the things that need to be + * changed are mtime and ctime (provided the utime + * xlator from the client has set those flags, which + * are just copied to flag_dup). + */ + if (flag.atime) + flag_dup.atime = 0; + + ret = posix_set_mdata_xattr(this, real_path_out, fd_out, inode_out, + &frame->root->ctime, stbuf_out, &flag_dup, + _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed on file: %s gfid:%s", real_path_out, + inode_out ? uuid_utoa(inode_out->gfid) : "No inode"); + } + + /* + * For the source file, no need to change the mtime and ctime. + * For source file, it is only read operation. So, if at all + * anything needs to be updated, it is only the atime. + */ + if (flag.atime) + flag_dup.atime = flag.atime; + flag_dup.mtime = 0; + flag_dup.ctime = 0; + + ret = posix_set_mdata_xattr(this, real_path_in, fd_out, inode_out, + &frame->root->ctime, stbuf_out, &flag_dup, + _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed on file: %s gfid:%s", real_path_in, + inode_in ? uuid_utoa(inode_in->gfid) : "No inode"); + } + } +out: + return; +} diff --git a/xlators/storage/posix/src/posix-metadata.h b/xlators/storage/posix/src/posix-metadata.h index e1b549d55a1..3416148ea97 100644 --- a/xlators/storage/posix/src/posix-metadata.h +++ b/xlators/storage/posix/src/posix-metadata.h @@ -48,5 +48,10 @@ void posix_set_parent_ctime(call_frame_t *frame, xlator_t *this, const char *real_path, int fd, inode_t *inode, struct iatt *stbuf); +void +posix_set_ctime_cfr(call_frame_t *frame, xlator_t *this, + const char *real_path_in, int fd_in, inode_t *inode_in, + struct iatt *stbuf_in, const char *read_path_put, + int fd_out, inode_t *inode_out, struct iatt *stbuf_out); #endif /* _POSIX_METADATA_H */ diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index d6a20135f96..42b965434b9 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -76,6 +76,7 @@ struct xlator_fops fops = { .seek = posix_seek, .lease = posix_lease, .put = posix_put, + .copy_file_range = posix_copy_file_range, }; struct xlator_cbks cbks = { diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 1f1d4fc2774..a1ec996f4b2 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -637,6 +637,11 @@ posix_put(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, mode_t umask, uint32_t flags, struct iovec *vector, int32_t count, off_t offset, struct iobref *iobref, dict_t *xattr, dict_t *xdata); +int32_t +posix_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off64_t off_in, fd_t *fd_out, off64_t off_out, size_t len, + uint32_t flags, dict_t *xdata); + int32_t posix_set_mode_in_dict(dict_t *in_dict, dict_t *out_dict, struct iatt *in_stbuf); @@ -656,5 +661,7 @@ int posix_cs_maintenance(xlator_t *this, fd_t *fd, loc_t *loc, int *pfd, struct iatt *buf, const char *realpath, dict_t *xattr_req, dict_t **xattr_rsp, gf_boolean_t ignore_failure); +int +posix_check_dev_file(xlator_t *this, inode_t *inode, char *fop, int *op_errno); #endif /* _POSIX_H */ -- cgit