/* Copyright (c) 2018 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include "xlator.h" #include "posix-metadata.h" #include "posix-metadata-disk.h" #include "posix-handle.h" #include "posix-messages.h" #include "syscall.h" #include "compat-errno.h" #include "compat.h" static int gf_posix_xattr_enotsup_log; /* posix_mdata_to_disk converts posix_mdata_t into network byte order to * save it on disk in machine independent format */ static inline void posix_mdata_to_disk(posix_mdata_disk_t *out, posix_mdata_t *in) { out->version = in->version; out->flags = htobe64(in->flags); out->ctime.tv_sec = htobe64(in->ctime.tv_sec); out->ctime.tv_nsec = htobe64(in->ctime.tv_nsec); out->mtime.tv_sec = htobe64(in->mtime.tv_sec); out->mtime.tv_nsec = htobe64(in->mtime.tv_nsec); out->atime.tv_sec = htobe64(in->atime.tv_sec); out->atime.tv_nsec = htobe64(in->atime.tv_nsec); } /* posix_mdata_from_disk converts posix_mdata_disk_t into host byte order */ static inline void posix_mdata_from_disk(posix_mdata_t *out, posix_mdata_disk_t *in) { out->version = in->version; out->flags = be64toh(in->flags); out->ctime.tv_sec = be64toh(in->ctime.tv_sec); out->ctime.tv_nsec = be64toh(in->ctime.tv_nsec); out->mtime.tv_sec = be64toh(in->mtime.tv_sec); out->mtime.tv_nsec = be64toh(in->mtime.tv_nsec); out->atime.tv_sec = be64toh(in->atime.tv_sec); out->atime.tv_nsec = be64toh(in->atime.tv_nsec); } /* posix_fetch_mdata_xattr fetches the posix_mdata_t from disk */ static int posix_fetch_mdata_xattr(xlator_t *this, const char *real_path_arg, int _fd, inode_t *inode, posix_mdata_t *metadata, int *op_errno) { size_t size = -1; int op_ret = -1; char *value = NULL; gf_boolean_t fd_based_fop = _gf_false; char gfid_str[64] = {0}; char *real_path = NULL; char *key = GF_XATTR_MDATA_KEY; if (!metadata) { op_ret = -1; goto out; } if (_fd != -1) { fd_based_fop = _gf_true; } if (!(fd_based_fop || real_path_arg)) { MAKE_HANDLE_PATH(real_path, this, inode->gfid, NULL); if (!real_path) { uuid_utoa_r(inode->gfid, gfid_str); gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_LSTAT_FAILED, "lstat on gfid %s failed", gfid_str); op_ret = -1; *op_errno = errno; goto out; } } if (fd_based_fop) { size = sys_fgetxattr(_fd, key, NULL, 0); } else if (real_path_arg) { size = sys_lgetxattr(real_path_arg, key, NULL, 0); } else if (real_path) { size = sys_lgetxattr(real_path, key, NULL, 0); } if (size == -1) { *op_errno = errno; if ((*op_errno == ENOTSUP) || (*op_errno == ENOSYS)) { GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " "supported (try remounting" " brick with 'user_xattr' " "flag)"); } else if (*op_errno == ENOATTR || *op_errno == ENODATA) { gf_msg_debug(this->name, 0, "No such attribute:%s for file %s " "gfid: %s", key, real_path ? real_path : (real_path_arg ? real_path_arg : "null"), uuid_utoa(inode->gfid)); } else { gf_msg(this->name, GF_LOG_DEBUG, *op_errno, P_MSG_XATTR_FAILED, "getxattr failed" " on %s gfid: %s key: %s ", real_path ? real_path : (real_path_arg ? real_path_arg : "null"), uuid_utoa(inode->gfid), key); } op_ret = -1; goto out; } value = GF_CALLOC(size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_ret = -1; *op_errno = ENOMEM; goto out; } if (fd_based_fop) { size = sys_fgetxattr(_fd, key, value, size); } else if (real_path_arg) { size = sys_lgetxattr(real_path_arg, key, value, size); } else if (real_path) { size = sys_lgetxattr(real_path, key, value, size); } if (size == -1) { op_ret = -1; *op_errno = errno; gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, "getxattr failed on " " on %s gfid: %s key: %s ", real_path ? real_path : (real_path_arg ? real_path_arg : "null"), uuid_utoa(inode->gfid), key); goto out; } posix_mdata_from_disk(metadata, (posix_mdata_disk_t *)value); op_ret = 0; out: GF_FREE(value); return op_ret; } /* posix_store_mdata_xattr stores the posix_mdata_t on disk */ static int posix_store_mdata_xattr(xlator_t *this, const char *real_path_arg, int fd, inode_t *inode, posix_mdata_t *metadata) { char *real_path = NULL; int op_ret = 0; gf_boolean_t fd_based_fop = _gf_false; char *key = GF_XATTR_MDATA_KEY; char gfid_str[64] = {0}; posix_mdata_disk_t disk_metadata; if (!metadata) { op_ret = -1; goto out; } if (fd != -1) { fd_based_fop = _gf_true; } if (!(fd_based_fop || real_path_arg)) { MAKE_HANDLE_PATH(real_path, this, inode->gfid, NULL); if (!real_path) { uuid_utoa_r(inode->gfid, gfid_str); gf_msg(this->name, GF_LOG_DEBUG, errno, P_MSG_LSTAT_FAILED, "lstat on gfid %s failed", gfid_str); op_ret = -1; goto out; } } /* Set default version as 1 */ posix_mdata_to_disk(&disk_metadata, metadata); if (fd_based_fop) { op_ret = sys_fsetxattr(fd, key, (void *)&disk_metadata, sizeof(posix_mdata_disk_t), 0); } else if (real_path_arg) { op_ret = sys_lsetxattr(real_path_arg, key, (void *)&disk_metadata, sizeof(posix_mdata_disk_t), 0); } else if (real_path) { op_ret = sys_lsetxattr(real_path, key, (void *)&disk_metadata, sizeof(posix_mdata_disk_t), 0); } #ifdef GF_DARWIN_HOST_OS if (real_path_arg) { posix_dump_buffer(this, real_path_arg, key, value, 0); } else if (real_path) { posix_dump_buffer(this, real_path, key, value, 0); } #endif out: if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, "file: %s: gfid: %s key:%s ", real_path ? real_path : (real_path_arg ? real_path_arg : "null"), uuid_utoa(inode->gfid), key); } return op_ret; } /* _posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails * to get it from inode context, gets it from disk. This is with out inode lock. */ int __posix_get_mdata_xattr(xlator_t *this, const char *real_path, int _fd, inode_t *inode, struct iatt *stbuf) { posix_mdata_t *mdata = NULL; int ret = -1; int op_errno = 0; GF_VALIDATE_OR_GOTO(this->name, inode, out); ret = __inode_ctx_get1(inode, this, (uint64_t *)&mdata); if (ret == -1 || !mdata) { mdata = GF_CALLOC(1, sizeof(posix_mdata_t), gf_posix_mt_mdata_attr); if (!mdata) { ret = -1; goto out; } ret = posix_fetch_mdata_xattr(this, real_path, _fd, inode, mdata, &op_errno); if (ret == 0) { /* Got mdata from disk, set it in inode ctx. This case * is hit when in-memory status is lost due to brick * down scenario */ __inode_ctx_set1(inode, this, (uint64_t *)&mdata); } else { /* Failed to get mdata from disk, xattr missing. * This happens on two cases. * 1. File is created before ctime is enabled. * 2. On new file creation. * * Do nothing, just return success. It is as * good as ctime feature is not enabled for this * file. For files created before ctime is enabled, * time attributes gets updated into ctime structure * once the metadata modification fop happens and * time attributes become consistent eventually. * For new files, it would obviously get updated * before the fop completion. */ if (stbuf && op_errno != ENOENT) { ret = 0; goto out; } else { /* This case should not be hit. If it hits, * don't fail, log warning, free mdata and move * on */ gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_FETCHMDATA_FAILED, "file: %s: gfid: %s key:%s ", real_path ? real_path : "null", uuid_utoa(inode->gfid), GF_XATTR_MDATA_KEY); GF_FREE(mdata); ret = 0; goto out; } } } ret = 0; if (ret == 0 && stbuf) { stbuf->ia_ctime = mdata->ctime.tv_sec; stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec; stbuf->ia_mtime = mdata->mtime.tv_sec; stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec; stbuf->ia_atime = mdata->atime.tv_sec; stbuf->ia_atime_nsec = mdata->atime.tv_nsec; } out: return ret; } /* posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails * to get it from inode context, gets it from disk. This is with inode lock. */ int posix_get_mdata_xattr(xlator_t *this, const char *real_path, int _fd, inode_t *inode, struct iatt *stbuf) { int ret = -1; GF_VALIDATE_OR_GOTO(this->name, inode, out); LOCK(&inode->lock); { ret = __posix_get_mdata_xattr(this, real_path, _fd, inode, stbuf); } UNLOCK(&inode->lock); out: return ret; } static int posix_compare_timespec(struct timespec *first, struct timespec *second) { if (first->tv_sec == second->tv_sec) return first->tv_nsec - second->tv_nsec; else return first->tv_sec - second->tv_sec; } /* posix_set_mdata_xattr updates the posix_mdata_t based on the flag * in inode context and stores it on disk */ static int posix_set_mdata_xattr(xlator_t *this, const char *real_path, int fd, inode_t *inode, struct timespec *time, struct iatt *stbuf, posix_mdata_flag_t *flag, gf_boolean_t update_utime) { posix_mdata_t *mdata = NULL; int ret = -1; int op_errno = 0; GF_VALIDATE_OR_GOTO("posix", this, out); GF_VALIDATE_OR_GOTO(this->name, inode, out); LOCK(&inode->lock); { ret = __inode_ctx_get1(inode, this, (uint64_t *)&mdata); if (ret == -1 || !mdata) { /* * Do we need to fetch the data from xattr * If we does we can compare the value and store * the largest data in inode ctx. */ mdata = GF_CALLOC(1, sizeof(posix_mdata_t), gf_posix_mt_mdata_attr); if (!mdata) { ret = -1; goto unlock; } ret = posix_fetch_mdata_xattr(this, real_path, fd, inode, (void *)mdata, &op_errno); if (ret == 0) { /* Got mdata from disk, set it in inode ctx. This case * is hit when in-memory status is lost due to brick * down scenario */ __inode_ctx_set1(inode, this, (uint64_t *)&mdata); } else if (ret && time) { /* * This is the first time creating the time * attr. This happens when you activate this * feature, and the legacy file will not have * any xattr set. * * New files will create extended attributes. */ /* * TODO: This is wrong approach, because before * creating fresh xattr, we should consult * to all replica and/or distribution set. * * We should contact the time management * xlators, and ask them to create an xattr. */ /* We should not be relying on backend file's * time attributes to load the initial ctime * time attribute structure. This is incorrect * as each replica set would have witnessed the * file creation at different times. * * For new file creation, ctime, atime and mtime * should be same, hence initiate the ctime * structure with the time from the frame. But * for the files which were created before ctime * feature is enabled, this is not accurate but * still fine as the times would get eventually * accurate. */ mdata->version = 1; mdata->flags = 0; mdata->ctime.tv_sec = time->tv_sec; mdata->ctime.tv_nsec = time->tv_nsec; mdata->atime.tv_sec = time->tv_sec; mdata->atime.tv_nsec = time->tv_nsec; mdata->mtime.tv_sec = time->tv_sec; mdata->mtime.tv_nsec = time->tv_nsec; __inode_ctx_set1(inode, this, (uint64_t *)&mdata); } } /* Earlier, mdata was updated only if the existing time is less * than the time to be updated. This would fail the scenarios * where mtime can be set to any time using the syscall. Hence * just updating without comparison. But the ctime is not * allowed to changed to older date. */ if (flag->ctime && posix_compare_timespec(time, &mdata->ctime) > 0) { mdata->ctime = *time; } /* In distributed systems, there could be races with fops * updating mtime/atime which could result in different * mtime/atime for same file. So this makes sure, only the * highest time is retained. If the mtime/atime update comes * from the explicit utime syscall, it is allowed to set to * previous time */ if (update_utime) { if (flag->mtime) { mdata->mtime = *time; } if (flag->atime) { mdata->atime = *time; } } else { if (flag->mtime && posix_compare_timespec(time, &mdata->mtime) > 0) { mdata->mtime = *time; } if (flag->atime && posix_compare_timespec(time, &mdata->atime) > 0) { mdata->atime = *time; } } if (inode->ia_type == IA_INVAL) { /* * TODO: This is non-linked inode. So we have to sync the * data into backend. Because inode_link may return * a different inode. */ /* ret = posix_store_mdata_xattr (this, loc, fd, * mdata); */ } /* * With this patch set, we are setting the xattr for each update * We should evaluate the performance, and based on that we can * decide on asynchronous updation. */ ret = posix_store_mdata_xattr(this, real_path, fd, inode, mdata); if (ret) { gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_STOREMDATA_FAILED, "file: %s: gfid: %s key:%s ", real_path ? real_path : "null", uuid_utoa(inode->gfid), GF_XATTR_MDATA_KEY); goto unlock; } } unlock: UNLOCK(&inode->lock); out: if (ret == 0 && stbuf) { stbuf->ia_ctime = mdata->ctime.tv_sec; stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec; stbuf->ia_mtime = mdata->mtime.tv_sec; stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec; stbuf->ia_atime = mdata->atime.tv_sec; stbuf->ia_atime_nsec = mdata->atime.tv_nsec; } return ret; } /* posix_update_utime_in_mdata updates the posix_mdata_t when mtime/atime * is modified using syscall */ void posix_update_utime_in_mdata(xlator_t *this, const char *real_path, int fd, inode_t *inode, struct iatt *stbuf, int valid) { int32_t ret = 0; #if defined(HAVE_UTIMENSAT) struct timespec tv = { 0, }; #else struct timeval tv = { 0, }; #endif posix_mdata_flag_t flag = { 0, }; struct posix_private *priv = NULL; priv = this->private; if (inode && priv->ctime) { if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) { tv.tv_sec = stbuf->ia_atime; SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, stbuf->ia_atime_nsec); flag.ctime = 0; flag.mtime = 0; flag.atime = 1; ret = posix_set_mdata_xattr(this, real_path, -1, inode, &tv, NULL, &flag, _gf_true); if (ret) { gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, "posix set mdata atime failed on file:" " %s gfid:%s", real_path, uuid_utoa(inode->gfid)); } } if ((valid & GF_SET_ATTR_MTIME) == GF_SET_ATTR_MTIME) { tv.tv_sec = stbuf->ia_mtime; SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, stbuf->ia_mtime_nsec); flag.ctime = 1; flag.mtime = 1; flag.atime = 0; ret = posix_set_mdata_xattr(this, real_path, -1, inode, &tv, NULL, &flag, _gf_true); if (ret) { gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, "posix set mdata mtime failed on file:" " %s gfid:%s", real_path, uuid_utoa(inode->gfid)); } } } else { gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, "posix utime set mdata failed on file"); } return; } static void posix_get_mdata_flag(uint64_t flags, posix_mdata_flag_t *flag) { if (!flag) return; flag->ctime = 0; flag->atime = 0; flag->mtime = 0; if (flags & MDATA_CTIME) flag->ctime = 1; if (flags & MDATA_MTIME) flag->mtime = 1; if (flags & MDATA_ATIME) flag->atime = 1; } static void posix_get_parent_mdata_flag(uint64_t flags, posix_mdata_flag_t *flag) { if (!flag) return; flag->ctime = 0; flag->atime = 0; flag->mtime = 0; if (flags & MDATA_PAR_CTIME) flag->ctime = 1; if (flags & MDATA_PAR_MTIME) flag->mtime = 1; if (flags & MDATA_PAR_ATIME) flag->atime = 1; } void posix_set_ctime(call_frame_t *frame, xlator_t *this, const char *real_path, int fd, inode_t *inode, struct iatt *stbuf) { posix_mdata_flag_t flag = { 0, }; int ret = 0; struct posix_private *priv = NULL; priv = this->private; if (priv->ctime) { (void)posix_get_mdata_flag(frame->root->flags, &flag); if ((flag.ctime == 0) && (flag.mtime == 0) && (flag.atime == 0)) { goto out; } if (frame->root->ctime.tv_sec == 0) { gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, "posix set mdata failed, No ctime : %s gfid:%s", real_path, inode ? uuid_utoa(inode->gfid) : "No inode"); goto out; } ret = posix_set_mdata_xattr(this, real_path, fd, inode, &frame->root->ctime, stbuf, &flag, _gf_false); if (ret) { gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, "posix set mdata failed on file: %s gfid:%s", real_path, inode ? uuid_utoa(inode->gfid) : "No inode"); } } out: return; } void posix_set_parent_ctime(call_frame_t *frame, xlator_t *this, const char *real_path, int fd, inode_t *inode, struct iatt *stbuf) { posix_mdata_flag_t flag = { 0, }; int ret = 0; struct posix_private *priv = NULL; priv = this->private; if (inode && priv->ctime) { (void)posix_get_parent_mdata_flag(frame->root->flags, &flag); if ((flag.ctime == 0) && (flag.mtime == 0) && (flag.atime == 0)) { goto out; } ret = posix_set_mdata_xattr(this, real_path, fd, inode, &frame->root->ctime, stbuf, &flag, _gf_false); if (ret) { gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, "posix set mdata failed on file: %s gfid:%s", real_path, uuid_utoa(inode->gfid)); } } out: return; }