diff options
Diffstat (limited to 'xlators/storage/posix/src/posix-metadata.c')
-rw-r--r-- | xlators/storage/posix/src/posix-metadata.c | 510 |
1 files changed, 510 insertions, 0 deletions
diff --git a/xlators/storage/posix/src/posix-metadata.c b/xlators/storage/posix/src/posix-metadata.c new file mode 100644 index 00000000000..4e75a4f1411 --- /dev/null +++ b/xlators/storage/posix/src/posix-metadata.c @@ -0,0 +1,510 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "xlator.h" +#include "posix-metadata.h" +#include "posix-metadata-disk.h" +#include "posix-handle.h" +#include "posix-messages.h" +#include "syscall.h" +#include "compat-errno.h" +#include "compat.h" + +static int gf_posix_xattr_enotsup_log; + +/* posix_mdata_to_disk converts posix_mdata_t into network byte order to + * save it on disk in machine independant format + */ +static inline void +posix_mdata_to_disk (posix_mdata_disk_t *out, posix_mdata_t *in) +{ + out->version = in->version; + out->flags = htobe64(in->flags); + + out->ctime.tv_sec = htobe64(in->ctime.tv_sec); + out->ctime.tv_nsec = htobe64(in->ctime.tv_nsec); + + out->mtime.tv_sec = htobe64(in->mtime.tv_sec); + out->mtime.tv_nsec = htobe64(in->mtime.tv_nsec); + + out->atime.tv_sec = htobe64(in->atime.tv_sec); + out->atime.tv_nsec = htobe64(in->atime.tv_nsec); +} + +/* posix_mdata_from_disk converts posix_mdata_disk_t into host byte order + */ +static inline void +posix_mdata_from_disk (posix_mdata_t *out, posix_mdata_disk_t *in) +{ + out->version = in->version; + out->flags = be64toh(in->flags); + + out->ctime.tv_sec = be64toh(in->ctime.tv_sec); + out->ctime.tv_nsec = be64toh(in->ctime.tv_nsec); + + out->mtime.tv_sec = be64toh(in->mtime.tv_sec); + out->mtime.tv_nsec = be64toh(in->mtime.tv_nsec); + + out->atime.tv_sec = be64toh(in->atime.tv_sec); + out->atime.tv_nsec = be64toh(in->atime.tv_nsec); +} + +/* posix_fetch_mdata_xattr fetches the posix_mdata_t from disk */ +static int +posix_fetch_mdata_xattr (xlator_t *this, const char *real_path_arg, int _fd, + inode_t *inode, posix_mdata_t *metadata) +{ + size_t size = -1; + int op_errno = 0; + int op_ret = -1; + char *value = NULL; + gf_boolean_t fd_based_fop = _gf_false; + char gfid_str[64] = {0}; + char *real_path = NULL; + + char *key = GF_XATTR_MDATA_KEY; + + if (!metadata) { + op_ret = -1; + goto out; + } + + if (_fd != -1) { + fd_based_fop = _gf_true; + } + if (!(fd_based_fop || real_path_arg)) { + MAKE_HANDLE_PATH (real_path, this, inode->gfid, NULL); + if (!real_path) { + uuid_utoa_r (inode->gfid, gfid_str); + gf_msg (this->name, GF_LOG_WARNING, op_errno, + P_MSG_LSTAT_FAILED, "lstat on gfid %s failed", + gfid_str); + op_ret = -1; + goto out; + } + } + + if (fd_based_fop) { + size = sys_fgetxattr (_fd, key, NULL, 0); + } else if (real_path_arg) { + size = sys_lgetxattr (real_path_arg, key, NULL, 0); + } else if (real_path) { + size = sys_lgetxattr (real_path, key, NULL, 0); + } + + if (size == -1) { + op_errno = errno; + if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_msg_debug (this->name, 0, + "No such attribute:%s for file %s " + "gfid: %s", + key, real_path ? real_path : (real_path_arg ? real_path_arg : "null"), + uuid_utoa(inode->gfid)); + } else { + gf_msg (this->name, GF_LOG_DEBUG, op_errno, + P_MSG_XATTR_FAILED, "getxattr failed" + " on %s gfid: %s key: %s ", + real_path ? real_path : (real_path_arg ? real_path_arg : "null"), + uuid_utoa(inode->gfid), key); + } + op_ret = -1; + goto out; + } + + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); + if (!value) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + if (fd_based_fop) { + size = sys_fgetxattr (_fd, key, value, size); + } else if (real_path_arg) { + size = sys_lgetxattr (real_path_arg, key, value, size); + } else if (real_path) { + size = sys_lgetxattr (real_path, key, value, size); + } + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg (this->name, GF_LOG_ERROR, errno, + P_MSG_XATTR_FAILED, "getxattr failed on " + " on %s gfid: %s key: %s ", + real_path ? real_path : (real_path_arg ? real_path_arg : "null"), + uuid_utoa(inode->gfid), key); + goto out; + } + + posix_mdata_from_disk (metadata, (posix_mdata_disk_t*)value); + + op_ret = 0; +out: + GF_FREE (value); + return op_ret; +} + +/* posix_store_mdata_xattr stores the posix_mdata_t on disk */ +static int +posix_store_mdata_xattr (xlator_t *this, const char *real_path_arg, int fd, + inode_t *inode, posix_mdata_t *metadata) +{ + char *real_path = NULL; + int op_ret = 0; + gf_boolean_t fd_based_fop = _gf_false; + char *key = GF_XATTR_MDATA_KEY; + char gfid_str[64] = {0}; + posix_mdata_disk_t disk_metadata; + + if (!metadata) { + op_ret = -1; + goto out; + } + + if (fd != -1) { + fd_based_fop = _gf_true; + } + if (!(fd_based_fop || real_path_arg)) { + MAKE_HANDLE_PATH (real_path, this, inode->gfid, NULL); + if (!real_path) { + uuid_utoa_r (inode->gfid, gfid_str); + gf_msg (this->name, GF_LOG_DEBUG, errno, + P_MSG_LSTAT_FAILED, "lstat on gfid %s failed", + gfid_str); + op_ret = -1; + goto out; + } + } + + /* Set default version as 1 */ + posix_mdata_to_disk (&disk_metadata, metadata); + + if (fd_based_fop) { + op_ret = sys_fsetxattr (fd, key, + (void *) &disk_metadata, + sizeof (posix_mdata_disk_t), 0); + } else if (real_path_arg) { + op_ret = sys_lsetxattr (real_path_arg, key, + (void *) &disk_metadata, + sizeof (posix_mdata_disk_t), 0); + } else if (real_path) { + op_ret = sys_lsetxattr (real_path, key, + (void *) &disk_metadata, + sizeof (posix_mdata_disk_t), 0); + } + +#ifdef GF_DARWIN_HOST_OS + if (real_path_arg) { + posix_dump_buffer(this, real_path_arg, key, value, 0); + } else if (real_path) { + posix_dump_buffer(this, real_path, key, value, 0); + } +#endif +out: + if (op_ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "file: %s: gfid: %s key:%s ", + real_path ? real_path : (real_path_arg ? real_path_arg : "null"), + uuid_utoa(inode->gfid), key); + } + return op_ret; +} + +/* _posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails + * to get it from inode context, gets it from disk. This is with out inode lock. + */ +int +__posix_get_mdata_xattr (xlator_t *this, const char *real_path, int _fd, + inode_t *inode, struct iatt *stbuf) +{ + posix_mdata_t *mdata = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = __inode_ctx_get1 (inode, this, + (uint64_t *)&mdata); + if (ret == -1 || !mdata) { + mdata = GF_CALLOC (1, sizeof (posix_mdata_t), + gf_posix_mt_mdata_attr); + if (!mdata) { + ret = -1; + goto out; + } + + ret = posix_fetch_mdata_xattr (this, real_path, _fd, inode, + mdata); + + if (ret == 0) { + /* Got mdata from disk, set it in inode ctx. This case + * is hit when in-memory status is lost due to brick + * down scenario + */ + __inode_ctx_set1 (inode, this, (uint64_t *)&mdata); + } else { + /* Failed to get mdata from disk, xattr missing + * Even new file creation hits here first as posix_pstat + * is generally done before posix_set_ctime + */ + if (stbuf) { + mdata->version = 1; + mdata->flags = 0; + mdata->ctime.tv_sec = stbuf->ia_ctime; + mdata->ctime.tv_nsec = stbuf->ia_ctime_nsec; + mdata->mtime.tv_sec = stbuf->ia_mtime; + mdata->mtime.tv_nsec = stbuf->ia_mtime_nsec; + mdata->atime.tv_sec = stbuf->ia_atime; + mdata->atime.tv_nsec = stbuf->ia_atime_nsec; + ret = posix_store_mdata_xattr (this, real_path, + _fd, inode, + mdata); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, errno, + P_MSG_STOREMDATA_FAILED, + "file: %s: gfid: %s key:%s ", + real_path ? real_path : "null", + uuid_utoa(inode->gfid), + GF_XATTR_MDATA_KEY); + goto out; + } + __inode_ctx_set1 (inode, this, (uint64_t *)&mdata); + } else { + /* This case should not be hit. If it hits, don't + * fail, log warning, free mdata and move on + */ + gf_msg (this->name, GF_LOG_WARNING, errno, + P_MSG_FETCHMDATA_FAILED, + "file: %s: gfid: %s key:%s ", + real_path ? real_path : "null", + uuid_utoa(inode->gfid), + GF_XATTR_MDATA_KEY); + GF_FREE (mdata); + ret = 0; + goto out; + } + } + } + + ret = 0; + + if (ret == 0 && stbuf) { + stbuf->ia_ctime = mdata->ctime.tv_sec; + stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec; + stbuf->ia_mtime = mdata->mtime.tv_sec; + stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec; + stbuf->ia_atime = mdata->atime.tv_sec; + stbuf->ia_atime_nsec = mdata->atime.tv_nsec; + } + +out: + return ret; +} + +/* posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails + * to get it from inode context, gets it from disk. This is with inode lock. + */ +int +posix_get_mdata_xattr (xlator_t *this, const char *real_path, int _fd, + inode_t *inode, struct iatt *stbuf) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + LOCK (&inode->lock); + { + ret = __posix_get_mdata_xattr (this, real_path, _fd, inode, stbuf); + } + UNLOCK (&inode->lock); + +out: + return ret; +} + +static int +posix_compare_timespec (struct timespec *first, struct timespec *second) +{ + if (first->tv_sec == second->tv_sec) + return first->tv_nsec - second->tv_nsec; + else + return first->tv_sec - second->tv_sec; +} + +/* posix_update_mdata_xattr updates the posix_mdata_t based on the flag + * in inode context and stores it on disk + */ +int +posix_set_mdata_xattr (xlator_t *this, const char *real_path, int fd, + inode_t *inode, struct timespec *time, + struct iatt *stbuf, posix_mdata_flag_t *flag) +{ + posix_mdata_t *mdata = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("posix", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, inode->gfid, out); + + LOCK (&inode->lock); + { + ret = __inode_ctx_get1 (inode, this, + (uint64_t *)&mdata); + if (ret == -1 || !mdata) { + /* + * Do we need to fetch the data from xattr + * If we does we can compare the value and store + * the largest data in inode ctx. + */ + mdata = GF_CALLOC (1, sizeof (posix_mdata_t), + gf_posix_mt_mdata_attr); + if (!mdata) { + ret = -1; + goto unlock; + } + + ret = posix_fetch_mdata_xattr (this, real_path, fd, + inode, + (void *)mdata); + if (ret == 0) { + /* Got mdata from disk, set it in inode ctx. This case + * is hit when in-memory status is lost due to brick + * down scenario + */ + __inode_ctx_set1 (inode, this, + (uint64_t *)&mdata); + } else if (ret && stbuf) { + /* + * This is the first time creating the time + * attr. This happens when you activate this + * feature, and the legacy file will not have + * any xattr set. + * + * New files will create extended attributes. + */ + + /* + * TODO: This is wrong approach, because before + * creating fresh xattr, we should consult + * to all replica and/or distribution set. + * + * We should contact the time management + * xlators, and ask them to create an xattr. + */ + mdata->version = 1; + mdata->flags = 0; + mdata->ctime.tv_sec = stbuf->ia_ctime; + mdata->ctime.tv_nsec = stbuf->ia_ctime_nsec; + mdata->atime.tv_sec = stbuf->ia_atime; + mdata->atime.tv_nsec = stbuf->ia_atime_nsec; + mdata->mtime.tv_sec = stbuf->ia_mtime; + mdata->mtime.tv_nsec = stbuf->ia_mtime_nsec; + + __inode_ctx_set1 (inode, this, + (uint64_t *)&mdata); + } + } + if (flag->ctime && + posix_compare_timespec (time, &mdata->ctime) > 0) { + mdata->ctime = *time; + } + if (flag->mtime && + posix_compare_timespec (time, &mdata->mtime) > 0) { + mdata->mtime = *time; + } + if (flag->atime && + posix_compare_timespec (time, &mdata->atime) > 0) { + mdata->atime = *time; + } + + if (inode->ia_type == IA_INVAL) { + /* + * TODO: This is non-linked inode. So we have to sync the + * data into backend. Because inode_link may return + * a different inode. + */ + /* ret = posix_store_mdata_xattr (this, loc, fd, + * mdata); */ + } + /* + * With this patch set, we are setting the xattr for each update + * We should evaluate the performance, and based on that we can + * decide on asynchronous updation. + */ + ret = posix_store_mdata_xattr (this, real_path, fd, inode, + mdata); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, errno, + P_MSG_STOREMDATA_FAILED, + "file: %s: gfid: %s key:%s ", + real_path ? real_path : "null", + uuid_utoa(inode->gfid), GF_XATTR_MDATA_KEY); + goto out; + } + } +unlock: + UNLOCK (&inode->lock); +out: + if (ret == 0 && stbuf) { + stbuf->ia_ctime = mdata->ctime.tv_sec; + stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec; + stbuf->ia_mtime = mdata->mtime.tv_sec; + stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec; + stbuf->ia_atime = mdata->atime.tv_sec; + stbuf->ia_atime_nsec = mdata->atime.tv_nsec; + } + + return ret; +} + +/* posix_update_utime_in_mdata updates the posix_mdata_t when mtime/atime + * is modified using syscall + */ +int +posix_update_utime_in_mdata (xlator_t *this, const char *real_path, int fd, + inode_t *inode, + struct iatt *stbuf, int valid) +{ + int32_t ret = -1; +#if defined(HAVE_UTIMENSAT) + struct timespec tv = {0, }; +#else + struct timeval tv = {0, }; +#endif + posix_mdata_flag_t flag = {0, }; + + if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) { + tv.tv_sec = stbuf->ia_atime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, stbuf->ia_atime_nsec); + + flag.ctime = 0; + flag.mtime = 0; + flag.atime = 1; + } + + if ((valid & GF_SET_ATTR_MTIME) == GF_SET_ATTR_MTIME) { + tv.tv_sec = stbuf->ia_mtime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, stbuf->ia_mtime_nsec); + flag.ctime = 1; + flag.mtime = 1; + flag.atime = 0; + } + + ret = posix_set_mdata_xattr (this, real_path, -1, inode, &tv, NULL, + &flag); + return ret; +} |