diff options
| author | Venky Shankar <vshankar@redhat.com> | 2015-02-09 18:28:21 +0530 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2015-03-24 06:37:19 -0700 | 
| commit | a20101e2e4d5f5595655544cfc798eb1d445638c (patch) | |
| tree | 0c3ceb7ba0ba3bba7e41663d728fb2f60208e5a8 /xlators/features/bit-rot/src | |
| parent | 3c474a042aed68659fe0cfdf32e01285bde9f689 (diff) | |
Bitrot Stub
Bitrot stub implements object versioning required for identifying
signature freshness. More details about versioning is explained
as a part of the "bitrot feature documentation" patch.
Change-Id: I2ad70d9eb109ba4a12148ab8d81336afda529ad9
BUG: 1170075
Signed-off-by: Venky Shankar <vshankar@redhat.com>
Reviewed-on: http://review.gluster.org/9709
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/features/bit-rot/src')
| -rw-r--r-- | xlators/features/bit-rot/src/Makefile.am | 1 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/stub/Makefile.am | 17 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-common.h | 165 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h | 24 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.c | 1428 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.h | 269 | 
6 files changed, 1904 insertions, 0 deletions
diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am new file mode 100644 index 00000000000..7581732c7d9 --- /dev/null +++ b/xlators/features/bit-rot/src/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = stub diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am new file mode 100644 index 00000000000..9abcbb76db2 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = bitrot-stub.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bitrot_stub_la_LDFLAGS = -module -avoid-version + +bitrot_stub_la_SOURCES = bit-rot-stub.c +bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ +	      -I$(top_srcdir)/xlators/features/changelog/lib/src + + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h new file mode 100644 index 00000000000..9e101523556 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -0,0 +1,165 @@ +/* +   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_COMMON_H__ +#define __BIT_ROT_COMMON_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" + +/** + * on-disk formats for ongoing version and object signature. + */ +typedef struct br_version { +        unsigned long ongoingversion; +        uint32_t timebuf[2]; +} br_version_t; + +typedef struct br_signature { +        int8_t signaturetype; + +        unsigned long signedversion; + +        char signature[0]; +} br_signature_t; + +#define BR_VXATTR_VERSION   (1 << 0) +#define BR_VXATTR_SIGNATURE (1 << 1) + +#define BR_VXATTR_ALL_MISSING                           \ +        (BR_VXATTR_VERSION | BR_VXATTR_SIGNATURE) + +typedef enum br_vxattr_state { +        BR_VXATTR_STATUS_MISSING = 0, +        BR_VXATTR_STATUS_PARTIAL = 1, +        BR_VXATTR_STATUS_FULL    = 2, +} br_vxattr_status_t; + +static inline br_vxattr_status_t +br_version_xattr_state (dict_t *xattr, +                        br_version_t **obuf, br_signature_t **sbuf) +{ +        int32_t             ret    = 0; +        int32_t             vxattr = 0; +        br_vxattr_status_t  status; + +        ret = dict_get_bin (xattr, BITROT_CURRENT_VERSION_KEY, (void **)obuf); +        if (ret) +                vxattr |= BR_VXATTR_VERSION; + +        ret = dict_get_bin (xattr, BITROT_SIGNING_VERSION_KEY, (void **)sbuf); +        if (ret) +                vxattr |= BR_VXATTR_SIGNATURE; + +        switch (vxattr) { +        case 0: +                status = BR_VXATTR_STATUS_FULL; +                break; +        case BR_VXATTR_ALL_MISSING: +                status = BR_VXATTR_STATUS_MISSING; +                break; +        default: +                status = BR_VXATTR_STATUS_PARTIAL; +        } + +        return status; +} + +/** + * in-memory representation of signature used by signer for object + * signing. + */ +typedef struct br_isignature_in { +        int8_t signaturetype;            /* signature type            */ + +        unsigned long signedversion;     /* version against which the +                                            object was signed         */ + +        char signature[0];               /* object signature          */ +} br_isignature_t; + +/** + * in-memory representation of signature used by scrubber for object + * verification. + */ +typedef struct br_isignature_out { +        char stale;                      /* stale signature?          */ + +        uint32_t time[2];                /* time when the object +                                            got dirtied               */ + +        int8_t signaturetype;            /* hash type                 */ +        char signature[0];               /* signature (hash)          */ +} br_isignature_out_t; + +typedef struct br_stub_init { +        uint32_t timebuf[2]; +        char export[PATH_MAX]; +} br_stub_init_t; + +typedef enum { +        BR_SIGNATURE_TYPE_VOID   = -1,   /* object is not signed       */ +        BR_SIGNATURE_TYPE_ZERO   = 0,    /* min boundary               */ +        BR_SIGNATURE_TYPE_SHA256 = 1,    /* signed with SHA256         */ +        BR_SIGNATURE_TYPE_MAX    = 2,    /* max boundary               */ +} br_signature_type; + +/* BitRot stub start time (virtual xattr) */ +#define GLUSTERFS_GET_BR_STUB_INIT_TIME  "trusted.glusterfs.bit-rot.stub-init" + +static inline int +br_is_signature_type_valid (int8_t signaturetype) +{ +        return ((signaturetype > BR_SIGNATURE_TYPE_ZERO) +                 && (signaturetype < BR_SIGNATURE_TYPE_MAX)); +} + +static inline void +br_set_default_ongoingversion (br_version_t *buf, uint32_t *tv) +{ +        buf->ongoingversion = BITROT_DEFAULT_CURRENT_VERSION; +        buf->timebuf[0] = tv[0]; +        buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_default_signature (br_signature_t *buf, size_t *size) +{ +        buf->signaturetype = (int8_t) BR_SIGNATURE_TYPE_VOID; +        buf->signedversion = BITROT_DEFAULT_SIGNING_VERSION; + +        *size = sizeof (br_signature_t); /* no signature */ +} + +static inline void +br_set_ongoingversion (br_version_t *buf, +                       unsigned long version, uint32_t *tv) +{ +        buf->ongoingversion = version; +        buf->timebuf[0] = tv[0]; +        buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_signature (br_signature_t *buf, +                  br_isignature_t *sign, size_t signaturelen, size_t *size) +{ +        buf->signaturetype  = sign->signaturetype; +        buf->signedversion  = ntohl (sign->signedversion); + +        memcpy (buf->signature, sign->signature, signaturelen); +        *size = sizeof (br_signature_t) + signaturelen; +} + +#endif /* __BIT_ROT_COMMON_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h new file mode 100644 index 00000000000..64779923fd6 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -0,0 +1,24 @@ +/* +   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ + +#ifndef _BR_MEM_TYPES_H +#define _BR_MEM_TYPES_H + +#include "mem-types.h" + +enum br_mem_types { +        gf_br_stub_mt_private_t   = gf_common_mt_end + 1, +        gf_br_stub_mt_version_t   = gf_common_mt_end + 2, +        gf_br_stub_mt_inode_ctx_t = gf_common_mt_end + 3, +        gf_br_stub_mt_signature_t = gf_common_mt_end + 4, +        gf_br_stub_mt_end +}; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c new file mode 100644 index 00000000000..420f145a849 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -0,0 +1,1428 @@ +/* +   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ + +#include <ctype.h> +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "changelog.h" +#include "compat-errno.h" + +#include "bit-rot-stub.h" +#include "bit-rot-stub-mem-types.h" + +#include "bit-rot-common.h" + +#define BR_STUB_REQUEST_COOKIE  0x1 + +int32_t +mem_acct_init (xlator_t *this) +{ +        int32_t ret = -1; + +        if (!this) +                return ret; + +        ret = xlator_mem_acct_init (this, gf_br_stub_mt_end + 1); + +        if (ret != 0) { +                gf_log (this->name, GF_LOG_WARNING, "Memory accounting" +                        " init failed"); +                return ret; +        } + +        return ret; +} + +int32_t +init (xlator_t *this) +{ +        char *tmp = NULL; +        struct timeval tv = {0,}; +	br_stub_private_t *priv = NULL; + +	if (!this->children) { +		gf_log (this->name, GF_LOG_ERROR, "FATAL: no children"); +		goto error_return; +	} + +        priv = GF_CALLOC (1, sizeof (*priv), gf_br_stub_mt_private_t); +        if (!priv) +                goto error_return; + +        priv->local_pool = mem_pool_new (br_stub_local_t, 512); +        if (!priv->local_pool) +                goto free_priv; + +        GF_OPTION_INIT ("bitrot", priv->go, bool, free_mempool); + +        GF_OPTION_INIT ("export", tmp, str, free_mempool); +        memcpy (priv->export, tmp, strlen (tmp) + 1); + +        (void) gettimeofday (&tv, NULL); + +        /* boot time is in network endian format */ +        priv->boot[0] = htonl (tv.tv_sec); +        priv->boot[1] = htonl (tv.tv_usec); + +        gf_log (this->name, GF_LOG_DEBUG, "bit-rot stub loaded"); +	this->private = priv; +        return 0; + + free_mempool: +        mem_pool_destroy (priv->local_pool); + free_priv: +        GF_FREE (priv); + error_return: +        return -1; +} + +void +fini (xlator_t *this) +{ +	br_stub_private_t *priv = this->private; + +        if (!priv) +                return; +        this->private = NULL; +	GF_FREE (priv); + +	return; +} + +static inline int +br_stub_alloc_versions (br_version_t **obuf, +                        br_signature_t **sbuf, size_t signaturelen) +{ +        void *mem = NULL; +        size_t size = 0; + +        if (obuf) +                size += sizeof (br_version_t); +        if (sbuf) +                size += sizeof (br_signature_t) + signaturelen; + +        mem = GF_CALLOC (1, size, gf_br_stub_mt_version_t); +        if (!mem) +                goto error_return; + +        if (obuf) { +                *obuf = (br_version_t *)mem; +                mem = ((char *)mem + sizeof (br_version_t)); +        } +        if (sbuf) { +                *sbuf = (br_signature_t *)mem; +        } + +        return 0; + + error_return: +        return -1; +} + +static inline void +br_stub_dealloc_versions (void *mem) +{ +        GF_FREE (mem); +} + +static inline br_stub_local_t * +br_stub_alloc_local (xlator_t *this) +{ +        br_stub_private_t *priv = this->private; + +        return mem_get0 (priv->local_pool); +} + +static inline void +br_stub_dealloc_local (br_stub_local_t *ptr) +{ +        mem_put (ptr); +} + +static inline int +br_stub_prepare_default_request (xlator_t *this, dict_t *dict, +                                 br_version_t *obuf, br_signature_t *sbuf) +{ +        int32_t ret = 0; +        size_t size = 0; +        br_stub_private_t *priv = NULL; + +        priv = this->private; + +        /** Prepare ongoing version */ +        br_set_default_ongoingversion (obuf, priv->boot); +        ret = dict_set_static_bin (dict, BITROT_CURRENT_VERSION_KEY, +                                   (void *)obuf, sizeof (br_version_t)); +        if (ret) +                return -1; + +        /** Prepare signature version */ +        br_set_default_signature (sbuf, &size); +        return dict_set_static_bin (dict, BITROT_SIGNING_VERSION_KEY, +                                    (void *)sbuf, size); +} + +static inline int +br_stub_prepare_version_request (xlator_t *this, dict_t *dict, +                                br_version_t *obuf, unsigned long oversion) +{ +        br_stub_private_t *priv = NULL; + +        priv = this->private; +        br_set_ongoingversion (obuf, oversion, priv->boot); + +        return dict_set_static_bin (dict, BITROT_CURRENT_VERSION_KEY, +                                    (void *)obuf, sizeof (br_version_t)); +} + +static inline int +br_stub_prepare_signing_request (dict_t *dict, +                                 br_signature_t *sbuf, +                                 br_isignature_t *sign, size_t signaturelen) +{ +        size_t size = 0; + +        br_set_signature (sbuf, sign, signaturelen, &size); + +        return dict_set_static_bin (dict, BITROT_SIGNING_VERSION_KEY, +                                    (void *)sbuf, size); +} + +/** + * initialize an inode context starting with a given ongoing version. + * a fresh lookup() or a first creat() call initializes the inode + * context, hence the inode is marked dirty. this routine also + * initializes the transient inode version. + */ +static inline int +br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode, +                             unsigned long version, gf_boolean_t markdirty) +{ +        int32_t ret = 0; +        br_stub_inode_ctx_t *ctx = NULL; + +        ctx = GF_CALLOC (1, sizeof (br_stub_inode_ctx_t), +                         gf_br_stub_mt_inode_ctx_t); +        if (!ctx) +                goto error_return; + +        (markdirty) ? __br_stub_mark_inode_dirty (ctx) +                : __br_stub_mark_inode_synced (ctx); +        __br_stub_set_ongoing_version (ctx, version); +        __br_stub_reset_release_counters (ctx); + +        if (fd) { +                br_stub_require_release_call (this, fd); +                __br_stub_track_openfd (fd, ctx); +        } +        ret = br_stub_set_inode_ctx (this, inode, ctx); +        if (ret) +                goto free_ctx; +        return 0; + + free_ctx: +        GF_FREE (ctx); + error_return: +        return -1; +} + +/** + * modify the ongoing version of an inode. + */ +static inline int +br_stub_mod_inode_versions (xlator_t *this, +                            fd_t *fd, inode_t *inode, unsigned long version) +{ +        int32_t ret = -1; +        br_stub_inode_ctx_t *ctx = 0; + +        LOCK (&inode->lock); +        { +                ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); +                if (ctx == NULL) +                        goto unblock; +                if (__br_stub_is_inode_dirty (ctx)) { +                        __br_stub_set_ongoing_version (ctx, version); +                        __br_stub_mark_inode_synced (ctx); +                } + +                __br_stub_track_openfd (fd, ctx); +                ret = 0; +        } + unblock: +        UNLOCK (&inode->lock); + +        return ret; +} + +static inline void +br_stub_fill_local (br_stub_local_t *local, +                    call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid, +                    int versioningtype, unsigned long memversion, int dirty) +{ +        local->fopstub = stub; +        local->versioningtype = versioningtype; +        local->u.context.version = memversion; +        if (fd) +                local->u.context.fd = fd_ref (fd); +        if (inode) +                local->u.context.inode = inode_ref (inode); +        uuid_copy (local->u.context.gfid, gfid); + +        /* mark inode dirty/fresh according to durability */ +        local->u.context.markdirty = (dirty) ? _gf_true : _gf_false; +} + +static inline void +br_stub_cleanup_local (br_stub_local_t *local) +{ +        local->fopstub = NULL; +        local->versioningtype = 0; +        local->u.context.version = 0; +        if (local->u.context.fd) { +                fd_unref (local->u.context.fd); +                local->u.context.fd = NULL; +        } +        if (local->u.context.inode) { +                inode_unref (local->u.context.inode); +                local->u.context.inode = NULL; +        } +        local->u.context.markdirty = _gf_true; +        memset (local->u.context.gfid, '\0', sizeof (uuid_t)); +} + +/** + * callback for inode/fd full versioning + */ +int +br_stub_inode_fullversioning_cbk (call_frame_t *frame, +                                  void *cookie, xlator_t *this, +                                  int op_ret, int op_errno, dict_t *xdata) +{ +        fd_t            *fd      = NULL; +        inode_t         *inode   = NULL; +        unsigned long    version = 0; +        gf_boolean_t     dirty   = _gf_true; +        br_stub_local_t *local   = NULL; + +        local = (br_stub_local_t *)frame->local; + +        /* be graceful to EEXIST */ +        if ((op_ret < 0) && (op_errno == EEXIST)) { +                op_ret = 0; +                goto done; +        } + +        if (op_ret < 0) +                goto done; + +        fd      = local->u.context.fd; +        inode   = local->u.context.inode; +        version = local->u.context.version; +        dirty   = local->u.context.markdirty; + +        op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty); +        if (op_ret < 0) +                op_errno = EINVAL; + + done: +        frame->local = NULL; +        if (op_ret < 0) +                call_unwind_error (local->fopstub, op_ret, op_errno); +        else +                call_resume (local->fopstub); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); + +        return 0; +} + +int +br_stub_fd_incversioning_cbk (call_frame_t *frame, +                              void *cookie, xlator_t *this, +                              int op_ret, int op_errno, dict_t *xdata) +{ +        fd_t            *fd      = NULL; +        inode_t         *inode   = NULL; +        unsigned long    version = 0; +        br_stub_local_t *local   = NULL; + +        local = (br_stub_local_t *)frame->local; +        if (op_ret < 0) +                goto done; +        fd      = local->u.context.fd; +        inode   = local->u.context.inode; +        version = local->u.context.version; + +        op_ret = br_stub_mod_inode_versions (this, fd, inode, version); +        if (op_ret < 0) +                op_errno = EINVAL; + + done: +        frame->local = NULL; +        if (op_ret < 0) +                call_unwind_error (local->fopstub, -1, op_errno); +        else +                call_resume (local->fopstub); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); + +        return 0; +} + +/** + * Initial object versioning + * + * Version persists two (2) extended attributes as explained below: + *   1. Current (ongoing) version: This is incremented on an open() + *      or creat() and is the running version for an object. + *   2. Signing version: This is the version against which an object + *      was signed (checksummed). + * + * During initial versioning, both ongoing and signing versions are + * set of one and zero respectively. An open() call increments the + * ongoing version as an indication of modification to the object. + * Additionally this needs to be persisted on disk and needs to be + * durable: fsync().. :-/ + * As an optimization only the first open() synchronizes the ongoing + * version to disk, subsequent open()s before the *last* release() + * are no-op's. + * + * create(), just like lookup() initializes the object versions to + * the default, but persists the version to disk. As an optimization + * this is not a durable operation: in case of a crash, hard reboot + * etc.. absence of versioning xattrs is ignored in scrubber along + * with the one time crawler explicitly triggering signing for such + * objects. + * + * c.f. br_stub_open_cbk() / br_stub_create_cbk() + */ + +/** + * perform full or incremental versioning on an inode pointd by an + * fd. incremental versioning is done when an inode is dirty and a + * writeback is trigerred. + */ + +int +br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, +                       call_stub_t *stub, dict_t *dict, fd_t *fd, +                       br_stub_version_cbk *callback, unsigned long memversion, +                       int versioningtype, int durable, int dirty) +{ +        int32_t          ret   = -1; +        int              flags = 0; +        dict_t          *xdata = NULL; +        br_stub_local_t *local = NULL; + +        if (durable) { +                xdata = dict_new (); +                if (!xdata) +                        goto done; +                ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0); +                if (ret) +                        goto dealloc_xdata; +        } + +        local = br_stub_alloc_local (this); +        if (!local) { +                ret = -1; +                goto dealloc_xdata; +        } + +        if (versioningtype == BR_STUB_FULL_VERSIONING) +                flags |= XATTR_CREATE; + +        br_stub_fill_local (local, stub, fd, +                            fd->inode, fd->inode->gfid, +                            versioningtype, memversion, dirty); + +        frame->local = local; +        STACK_WIND (frame, callback, +                    FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, +                    fd, dict, flags, xdata); + +        ret = 0; + + dealloc_xdata: +        if (durable) +                dict_unref (xdata); + done: +        return ret; +} + +static inline int +br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, +                                call_stub_t *stub, fd_t *fd) +{ +        int32_t         ret      = -1; +        dict_t         *dict     = NULL; +        br_version_t   *obuf     = NULL; +        br_signature_t *sbuf     = NULL; +        int             op_errno = 0; + +        op_errno = ENOMEM; +        dict = dict_new (); +        if (!dict) +                goto done; +        ret = br_stub_alloc_versions (&obuf, &sbuf, 0); +        if (ret) +                goto dealloc_dict; + +        op_errno = EINVAL; +        ret = br_stub_prepare_default_request (this, dict, obuf, sbuf); +        if (ret) +                goto dealloc_versions; + +        /** +         * Version extended attributes need not be durable at this point of +         * time. If the objects (inode) data gets persisted on disk but the +         * version extended attributes are lost due to a crash/power failure, +         * a subsequent lookup marks the objects signature as stale. This way, +         * dentry operation times do not shoot up. +         */ +        ret = br_stub_fd_versioning (this, frame, stub, dict, fd, +                                     br_stub_inode_fullversioning_cbk, +                                     BITROT_DEFAULT_CURRENT_VERSION, +                                     BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0); + + dealloc_versions: +        br_stub_dealloc_versions (obuf); + dealloc_dict: +        dict_unref (dict); + done: +        if (ret) +                call_unwind_error (stub, -1, op_errno); +        return ret; +} + +static inline int +br_stub_perform_incversioning (xlator_t *this, +                               call_frame_t *frame, call_stub_t *stub, +                               fd_t *fd, br_stub_inode_ctx_t *ctx) +{ +        int32_t        ret               = -1; +        dict_t        *dict              = NULL; +        inode_t       *inode             = NULL; +        br_version_t  *obuf              = NULL; +        unsigned long  writeback_version = 0; +        int            op_errno          = 0; + +        inode = fd->inode; + +        op_errno = EINVAL; +        ret = br_stub_require_release_call (this, fd); +        if (ret) +                goto done; + +        LOCK (&inode->lock); +        { +                if (__br_stub_is_inode_dirty (ctx)) +                        writeback_version = __br_stub_writeback_version (ctx); +                else +                        __br_stub_track_openfd (fd, ctx); +        } +        UNLOCK (&inode->lock); + +        if (!writeback_version) { +                ret = 0; +                goto done; +        } + +        /* inode requires writeback to disk */ +        op_errno = ENOMEM; +        dict = dict_new (); +        if (!dict) +                goto done; +        ret = br_stub_alloc_versions (&obuf, NULL, 0); +        if (ret) +                goto dealloc_dict; +        ret = br_stub_prepare_version_request (this, dict, +                                               obuf, writeback_version); +        if (ret) +                goto dealloc_versions; + +        ret = br_stub_fd_versioning +                (this, frame, stub, dict, +                 fd, br_stub_fd_incversioning_cbk, writeback_version, +                 BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0); + + dealloc_versions: +        br_stub_dealloc_versions (obuf); + dealloc_dict: +        dict_unref (dict); + done: +        if (!ret && !writeback_version) +                call_resume (stub); +        if (ret) +                call_unwind_error (stub, -1, op_errno); +        return ret; +} + +/** {{{ */ + +/* fsetxattr() */ + +static inline int +br_stub_prepare_signature (xlator_t *this, dict_t *dict, +                           inode_t *inode, br_isignature_t *sign) +{ +        int32_t ret = 0; +        size_t signaturelen = 0; +        br_signature_t *sbuf = NULL; + +        if (!br_is_signature_type_valid (sign->signaturetype)) +                goto error_return; + +        signaturelen = strlen (sign->signature); +        ret = br_stub_alloc_versions (NULL, &sbuf, signaturelen); +        if (ret) +                goto error_return; +        ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen); +        if (ret) +                goto dealloc_versions; +        return 0; + + dealloc_versions: +        br_stub_dealloc_versions (sbuf); + error_return: +        return -1; +} + +int +br_stub_fsetxattr (call_frame_t *frame, xlator_t *this, +                   fd_t *fd, dict_t *dict, int flags, dict_t *xdata) +{ +        int32_t          ret  = 0; +        br_isignature_t *sign = NULL; +        gf_boolean_t     xref = _gf_false; + +        if (!IA_ISREG (fd->inode->ia_type)) +                goto wind; +        ret = dict_get_bin (dict, GLUSTERFS_SET_OBJECT_SIGNATURE, +                            (void **) &sign); +        if (ret < 0) +                goto wind; +        if (frame->root->pid != GF_CLIENT_PID_BITD) +                goto unwind; + +        ret = br_stub_prepare_signature (this, dict, fd->inode, sign); +        if (ret) +                goto unwind; +        dict_del (dict, GLUSTERFS_SET_OBJECT_SIGNATURE); + +        if (!xdata) { +                xdata = dict_new (); +                if (!xdata) +                        goto unwind; +        } else { +                dict_ref (xdata); +        } + +        xref = _gf_true; +        ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0); +        if (ret) +                goto unwind; + + wind: +        STACK_WIND (frame, default_setxattr_cbk, +                    FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, +                    dict, flags, xdata); +        goto done; + + unwind: +        STACK_UNWIND_STRICT (setxattr, frame, -1, EINVAL, NULL); + done: +        if (xref) +                dict_unref (xdata); +        return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* {f}getxattr() */ + +int +br_stub_listxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ +        if (op_ret < 0) +                goto unwind; + +        br_stub_remove_vxattrs (xattr); + + unwind: +        STACK_UNWIND (frame, op_ret, op_errno, xattr, xdata); +        return 0; +} + + +int +br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ +        int32_t              ret          = 0; +        ssize_t              totallen     = 0; +        ssize_t              signaturelen = 0; +        br_version_t        *obuf         = NULL; +        br_signature_t      *sbuf         = NULL; +        br_isignature_out_t *sign         = NULL; +        br_vxattr_status_t   status; + +        if (op_ret < 0) +                goto unwind; +        if (cookie != (void *) BR_STUB_REQUEST_COOKIE) +                goto unwind; + +        op_ret   = -1; +        op_errno = EINVAL; + +        status = br_version_xattr_state (xattr, &obuf, &sbuf); +        if (status == BR_VXATTR_STATUS_PARTIAL) +                goto delkeys; + +        op_errno = ENODATA; +        if (status == BR_VXATTR_STATUS_MISSING) +                goto delkeys; + +        signaturelen = strlen (sbuf->signature); +        totallen = signaturelen + sizeof (br_isignature_out_t); + +        op_errno = ENOMEM; +        sign = GF_CALLOC (1, totallen, gf_br_stub_mt_signature_t); +        if (!sign) +                goto delkeys; + +        sign->time[0] = obuf->timebuf[0]; +        sign->time[1] = obuf->timebuf[1]; + +        /* Object's dirty state */ +        sign->stale = (obuf->ongoingversion != sbuf->signedversion) ? 1 : 0; + +        /* Object's signature */ +        sign->signaturetype = sbuf->signaturetype; +        (void) memcpy (sign->signature, sbuf->signature, signaturelen); + +        op_errno = EINVAL; +        ret = dict_set_bin (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, +                            (void *)sign, totallen); +        if (ret < 0) +                goto delkeys; +        op_errno = 0; +        op_ret = totallen; + + delkeys: +        br_stub_remove_vxattrs (xattr); + + unwind: +        STACK_UNWIND (frame, op_ret, op_errno, xattr, xdata); +        return 0; +} + +static inline void +br_stub_send_stub_init_time (call_frame_t *frame, xlator_t *this) +{ +        int op_ret = 0; +        int op_errno = 0; +        dict_t *xattr = NULL; +        br_stub_init_t stub = {{0,},}; +        br_stub_private_t *priv = NULL; + +        priv = this->private; + +        xattr = dict_new (); +        if (!xattr) { +                op_ret = -1; +                op_errno = ENOMEM; +                goto unwind; +        } + +        stub.timebuf[0] = priv->boot[0]; +        stub.timebuf[1] = priv->boot[1]; +        memcpy (stub.export, priv->export, strlen (priv->export) + 1); + +        op_ret = dict_set_static_bin (xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, +                                      (void *) &stub, sizeof (br_stub_init_t)); +        if (op_ret < 0) { +                op_errno = EINVAL; +                goto unwind; +        } + +        op_ret = sizeof (br_stub_init_t); + + unwind: +        STACK_UNWIND (frame, op_ret, op_errno, xattr, NULL); + +        if (xattr) +                dict_unref (xattr); +} + +int +br_stub_getxattr (call_frame_t *frame, xlator_t *this, +                  loc_t *loc, const char *name, dict_t *xdata) +{ +        void *cookie = NULL; +        uuid_t rootgfid = {0, }; +        fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk; + +        rootgfid[15] = 1; + +        if (!name) { +                cbk = br_stub_listxattr_cbk; +                goto wind; +        } + +        if (br_stub_is_internal_xattr (name)) +                goto wind; + +        /** +         * this special extended attribute is allowed only on root +         */ +        if (name +            && (strncmp (name, GLUSTERFS_GET_BR_STUB_INIT_TIME, +                          strlen (GLUSTERFS_GET_BR_STUB_INIT_TIME)) == 0) +            && ((uuid_compare (loc->gfid, rootgfid) == 0) +                || (uuid_compare (loc->inode->gfid, rootgfid) == 0))) { +                br_stub_send_stub_init_time (frame, this); +                return 0; +        } + +        if (!IA_ISREG (loc->inode->ia_type)) +                goto wind; + +        if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, +                              strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { +                cookie = (void *) BR_STUB_REQUEST_COOKIE; +        } + + wind: +        STACK_WIND_COOKIE +                      (frame, cbk, cookie, FIRST_CHILD (this), +                       FIRST_CHILD (this)->fops->getxattr, loc, name, xdata); +        return 0; +} + +int +br_stub_fgetxattr (call_frame_t *frame, xlator_t *this, +                   fd_t *fd, const char *name, dict_t *xdata) +{ +        void *cookie = NULL; +        uuid_t rootgfid = {0, }; +        fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk; + +        rootgfid[15] = 1; + +        if (!name) { +                cbk = br_stub_listxattr_cbk; +                goto wind; +        } + +        if (br_stub_is_internal_xattr (name)) +                goto wind; + +        /** +         * this special extended attribute is allowed only on root +         */ +        if (name +            && (strncmp (name, GLUSTERFS_GET_BR_STUB_INIT_TIME, +                         strlen (GLUSTERFS_GET_BR_STUB_INIT_TIME)) == 0) +            && (uuid_compare (fd->inode->gfid, rootgfid) == 0)) { +                br_stub_send_stub_init_time (frame, this); +                return 0; +        } + +        if (!IA_ISREG (fd->inode->ia_type)) +                goto wind; + +        if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, +                              strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { +                cookie = (void *) BR_STUB_REQUEST_COOKIE; +        } + + wind: +        STACK_WIND_COOKIE +                      (frame, cbk, cookie, FIRST_CHILD (this), +                       FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata); +        return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* open() */ + +int +br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                  int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ +        int32_t              ret      = 0; +        uint64_t             ctx_addr = 0; +        br_stub_inode_ctx_t *ctx      = NULL; +        call_stub_t         *stub     = NULL; + +        if (op_ret < 0) +                goto unwind; +        if (cookie != (void *) BR_STUB_REQUEST_COOKIE) +                goto unwind; + +        ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); +        if (ret < 0) +                goto unwind; + +        stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); +        if (!stub) { +                op_ret = -1; +                op_errno = EINVAL; +                goto unwind; +        } + +        /** +         * Ongoing version needs to be incremented. If the inode is not dirty, +         * things are simple: increment the ongoing version safely and be done. +         * If inode is dirty, a writeback to disk is required. This is tricky in +         * case of multiple open()'s as ongoing version needs to be incremented +         * on a successful writeback. It's probably safe to remember the ongoing +         * version before writeback and *assigning* it in the callback, but that +         * may lead to a trustable checksum to be treated as stale by scrubber +         * (the case where the in-memory ongoing version is lesser than the +         * on-disk version). Therefore, *all* open() calls (which might have +         * come in parallel) try to synchronize the next ongoing version to +         * disk. In the callback path, the winner marks the inode as synced +         * therby loosing open() calls become no-op's. +         */ +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +        return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + + unwind: +        STACK_UNWIND_STRICT (open, frame, +                             op_ret, op_errno, fd, xdata); +        return 0; +} + +int +br_stub_open (call_frame_t *frame, xlator_t *this, +              loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) +{ +        void *cookie = NULL; + +        if (!flags) +                goto wind; +        cookie = (void *) BR_STUB_REQUEST_COOKIE; + + wind: +        STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie, +                           FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, +                           loc, flags, fd, xdata); +        return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* creat() */ + +int +br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                    int op_ret, int op_errno, fd_t *fd, inode_t *inode, +                    struct iatt *stbuf, struct iatt *preparent, +                    struct iatt *postparent, dict_t *xdata) +{ +        int32_t ret = 0; +        uint64_t ctx_addr = 0; +        call_stub_t *stub = NULL; +        br_stub_inode_ctx_t *ctx = NULL; + +        if (op_ret < 0) +                goto unwind; + +        stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, +                                    stbuf, preparent, postparent, xdata); +        if (!stub) { +                op_ret = -1; +                op_errno = EINVAL; +                goto unwind; +        } + +        ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); +        if (ret < 0) +                ctx_addr = 0; +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + +        /* see comment in br_stub_open_cbk().. */ +        return (ctx) +                ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) +                : br_stub_perform_fullversioning (this, frame, stub, fd); + + unwind: +        STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, +                             fd, inode, stbuf, preparent, postparent, xdata); +        return 0; +} + +int +br_stub_create (call_frame_t *frame, +                xlator_t *this, loc_t *loc, int32_t flags, +                mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ +        STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->create, +                    loc, flags, mode, umask, fd, xdata); +        return 0; +} + +/** }}} */ + +static inline int32_t +br_stub_lookup_version (xlator_t *this, +                        uuid_t gfid, inode_t *inode, dict_t *xattr) +{ +        unsigned long       version = 0; +        br_version_t       *obuf    = NULL; +        br_signature_t     *sbuf    = NULL; +        br_vxattr_status_t  status; + +        /** +         * versioning xattrs were requested from POSIX. if available, figure +         * out the correct version to use in the inode context (start with +         * the default version if unavailable). As of now versions are not +         * persisted on-disk. The inode is marked dirty, so that the first +         * operation (such as open(), etc..) would trigger synchronization +         * to disk. +         */ +        status = br_version_xattr_state (xattr, &obuf, &sbuf); + +        /** +         * stub does not know how to handle partial presence of version +         * extended attributes, therefore, bail out in such cases. +         */ +        if (status == BR_VXATTR_STATUS_PARTIAL) { +                gf_log (this->name, GF_LOG_ERROR, "Partial version xattrs!.. " +                        "bailing out [GFID: %s]", uuid_utoa (gfid)); +                return -1; +        } + +        version = (status == BR_VXATTR_STATUS_FULL) +                        ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION; +        return br_stub_init_inode_versions (this, NULL, +                                            inode, version, _gf_true); +} + + +/** {{{ */ + +int +br_stub_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int op_ret, int op_errno, gf_dirent_t *entries, +                      dict_t *dict) +{ +        int32_t      ret     = 0; +        uint64_t     ctxaddr = 0; +        gf_dirent_t *entry   = NULL; + +        if (op_ret < 0) +                goto unwind; + +        list_for_each_entry (entry, &entries->list, list) { +                if ((strcmp (entry->d_name, ".") == 0) +                    || (strcmp (entry->d_name, "..") == 0)) +                        continue; + +                if (!IA_ISREG (entry->d_stat.ia_type)) +                        continue; + +                if (entry->dict) { +                        br_stub_remove_vxattrs (entry->dict); +                } + +                ret = br_stub_get_inode_ctx (this, entry->inode, &ctxaddr); +                if (ret < 0) +                        ctxaddr = 0; +                if (ctxaddr) /* already has the context */ +                        continue; + +                ret = br_stub_lookup_version +                        (this, entry->inode->gfid, entry->inode, entry->dict); +                if (ret) { +                        /** +                         * there's no per-file granularity support in case of +                         * failure. let's fail the entire request for now.. +                         */ +                        break; +                } +        } + +        if (ret) { +                op_ret   = -1; +                op_errno = EINVAL; +        } + + unwind: +        STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, dict); + +        return 0; +} + +int +br_stub_readdirp (call_frame_t *frame, xlator_t *this, +                  fd_t *fd, size_t size, off_t offset, dict_t *dict) +{ +        int32_t ret = -1; +        int op_errno = 0; +        gf_boolean_t xref = _gf_false; + +        op_errno = ENOMEM; +        if (!dict) { +                dict = dict_new (); +                if (!dict) +                        goto unwind; +        } else { +                dict = dict_ref (dict); +        } + +        xref = _gf_true; + +        op_errno = EINVAL; +        ret = dict_set_uint32 (dict, BITROT_CURRENT_VERSION_KEY, 0); +        if (ret) +                goto unwind; +        ret = dict_set_uint32 (dict, BITROT_SIGNING_VERSION_KEY, 0); +        if (ret) +                goto unwind; + +        STACK_WIND (frame, br_stub_readdirp_cbk, FIRST_CHILD (this), +                    FIRST_CHILD(this)->fops->readdirp, fd, size, +                    offset, dict); +        goto unref_dict; + + unwind: +        STACK_UNWIND_STRICT (readdirp, frame, -1, op_errno, NULL, NULL); +        return 0; + + unref_dict: +        if (xref) +                dict_unref (dict); +        return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* lookup() */ + +int +br_stub_lookup_cbk (call_frame_t *frame, void *cookie, +                    xlator_t *this, int op_ret, int op_errno, inode_t *inode, +                    struct iatt *stbuf, dict_t *xattr, struct iatt *postparent) +{ +        int32_t ret = 0; + +        if (op_ret < 0) +                goto unwind; +        if (!IA_ISREG (stbuf->ia_type)) +                goto unwind; + +        /** +         * perform this before checking if we requested xattrs as this +         * can happen during revalidate. +         */ +        br_stub_remove_vxattrs (xattr); +        if (cookie != (void *) BR_STUB_REQUEST_COOKIE) +                goto unwind; + +        ret = br_stub_lookup_version (this, stbuf->ia_gfid, inode, xattr); +        if (ret < 0) { +                op_ret   = -1; +                op_errno = EINVAL; +        } + + unwind: +        STACK_UNWIND_STRICT (lookup, frame, +                             op_ret, op_errno, inode, stbuf, xattr, postparent); + +        return 0; +} + +int +br_stub_lookup (call_frame_t *frame, +                xlator_t *this, loc_t *loc, dict_t *xdata) +{ +        int32_t ret = 0; +        int op_errno = 0; +        void *cookie = NULL; +        uint64_t ctx_addr = 0; +        gf_boolean_t xref = _gf_false; + +        ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); +        if (ret < 0) +                ctx_addr = 0; +        if (ctx_addr != 0) +                goto wind; + +        /** +         * fresh lookup: request version keys from POSIX +         */ +        op_errno = ENOMEM; +        if (!xdata) { +                xdata = dict_new (); +                if (!xdata) +                        goto unwind; +        } else { +                xdata = dict_ref (xdata); +        } + +        xref = _gf_true; + +        op_errno = EINVAL; +        ret = dict_set_uint32 (xdata, BITROT_CURRENT_VERSION_KEY, 0); +        if (ret) +                goto unwind; +        ret = dict_set_uint32 (xdata, BITROT_SIGNING_VERSION_KEY, 0); +        if (ret) +                goto unwind; +        cookie = (void *) BR_STUB_REQUEST_COOKIE; + + wind: +        STACK_WIND_COOKIE (frame, br_stub_lookup_cbk, cookie, +                           FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, +                           loc, xdata); +        goto dealloc_dict; + + unwind: +        STACK_UNWIND_STRICT (lookup, frame, +                             -1, op_errno, NULL, NULL, NULL, NULL); + dealloc_dict: +        if (xref) +                dict_unref (xdata); +        return 0; +} + +/** }}} */ + +/** {{{ */ + +/* forget() */ + +int +br_stub_forget (xlator_t *this, inode_t *inode) +{ +        uint64_t ctx_addr = 0; +        br_stub_inode_ctx_t *ctx = NULL; + +        inode_ctx_del (inode, this, &ctx_addr); +        if (!ctx_addr) +                return 0; + +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +        GF_FREE (ctx); + +        return 0; +} + +/** }}} */ + +/** {{{ */ + +int32_t +br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this, +              int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        STACK_DESTROY (frame->root); +        return 0; +} + +static inline void +br_stub_send_ipc_fop (xlator_t *this, +                      fd_t *fd, unsigned long releaseversion, int32_t flags) +{ +        int32_t op = 0; +        int32_t ret = 0; +        dict_t *xdata = NULL; +        call_frame_t *frame = NULL; +        changelog_event_t ev = {0,}; + +        ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; +        ev.u.releasebr.flags = flags; +        ev.u.releasebr.version = releaseversion; +        uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid); + +        xdata = dict_new (); +        if (!xdata) { +                gf_log (this->name, GF_LOG_WARNING, +                        "dict allocation failed: cannot send IPC FOP " +                        "to changelog"); +                goto out; +        } + +        ret = dict_set_static_bin (xdata, +                                   "RELEASE-EVENT", &ev, CHANGELOG_EV_SIZE); +        if (ret) { +                gf_log (this->name, GF_LOG_WARNING, +                        "cannot set release event in dict"); +                goto dealloc_dict; +        } + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) { +                gf_log (this->name, GF_LOG_WARNING, "create_frame() failure"); +                goto dealloc_dict; +        } + +        op = GF_IPC_TARGET_CHANGELOG; +        STACK_WIND (frame, br_stub_noop, FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->ipc, op, xdata); +        return; + + dealloc_dict: +        dict_unref (xdata); + out: +        return; +} + +int32_t +br_stub_release (xlator_t *this, fd_t *fd) +{ +        int32_t ret = 0; +        int32_t flags = 0; +        inode_t *inode = NULL; +        unsigned long releaseversion = 0; +        br_stub_inode_ctx_t *ctx = NULL; + +        inode = fd->inode; + +        LOCK (&inode->lock); +        { +                ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); +                if (ctx == NULL) +                        goto unblock; +                __br_stub_track_release (ctx); +                ret = __br_stub_can_trigger_release +                                 (inode, ctx, &releaseversion, &flags); +                if (ret) { +                        GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); +                        __br_stub_mark_inode_dirty (ctx); +                } +        } + unblock: +        UNLOCK (&inode->lock); + +        if (ret) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "releaseversion: %lu|flags: %d", releaseversion, flags); +                br_stub_send_ipc_fop (this, fd, releaseversion, flags); +        } + +        return 0; +} + +/** }}} */ + +/** {{{ */ + +/* ictxmerge */ + +void +br_stub_ictxmerge (xlator_t *this, fd_t *fd, +                   inode_t *inode, inode_t *linked_inode) +{ +        int32_t ret = 0; +        uint64_t ctxaddr = 0; +        uint64_t lctxaddr = 0; +        br_stub_inode_ctx_t *ctx = NULL; +        br_stub_inode_ctx_t *lctx = NULL; + +        ret = br_stub_get_inode_ctx (this, inode, &ctxaddr); +        if (ret < 0) +                goto done; +        ctx = (br_stub_inode_ctx_t *) ctxaddr; + +        LOCK (&linked_inode->lock); +        { +                ret = __br_stub_get_inode_ctx (this, linked_inode, &lctxaddr); +                if (ret < 0) +                        goto unblock; +                lctx = (br_stub_inode_ctx_t *) lctxaddr; + +                if (__br_stub_is_inode_dirty (lctx)) { +                        /** +                         * RACY code: An inode can end up in this situation +                         * after a lookup() or after a create() followed by +                         * a release(). Even if we distinguish b/w the two, +                         * there needs to be more infrastructure built up +                         * in stub to handle these races. Note, that it's +                         * probably OK to ignore the race iff the version +                         * was initialized on the very first lookup(), i.e., +                         * [ongoingversion: default]. +                         * +                         * FIXME: fixup races [create(1..n)/lookup(1..n)]. +                         */ +                        GF_ASSERT (lctx->currentversion +                                      == BITROT_DEFAULT_CURRENT_VERSION); +                        __br_stub_track_openfd (fd, lctx); +                        __br_stub_mark_inode_synced (lctx); +                } else { +                        GF_ASSERT (ctx->currentversion <= lctx->currentversion); +                        __br_stub_track_openfd (fd, lctx); +                } +        } + unblock: +        UNLOCK (&linked_inode->lock); + + done: +        return; +} + +/** }}} */ + + +struct xlator_fops fops = { +        .lookup    = br_stub_lookup, +        .open      = br_stub_open, +        .create    = br_stub_create, +        .readdirp  = br_stub_readdirp, +        .getxattr  = br_stub_getxattr, +        .fgetxattr = br_stub_fgetxattr, +        .fsetxattr = br_stub_fsetxattr, +}; + +struct xlator_cbks cbks = { +        .forget    = br_stub_forget, +        .release   = br_stub_release, +        .ictxmerge = br_stub_ictxmerge, +}; + +struct volume_options options[] = { +        { .key = {"bitrot"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "enable/disable bitrot stub" +        }, +        { .key = {"export"}, +          .type = GF_OPTION_TYPE_PATH, +          .description = "brick path for versioning" +        }, +	{ .key  = {NULL} }, +}; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h new file mode 100644 index 00000000000..d565112b1ad --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -0,0 +1,269 @@ + /* +   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_STUB_H__ +#define __BIT_ROT_STUB_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "call-stub.h" + +#include "bit-rot-common.h" + +typedef int (br_stub_version_cbk) (call_frame_t *, void *, +                                   xlator_t *, int32_t, int32_t, dict_t *); + +typedef struct br_stub_inode_ctx { +        int need_writeback;                     /* does the inode need +                                                      a writeback to disk? */ +        unsigned long currentversion;           /* ongoing version */ + +        struct release { +                int32_t ordflags; +                unsigned long opencount;        /* number of open()s before +                                                   final release() */ +                unsigned long releasecount;     /* number of release()s */ +        } releasectx; +#define BR_STUB_REQUIRE_RELEASE_CBK 0x0E0EA0E +} br_stub_inode_ctx_t; + + +#define I_DIRTY  (1<<0)        /* inode needs writeback */ +#define WRITEBACK_DURABLE 1    /* writeback is durable */ + +/** + * This could just have been a plain struct without unions and all, + * but we may need additional things in the future. + */ +typedef struct br_stub_local { +        call_stub_t *fopstub;   /* stub for original fop */ + +        int versioningtype;     /* not much used atm */ + +        union { +                struct br_stub_ctx { +                        fd_t          *fd; +                        uuid_t         gfid; +                        inode_t       *inode; +                        unsigned long  version; +                        gf_boolean_t   markdirty; +                } context; +        } u; +} br_stub_local_t; + +#define BR_STUB_FULL_VERSIONING (1<<0) +#define BR_STUB_INCREMENTAL_VERSIONING (1<<1) + +typedef struct br_stub_private { +        gf_boolean_t go; + +        uint32_t boot[2]; +        char export[PATH_MAX]; + +        struct mem_pool *local_pool; +} br_stub_private_t; + +/* inode writeback helpers */ +static inline void +__br_stub_mark_inode_dirty (br_stub_inode_ctx_t *ctx) +{ +        ctx->need_writeback |= I_DIRTY; +} + +static inline void +__br_stub_mark_inode_synced (br_stub_inode_ctx_t *ctx) +{ +        ctx->need_writeback &= ~I_DIRTY; +} + +static inline int +__br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx) +{ +        return (ctx->need_writeback & I_DIRTY); +} + +static inline int +br_stub_require_release_call (xlator_t *this, fd_t *fd) +{ +        int32_t ret = 0; + +        ret = fd_ctx_set (fd, this, +                          (uint64_t)(long)BR_STUB_REQUIRE_RELEASE_CBK); +        if (ret) +                gf_log (this->name, GF_LOG_WARNING, +                        "could not set fd context (for release callback"); +        return ret; +} + +/* get/set inode context helpers */ + +static inline int +__br_stub_get_inode_ctx (xlator_t *this, +                         inode_t *inode, uint64_t *ctx) +{ +        return __inode_ctx_get (inode, this, ctx); +} + +static inline int +br_stub_get_inode_ctx (xlator_t *this, +                       inode_t *inode, uint64_t *ctx) +{ +        return inode_ctx_get (inode, this, ctx); +} + +static inline int +br_stub_set_inode_ctx (xlator_t *this, +                       inode_t *inode, br_stub_inode_ctx_t *ctx) +{ +        uint64_t ctx_addr = (uint64_t) ctx; +        return inode_ctx_set (inode, this, &ctx_addr); +} + +/* version get/set helpers */ + +static inline unsigned long +__br_stub_writeback_version (br_stub_inode_ctx_t *ctx) +{ +        return (ctx->currentversion + 1); +} + +static inline void +__br_stub_set_ongoing_version (br_stub_inode_ctx_t *ctx, unsigned long version) +{ +        ctx->currentversion = version; +} + +static inline void +__br_stub_reset_release_counters (br_stub_inode_ctx_t *ctx) +{ +        ctx->releasectx.ordflags = 0; +        ctx->releasectx.opencount = 0; +        ctx->releasectx.releasecount = 0; +} + +static inline void +__br_stub_track_release (br_stub_inode_ctx_t *ctx) +{ +        ++ctx->releasectx.releasecount; +} + +static inline void +___br_stub_track_open (br_stub_inode_ctx_t *ctx) +{ +        ++ctx->releasectx.opencount; +} + +static inline void +___br_stub_track_open_flags (fd_t *fd, br_stub_inode_ctx_t *ctx) +{ +        ctx->releasectx.ordflags |= fd->flags; +} + +static inline void +__br_stub_track_openfd (fd_t *fd, br_stub_inode_ctx_t *ctx) +{ +        ___br_stub_track_open (ctx); +        ___br_stub_track_open_flags (fd, ctx); +} + +static inline int +__br_stub_can_trigger_release (inode_t *inode, +                               br_stub_inode_ctx_t *ctx, +                               unsigned long *version, int32_t *flags) +{ +        if (list_empty (&inode->fd_list) +            && (ctx->releasectx.releasecount == ctx->releasectx.opencount)) { +                if (flags) +                        *flags = htonl (ctx->releasectx.ordflags); +                if (version) +                        *version = htonl (ctx->currentversion); + +                __br_stub_reset_release_counters (ctx); +                return 1; +        } + +        return 0; +} + +static inline int32_t +br_stub_get_ongoing_version (xlator_t *this, +                             inode_t *inode, unsigned long *version) +{ +        int32_t ret = 0; +        uint64_t ctx_addr = 0; +        br_stub_inode_ctx_t *ctx = NULL; + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx_addr); +                if (ret < 0) +                        goto unblock; +                ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +                *version = ctx->currentversion; +        } + unblock: +        UNLOCK (&inode->lock); + +        return ret; +} + +/** + * fetch the current version from inode and return the context. + * inode->lock should be held before invoking this as context + * *needs* to be valid in the caller. + */ +static inline br_stub_inode_ctx_t * +__br_stub_get_ongoing_version_ctx (xlator_t *this, +                                   inode_t *inode, unsigned long *version) +{ +        int32_t ret = 0; +        uint64_t ctx_addr = 0; +        br_stub_inode_ctx_t *ctx = NULL; + +        ret = __inode_ctx_get (inode, this, &ctx_addr); +        if (ret < 0) +                return NULL; +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +        if (version) +                *version = ctx->currentversion; + +        return ctx; +} + +/* filter for xattr fetch */ +static inline int +br_stub_is_internal_xattr (const char *name) +{ +        if (name +            && ((strncmp (name, BITROT_CURRENT_VERSION_KEY, +                          strlen (BITROT_CURRENT_VERSION_KEY)) == 0) +                || (strncmp (name, BITROT_SIGNING_VERSION_KEY, +                             strlen (BITROT_SIGNING_VERSION_KEY)) == 0))) +                return 1; +        return 0; +} + +static inline void +br_stub_remove_vxattrs (dict_t *xattr) +{ +        if (xattr) { +                dict_del (xattr, BITROT_CURRENT_VERSION_KEY); +                dict_del (xattr, BITROT_SIGNING_VERSION_KEY); +        } +} + +#endif /* __BIT_ROT_STUB_H__ */  | 
