diff options
| author | Jeff Darcy <jdarcy@redhat.com> | 2016-02-08 13:30:49 -0500 | 
|---|---|---|
| committer | Jeff Darcy <jdarcy@redhat.com> | 2016-02-13 05:13:07 -0800 | 
| commit | c458433041aafb48ae6d6e5fcf3e1e737dc3fda3 (patch) | |
| tree | 33a03ca0c1f5faf58419de2c4ff4532752ddfb07 /xlators | |
| parent | da33097c3d6492e3b468b4347e47c70828fb4320 (diff) | |
experimental: add fdl (Full Data Logging) translator
NSR needs logging that is different than our existing changelog in
several ways:
 * Full data, not just metadata
 * Pre-op, not post-op
 * High performance
 * Supports the concept of time-bounded "terms"
Others (for example EC) might need the same thing.  This patch adds such
a translator.  It also adds code to dump the resulting journals, and to replay
them using syncops, plus (very rudimentary) tests for all of the above.
Change-Id: I29680a1b4e0a9e7d5a8497fef302c46434b86636
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.org/12450
Smoke: Gluster Build System <jenkins@build.gluster.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators')
| -rw-r--r-- | xlators/experimental/Makefile.am | 2 | ||||
| -rw-r--r-- | xlators/experimental/fdl/Makefile.am | 3 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/Makefile.am | 42 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/dump-tmpl.c | 156 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/fdl-tmpl.c | 506 | ||||
| -rwxr-xr-x | xlators/experimental/fdl/src/gen_dumper.py | 116 | ||||
| -rwxr-xr-x | xlators/experimental/fdl/src/gen_fdl.py | 328 | ||||
| -rwxr-xr-x | xlators/experimental/fdl/src/gen_recon.py | 191 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/jnl-types.h | 14 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/logdump.c | 50 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/recon-tmpl.c | 305 | ||||
| -rw-r--r-- | xlators/experimental/fdl/src/recon.c | 89 | ||||
| -rw-r--r-- | xlators/features/Makefile.am | 7 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 25 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 9 | 
15 files changed, 1839 insertions, 4 deletions
diff --git a/xlators/experimental/Makefile.am b/xlators/experimental/Makefile.am index 06f04a193c8..a31512203f6 100644 --- a/xlators/experimental/Makefile.am +++ b/xlators/experimental/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = nsr-client nsr-server +SUBDIRS = nsr-client nsr-server fdl  CLEANFILES = diff --git a/xlators/experimental/fdl/Makefile.am b/xlators/experimental/fdl/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/experimental/fdl/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/experimental/fdl/src/Makefile.am b/xlators/experimental/fdl/src/Makefile.am new file mode 100644 index 00000000000..a05fc797b0a --- /dev/null +++ b/xlators/experimental/fdl/src/Makefile.am @@ -0,0 +1,42 @@ +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental +xlator_LTLIBRARIES = fdl.la + +noinst_HEADERS = jnl-types.h + +nodist_fdl_la_SOURCES = fdl.c +fdl_la_LDFLAGS = -module -avoid-version +fdl_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +sbin_PROGRAMS = gf_logdump gf_recon +gf_logdump_SOURCES = logdump.c +nodist_gf_logdump_SOURCES = libfdl.c +gf_logdump_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\ +                   $(top_builddir)/api/src/libgfapi.la + +# Eventually recon(ciliation) code will move elsewhere, but for now it's +# easier to have it next to the similar logdump code. +gf_recon_SOURCES = recon.c +nodist_gf_recon_SOURCES = librecon.c +gf_recon_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\ +                   $(top_builddir)/api/src/libgfapi.la + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ +	      -I$(top_srcdir)/api/src -fPIC \ +	      -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ +	      -DDATADIR=\"$(localstatedir)\" + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +noinst_PYTHON = gen_fdl.py gen_dumper.py gen_recon.py +EXTRA_DIST = fdl-tmpl.c dump-tmpl.c recon-tmpl.c + +CLEANFILES = $(nodist_fdl_la_SOURCES) $(nodist_gf_logdump_SOURCES) + +fdl.c: fdl-tmpl.c gen_fdl.py +	$(PYTHON) $(srcdir)/gen_fdl.py $(srcdir)/fdl-tmpl.c > $@ + +libfdl.c: dump-tmpl.c gen_dumper.py +	$(PYTHON) $(srcdir)/gen_dumper.py $(srcdir)/dump-tmpl.c > $@ + +librecon.c: recon-tmpl.c gen_recon.py +	$(PYTHON) $(srcdir)/gen_recon.py $(srcdir)/recon-tmpl.c > $@ diff --git a/xlators/experimental/fdl/src/dump-tmpl.c b/xlators/experimental/fdl/src/dump-tmpl.c new file mode 100644 index 00000000000..cac1071a9c1 --- /dev/null +++ b/xlators/experimental/fdl/src/dump-tmpl.c @@ -0,0 +1,156 @@ +#pragma fragment PROLOG +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glfs.h" +#include "iatt.h" +#include "xlator.h" +#include "jnl-types.h" + +#pragma fragment DICT +        { +                int key_len, data_len; +                char *key_ptr; +                printf ("@ARGNAME@ = dict {\n"); +                for (;;) { +                        key_len = *((int *)new_meta); +                        new_meta += sizeof(int); +                        if (!key_len) { +                                break; +                        } +                        key_ptr = new_meta; +                        new_meta += key_len; +                        data_len = *((int *)new_meta); +                        new_meta += sizeof(int) + data_len; +                        printf (" %s = <%d bytes>\n", key_ptr, data_len); +                } +                printf ("}\n"); +        } + +#pragma fragment DOUBLE +        printf ("@ARGNAME@ = @FORMAT@\n", *((uint64_t *)new_meta), +                *((uint64_t *)new_meta)); +        new_meta += sizeof(uint64_t); + +#pragma fragment GFID +        printf ("@ARGNAME@ = <gfid %s>\n", uuid_utoa(*((uuid_t *)new_meta))); +        new_meta += 16; + +#pragma fragment INTEGER +        printf ("@ARGNAME@ = @FORMAT@\n", *((uint32_t *)new_meta), +                *((uint32_t *)new_meta)); +        new_meta += sizeof(uint32_t); + +#pragma fragment LOC +        printf ("@ARGNAME@ = loc {\n"); +        printf ("  gfid = %s\n", uuid_utoa(*((uuid_t *)new_meta))); +        new_meta += 16; +        printf ("  pargfid = %s\n", uuid_utoa(*((uuid_t *)new_meta))); +        new_meta += 16; +        if (*(new_meta++)) { +                printf ("  name = %s\n", new_meta); +                new_meta += (strlen(new_meta) + 1); +        } +        printf ("}\n"); + +#pragma fragment STRING +        if (*(new_meta++)) { +                printf ("@ARGNAME@ = %s\n", new_meta); +                new_meta += (strlen(new_meta) + 1); +        } + +#pragma fragment VECTOR +        { +                size_t len = *((size_t *)new_meta); +                new_meta += sizeof(len); +                printf ("@ARGNAME@ = <%zu bytes>\n", len); +                new_data += len; +        } + +#pragma fragment IATT +        { +                ia_prot_t *myprot = ((ia_prot_t *)new_meta); +                printf ("@ARGNAME@ = iatt {\n"); +                printf ("  ia_prot = %c%c%c", +                        myprot->suid ? 'S' : '-', +                        myprot->sgid ? 'S' : '-', +                        myprot->sticky ? 'T' : '-'); +                printf ("%c%c%c", +                        myprot->owner.read ? 'r' : '-', +                        myprot->owner.write ? 'w' : '-', +                        myprot->owner.exec ? 'x' : '-'); +                printf ("%c%c%c", +                        myprot->group.read ? 'r' : '-', +                        myprot->group.write ? 'w' : '-', +                        myprot->group.exec ? 'x' : '-'); +                printf ("%c%c%c\n", +                        myprot->other.read ? 'r' : '-', +                        myprot->other.write ? 'w' : '-', +                        myprot->other.exec ? 'x' : '-'); +                new_meta += sizeof(ia_prot_t); +                uint32_t *myints = (uint32_t *)new_meta; +                printf ("  ia_uid = %u\n", myints[0]); +                printf ("  ia_gid = %u\n", myints[1]); +                printf ("  ia_atime = %u.%09u\n", myints[2], myints[3]); +                printf ("  ia_mtime = %u.%09u\n", myints[4], myints[5]); +                new_meta += sizeof(*myints) * 6; +        } + +#pragma fragment FOP +void +fdl_dump_@NAME@ (char **old_meta, char **old_data) +{ +        char    *new_meta	= *old_meta; +        char	*new_data	= *old_data; + +        /* TBD: word size/endianness */ +@FUNCTION_BODY@ + +        *old_meta = new_meta; +        *old_data = new_data; +} + +#pragma fragment CASE +        case GF_FOP_@UPNAME@: +                printf ("=== GF_FOP_@UPNAME@\n"); +                fdl_dump_@NAME@ (&new_meta, &new_data); +                break; + +#pragma fragment EPILOG +int +fdl_dump (char **old_meta, char **old_data) +{ +        char            *new_meta       = *old_meta; +        char            *new_data       = *old_data; +        static glfs_t   *fs             = NULL; +        int             recognized      = 1; +        event_header_t  *eh; + +        /* +         * We don't really call anything else in GFAPI, but this is the most +         * convenient way to satisfy all of the spurious dependencies on how it +         * or glusterfsd initialize (e.g. setting up THIS). +         */ +        if (!fs) { +                fs = glfs_new ("dummy"); +        } + +        eh = (event_header_t *)new_meta; +        new_meta += sizeof (*eh); + +        /* TBD: check event_type instead of assuming NEW_REQUEST */ + +        switch (eh->fop_type) { +@SWITCH_BODY@ + +        default: +                printf ("unknown fop %u\n", eh->fop_type); +                recognized = 0; +        } + +        *old_meta = new_meta; +        *old_data = new_data; +        return recognized; +} diff --git a/xlators/experimental/fdl/src/fdl-tmpl.c b/xlators/experimental/fdl/src/fdl-tmpl.c new file mode 100644 index 00000000000..8fcc6a8d6ff --- /dev/null +++ b/xlators/experimental/fdl/src/fdl-tmpl.c @@ -0,0 +1,506 @@ +/* +  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <fcntl.h> +#include <unistd.h> +#include <sys/mman.h> +#include "call-stub.h" +#include "iatt.h" +#include "defaults.h" +#include "syscall.h" +#include "xlator.h" +#include "jnl-types.h" + +/* TBD: make tunable */ +#define META_FILE_SIZE  (1 << 20) +#define DATA_FILE_SIZE  (1 << 24) + +enum gf_fdl { +        gf_fdl_mt_fdl_private_t = gf_common_mt_end + 1, +        gf_fdl_mt_end +}; + +typedef struct { +        char            *type; +        off_t           size; +        char            *path; +        int             fd; +        void *          ptr; +        off_t           max_offset; +} log_obj_t; + +typedef struct { +        struct list_head        reqs; +        pthread_mutex_t         req_lock; +        pthread_cond_t          req_cond; +        char                    *log_dir; +        pthread_t               worker; +        gf_boolean_t            should_stop; +        gf_boolean_t            change_term; +        log_obj_t               meta_log; +        log_obj_t               data_log; +        int                     term; +        int                     first_term; +} fdl_private_t; + +void +fdl_enqueue (xlator_t *this, call_stub_t *stub) +{ +        fdl_private_t   *priv   = this->private; + +        pthread_mutex_lock (&priv->req_lock); +        list_add_tail (&stub->list, &priv->reqs); +        pthread_mutex_unlock (&priv->req_lock); + +        pthread_cond_signal (&priv->req_cond); +} + +#pragma generate + +char * +fdl_open_term_log (xlator_t *this, log_obj_t *obj, int term) +{ +        fdl_private_t   *priv   = this->private; +        int             ret; +        char *          ptr     = NULL; + +        /* +         * Use .jnl instead of .log so that we don't get test info (mistakenly) +         * appended to our journal files. +         */ +        if (this->ctx->cmd_args.log_ident) { +                ret = gf_asprintf (&obj->path, "%s/%s-%s-%d.jnl", +                                   priv->log_dir, this->ctx->cmd_args.log_ident, +                                   obj->type, term); +        } +        else { +                ret = gf_asprintf (&obj->path, "%s/fubar-%s-%d.jnl", +                                   priv->log_dir, obj->type, term); +        } +        if ((ret <= 0) || !obj->path) { +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to construct log-file path"); +                goto err; +        } + +        gf_log (this->name, GF_LOG_INFO, "opening %s (size %ld)", +                obj->path, obj->size); + +        obj->fd = open (obj->path, O_RDWR|O_CREAT|O_TRUNC, 0666); +        if (obj->fd < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to open log file (%s)", strerror(errno)); +                goto err; +        } + +#if !defined(GF_BSD_HOST_OS) +        /* +         * NetBSD can just go die in a fire.  Even though it claims to support +         * fallocate/posix_fallocate they don't actually *do* anything so the +         * file size remains zero.  Then mmap succeeds anyway, but any access +         * to the mmap'ed region will segfault.  It would be acceptable for +         * fallocate to do what it says, for mmap to fail, or for access to +         * extend the file.  NetBSD managed to hit the trifecta of Getting +         * Everything Wrong, and debugging in that environment to get this far +         * has already been painful enough (systems I worked on in 1990 were +         * better that way).  We'll fall through to the lseek/write method, and +         * performance will be worse, and TOO BAD. +         */ +        if (sys_fallocate(obj->fd,0,0,obj->size) < 0) +#endif +        { +                gf_log (this->name, GF_LOG_WARNING, +                        "failed to fallocate space for log file"); +                /* Have to do this the ugly page-faulty way. */ +                (void) sys_lseek (obj->fd, obj->size-1, SEEK_SET); +                (void) sys_write (obj->fd, "", 1); +        } + +        ptr = mmap (NULL, obj->size, PROT_WRITE, MAP_SHARED, obj->fd, 0); +        if (ptr == MAP_FAILED) { +                gf_log (this->name, GF_LOG_ERROR, "failed to mmap log (%s)", +                        strerror(errno)); +                goto err; +        } + +        obj->ptr = ptr; +        obj->max_offset = 0; +        return ptr; + +err: +        if (obj->fd >= 0) { +                sys_close (obj->fd); +                obj->fd = (-1); +        } +        if (obj->path) { +                GF_FREE (obj->path); +                obj->path = NULL; +        } +        return ptr; +} + +void +fdl_close_term_log (xlator_t *this, log_obj_t *obj) +{ +        fdl_private_t   *priv           = this->private; + +        if (obj->ptr) { +                (void) munmap (obj->ptr, obj->size); +                obj->ptr = NULL; +        } + +        if (obj->fd >= 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "truncating term %d %s journal to %ld", +                        priv->term, obj->type, obj->max_offset); +                if (sys_ftruncate(obj->fd,obj->max_offset) < 0) { +                        gf_log (this->name, GF_LOG_WARNING, +                                "failed to truncate journal (%s)", +                                strerror(errno)); +                } +                sys_close (obj->fd); +                obj->fd = (-1); +        } + +        if (obj->path) { +                GF_FREE (obj->path); +                obj->path = NULL; +        } +} + +gf_boolean_t +fdl_change_term (xlator_t *this, char **meta_ptr, char **data_ptr) +{ +        fdl_private_t   *priv           = this->private; + +        fdl_close_term_log (this, &priv->meta_log); +        fdl_close_term_log (this, &priv->data_log); + +        ++(priv->term); + +        *meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term); +        if (!*meta_ptr) { +                return _gf_false; +        } + +        *data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term); +        if (!*data_ptr) { +                return _gf_false; +        } + +        return _gf_true; +} + +void * +fdl_worker (void *arg) +{ +        xlator_t        *this           = arg; +        fdl_private_t   *priv           = this->private; +        call_stub_t     *stub; +        char *          meta_ptr        = NULL; +        off_t           *meta_offset    = &priv->meta_log.max_offset; +        char *          data_ptr        = NULL; +        off_t           *data_offset    = &priv->data_log.max_offset; +        unsigned long   base_as_ul; +        void *          msync_ptr; +        size_t          msync_len; +        gf_boolean_t    recycle; +        void            *err_label      = &&err_unlocked; + +        priv->meta_log.type = "meta"; +        priv->meta_log.size = META_FILE_SIZE; +        priv->meta_log.path = NULL; +        priv->meta_log.fd = (-1); +        priv->meta_log.ptr = NULL; + +        priv->data_log.type = "data"; +        priv->data_log.size = DATA_FILE_SIZE; +        priv->data_log.path = NULL; +        priv->data_log.fd = (-1); +        priv->data_log.ptr = NULL; + +        /* TBD: initial term should come from persistent storage (e.g. etcd) */ +        priv->first_term = ++(priv->term); +        meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term); +        if (!meta_ptr) { +                goto *err_label; +        } +        data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term); +        if (!data_ptr) { +                fdl_close_term_log (this, &priv->meta_log); +                goto *err_label; +        } + +        for (;;) { +                pthread_mutex_lock (&priv->req_lock); +                err_label = &&err_locked; +                while (list_empty(&priv->reqs)) { +                        pthread_cond_wait (&priv->req_cond, &priv->req_lock); +                        if (priv->should_stop) { +                                goto *err_label; +                        } +                        if (priv->change_term) { +                                if (!fdl_change_term(this, &meta_ptr, +                                                           &data_ptr)) { +                                        goto *err_label; +                                } +                                priv->change_term = _gf_false; +                                continue; +                        } +                } +                stub = list_entry (priv->reqs.next, call_stub_t, list); +                list_del_init (&stub->list); +                pthread_mutex_unlock (&priv->req_lock); +                err_label = &&err_unlocked; +                /* +                 * TBD: batch requests +                 * +                 * What we should do here is gather up *all* of the requests +                 * that have accumulated since we were last at this point, +                 * blast them all out in one big writev, and then dispatch them +                 * all before coming back for more.  That maximizes throughput, +                 * at some cost to latency (due to queuing effects at the log +                 * stage).  Note that we're likely to be above io-threads, so +                 * the dispatch itself will be parallelized (at further cost to +                 * latency).  For now, we just do the simplest thing and handle +                 * one request all the way through before fetching the next. +                 * +                 * So, why mmap/msync instead of writev/fdatasync?  Because it's +                 * faster.  Much faster.  So much faster that I half-suspect +                 * cheating, but it's more convenient for now than having to +                 * ensure that everything's page-aligned for O_DIRECT (the only +                 * alternative that still might avoid ridiculous levels of +                 * local-FS overhead). +                 * +                 * TBD: check that msync really does get our data to disk. +                 */ +                gf_log (this->name, GF_LOG_DEBUG, +                        "logging %u+%u bytes for op %d", +                        stub->jnl_meta_len, stub->jnl_data_len, stub->fop); +                recycle = _gf_false; +                if ((*meta_offset + stub->jnl_meta_len) > priv->meta_log.size) { +                        recycle = _gf_true; +                } +                if ((*data_offset + stub->jnl_data_len) > priv->data_log.size) { +                        recycle = _gf_true; +                } +                if (recycle && !fdl_change_term(this,&meta_ptr,&data_ptr)) { +                        goto *err_label; +                } +                meta_ptr = priv->meta_log.ptr; +                data_ptr = priv->data_log.ptr; +                gf_log (this->name, GF_LOG_DEBUG, "serializing to %p/%p", +                        meta_ptr + *meta_offset, data_ptr + *data_offset); +                stub->serialize (stub, meta_ptr + *meta_offset, +                                       data_ptr + *data_offset); +                if (stub->jnl_meta_len > 0) { +                        base_as_ul = (unsigned long) (meta_ptr + *meta_offset); +                        msync_ptr = (void *) (base_as_ul & ~0x0fff); +                        msync_len = (size_t) (base_as_ul &  0x0fff); +                        if (msync (msync_ptr, msync_len+stub->jnl_meta_len, +                                              MS_SYNC) < 0) { +                                gf_log (this->name, GF_LOG_WARNING, +                                        "failed to log request meta (%s)", +                                        strerror(errno)); +                        } +                        *meta_offset += stub->jnl_meta_len; +                } +                if (stub->jnl_data_len > 0) { +                        base_as_ul = (unsigned long) (data_ptr + *data_offset); +                        msync_ptr = (void *) (base_as_ul & ~0x0fff); +                        msync_len = (size_t) (base_as_ul &  0x0fff); +                        if (msync (msync_ptr, msync_len+stub->jnl_data_len, +                                              MS_SYNC) < 0) { +                                gf_log (this->name, GF_LOG_WARNING, +                                        "failed to log request data (%s)", +                                        strerror(errno)); +                        } +                        *data_offset += stub->jnl_data_len; +                } +                call_resume (stub); +        } + +err_locked: +        pthread_mutex_unlock (&priv->req_lock); +err_unlocked: +        fdl_close_term_log (this, &priv->meta_log); +        fdl_close_term_log (this, &priv->data_log); +        return NULL; +} + +int32_t +fdl_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ +        fdl_private_t   *priv   = this->private; +        dict_t          *tdict; +        int32_t         gt_err  = EIO; + +        switch (op) { + +        case FDL_IPC_CHANGE_TERM: +                gf_log (this->name, GF_LOG_INFO, "got CHANGE_TERM op"); +                priv->change_term = _gf_true; +                pthread_cond_signal (&priv->req_cond); +                STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL); +                break; + +        case FDL_IPC_GET_TERMS: +                gf_log (this->name, GF_LOG_INFO, "got GET_TERMS op"); +                tdict = dict_new (); +                if (!tdict) { +                        gt_err = ENOMEM; +                        goto gt_done; +                } +                if (dict_set_int32(tdict,"first",priv->first_term) != 0) { +                        goto gt_done; +                } +                if (dict_set_int32(tdict,"last",priv->term) != 0) { +                        goto gt_done; +                } +                gt_err = 0; +        gt_done: +                if (gt_err) { +                        STACK_UNWIND_STRICT (ipc, frame, -1, gt_err, NULL); +                } else { +                        STACK_UNWIND_STRICT (ipc, frame, 0, 0, tdict); +                } +                if (tdict) { +                        dict_unref (tdict); +                } +                break; + +        default: +                STACK_WIND_TAIL (frame, +                                 FIRST_CHILD(this), +                                 FIRST_CHILD(this)->fops->ipc, +                                 op, xdata); +        } + +        return 0; +} + +int +fdl_init (xlator_t *this) +{ +        fdl_private_t   *priv   = NULL; + +        priv = GF_CALLOC (1, sizeof (*priv), gf_fdl_mt_fdl_private_t); +        if (!priv) { +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to allocate fdl_private"); +                goto err; +        } + +        INIT_LIST_HEAD (&priv->reqs); +        if (pthread_mutex_init (&priv->req_lock, NULL) != 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to initialize req_lock"); +                goto err; +        } +        if (pthread_cond_init (&priv->req_cond, NULL) != 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to initialize req_cond"); +                goto err; +        } + +        GF_OPTION_INIT ("log-path", priv->log_dir, path, err); + +        if (pthread_create(&priv->worker,NULL,fdl_worker,this) != 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to start fdl_worker"); +                goto err; +        } + +        /* +         * The rest of the fop table is automatically generated, so this is a +         * bit cleaner than messing with the generation to add a hand-written +         * exception. +         */ +        this->fops->ipc = fdl_ipc; + +        this->private = priv; +        return 0; + +err: +        if (priv) { +                GF_FREE(priv); +        } +        return -1; +} + +void +fdl_fini (xlator_t *this) +{ +        fdl_private_t   *priv   = this->private; + +        if (priv) { +                priv->should_stop = _gf_true; +                pthread_cond_signal (&priv->req_cond); +                pthread_join (priv->worker, NULL); +                GF_FREE(priv); +        } +} + +int +fdl_reconfigure (xlator_t *this, dict_t *options) +{ +        fdl_private_t   *priv   = this->private; + +	GF_OPTION_RECONF ("log_dir", priv->log_dir, options, path, out); +        /* TBD: react if it changed */ + +out: +        return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ +        int     ret = -1; + +        GF_VALIDATE_OR_GOTO ("fdl", this, out); + +        ret = xlator_mem_acct_init (this, gf_fdl_mt_end + 1); + +        if (ret != 0) { +                gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" +                        "failed"); +                return ret; +        } +out: +        return ret; +} + +class_methods_t class_methods = { +        .init           = fdl_init, +        .fini           = fdl_fini, +        .reconfigure    = fdl_reconfigure, +        .notify         = default_notify, +}; + +struct volume_options options[] = { +        { .key = {"log-path"}, +          .type = GF_OPTION_TYPE_PATH, +          .default_value = DEFAULT_LOG_FILE_DIRECTORY, +          .description = "Directory for FDL files." +        }, +        { .key  = {NULL} }, +}; + +struct xlator_cbks cbks = { +        .release        = default_release, +        .releasedir     = default_releasedir, +        .forget         = default_forget, +}; diff --git a/xlators/experimental/fdl/src/gen_dumper.py b/xlators/experimental/fdl/src/gen_dumper.py new file mode 100755 index 00000000000..42db55d2cb3 --- /dev/null +++ b/xlators/experimental/fdl/src/gen_dumper.py @@ -0,0 +1,116 @@ +#!/usr/bin/python + +import os +import re +import sys + +curdir = os.path.dirname (sys.argv[0]) +gendir = os.path.join (curdir, '../../../../libglusterfs/src') +sys.path.append (gendir) +from generator import ops, fop_subs, cbk_subs, generate + +# See the big header comment at the start of gen_fdl.py to see how the stages +# fit together.  The big difference here is that *all* of the C code is in the +# template file as labelled fragments, instead of as Python strings.  That +# makes it much easier to edit in one place, with proper syntax highlighting +# and indentation. +# +#   Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of +#   LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE. +# +#   Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and +#   FOP_TEMPLATE.  The expanded FOP code (including FUNCTION_BODY substitution +#   in the middle of each function) is emitted immediately; the expanded CASE +#   code is saved for the next stage. +# +#   Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code +#   in the middle of EPILOG, to generate the whole output file. +# +# Another way of looking at it is to consider how the fragments appear in +# the final output: +# +#   PROLOG +#   FOP (expanded for CREATE) +#       FOP before FUNCTION_BODY +#       LOC, INTEGER, GFID, etc. (one per arg, by type) +#       FOP after FUNCTION_BODY +#   FOP (expanded for WRITEV) +#       FOP before FUNCTION_BODY +#       GFID, VECTOR, etc. (on per arg, by type) +#       FOP after FUNCTION_BODY +#   (more FOPs) +#   EPILOG +#       EPILOG before CASE +#       CASE statements (one per fop) +#       EPILOG after CASE + +typemap = { +	'dict_t *':				( "DICT",		""), +	'fd_t *':				( "GFID",		""), +	'dev_t':				( "DOUBLE",		"%ld (0x%lx)"), +	'gf_xattrop_flags_t':	( "INTEGER",	"%d (0x%x)"), +	'int32_t':				( "INTEGER",	"%d (0x%x)"), +	'mode_t':				( "INTEGER",	"%d (0x%x)"), +	'off_t':				( "DOUBLE",		"%ld (0x%lx)"), +	'size_t':				( "DOUBLE",		"%ld (0x%lx)"), +	'uint32_t':				( "INTEGER",	"%d (0x%x)"), +	'loc_t *':				( "LOC",		""), +	'const char *':			( "STRING",		""), +	'struct iovec *':		( "VECTOR",		""), +	'struct iatt *':		( "IATT",		""), +} + +def get_special_subs (args): +	code = "" +	for arg in args: +		if (arg[0] != 'fop-arg') or (len(arg) < 4): +			continue +		recon_type, recon_fmt = typemap[arg[2]] +		code += fragments[recon_type].replace("@ARGNAME@",arg[3])		\ +									 .replace("@FORMAT@",recon_fmt) +	return code + +def gen_functions (): +	code = "" +	for name, value in ops.iteritems(): +		if "journal" not in [ x[0] for x in value ]: +			continue +		fop_subs[name]["@FUNCTION_BODY@"] = get_special_subs(value) +		# Print the FOP fragment with @FUNCTION_BODY@ in the middle. +		code += generate(fragments["FOP"],name,fop_subs) +	return code + +def gen_cases (): +	code = "" +	for name, value in ops.iteritems(): +		if "journal" not in [ x[0] for x in value ]: +			continue +		# Add the CASE fragment for this fop. +		code += generate(fragments["CASE"],name,fop_subs) +	return code + +def load_fragments (path="recon-tmpl.c"): +	pragma_re = re.compile('pragma fragment (.*)') +	cur_symbol = None +	cur_value = "" +	result = {} +	for line in open(path,"r").readlines(): +		m = pragma_re.search(line) +		if m: +			if cur_symbol: +				result[cur_symbol] = cur_value +			cur_symbol = m.group(1) +			cur_value = "" +		else: +			cur_value += line +	if cur_symbol: +		result[cur_symbol] = cur_value +	return result + +if __name__ == "__main__": +	fragments = load_fragments(sys.argv[1]) +	print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" +	print fragments["PROLOG"] +	print gen_functions() +	print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases()) +	print "/* END GENERATED CODE */" diff --git a/xlators/experimental/fdl/src/gen_fdl.py b/xlators/experimental/fdl/src/gen_fdl.py new file mode 100755 index 00000000000..7f6b1aaaeaa --- /dev/null +++ b/xlators/experimental/fdl/src/gen_fdl.py @@ -0,0 +1,328 @@ +#!/usr/bin/python + +import os +import sys + +curdir = os.path.dirname (sys.argv[0]) +gendir = os.path.join (curdir, '../../../../libglusterfs/src') +sys.path.append (gendir) +from generator import ops, fop_subs, cbk_subs, generate + +# Generation occurs in three stages.  In this case, it actually makes more +# sense to discuss them in the *opposite* order of that in which they +# actually happen. +# +#   Stage 3 is to insert all of the generated code into a file, replacing the +#   "#pragma generate" that's already there.  The file can thus contain all +#   sorts of stuff that's not specific to one fop, either before or after the +#   generated code as appropriate. +# +#   Stage 2 is to generate all of the code *for a particular fop*, using a +#   string-valued template plus a table of substitution values.  Most of these +#   are built in to the generator itself.  However, we also add a couple that +#   are specific to this particular translator - LEN_CODE and SER_CODE.  These +#   are per-fop functions to get the length or the contents (respectively) of +#   what we'll put in the log.  As with stage 3 allowing per-file boilerplate +#   before and after generated code, this allows per-fop boilerplate before and +#   after generated code. +# +#   Stage 1, therefore, is to create the LEN_CODE and SER_CODE substitutions for +#   each fop, and put them in the same table where e.g. NAME and SHORT_ARGS +#   already are.  We do this by looking at the fop-description table in the +#   generator module, then doing out own template substitution to plug each +#   specific argument name into another string-valued template. +# +# So, what does this leave us with in terms of variables and files? +# +#   For stage 1, we have a series of LEN_*_TEMPLATE and SERLZ_*_TEMPLATE +#   strings, which are used to generate the length and serialization code for +#   each argument type. +# +#   For stage 2, we have a bunch of *_TEMPLATE strings (no LEN_ or SERLZ_ +#   prefix), which are used (along with the output from stage 1) to generate +#   whole functions. +# +#   For stage 3, we have a whole separate file (fdl_tmpl.c) into which we insert +#   the collection of all functions defined in stage 2. + + +LEN_TEMPLATE = """ +void +fdl_len_@NAME@ (call_stub_t *stub) +{ +        uint32_t    meta_len    = sizeof (event_header_t); +		uint32_t	data_len	= 0; + +        /* TBD: global stuff, e.g. uid/gid */ +@LEN_CODE@ + +		/* TBD: pad extension length */ +		stub->jnl_meta_len = meta_len; +		stub->jnl_data_len = data_len; +} +""" + +SER_TEMPLATE = """ +void +fdl_serialize_@NAME@ (call_stub_t *stub, char *meta_buf, char *data_buf) +{ +		event_header_t	*eh; +		unsigned long	offset = 0; + +        /* TBD: word size/endianness */ +		eh = (event_header_t *)meta_buf; +		eh->event_type = NEW_REQUEST; +		eh->fop_type = GF_FOP_@UPNAME@; +		eh->request_id = 0;	// TBD +		meta_buf += sizeof (*eh); +@SER_CODE@ +		/* TBD: pad extension length */ +		eh->ext_length = offset; +} +""" + +CBK_TEMPLATE = """ +int32_t +fdl_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                int32_t op_ret, int32_t op_errno, +                @LONG_ARGS@) +{ +        STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, +                             @SHORT_ARGS@); +        return 0; +} +""" + +CONTINUE_TEMPLATE = """ +int32_t +fdl_@NAME@_continue (call_frame_t *frame, xlator_t *this, +                     @LONG_ARGS@) +{ +        STACK_WIND (frame, fdl_@NAME@_cbk, +                    FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, +                    @SHORT_ARGS@); +        return 0; +} + +""" + +FOP_TEMPLATE = """ +int32_t +fdl_@NAME@ (call_frame_t *frame, xlator_t *this, +            @LONG_ARGS@) +{ +        call_stub_t     *stub; + +        stub = fop_@NAME@_stub (frame, default_@NAME@, +                                @SHORT_ARGS@); +		fdl_len_@NAME@ (stub); +        stub->serialize = fdl_serialize_@NAME@; +        fdl_enqueue (this, stub); + +        return 0; +} +""" + +LEN_DICT_TEMPLATE = """ +		if (@SRC@) { +			data_pair_t *memb; +			for (memb = @SRC@->members_list; memb; memb = memb->next) { +				meta_len += sizeof(int); +				meta_len += strlen(memb->key) + 1; +				meta_len += sizeof(int); +				meta_len += memb->value->len; +			} +		} +		meta_len += sizeof(int); +""" + +LEN_GFID_TEMPLATE = """ +        meta_len += 16; +""" + +LEN_INTEGER_TEMPLATE = """ +        meta_len += sizeof (@SRC@); +""" + +# 16 for gfid, 16 for pargfid, 1 for flag, 0/1 for terminating NUL +LEN_LOC_TEMPLATE = """ +        if (@SRC@.name) { +                meta_len += (strlen (@SRC@.name) + 34); +        } else { +                meta_len += 33; +        } +""" + +LEN_STRING_TEMPLATE = """ +        if (@SRC@) { +                meta_len += (strlen (@SRC@) + 1); +        } else { +                meta_len += 1; +        } +""" + +LEN_VECTOR_TEMPLATE = """ +        meta_len += sizeof(size_t); +        data_len += iov_length (@VEC@, @CNT@); +""" + +LEN_IATT_TEMPLATE = """ +		meta_len += sizeof(@SRC@.ia_prot); +		meta_len += sizeof(@SRC@.ia_uid); +		meta_len += sizeof(@SRC@.ia_gid); +		meta_len += sizeof(@SRC@.ia_atime); +		meta_len += sizeof(@SRC@.ia_atime_nsec); +		meta_len += sizeof(@SRC@.ia_mtime); +		meta_len += sizeof(@SRC@.ia_mtime_nsec); +""" + +SERLZ_DICT_TEMPLATE = """ +        if (@SRC@) { +			data_pair_t *memb; +			for (memb = @SRC@->members_list; memb; memb = memb->next) { +				*((int *)(meta_buf+offset)) = strlen(memb->key) + 1; +				offset += sizeof(int); +				strcpy (meta_buf+offset, memb->key); +				offset += strlen(memb->key) + 1; +				*((int *)(meta_buf+offset)) = memb->value->len; +				offset += sizeof(int); +				memcpy (meta_buf+offset, memb->value->data, memb->value->len); +				offset += memb->value->len; +			} +        } +		*((int *)(meta_buf+offset)) = 0; +		offset += sizeof(int); +""" + +SERLZ_GFID_TEMPLATE = """ +        memcpy (meta_buf+offset, @SRC@->inode->gfid, 16); +        offset += 16; +""" + +SERLZ_INTEGER_TEMPLATE = """ +        memcpy (meta_buf+offset, &@SRC@, sizeof(@SRC@)); +        offset += sizeof(@SRC@); +""" + +SERLZ_LOC_TEMPLATE = """ +        memcpy (meta_buf+offset, @SRC@.gfid, 16); +        offset += 16; +        memcpy (meta_buf+offset, @SRC@.pargfid, 16); +        offset += 16; +        if (@SRC@.name) { +                *(meta_buf+offset) = 1; +				++offset; +                strcpy (meta_buf+offset, @SRC@.name); +                offset += (strlen (@SRC@.name) + 1); +        } else { +                *(meta_buf+offset) = 0; +				++offset; +        } +""" + +SERLZ_STRING_TEMPLATE = """ +        if (@SRC@) { +                *(meta_buf+offset) = 1; +				++offset; +                strcpy (meta_buf+offset, @SRC@); +                offset += strlen(@SRC@); +        } else { +                *(meta_buf+offset) = 0; +				++offset; +        } +""" + +SERLZ_VECTOR_TEMPLATE = """ +        *((size_t *)(meta_buf+offset)) = iov_length (@VEC@, @CNT@); +        offset += sizeof(size_t); +        int32_t i; +        for (i = 0; i < @CNT@; ++i) { +                memcpy (data_buf, @VEC@[i].iov_base, @VEC@[i].iov_len); +                data_buf += @VEC@[i].iov_len; +        } +""" + +# We don't need to save all of the fields - only those affected by chown, +# chgrp, chmod, and utime. +SERLZ_IATT_TEMPLATE = """ +		*((ia_prot_t *)(meta_buf+offset)) = @SRC@.ia_prot; +		offset += sizeof(@SRC@.ia_prot); +		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_uid; +		offset += sizeof(@SRC@.ia_uid); +		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_gid; +		offset += sizeof(@SRC@.ia_gid); +		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime; +		offset += sizeof(@SRC@.ia_atime); +		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime_nsec; +		offset += sizeof(@SRC@.ia_atime_nsec); +		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime; +		offset += sizeof(@SRC@.ia_mtime); +		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime_nsec; +		offset += sizeof(@SRC@.ia_mtime_nsec); +""" + +typemap = { +	'dict_t *':				( LEN_DICT_TEMPLATE,	SERLZ_DICT_TEMPLATE), +	'fd_t *':				( LEN_GFID_TEMPLATE,	SERLZ_GFID_TEMPLATE), +	'dev_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE), +	'gf_xattrop_flags_t':	( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE), +	'int32_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE), +	'mode_t':				( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE), +	'off_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE), +	'size_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE), +	'uint32_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE), +	'loc_t *':				( LEN_LOC_TEMPLATE,		SERLZ_LOC_TEMPLATE), +	'const char *':			( LEN_STRING_TEMPLATE,	SERLZ_STRING_TEMPLATE), +	'struct iatt *':		( LEN_IATT_TEMPLATE,	SERLZ_IATT_TEMPLATE), +} + +def get_special_subs (args): +	len_code = "" +	ser_code = "" +	for arg in args: +		if (arg[0] != 'fop-arg') or (len(arg) < 4): +			continue +		# Let this throw an exception if we get an unknown field name.  The +		# broken build will remind whoever messed with the stub code that a +		# corresponding update is needed here. +		if arg[3] == "vector": +			# Make it as obvious as possible that this is a special case. +			len_code += LEN_VECTOR_TEMPLATE \ +				.replace("@VEC@","stub->args.vector") \ +				.replace("@CNT@","stub->args.count") +			ser_code += SERLZ_VECTOR_TEMPLATE \ +				.replace("@VEC@","stub->args.vector") \ +				.replace("@CNT@","stub->args.count") +		else: +			len_tmpl, ser_tmpl = typemap[arg[2]] +			src = "stub->args.%s" % arg[3] +			len_code += len_tmpl.replace("@SRC@",src) +			ser_code += ser_tmpl.replace("@SRC@",src) +	return len_code, ser_code + +def gen_fdl (): +	entrypoints = [] +	for name, value in ops.iteritems(): +		if "journal" not in [ x[0] for x in value ]: +			continue +		len_code, ser_code = get_special_subs(value) +		fop_subs[name]["@LEN_CODE@"] = len_code[:-1] +		fop_subs[name]["@SER_CODE@"] = ser_code[:-1] +		print generate(LEN_TEMPLATE,name,fop_subs) +		print generate(SER_TEMPLATE,name,fop_subs) +		print generate(CBK_TEMPLATE,name,cbk_subs) +		print generate(CONTINUE_TEMPLATE,name,fop_subs) +		print generate(FOP_TEMPLATE,name,fop_subs) +		entrypoints.append(name) +	print "struct xlator_fops fops = {" +	for ep in entrypoints: +		print "\t.%s = fdl_%s," % (ep, ep) +	print "};" + +for l in open(sys.argv[1],'r').readlines(): +	if l.find('#pragma generate') != -1: +		print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" +		gen_fdl() +		print "/* END GENERATED CODE */" +	else: +		print l[:-1] diff --git a/xlators/experimental/fdl/src/gen_recon.py b/xlators/experimental/fdl/src/gen_recon.py new file mode 100755 index 00000000000..26318f92d88 --- /dev/null +++ b/xlators/experimental/fdl/src/gen_recon.py @@ -0,0 +1,191 @@ +#!/usr/bin/python + +import os +import re +import string +import sys + +curdir = os.path.dirname (sys.argv[0]) +gendir = os.path.join (curdir, '../../../../libglusterfs/src') +sys.path.append (gendir) +from generator import ops, fop_subs, cbk_subs, generate + +# See the big header comment at the start of gen_fdl.py to see how the stages +# fit together.  The big difference here is that *all* of the C code is in the +# template file as labelled fragments, instead of as Python strings.  That +# makes it much easier to edit in one place, with proper syntax highlighting +# and indentation. +# +#   Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of +#   LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE. +# +#   Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and +#   FOP_TEMPLATE.  The expanded FOP code (including FUNCTION_BODY substitution +#   in the middle of each function) is emitted immediately; the expanded CASE +#   code is saved for the next stage. +# +#   Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code +#   in the middle of EPILOG, to generate the whole output file. +# +# Another way of looking at it is to consider how the fragments appear in +# the final output: +# +#   PROLOG +#   FOP (expanded for CREATE) +#       FOP before FUNCTION_BODY +#       LOC, INTEGER, GFID, etc. (one per arg, by type) +#       FOP after FUNCTION_BODY +#   FOP (expanded for WRITEV) +#       FOP before FUNCTION_BODY +#       GFID, VECTOR, etc. (one per arg, by type) +#       FOP after FUNCTION_BODY +#   (more FOPs) +#   EPILOG +#       EPILOG before CASE +#       CASE statements (one per fop) +#       EPILOG after CASE + +typemap = { +	'dict_t *':				"DICT", +	'fd_t *':				"FD", +	'dev_t':				"DOUBLE", +	'gf_xattrop_flags_t':	"INTEGER", +	'int32_t':				"INTEGER", +	'mode_t':				"INTEGER", +	'off_t':				"DOUBLE", +	'size_t':				"DOUBLE", +	'uint32_t':				"INTEGER", +	'loc_t *':				"LOC", +	'const char *':			"STRING", +	'struct iovec *':		"VECTOR", +	'struct iatt *':		"IATT", +	'struct iobref *':		"IOBREF", +} + +def get_special_subs (name, args, fop_type): +	code = "" +	cleanups = "" +	links = "" +	s_args = [] +	for arg in args: +		if arg[0] == 'extra': +			code += "\t%s %s;\n\n" % (arg[2], arg[1]) +			s_args.append(arg[3]) +			continue +		if arg[0] == 'link': +			links += fragments["LINK"].replace("@INODE_ARG@",arg[1])	\ +									  .replace("@IATT_ARG@",arg[2]) +			continue +		if arg[0] != 'fop-arg': +			continue +		if (name, arg[1]) == ('writev', 'count'): +			# Special case: just skip this.  We can't mark it as 'nosync' +			# because of the way the translator and dumper generators look for +			# that after 'stub-name' which we don't define.  Instead of adding a +			# bunch of generic infrastructure for this one case, just pound it +			# here. +			continue +		recon_type = typemap[arg[2]] +		# print "/* %s.%s => %s (%s)*/" % (name, arg[1], recon_type, fop_type) +		if (name == "create") and (arg[1] == "fd"): +			# Special case: fd for create is new, not looked up. +			# print "/* change to NEW_FD */" +			recon_type = "NEW_FD" +		elif (recon_type == "LOC") and (fop_type == "entry-op"): +			# Need to treat this differently for inode vs. entry ops. +			# Special case: link source is treated like inode-op. +			if (name != "link") or (arg[1] != "oldloc"): +				# print "/* change to PARENT_LOC */" +				recon_type = "PARENT_LOC" +		code += fragments[recon_type].replace("@ARGNAME@",arg[1])		\ +									 .replace("@ARGTYPE@",arg[2]) +		cleanup_key = recon_type + "_CLEANUP" +		if fragments.has_key(cleanup_key): +			cleanups += fragments[cleanup_key].replace("@ARGNAME@",arg[1]) +		if 'nosync' in arg[4:]: +			code += "\t(void)%s;\n" % arg[1]; +			continue +		if arg[2] in ("loc_t *", "struct iatt *"): +			# These are passed as pointers to the syncop, but they're actual +			# structures in the generated code. +			s_args.append("&"+arg[1]); +		else: +			s_args.append(arg[1]) +	# We have to handle a couple of special cases here, because some n00b +	# defined the syncops with a different argument order than the fops they're +	# based on. +	if name == 'writev': +		# Swap 'flags' and 'iobref'.  Also, we need to add the iov count, which +		# is not stored in or read from the journal.  There are other ways to +		# do that, but this is the only place we need anything similar and we +		# already have to treat it as a special case so this is simplest. +		s_args_str = 'fd, &vector, 1, off, iobref, flags, xdata' +	elif name == 'symlink': +		# Swap 'linkpath' and 'loc'. +		s_args_str = '&loc, linkpath, &iatt, xdata' +	else: +		s_args_str = string.join (s_args, ", ") +	return code, links, s_args_str, cleanups + +# TBD: probably need to generate type-specific cleanup code as well - e.g. +# fd_unref for an fd_t, loc_wipe for a loc_t, and so on.  All of these +# generated CLEANUP fragments will go at the end of the function, with goto +# labels.  Meanwhile, the error-checking part of each type-specific fragment +# (e.g. LOC or FD) will need to update the indirect label that we jump to when +# an error is detected.  This will probably get messy. +def gen_functions (): +	code = "" +	for name, value in ops.iteritems(): +		fop_type = [ x[1] for x in value if x[0] == "journal" ] +		if not fop_type: +			continue +		body, links, syncop_args, cleanups = get_special_subs (name, value, +															   fop_type[0]) +		fop_subs[name]["@FUNCTION_BODY@"] = body +		fop_subs[name]["@LINKS@"] = links +		fop_subs[name]["@SYNCOP_ARGS@"] = syncop_args +		fop_subs[name]["@CLEANUPS@"] = cleanups +		if name == "writev": +			# Take advantage of the fact that, *during reconciliation*, the +			# vector is always a single element.  In normal I/O it's not. +			fop_subs[name]["@SUCCESS_VALUE@"] = "vector.iov_len" +		else: +			fop_subs[name]["@SUCCESS_VALUE@"] = "GFAPI_SUCCESS" +		# Print the FOP fragment with @FUNCTION_BODY@ in the middle. +		code += generate(fragments["FOP"],name,fop_subs) +	return code + +def gen_cases (): +	code = "" +	for name, value in ops.iteritems(): +		if "journal" not in [ x[0] for x in value ]: +			continue +		# Add the CASE fragment for this fop. +		code += generate(fragments["CASE"],name,fop_subs) +	return code + +def load_fragments (path="recon-tmpl.c"): +	pragma_re = re.compile('pragma fragment (.*)') +	cur_symbol = None +	cur_value = "" +	result = {} +	for line in open(path,"r").readlines(): +		m = pragma_re.search(line) +		if m: +			if cur_symbol: +				result[cur_symbol] = cur_value +			cur_symbol = m.group(1) +			cur_value = "" +		else: +			cur_value += line +	if cur_symbol: +		result[cur_symbol] = cur_value +	return result + +if __name__ == "__main__": +	fragments = load_fragments(sys.argv[1]) +	print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" +	print fragments["PROLOG"] +	print gen_functions() +	print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases()) +	print "/* END GENERATED CODE */" diff --git a/xlators/experimental/fdl/src/jnl-types.h b/xlators/experimental/fdl/src/jnl-types.h new file mode 100644 index 00000000000..8cb39d01a25 --- /dev/null +++ b/xlators/experimental/fdl/src/jnl-types.h @@ -0,0 +1,14 @@ +#define NEW_REQUEST     (uint8_t)'N' + +typedef struct { +        uint8_t         event_type;     /* e.g. NEW_REQUEST */ +        uint8_t         fop_type;       /* e.g. GF_FOP_SETATTR */ +        uint16_t        request_id; +        uint32_t        ext_length; +} event_header_t; + +enum { +        FDL_IPC_BASE = 0xfeedbee5,       /* ... and they make honey */ +        FDL_IPC_CHANGE_TERM, +        FDL_IPC_GET_TERMS, +}; diff --git a/xlators/experimental/fdl/src/logdump.c b/xlators/experimental/fdl/src/logdump.c new file mode 100644 index 00000000000..7c979c32a04 --- /dev/null +++ b/xlators/experimental/fdl/src/logdump.c @@ -0,0 +1,50 @@ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +extern int fdl_dump (char **, char **); + +int +main (int argc, char **argv) +{ +        int     meta_fd         = (-1); +        char    *meta_buf       = NULL; +        int     data_fd         = (-1); +        char    *data_buf       = NULL; + +        meta_fd = open (argv[1], O_RDONLY); +        if (meta_fd < 0) { +                perror ("open"); +                return EXIT_FAILURE; +        } + +        /* TBD: get proper length */ +        meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0); +        if (meta_buf == MAP_FAILED) { +                perror ("mmap"); +                return EXIT_FAILURE; +        } + +        data_fd = open (argv[2], O_RDONLY); +        if (data_fd < 0) { +                perror ("open"); +                return EXIT_FAILURE; +        } + +        /* TBD: get proper length */ +        data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0); +        if (data_buf == MAP_FAILED) { +                perror ("mmap"); +                return EXIT_FAILURE; +        } + +        for (;;) { +                if (!fdl_dump(&meta_buf,&data_buf)) { +                        break; +                } +        } + +        return EXIT_SUCCESS; +} diff --git a/xlators/experimental/fdl/src/recon-tmpl.c b/xlators/experimental/fdl/src/recon-tmpl.c new file mode 100644 index 00000000000..523bda39418 --- /dev/null +++ b/xlators/experimental/fdl/src/recon-tmpl.c @@ -0,0 +1,305 @@ +#pragma fragment PROLOG +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "fd.h" +#include "iatt.h" +#include "syncop.h" +#include "xlator.h" +#include "glfs-internal.h" + +#include "jnl-types.h" + +#define GFAPI_SUCCESS 0 + +inode_t * +recon_get_inode (glfs_t *fs, uuid_t gfid) +{ +        inode_t         *inode; +        loc_t           loc     = {NULL,}; +        struct iatt     iatt; +        int             ret; +        inode_t         *newinode; + +        inode = inode_find (fs->active_subvol->itable, gfid); +        if (inode) { +                printf ("=== FOUND %s IN TABLE\n", uuid_utoa(gfid)); +                return inode; +        } + +        loc.inode = inode_new (fs->active_subvol->itable); +        if (!loc.inode) { +                return NULL; +        } +        gf_uuid_copy (loc.inode->gfid, gfid); +        gf_uuid_copy (loc.gfid, gfid); + +        printf ("=== DOING LOOKUP FOR %s\n", uuid_utoa(gfid)); + +        ret = syncop_lookup (fs->active_subvol, &loc, &iatt, +                             NULL, NULL, NULL); +        if (ret != GFAPI_SUCCESS) { +                fprintf (stderr, "syncop_lookup failed (%d)\n", ret); +                return NULL; +        } + +        newinode = inode_link (loc.inode, NULL, NULL, &iatt); +        if (newinode) { +                inode_lookup (newinode); +        } + +        return newinode; +} + +#pragma fragment DICT +        dict_t  *@ARGNAME@; + +        @ARGNAME@ = dict_new(); +        if (!@ARGNAME@) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@; + +        { +                int     key_len, data_len; +                char    *key_ptr; +                int     garbage; +                for (;;) { +                        key_len = *((int *)new_meta); +                        new_meta += sizeof(int); +                        if (!key_len) { +                                break; +                        } +                        key_ptr = new_meta; +                        new_meta += key_len; +                        data_len = *((int *)new_meta); +                        new_meta += sizeof(int); +                        garbage = dict_set_static_bin (@ARGNAME@, key_ptr, +                                                       new_meta, data_len); +                        /* TBD: check error from dict_set_static_bin */ +                        (void)garbage; +                        new_meta += data_len; +                } +        } + +#pragma fragment DICT_CLEANUP +cleanup_@ARGNAME@: +        dict_unref (@ARGNAME@); + +#pragma fragment DOUBLE +        @ARGTYPE@       @ARGNAME@       = *((@ARGTYPE@ *)new_meta); +        new_meta += sizeof(uint64_t); + +#pragma fragment FD +        inode_t *@ARGNAME@_ino; +        fd_t    *@ARGNAME@; + +        @ARGNAME@_ino = recon_get_inode (fs, *((uuid_t *)new_meta)); +        new_meta += 16; +        if (!@ARGNAME@_ino) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@_ino; + +        @ARGNAME@ = fd_anonymous (@ARGNAME@_ino); +        if (!@ARGNAME@) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@; + +#pragma fragment FD_CLEANUP +cleanup_@ARGNAME@: +        fd_unref (@ARGNAME@); +cleanup_@ARGNAME@_ino: +        inode_unref (@ARGNAME@_ino); + +#pragma fragment NEW_FD +        /* +         * This pseudo-type is only used for create, and in that case we know +         * we'll be using loc.inode, so it's not worth generalizing to take an +         * extra argument. +         */ +        fd_t    *@ARGNAME@      = fd_anonymous (loc.inode); + +        if (!fd) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@; +        new_meta += 16; + +#pragma fragment NEW_FD_CLEANUP +cleanup_@ARGNAME@: +        fd_unref (@ARGNAME@); + +#pragma fragment INTEGER +        @ARGTYPE@       @ARGNAME@       = *((@ARGTYPE@ *)new_meta); + +        new_meta += sizeof(@ARGTYPE@); + +#pragma fragment LOC +        loc_t           @ARGNAME@       = { NULL, }; + +        @ARGNAME@.inode = recon_get_inode (fs, *((uuid_t *)new_meta)); +        if (!@ARGNAME@.inode) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@; +        gf_uuid_copy (@ARGNAME@.gfid, @ARGNAME@.inode->gfid); +        new_meta += 16; +        new_meta += 16; /* skip over pargfid */ +        if (*(new_meta++)) { +                @ARGNAME@.name = new_meta; +                new_meta += strlen(new_meta) + 1; +        } + +#pragma fragment LOC_CLEANUP +cleanup_@ARGNAME@: +        loc_wipe (&@ARGNAME@); + +#pragma fragment PARENT_LOC +        loc_t           @ARGNAME@       = { NULL, }; + +        new_meta += 16; /* skip over gfid */ +        @ARGNAME@.parent = recon_get_inode (fs, *((uuid_t *)new_meta)); +        if (!@ARGNAME@.parent) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@; +        gf_uuid_copy (@ARGNAME@.pargfid, @ARGNAME@.parent->gfid); +        new_meta += 16; +        if (!*(new_meta++)) { +                goto *err_label; +        } +        @ARGNAME@.name = new_meta; +        new_meta += strlen(new_meta) + 1; + +        @ARGNAME@.inode = inode_new (fs->active_subvol->itable); +        if (!@ARGNAME@.inode) { +                goto *err_label; +        } + +#pragma fragment PARENT_LOC_CLEANUP +cleanup_@ARGNAME@: +        loc_wipe (&@ARGNAME@); + +#pragma fragment STRING +        char    *@ARGNAME@; +        if (*(new_meta++)) { +                @ARGNAME@ = new_meta; +                new_meta += (strlen(new_meta) + 1); +        } +        else { +                goto *err_label; +        } + +#pragma fragment VECTOR +        struct iovec    @ARGNAME@; + +        @ARGNAME@.iov_len = *((size_t *)new_meta); +        new_meta += sizeof(@ARGNAME@.iov_len); +        @ARGNAME@.iov_base = new_data; +        new_data += @ARGNAME@.iov_len; + +#pragma fragment IATT +        struct iatt     @ARGNAME@; +        { +                @ARGNAME@.ia_prot = *((ia_prot_t *)new_meta); +                new_meta += sizeof(ia_prot_t); +                uint32_t *myints = (uint32_t *)new_meta; +                @ARGNAME@.ia_uid = myints[0]; +                @ARGNAME@.ia_gid = myints[1]; +                @ARGNAME@.ia_atime = myints[2]; +                @ARGNAME@.ia_atime_nsec = myints[3]; +                @ARGNAME@.ia_mtime = myints[4]; +                @ARGNAME@.ia_mtime_nsec = myints[5]; +                new_meta += sizeof(*myints) * 6; +        } + +#pragma fragment IOBREF +        struct iobref   *@ARGNAME@; + +        @ARGNAME@ = iobref_new(); +        if (!@ARGNAME@) { +                goto *err_label; +        } +        err_label = &&cleanup_@ARGNAME@; + +#pragma fragment IOBREF_CLEANUP +cleanup_@ARGNAME@: +        iobref_unref (@ARGNAME@); + +#pragma fragment LINK +        /* TBD: check error */ +        inode_t *new_inode = inode_link (@INODE_ARG@, NULL, NULL, @IATT_ARG@); +        if (new_inode) { +                inode_lookup (new_inode); +        } + +#pragma fragment FOP +int +fdl_replay_@NAME@ (glfs_t *fs, char **old_meta, char **old_data) +{ +        char    *new_meta	= *old_meta; +        char	*new_data	= *old_data; +        int     ret; +        int     status          = 0xbad; +        void    *err_label      = &&done; + +@FUNCTION_BODY@ + +        ret = syncop_@NAME@ (fs->active_subvol, @SYNCOP_ARGS@, NULL); +        if (ret != @SUCCESS_VALUE@) { +                fprintf (stderr, "syncop_@NAME@ returned %d", ret); +                goto *err_label; +        } + +@LINKS@ + +        status = 0; + +@CLEANUPS@ + +done: +        *old_meta = new_meta; +        *old_data = new_data; +        return status; +} + +#pragma fragment CASE +        case GF_FOP_@UPNAME@: +                printf ("=== GF_FOP_@UPNAME@\n"); +                if (fdl_replay_@NAME@ (fs, &new_meta, &new_data) != 0) { +                        goto done; +                } +                recognized = 1; +                break; + +#pragma fragment EPILOG +int +recon_execute (glfs_t *fs, char **old_meta, char **old_data) +{ +        char            *new_meta       = *old_meta; +        char            *new_data       = *old_data; +        int             recognized      = 0; +        event_header_t  *eh; + +        eh = (event_header_t *)new_meta; +        new_meta += sizeof (*eh); + +        /* TBD: check event_type instead of assuming NEW_REQUEST */ + +        switch (eh->fop_type) { +@SWITCH_BODY@ + +        default: +                printf ("unknown fop %u\n", eh->fop_type); +        } + +done: +        *old_meta = new_meta; +        *old_data = new_data; +        return recognized; +} diff --git a/xlators/experimental/fdl/src/recon.c b/xlators/experimental/fdl/src/recon.c new file mode 100644 index 00000000000..14168a011e0 --- /dev/null +++ b/xlators/experimental/fdl/src/recon.c @@ -0,0 +1,89 @@ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "glusterfs.h" +#include "fd.h" +#include "syncop.h" +#include "glfs-internal.h" + +#define GFAPI_SUCCESS 0 + +extern int recon_execute (glfs_t *, char **, char **); + +int +main (int argc, char **argv) +{ +        glfs_t  *fs; +        int     ret; +        int     meta_fd         = (-1); +        char    *meta_buf       = NULL; +        int     data_fd         = (-1); +        char    *data_buf       = NULL; + +        fs = glfs_new ("whocares"); +        if (!fs) { +                fprintf (stderr, "glfs_new failed\n"); +                return EXIT_FAILURE; +        } + +        if (getenv("RECON_DEBUG")) { +                ret = glfs_set_logging (fs, "/dev/stderr", 7); +        } +        else { +                ret = glfs_set_logging (fs, "/dev/null", 0); +        } + +        if (ret != GFAPI_SUCCESS) { +                fprintf (stderr, "glfs_set_logging failed (%d)\n", errno); +                return EXIT_FAILURE; +        } + +        ret = glfs_set_volfile (fs, argv[1]); +        if (ret != GFAPI_SUCCESS) { +                fprintf (stderr, "glfs_set_volfile failed (%d)\n", errno); +                return EXIT_FAILURE; +        } + +        ret = glfs_init (fs); +        if (ret != GFAPI_SUCCESS) { +                fprintf (stderr, "glfs_init failed (%d)\n", errno); +                return EXIT_FAILURE; +        } + +        meta_fd = open (argv[2], O_RDONLY); +        if (meta_fd < 0) { +                perror ("open"); +                return EXIT_FAILURE; +        } + +        /* TBD: get proper length */ +        meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0); +        if (meta_buf == MAP_FAILED) { +                perror ("mmap"); +                return EXIT_FAILURE; +        } + +        data_fd = open (argv[3], O_RDONLY); +        if (data_fd < 0) { +                perror ("open"); +                return EXIT_FAILURE; +        } + +        /* TBD: get proper length */ +        data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0); +        if (data_buf == MAP_FAILED) { +                perror ("mmap"); +                return EXIT_FAILURE; +        } + +        for (;;) { +                if (!recon_execute(fs,&meta_buf,&data_buf)) { +                        break; +                } +        } + +        return EXIT_SUCCESS; +} diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 7e5783f4f30..649d9d8e9fa 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,5 +1,6 @@ -SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier arbiter\ -          protect compress changelog changetimerecorder ganesha gfid-access $(GLUPY_SUBDIR) qemu-block \ -          upcall snapview-client snapview-server trash shard bit-rot #path-converter # filter +SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier \ +	  arbiter protect compress changelog changetimerecorder ganesha \ +	  gfid-access $(GLUPY_SUBDIR) qemu-block upcall snapview-client \ +	  snapview-server trash shard bit-rot  CLEANFILES = diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 2c52cf72a3f..3df4b3556cf 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -1783,6 +1783,30 @@ out:          return ret;  } +/* Add this before (above) io-threads because it's not thread-safe yet. */ +static int +brick_graph_add_fdl (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, +                     dict_t *set_dict, glusterd_brickinfo_t *brickinfo) +{ + +        xlator_t        *xl = NULL; +        int             ret = -1; + +        if (!graph || !volinfo || !set_dict) +                goto out; + +        if (dict_get_str_boolean (set_dict, "features.fdl", 0)) { +                xl = volgen_graph_add (graph, "experimental/fdl", +                                       volinfo->volname); +                if (!xl) +                        goto out; +        } +        ret = 0; + +out: +        return ret; +} +  static int  brick_graph_add_iot (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                        dict_t *set_dict, glusterd_brickinfo_t *brickinfo) @@ -2359,6 +2383,7 @@ static volgen_brick_xlator_t server_graph_table[] = {          {brick_graph_add_index, "index"},          {brick_graph_add_barrier, NULL},          {brick_graph_add_marker, "marker"}, +        {brick_graph_add_fdl, "fdl"},          {brick_graph_add_iot, "io-threads"},          {brick_graph_add_upcall, "upcall"},          {brick_graph_add_pump, NULL}, diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 1463ef72c71..c0059d83cfe 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -2711,6 +2711,15 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version  = GD_OP_VERSION_4_0_0,            .description = "percent of rep_count-1 bricks that must be up"          }, +        /* Full Data Logging */ +        { +          .key         = "features.fdl", +          .voltype     = "features/fdl", +          .option      = "!fdl", +          .op_version  = GD_OP_VERSION_4_0_0, +          .flags       = OPT_FLAG_XLATOR_OPT, +          .type        = NO_DOC, +        },          { .key         = NULL          }  };  | 
