From c458433041aafb48ae6d6e5fcf3e1e737dc3fda3 Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Mon, 8 Feb 2016 13:30:49 -0500
Subject: experimental: add fdl (Full Data Logging) translator

NSR needs logging that is different than our existing changelog in
several ways:

 * Full data, not just metadata

 * Pre-op, not post-op

 * High performance

 * Supports the concept of time-bounded "terms"

Others (for example EC) might need the same thing.  This patch adds such
a translator.  It also adds code to dump the resulting journals, and to replay
them using syncops, plus (very rudimentary) tests for all of the above.

Change-Id: I29680a1b4e0a9e7d5a8497fef302c46434b86636
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.org/12450
Smoke: Gluster Build System <jenkins@build.gluster.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
---
 xlators/experimental/fdl/Makefile.am       |   3 +
 xlators/experimental/fdl/src/Makefile.am   |  42 +++
 xlators/experimental/fdl/src/dump-tmpl.c   | 156 +++++++++
 xlators/experimental/fdl/src/fdl-tmpl.c    | 506 +++++++++++++++++++++++++++++
 xlators/experimental/fdl/src/gen_dumper.py | 116 +++++++
 xlators/experimental/fdl/src/gen_fdl.py    | 328 +++++++++++++++++++
 xlators/experimental/fdl/src/gen_recon.py  | 191 +++++++++++
 xlators/experimental/fdl/src/jnl-types.h   |  14 +
 xlators/experimental/fdl/src/logdump.c     |  50 +++
 xlators/experimental/fdl/src/recon-tmpl.c  | 305 +++++++++++++++++
 xlators/experimental/fdl/src/recon.c       |  89 +++++
 11 files changed, 1800 insertions(+)
 create mode 100644 xlators/experimental/fdl/Makefile.am
 create mode 100644 xlators/experimental/fdl/src/Makefile.am
 create mode 100644 xlators/experimental/fdl/src/dump-tmpl.c
 create mode 100644 xlators/experimental/fdl/src/fdl-tmpl.c
 create mode 100755 xlators/experimental/fdl/src/gen_dumper.py
 create mode 100755 xlators/experimental/fdl/src/gen_fdl.py
 create mode 100755 xlators/experimental/fdl/src/gen_recon.py
 create mode 100644 xlators/experimental/fdl/src/jnl-types.h
 create mode 100644 xlators/experimental/fdl/src/logdump.c
 create mode 100644 xlators/experimental/fdl/src/recon-tmpl.c
 create mode 100644 xlators/experimental/fdl/src/recon.c

(limited to 'xlators/experimental/fdl')

diff --git a/xlators/experimental/fdl/Makefile.am b/xlators/experimental/fdl/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/experimental/fdl/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/experimental/fdl/src/Makefile.am b/xlators/experimental/fdl/src/Makefile.am
new file mode 100644
index 00000000000..a05fc797b0a
--- /dev/null
+++ b/xlators/experimental/fdl/src/Makefile.am
@@ -0,0 +1,42 @@
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental
+xlator_LTLIBRARIES = fdl.la
+
+noinst_HEADERS = jnl-types.h
+
+nodist_fdl_la_SOURCES = fdl.c
+fdl_la_LDFLAGS = -module -avoid-version
+fdl_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+sbin_PROGRAMS = gf_logdump gf_recon
+gf_logdump_SOURCES = logdump.c
+nodist_gf_logdump_SOURCES = libfdl.c
+gf_logdump_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+                   $(top_builddir)/api/src/libgfapi.la
+
+# Eventually recon(ciliation) code will move elsewhere, but for now it's
+# easier to have it next to the similar logdump code.
+gf_recon_SOURCES = recon.c
+nodist_gf_recon_SOURCES = librecon.c
+gf_recon_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+                   $(top_builddir)/api/src/libgfapi.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	      -I$(top_srcdir)/api/src -fPIC \
+	      -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+	      -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+noinst_PYTHON = gen_fdl.py gen_dumper.py gen_recon.py
+EXTRA_DIST = fdl-tmpl.c dump-tmpl.c recon-tmpl.c
+
+CLEANFILES = $(nodist_fdl_la_SOURCES) $(nodist_gf_logdump_SOURCES)
+
+fdl.c: fdl-tmpl.c gen_fdl.py
+	$(PYTHON) $(srcdir)/gen_fdl.py $(srcdir)/fdl-tmpl.c > $@
+
+libfdl.c: dump-tmpl.c gen_dumper.py
+	$(PYTHON) $(srcdir)/gen_dumper.py $(srcdir)/dump-tmpl.c > $@
+
+librecon.c: recon-tmpl.c gen_recon.py
+	$(PYTHON) $(srcdir)/gen_recon.py $(srcdir)/recon-tmpl.c > $@
diff --git a/xlators/experimental/fdl/src/dump-tmpl.c b/xlators/experimental/fdl/src/dump-tmpl.c
new file mode 100644
index 00000000000..cac1071a9c1
--- /dev/null
+++ b/xlators/experimental/fdl/src/dump-tmpl.c
@@ -0,0 +1,156 @@
+#pragma fragment PROLOG
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glfs.h"
+#include "iatt.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+#pragma fragment DICT
+        {
+                int key_len, data_len;
+                char *key_ptr;
+                printf ("@ARGNAME@ = dict {\n");
+                for (;;) {
+                        key_len = *((int *)new_meta);
+                        new_meta += sizeof(int);
+                        if (!key_len) {
+                                break;
+                        }
+                        key_ptr = new_meta;
+                        new_meta += key_len;
+                        data_len = *((int *)new_meta);
+                        new_meta += sizeof(int) + data_len;
+                        printf (" %s = <%d bytes>\n", key_ptr, data_len);
+                }
+                printf ("}\n");
+        }
+
+#pragma fragment DOUBLE
+        printf ("@ARGNAME@ = @FORMAT@\n", *((uint64_t *)new_meta),
+                *((uint64_t *)new_meta));
+        new_meta += sizeof(uint64_t);
+
+#pragma fragment GFID
+        printf ("@ARGNAME@ = <gfid %s>\n", uuid_utoa(*((uuid_t *)new_meta)));
+        new_meta += 16;
+
+#pragma fragment INTEGER
+        printf ("@ARGNAME@ = @FORMAT@\n", *((uint32_t *)new_meta),
+                *((uint32_t *)new_meta));
+        new_meta += sizeof(uint32_t);
+
+#pragma fragment LOC
+        printf ("@ARGNAME@ = loc {\n");
+        printf ("  gfid = %s\n", uuid_utoa(*((uuid_t *)new_meta)));
+        new_meta += 16;
+        printf ("  pargfid = %s\n", uuid_utoa(*((uuid_t *)new_meta)));
+        new_meta += 16;
+        if (*(new_meta++)) {
+                printf ("  name = %s\n", new_meta);
+                new_meta += (strlen(new_meta) + 1);
+        }
+        printf ("}\n");
+
+#pragma fragment STRING
+        if (*(new_meta++)) {
+                printf ("@ARGNAME@ = %s\n", new_meta);
+                new_meta += (strlen(new_meta) + 1);
+        }
+
+#pragma fragment VECTOR
+        {
+                size_t len = *((size_t *)new_meta);
+                new_meta += sizeof(len);
+                printf ("@ARGNAME@ = <%zu bytes>\n", len);
+                new_data += len;
+        }
+
+#pragma fragment IATT
+        {
+                ia_prot_t *myprot = ((ia_prot_t *)new_meta);
+                printf ("@ARGNAME@ = iatt {\n");
+                printf ("  ia_prot = %c%c%c",
+                        myprot->suid ? 'S' : '-',
+                        myprot->sgid ? 'S' : '-',
+                        myprot->sticky ? 'T' : '-');
+                printf ("%c%c%c",
+                        myprot->owner.read ? 'r' : '-',
+                        myprot->owner.write ? 'w' : '-',
+                        myprot->owner.exec ? 'x' : '-');
+                printf ("%c%c%c",
+                        myprot->group.read ? 'r' : '-',
+                        myprot->group.write ? 'w' : '-',
+                        myprot->group.exec ? 'x' : '-');
+                printf ("%c%c%c\n",
+                        myprot->other.read ? 'r' : '-',
+                        myprot->other.write ? 'w' : '-',
+                        myprot->other.exec ? 'x' : '-');
+                new_meta += sizeof(ia_prot_t);
+                uint32_t *myints = (uint32_t *)new_meta;
+                printf ("  ia_uid = %u\n", myints[0]);
+                printf ("  ia_gid = %u\n", myints[1]);
+                printf ("  ia_atime = %u.%09u\n", myints[2], myints[3]);
+                printf ("  ia_mtime = %u.%09u\n", myints[4], myints[5]);
+                new_meta += sizeof(*myints) * 6;
+        }
+
+#pragma fragment FOP
+void
+fdl_dump_@NAME@ (char **old_meta, char **old_data)
+{
+        char    *new_meta	= *old_meta;
+        char	*new_data	= *old_data;
+
+        /* TBD: word size/endianness */
+@FUNCTION_BODY@
+
+        *old_meta = new_meta;
+        *old_data = new_data;
+}
+
+#pragma fragment CASE
+        case GF_FOP_@UPNAME@:
+                printf ("=== GF_FOP_@UPNAME@\n");
+                fdl_dump_@NAME@ (&new_meta, &new_data);
+                break;
+
+#pragma fragment EPILOG
+int
+fdl_dump (char **old_meta, char **old_data)
+{
+        char            *new_meta       = *old_meta;
+        char            *new_data       = *old_data;
+        static glfs_t   *fs             = NULL;
+        int             recognized      = 1;
+        event_header_t  *eh;
+
+        /*
+         * We don't really call anything else in GFAPI, but this is the most
+         * convenient way to satisfy all of the spurious dependencies on how it
+         * or glusterfsd initialize (e.g. setting up THIS).
+         */
+        if (!fs) {
+                fs = glfs_new ("dummy");
+        }
+
+        eh = (event_header_t *)new_meta;
+        new_meta += sizeof (*eh);
+
+        /* TBD: check event_type instead of assuming NEW_REQUEST */
+
+        switch (eh->fop_type) {
+@SWITCH_BODY@
+
+        default:
+                printf ("unknown fop %u\n", eh->fop_type);
+                recognized = 0;
+        }
+
+        *old_meta = new_meta;
+        *old_data = new_data;
+        return recognized;
+}
diff --git a/xlators/experimental/fdl/src/fdl-tmpl.c b/xlators/experimental/fdl/src/fdl-tmpl.c
new file mode 100644
index 00000000000..8fcc6a8d6ff
--- /dev/null
+++ b/xlators/experimental/fdl/src/fdl-tmpl.c
@@ -0,0 +1,506 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include "call-stub.h"
+#include "iatt.h"
+#include "defaults.h"
+#include "syscall.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+/* TBD: make tunable */
+#define META_FILE_SIZE  (1 << 20)
+#define DATA_FILE_SIZE  (1 << 24)
+
+enum gf_fdl {
+        gf_fdl_mt_fdl_private_t = gf_common_mt_end + 1,
+        gf_fdl_mt_end
+};
+
+typedef struct {
+        char            *type;
+        off_t           size;
+        char            *path;
+        int             fd;
+        void *          ptr;
+        off_t           max_offset;
+} log_obj_t;
+
+typedef struct {
+        struct list_head        reqs;
+        pthread_mutex_t         req_lock;
+        pthread_cond_t          req_cond;
+        char                    *log_dir;
+        pthread_t               worker;
+        gf_boolean_t            should_stop;
+        gf_boolean_t            change_term;
+        log_obj_t               meta_log;
+        log_obj_t               data_log;
+        int                     term;
+        int                     first_term;
+} fdl_private_t;
+
+void
+fdl_enqueue (xlator_t *this, call_stub_t *stub)
+{
+        fdl_private_t   *priv   = this->private;
+
+        pthread_mutex_lock (&priv->req_lock);
+        list_add_tail (&stub->list, &priv->reqs);
+        pthread_mutex_unlock (&priv->req_lock);
+
+        pthread_cond_signal (&priv->req_cond);
+}
+
+#pragma generate
+
+char *
+fdl_open_term_log (xlator_t *this, log_obj_t *obj, int term)
+{
+        fdl_private_t   *priv   = this->private;
+        int             ret;
+        char *          ptr     = NULL;
+
+        /*
+         * Use .jnl instead of .log so that we don't get test info (mistakenly)
+         * appended to our journal files.
+         */
+        if (this->ctx->cmd_args.log_ident) {
+                ret = gf_asprintf (&obj->path, "%s/%s-%s-%d.jnl",
+                                   priv->log_dir, this->ctx->cmd_args.log_ident,
+                                   obj->type, term);
+        }
+        else {
+                ret = gf_asprintf (&obj->path, "%s/fubar-%s-%d.jnl",
+                                   priv->log_dir, obj->type, term);
+        }
+        if ((ret <= 0) || !obj->path) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to construct log-file path");
+                goto err;
+        }
+
+        gf_log (this->name, GF_LOG_INFO, "opening %s (size %ld)",
+                obj->path, obj->size);
+
+        obj->fd = open (obj->path, O_RDWR|O_CREAT|O_TRUNC, 0666);
+        if (obj->fd < 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to open log file (%s)", strerror(errno));
+                goto err;
+        }
+
+#if !defined(GF_BSD_HOST_OS)
+        /*
+         * NetBSD can just go die in a fire.  Even though it claims to support
+         * fallocate/posix_fallocate they don't actually *do* anything so the
+         * file size remains zero.  Then mmap succeeds anyway, but any access
+         * to the mmap'ed region will segfault.  It would be acceptable for
+         * fallocate to do what it says, for mmap to fail, or for access to
+         * extend the file.  NetBSD managed to hit the trifecta of Getting
+         * Everything Wrong, and debugging in that environment to get this far
+         * has already been painful enough (systems I worked on in 1990 were
+         * better that way).  We'll fall through to the lseek/write method, and
+         * performance will be worse, and TOO BAD.
+         */
+        if (sys_fallocate(obj->fd,0,0,obj->size) < 0)
+#endif
+        {
+                gf_log (this->name, GF_LOG_WARNING,
+                        "failed to fallocate space for log file");
+                /* Have to do this the ugly page-faulty way. */
+                (void) sys_lseek (obj->fd, obj->size-1, SEEK_SET);
+                (void) sys_write (obj->fd, "", 1);
+        }
+
+        ptr = mmap (NULL, obj->size, PROT_WRITE, MAP_SHARED, obj->fd, 0);
+        if (ptr == MAP_FAILED) {
+                gf_log (this->name, GF_LOG_ERROR, "failed to mmap log (%s)",
+                        strerror(errno));
+                goto err;
+        }
+
+        obj->ptr = ptr;
+        obj->max_offset = 0;
+        return ptr;
+
+err:
+        if (obj->fd >= 0) {
+                sys_close (obj->fd);
+                obj->fd = (-1);
+        }
+        if (obj->path) {
+                GF_FREE (obj->path);
+                obj->path = NULL;
+        }
+        return ptr;
+}
+
+void
+fdl_close_term_log (xlator_t *this, log_obj_t *obj)
+{
+        fdl_private_t   *priv           = this->private;
+
+        if (obj->ptr) {
+                (void) munmap (obj->ptr, obj->size);
+                obj->ptr = NULL;
+        }
+
+        if (obj->fd >= 0) {
+                gf_log (this->name, GF_LOG_INFO,
+                        "truncating term %d %s journal to %ld",
+                        priv->term, obj->type, obj->max_offset);
+                if (sys_ftruncate(obj->fd,obj->max_offset) < 0) {
+                        gf_log (this->name, GF_LOG_WARNING,
+                                "failed to truncate journal (%s)",
+                                strerror(errno));
+                }
+                sys_close (obj->fd);
+                obj->fd = (-1);
+        }
+
+        if (obj->path) {
+                GF_FREE (obj->path);
+                obj->path = NULL;
+        }
+}
+
+gf_boolean_t
+fdl_change_term (xlator_t *this, char **meta_ptr, char **data_ptr)
+{
+        fdl_private_t   *priv           = this->private;
+
+        fdl_close_term_log (this, &priv->meta_log);
+        fdl_close_term_log (this, &priv->data_log);
+
+        ++(priv->term);
+
+        *meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+        if (!*meta_ptr) {
+                return _gf_false;
+        }
+
+        *data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+        if (!*data_ptr) {
+                return _gf_false;
+        }
+
+        return _gf_true;
+}
+
+void *
+fdl_worker (void *arg)
+{
+        xlator_t        *this           = arg;
+        fdl_private_t   *priv           = this->private;
+        call_stub_t     *stub;
+        char *          meta_ptr        = NULL;
+        off_t           *meta_offset    = &priv->meta_log.max_offset;
+        char *          data_ptr        = NULL;
+        off_t           *data_offset    = &priv->data_log.max_offset;
+        unsigned long   base_as_ul;
+        void *          msync_ptr;
+        size_t          msync_len;
+        gf_boolean_t    recycle;
+        void            *err_label      = &&err_unlocked;
+
+        priv->meta_log.type = "meta";
+        priv->meta_log.size = META_FILE_SIZE;
+        priv->meta_log.path = NULL;
+        priv->meta_log.fd = (-1);
+        priv->meta_log.ptr = NULL;
+
+        priv->data_log.type = "data";
+        priv->data_log.size = DATA_FILE_SIZE;
+        priv->data_log.path = NULL;
+        priv->data_log.fd = (-1);
+        priv->data_log.ptr = NULL;
+
+        /* TBD: initial term should come from persistent storage (e.g. etcd) */
+        priv->first_term = ++(priv->term);
+        meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+        if (!meta_ptr) {
+                goto *err_label;
+        }
+        data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+        if (!data_ptr) {
+                fdl_close_term_log (this, &priv->meta_log);
+                goto *err_label;
+        }
+
+        for (;;) {
+                pthread_mutex_lock (&priv->req_lock);
+                err_label = &&err_locked;
+                while (list_empty(&priv->reqs)) {
+                        pthread_cond_wait (&priv->req_cond, &priv->req_lock);
+                        if (priv->should_stop) {
+                                goto *err_label;
+                        }
+                        if (priv->change_term) {
+                                if (!fdl_change_term(this, &meta_ptr,
+                                                           &data_ptr)) {
+                                        goto *err_label;
+                                }
+                                priv->change_term = _gf_false;
+                                continue;
+                        }
+                }
+                stub = list_entry (priv->reqs.next, call_stub_t, list);
+                list_del_init (&stub->list);
+                pthread_mutex_unlock (&priv->req_lock);
+                err_label = &&err_unlocked;
+                /*
+                 * TBD: batch requests
+                 *
+                 * What we should do here is gather up *all* of the requests
+                 * that have accumulated since we were last at this point,
+                 * blast them all out in one big writev, and then dispatch them
+                 * all before coming back for more.  That maximizes throughput,
+                 * at some cost to latency (due to queuing effects at the log
+                 * stage).  Note that we're likely to be above io-threads, so
+                 * the dispatch itself will be parallelized (at further cost to
+                 * latency).  For now, we just do the simplest thing and handle
+                 * one request all the way through before fetching the next.
+                 *
+                 * So, why mmap/msync instead of writev/fdatasync?  Because it's
+                 * faster.  Much faster.  So much faster that I half-suspect
+                 * cheating, but it's more convenient for now than having to
+                 * ensure that everything's page-aligned for O_DIRECT (the only
+                 * alternative that still might avoid ridiculous levels of
+                 * local-FS overhead).
+                 *
+                 * TBD: check that msync really does get our data to disk.
+                 */
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "logging %u+%u bytes for op %d",
+                        stub->jnl_meta_len, stub->jnl_data_len, stub->fop);
+                recycle = _gf_false;
+                if ((*meta_offset + stub->jnl_meta_len) > priv->meta_log.size) {
+                        recycle = _gf_true;
+                }
+                if ((*data_offset + stub->jnl_data_len) > priv->data_log.size) {
+                        recycle = _gf_true;
+                }
+                if (recycle && !fdl_change_term(this,&meta_ptr,&data_ptr)) {
+                        goto *err_label;
+                }
+                meta_ptr = priv->meta_log.ptr;
+                data_ptr = priv->data_log.ptr;
+                gf_log (this->name, GF_LOG_DEBUG, "serializing to %p/%p",
+                        meta_ptr + *meta_offset, data_ptr + *data_offset);
+                stub->serialize (stub, meta_ptr + *meta_offset,
+                                       data_ptr + *data_offset);
+                if (stub->jnl_meta_len > 0) {
+                        base_as_ul = (unsigned long) (meta_ptr + *meta_offset);
+                        msync_ptr = (void *) (base_as_ul & ~0x0fff);
+                        msync_len = (size_t) (base_as_ul &  0x0fff);
+                        if (msync (msync_ptr, msync_len+stub->jnl_meta_len,
+                                              MS_SYNC) < 0) {
+                                gf_log (this->name, GF_LOG_WARNING,
+                                        "failed to log request meta (%s)",
+                                        strerror(errno));
+                        }
+                        *meta_offset += stub->jnl_meta_len;
+                }
+                if (stub->jnl_data_len > 0) {
+                        base_as_ul = (unsigned long) (data_ptr + *data_offset);
+                        msync_ptr = (void *) (base_as_ul & ~0x0fff);
+                        msync_len = (size_t) (base_as_ul &  0x0fff);
+                        if (msync (msync_ptr, msync_len+stub->jnl_data_len,
+                                              MS_SYNC) < 0) {
+                                gf_log (this->name, GF_LOG_WARNING,
+                                        "failed to log request data (%s)",
+                                        strerror(errno));
+                        }
+                        *data_offset += stub->jnl_data_len;
+                }
+                call_resume (stub);
+        }
+
+err_locked:
+        pthread_mutex_unlock (&priv->req_lock);
+err_unlocked:
+        fdl_close_term_log (this, &priv->meta_log);
+        fdl_close_term_log (this, &priv->data_log);
+        return NULL;
+}
+
+int32_t
+fdl_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+        fdl_private_t   *priv   = this->private;
+        dict_t          *tdict;
+        int32_t         gt_err  = EIO;
+
+        switch (op) {
+
+        case FDL_IPC_CHANGE_TERM:
+                gf_log (this->name, GF_LOG_INFO, "got CHANGE_TERM op");
+                priv->change_term = _gf_true;
+                pthread_cond_signal (&priv->req_cond);
+                STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL);
+                break;
+
+        case FDL_IPC_GET_TERMS:
+                gf_log (this->name, GF_LOG_INFO, "got GET_TERMS op");
+                tdict = dict_new ();
+                if (!tdict) {
+                        gt_err = ENOMEM;
+                        goto gt_done;
+                }
+                if (dict_set_int32(tdict,"first",priv->first_term) != 0) {
+                        goto gt_done;
+                }
+                if (dict_set_int32(tdict,"last",priv->term) != 0) {
+                        goto gt_done;
+                }
+                gt_err = 0;
+        gt_done:
+                if (gt_err) {
+                        STACK_UNWIND_STRICT (ipc, frame, -1, gt_err, NULL);
+                } else {
+                        STACK_UNWIND_STRICT (ipc, frame, 0, 0, tdict);
+                }
+                if (tdict) {
+                        dict_unref (tdict);
+                }
+                break;
+
+        default:
+                STACK_WIND_TAIL (frame,
+                                 FIRST_CHILD(this),
+                                 FIRST_CHILD(this)->fops->ipc,
+                                 op, xdata);
+        }
+
+        return 0;
+}
+
+int
+fdl_init (xlator_t *this)
+{
+        fdl_private_t   *priv   = NULL;
+
+        priv = GF_CALLOC (1, sizeof (*priv), gf_fdl_mt_fdl_private_t);
+        if (!priv) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to allocate fdl_private");
+                goto err;
+        }
+
+        INIT_LIST_HEAD (&priv->reqs);
+        if (pthread_mutex_init (&priv->req_lock, NULL) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to initialize req_lock");
+                goto err;
+        }
+        if (pthread_cond_init (&priv->req_cond, NULL) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to initialize req_cond");
+                goto err;
+        }
+
+        GF_OPTION_INIT ("log-path", priv->log_dir, path, err);
+
+        if (pthread_create(&priv->worker,NULL,fdl_worker,this) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to start fdl_worker");
+                goto err;
+        }
+
+        /*
+         * The rest of the fop table is automatically generated, so this is a
+         * bit cleaner than messing with the generation to add a hand-written
+         * exception.
+         */
+        this->fops->ipc = fdl_ipc;
+
+        this->private = priv;
+        return 0;
+
+err:
+        if (priv) {
+                GF_FREE(priv);
+        }
+        return -1;
+}
+
+void
+fdl_fini (xlator_t *this)
+{
+        fdl_private_t   *priv   = this->private;
+
+        if (priv) {
+                priv->should_stop = _gf_true;
+                pthread_cond_signal (&priv->req_cond);
+                pthread_join (priv->worker, NULL);
+                GF_FREE(priv);
+        }
+}
+
+int
+fdl_reconfigure (xlator_t *this, dict_t *options)
+{
+        fdl_private_t   *priv   = this->private;
+
+	GF_OPTION_RECONF ("log_dir", priv->log_dir, options, path, out);
+        /* TBD: react if it changed */
+
+out:
+        return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+        int     ret = -1;
+
+        GF_VALIDATE_OR_GOTO ("fdl", this, out);
+
+        ret = xlator_mem_acct_init (this, gf_fdl_mt_end + 1);
+
+        if (ret != 0) {
+                gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+                        "failed");
+                return ret;
+        }
+out:
+        return ret;
+}
+
+class_methods_t class_methods = {
+        .init           = fdl_init,
+        .fini           = fdl_fini,
+        .reconfigure    = fdl_reconfigure,
+        .notify         = default_notify,
+};
+
+struct volume_options options[] = {
+        { .key = {"log-path"},
+          .type = GF_OPTION_TYPE_PATH,
+          .default_value = DEFAULT_LOG_FILE_DIRECTORY,
+          .description = "Directory for FDL files."
+        },
+        { .key  = {NULL} },
+};
+
+struct xlator_cbks cbks = {
+        .release        = default_release,
+        .releasedir     = default_releasedir,
+        .forget         = default_forget,
+};
diff --git a/xlators/experimental/fdl/src/gen_dumper.py b/xlators/experimental/fdl/src/gen_dumper.py
new file mode 100755
index 00000000000..42db55d2cb3
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_dumper.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+
+import os
+import re
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# See the big header comment at the start of gen_fdl.py to see how the stages
+# fit together.  The big difference here is that *all* of the C code is in the
+# template file as labelled fragments, instead of as Python strings.  That
+# makes it much easier to edit in one place, with proper syntax highlighting
+# and indentation.
+#
+#   Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of
+#   LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE.
+#
+#   Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and
+#   FOP_TEMPLATE.  The expanded FOP code (including FUNCTION_BODY substitution
+#   in the middle of each function) is emitted immediately; the expanded CASE
+#   code is saved for the next stage.
+#
+#   Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code
+#   in the middle of EPILOG, to generate the whole output file.
+#
+# Another way of looking at it is to consider how the fragments appear in
+# the final output:
+#
+#   PROLOG
+#   FOP (expanded for CREATE)
+#       FOP before FUNCTION_BODY
+#       LOC, INTEGER, GFID, etc. (one per arg, by type)
+#       FOP after FUNCTION_BODY
+#   FOP (expanded for WRITEV)
+#       FOP before FUNCTION_BODY
+#       GFID, VECTOR, etc. (on per arg, by type)
+#       FOP after FUNCTION_BODY
+#   (more FOPs)
+#   EPILOG
+#       EPILOG before CASE
+#       CASE statements (one per fop)
+#       EPILOG after CASE
+
+typemap = {
+	'dict_t *':				( "DICT",		""),
+	'fd_t *':				( "GFID",		""),
+	'dev_t':				( "DOUBLE",		"%ld (0x%lx)"),
+	'gf_xattrop_flags_t':	( "INTEGER",	"%d (0x%x)"),
+	'int32_t':				( "INTEGER",	"%d (0x%x)"),
+	'mode_t':				( "INTEGER",	"%d (0x%x)"),
+	'off_t':				( "DOUBLE",		"%ld (0x%lx)"),
+	'size_t':				( "DOUBLE",		"%ld (0x%lx)"),
+	'uint32_t':				( "INTEGER",	"%d (0x%x)"),
+	'loc_t *':				( "LOC",		""),
+	'const char *':			( "STRING",		""),
+	'struct iovec *':		( "VECTOR",		""),
+	'struct iatt *':		( "IATT",		""),
+}
+
+def get_special_subs (args):
+	code = ""
+	for arg in args:
+		if (arg[0] != 'fop-arg') or (len(arg) < 4):
+			continue
+		recon_type, recon_fmt = typemap[arg[2]]
+		code += fragments[recon_type].replace("@ARGNAME@",arg[3])		\
+									 .replace("@FORMAT@",recon_fmt)
+	return code
+
+def gen_functions ():
+	code = ""
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		fop_subs[name]["@FUNCTION_BODY@"] = get_special_subs(value)
+		# Print the FOP fragment with @FUNCTION_BODY@ in the middle.
+		code += generate(fragments["FOP"],name,fop_subs)
+	return code
+
+def gen_cases ():
+	code = ""
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		# Add the CASE fragment for this fop.
+		code += generate(fragments["CASE"],name,fop_subs)
+	return code
+
+def load_fragments (path="recon-tmpl.c"):
+	pragma_re = re.compile('pragma fragment (.*)')
+	cur_symbol = None
+	cur_value = ""
+	result = {}
+	for line in open(path,"r").readlines():
+		m = pragma_re.search(line)
+		if m:
+			if cur_symbol:
+				result[cur_symbol] = cur_value
+			cur_symbol = m.group(1)
+			cur_value = ""
+		else:
+			cur_value += line
+	if cur_symbol:
+		result[cur_symbol] = cur_value
+	return result
+
+if __name__ == "__main__":
+	fragments = load_fragments(sys.argv[1])
+	print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+	print fragments["PROLOG"]
+	print gen_functions()
+	print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases())
+	print "/* END GENERATED CODE */"
diff --git a/xlators/experimental/fdl/src/gen_fdl.py b/xlators/experimental/fdl/src/gen_fdl.py
new file mode 100755
index 00000000000..7f6b1aaaeaa
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_fdl.py
@@ -0,0 +1,328 @@
+#!/usr/bin/python
+
+import os
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# Generation occurs in three stages.  In this case, it actually makes more
+# sense to discuss them in the *opposite* order of that in which they
+# actually happen.
+#
+#   Stage 3 is to insert all of the generated code into a file, replacing the
+#   "#pragma generate" that's already there.  The file can thus contain all
+#   sorts of stuff that's not specific to one fop, either before or after the
+#   generated code as appropriate.
+#
+#   Stage 2 is to generate all of the code *for a particular fop*, using a
+#   string-valued template plus a table of substitution values.  Most of these
+#   are built in to the generator itself.  However, we also add a couple that
+#   are specific to this particular translator - LEN_CODE and SER_CODE.  These
+#   are per-fop functions to get the length or the contents (respectively) of
+#   what we'll put in the log.  As with stage 3 allowing per-file boilerplate
+#   before and after generated code, this allows per-fop boilerplate before and
+#   after generated code.
+#
+#   Stage 1, therefore, is to create the LEN_CODE and SER_CODE substitutions for
+#   each fop, and put them in the same table where e.g. NAME and SHORT_ARGS
+#   already are.  We do this by looking at the fop-description table in the
+#   generator module, then doing out own template substitution to plug each
+#   specific argument name into another string-valued template.
+#
+# So, what does this leave us with in terms of variables and files?
+#
+#   For stage 1, we have a series of LEN_*_TEMPLATE and SERLZ_*_TEMPLATE
+#   strings, which are used to generate the length and serialization code for
+#   each argument type.
+#
+#   For stage 2, we have a bunch of *_TEMPLATE strings (no LEN_ or SERLZ_
+#   prefix), which are used (along with the output from stage 1) to generate
+#   whole functions.
+#
+#   For stage 3, we have a whole separate file (fdl_tmpl.c) into which we insert
+#   the collection of all functions defined in stage 2.
+
+
+LEN_TEMPLATE = """
+void
+fdl_len_@NAME@ (call_stub_t *stub)
+{
+        uint32_t    meta_len    = sizeof (event_header_t);
+		uint32_t	data_len	= 0;
+
+        /* TBD: global stuff, e.g. uid/gid */
+@LEN_CODE@
+
+		/* TBD: pad extension length */
+		stub->jnl_meta_len = meta_len;
+		stub->jnl_data_len = data_len;
+}
+"""
+
+SER_TEMPLATE = """
+void
+fdl_serialize_@NAME@ (call_stub_t *stub, char *meta_buf, char *data_buf)
+{
+		event_header_t	*eh;
+		unsigned long	offset = 0;
+
+        /* TBD: word size/endianness */
+		eh = (event_header_t *)meta_buf;
+		eh->event_type = NEW_REQUEST;
+		eh->fop_type = GF_FOP_@UPNAME@;
+		eh->request_id = 0;	// TBD
+		meta_buf += sizeof (*eh);
+@SER_CODE@
+		/* TBD: pad extension length */
+		eh->ext_length = offset;
+}
+"""
+
+CBK_TEMPLATE = """
+int32_t
+fdl_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno,
+                @LONG_ARGS@)
+{
+        STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno,
+                             @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+CONTINUE_TEMPLATE = """
+int32_t
+fdl_@NAME@_continue (call_frame_t *frame, xlator_t *this,
+                     @LONG_ARGS@)
+{
+        STACK_WIND (frame, fdl_@NAME@_cbk,
+                    FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@,
+                    @SHORT_ARGS@);
+        return 0;
+}
+
+"""
+
+FOP_TEMPLATE = """
+int32_t
+fdl_@NAME@ (call_frame_t *frame, xlator_t *this,
+            @LONG_ARGS@)
+{
+        call_stub_t     *stub;
+
+        stub = fop_@NAME@_stub (frame, default_@NAME@,
+                                @SHORT_ARGS@);
+		fdl_len_@NAME@ (stub);
+        stub->serialize = fdl_serialize_@NAME@;
+        fdl_enqueue (this, stub);
+
+        return 0;
+}
+"""
+
+LEN_DICT_TEMPLATE = """
+		if (@SRC@) {
+			data_pair_t *memb;
+			for (memb = @SRC@->members_list; memb; memb = memb->next) {
+				meta_len += sizeof(int);
+				meta_len += strlen(memb->key) + 1;
+				meta_len += sizeof(int);
+				meta_len += memb->value->len;
+			}
+		}
+		meta_len += sizeof(int);
+"""
+
+LEN_GFID_TEMPLATE = """
+        meta_len += 16;
+"""
+
+LEN_INTEGER_TEMPLATE = """
+        meta_len += sizeof (@SRC@);
+"""
+
+# 16 for gfid, 16 for pargfid, 1 for flag, 0/1 for terminating NUL
+LEN_LOC_TEMPLATE = """
+        if (@SRC@.name) {
+                meta_len += (strlen (@SRC@.name) + 34);
+        } else {
+                meta_len += 33;
+        }
+"""
+
+LEN_STRING_TEMPLATE = """
+        if (@SRC@) {
+                meta_len += (strlen (@SRC@) + 1);
+        } else {
+                meta_len += 1;
+        }
+"""
+
+LEN_VECTOR_TEMPLATE = """
+        meta_len += sizeof(size_t);
+        data_len += iov_length (@VEC@, @CNT@);
+"""
+
+LEN_IATT_TEMPLATE = """
+		meta_len += sizeof(@SRC@.ia_prot);
+		meta_len += sizeof(@SRC@.ia_uid);
+		meta_len += sizeof(@SRC@.ia_gid);
+		meta_len += sizeof(@SRC@.ia_atime);
+		meta_len += sizeof(@SRC@.ia_atime_nsec);
+		meta_len += sizeof(@SRC@.ia_mtime);
+		meta_len += sizeof(@SRC@.ia_mtime_nsec);
+"""
+
+SERLZ_DICT_TEMPLATE = """
+        if (@SRC@) {
+			data_pair_t *memb;
+			for (memb = @SRC@->members_list; memb; memb = memb->next) {
+				*((int *)(meta_buf+offset)) = strlen(memb->key) + 1;
+				offset += sizeof(int);
+				strcpy (meta_buf+offset, memb->key);
+				offset += strlen(memb->key) + 1;
+				*((int *)(meta_buf+offset)) = memb->value->len;
+				offset += sizeof(int);
+				memcpy (meta_buf+offset, memb->value->data, memb->value->len);
+				offset += memb->value->len;
+			}
+        }
+		*((int *)(meta_buf+offset)) = 0;
+		offset += sizeof(int);
+"""
+
+SERLZ_GFID_TEMPLATE = """
+        memcpy (meta_buf+offset, @SRC@->inode->gfid, 16);
+        offset += 16;
+"""
+
+SERLZ_INTEGER_TEMPLATE = """
+        memcpy (meta_buf+offset, &@SRC@, sizeof(@SRC@));
+        offset += sizeof(@SRC@);
+"""
+
+SERLZ_LOC_TEMPLATE = """
+        memcpy (meta_buf+offset, @SRC@.gfid, 16);
+        offset += 16;
+        memcpy (meta_buf+offset, @SRC@.pargfid, 16);
+        offset += 16;
+        if (@SRC@.name) {
+                *(meta_buf+offset) = 1;
+				++offset;
+                strcpy (meta_buf+offset, @SRC@.name);
+                offset += (strlen (@SRC@.name) + 1);
+        } else {
+                *(meta_buf+offset) = 0;
+				++offset;
+        }
+"""
+
+SERLZ_STRING_TEMPLATE = """
+        if (@SRC@) {
+                *(meta_buf+offset) = 1;
+				++offset;
+                strcpy (meta_buf+offset, @SRC@);
+                offset += strlen(@SRC@);
+        } else {
+                *(meta_buf+offset) = 0;
+				++offset;
+        }
+"""
+
+SERLZ_VECTOR_TEMPLATE = """
+        *((size_t *)(meta_buf+offset)) = iov_length (@VEC@, @CNT@);
+        offset += sizeof(size_t);
+        int32_t i;
+        for (i = 0; i < @CNT@; ++i) {
+                memcpy (data_buf, @VEC@[i].iov_base, @VEC@[i].iov_len);
+                data_buf += @VEC@[i].iov_len;
+        }
+"""
+
+# We don't need to save all of the fields - only those affected by chown,
+# chgrp, chmod, and utime.
+SERLZ_IATT_TEMPLATE = """
+		*((ia_prot_t *)(meta_buf+offset)) = @SRC@.ia_prot;
+		offset += sizeof(@SRC@.ia_prot);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_uid;
+		offset += sizeof(@SRC@.ia_uid);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_gid;
+		offset += sizeof(@SRC@.ia_gid);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime;
+		offset += sizeof(@SRC@.ia_atime);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime_nsec;
+		offset += sizeof(@SRC@.ia_atime_nsec);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime;
+		offset += sizeof(@SRC@.ia_mtime);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime_nsec;
+		offset += sizeof(@SRC@.ia_mtime_nsec);
+"""
+
+typemap = {
+	'dict_t *':				( LEN_DICT_TEMPLATE,	SERLZ_DICT_TEMPLATE),
+	'fd_t *':				( LEN_GFID_TEMPLATE,	SERLZ_GFID_TEMPLATE),
+	'dev_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'gf_xattrop_flags_t':	( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'int32_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'mode_t':				( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+	'off_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'size_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'uint32_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'loc_t *':				( LEN_LOC_TEMPLATE,		SERLZ_LOC_TEMPLATE),
+	'const char *':			( LEN_STRING_TEMPLATE,	SERLZ_STRING_TEMPLATE),
+	'struct iatt *':		( LEN_IATT_TEMPLATE,	SERLZ_IATT_TEMPLATE),
+}
+
+def get_special_subs (args):
+	len_code = ""
+	ser_code = ""
+	for arg in args:
+		if (arg[0] != 'fop-arg') or (len(arg) < 4):
+			continue
+		# Let this throw an exception if we get an unknown field name.  The
+		# broken build will remind whoever messed with the stub code that a
+		# corresponding update is needed here.
+		if arg[3] == "vector":
+			# Make it as obvious as possible that this is a special case.
+			len_code += LEN_VECTOR_TEMPLATE \
+				.replace("@VEC@","stub->args.vector") \
+				.replace("@CNT@","stub->args.count")
+			ser_code += SERLZ_VECTOR_TEMPLATE \
+				.replace("@VEC@","stub->args.vector") \
+				.replace("@CNT@","stub->args.count")
+		else:
+			len_tmpl, ser_tmpl = typemap[arg[2]]
+			src = "stub->args.%s" % arg[3]
+			len_code += len_tmpl.replace("@SRC@",src)
+			ser_code += ser_tmpl.replace("@SRC@",src)
+	return len_code, ser_code
+
+def gen_fdl ():
+	entrypoints = []
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		len_code, ser_code = get_special_subs(value)
+		fop_subs[name]["@LEN_CODE@"] = len_code[:-1]
+		fop_subs[name]["@SER_CODE@"] = ser_code[:-1]
+		print generate(LEN_TEMPLATE,name,fop_subs)
+		print generate(SER_TEMPLATE,name,fop_subs)
+		print generate(CBK_TEMPLATE,name,cbk_subs)
+		print generate(CONTINUE_TEMPLATE,name,fop_subs)
+		print generate(FOP_TEMPLATE,name,fop_subs)
+		entrypoints.append(name)
+	print "struct xlator_fops fops = {"
+	for ep in entrypoints:
+		print "\t.%s = fdl_%s," % (ep, ep)
+	print "};"
+
+for l in open(sys.argv[1],'r').readlines():
+	if l.find('#pragma generate') != -1:
+		print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+		gen_fdl()
+		print "/* END GENERATED CODE */"
+	else:
+		print l[:-1]
diff --git a/xlators/experimental/fdl/src/gen_recon.py b/xlators/experimental/fdl/src/gen_recon.py
new file mode 100755
index 00000000000..26318f92d88
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_recon.py
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+
+import os
+import re
+import string
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# See the big header comment at the start of gen_fdl.py to see how the stages
+# fit together.  The big difference here is that *all* of the C code is in the
+# template file as labelled fragments, instead of as Python strings.  That
+# makes it much easier to edit in one place, with proper syntax highlighting
+# and indentation.
+#
+#   Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of
+#   LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE.
+#
+#   Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and
+#   FOP_TEMPLATE.  The expanded FOP code (including FUNCTION_BODY substitution
+#   in the middle of each function) is emitted immediately; the expanded CASE
+#   code is saved for the next stage.
+#
+#   Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code
+#   in the middle of EPILOG, to generate the whole output file.
+#
+# Another way of looking at it is to consider how the fragments appear in
+# the final output:
+#
+#   PROLOG
+#   FOP (expanded for CREATE)
+#       FOP before FUNCTION_BODY
+#       LOC, INTEGER, GFID, etc. (one per arg, by type)
+#       FOP after FUNCTION_BODY
+#   FOP (expanded for WRITEV)
+#       FOP before FUNCTION_BODY
+#       GFID, VECTOR, etc. (one per arg, by type)
+#       FOP after FUNCTION_BODY
+#   (more FOPs)
+#   EPILOG
+#       EPILOG before CASE
+#       CASE statements (one per fop)
+#       EPILOG after CASE
+
+typemap = {
+	'dict_t *':				"DICT",
+	'fd_t *':				"FD",
+	'dev_t':				"DOUBLE",
+	'gf_xattrop_flags_t':	"INTEGER",
+	'int32_t':				"INTEGER",
+	'mode_t':				"INTEGER",
+	'off_t':				"DOUBLE",
+	'size_t':				"DOUBLE",
+	'uint32_t':				"INTEGER",
+	'loc_t *':				"LOC",
+	'const char *':			"STRING",
+	'struct iovec *':		"VECTOR",
+	'struct iatt *':		"IATT",
+	'struct iobref *':		"IOBREF",
+}
+
+def get_special_subs (name, args, fop_type):
+	code = ""
+	cleanups = ""
+	links = ""
+	s_args = []
+	for arg in args:
+		if arg[0] == 'extra':
+			code += "\t%s %s;\n\n" % (arg[2], arg[1])
+			s_args.append(arg[3])
+			continue
+		if arg[0] == 'link':
+			links += fragments["LINK"].replace("@INODE_ARG@",arg[1])	\
+									  .replace("@IATT_ARG@",arg[2])
+			continue
+		if arg[0] != 'fop-arg':
+			continue
+		if (name, arg[1]) == ('writev', 'count'):
+			# Special case: just skip this.  We can't mark it as 'nosync'
+			# because of the way the translator and dumper generators look for
+			# that after 'stub-name' which we don't define.  Instead of adding a
+			# bunch of generic infrastructure for this one case, just pound it
+			# here.
+			continue
+		recon_type = typemap[arg[2]]
+		# print "/* %s.%s => %s (%s)*/" % (name, arg[1], recon_type, fop_type)
+		if (name == "create") and (arg[1] == "fd"):
+			# Special case: fd for create is new, not looked up.
+			# print "/* change to NEW_FD */"
+			recon_type = "NEW_FD"
+		elif (recon_type == "LOC") and (fop_type == "entry-op"):
+			# Need to treat this differently for inode vs. entry ops.
+			# Special case: link source is treated like inode-op.
+			if (name != "link") or (arg[1] != "oldloc"):
+				# print "/* change to PARENT_LOC */"
+				recon_type = "PARENT_LOC"
+		code += fragments[recon_type].replace("@ARGNAME@",arg[1])		\
+									 .replace("@ARGTYPE@",arg[2])
+		cleanup_key = recon_type + "_CLEANUP"
+		if fragments.has_key(cleanup_key):
+			cleanups += fragments[cleanup_key].replace("@ARGNAME@",arg[1])
+		if 'nosync' in arg[4:]:
+			code += "\t(void)%s;\n" % arg[1];
+			continue
+		if arg[2] in ("loc_t *", "struct iatt *"):
+			# These are passed as pointers to the syncop, but they're actual
+			# structures in the generated code.
+			s_args.append("&"+arg[1]);
+		else:
+			s_args.append(arg[1])
+	# We have to handle a couple of special cases here, because some n00b
+	# defined the syncops with a different argument order than the fops they're
+	# based on.
+	if name == 'writev':
+		# Swap 'flags' and 'iobref'.  Also, we need to add the iov count, which
+		# is not stored in or read from the journal.  There are other ways to
+		# do that, but this is the only place we need anything similar and we
+		# already have to treat it as a special case so this is simplest.
+		s_args_str = 'fd, &vector, 1, off, iobref, flags, xdata'
+	elif name == 'symlink':
+		# Swap 'linkpath' and 'loc'.
+		s_args_str = '&loc, linkpath, &iatt, xdata'
+	else:
+		s_args_str = string.join (s_args, ", ")
+	return code, links, s_args_str, cleanups
+
+# TBD: probably need to generate type-specific cleanup code as well - e.g.
+# fd_unref for an fd_t, loc_wipe for a loc_t, and so on.  All of these
+# generated CLEANUP fragments will go at the end of the function, with goto
+# labels.  Meanwhile, the error-checking part of each type-specific fragment
+# (e.g. LOC or FD) will need to update the indirect label that we jump to when
+# an error is detected.  This will probably get messy.
+def gen_functions ():
+	code = ""
+	for name, value in ops.iteritems():
+		fop_type = [ x[1] for x in value if x[0] == "journal" ]
+		if not fop_type:
+			continue
+		body, links, syncop_args, cleanups = get_special_subs (name, value,
+															   fop_type[0])
+		fop_subs[name]["@FUNCTION_BODY@"] = body
+		fop_subs[name]["@LINKS@"] = links
+		fop_subs[name]["@SYNCOP_ARGS@"] = syncop_args
+		fop_subs[name]["@CLEANUPS@"] = cleanups
+		if name == "writev":
+			# Take advantage of the fact that, *during reconciliation*, the
+			# vector is always a single element.  In normal I/O it's not.
+			fop_subs[name]["@SUCCESS_VALUE@"] = "vector.iov_len"
+		else:
+			fop_subs[name]["@SUCCESS_VALUE@"] = "GFAPI_SUCCESS"
+		# Print the FOP fragment with @FUNCTION_BODY@ in the middle.
+		code += generate(fragments["FOP"],name,fop_subs)
+	return code
+
+def gen_cases ():
+	code = ""
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		# Add the CASE fragment for this fop.
+		code += generate(fragments["CASE"],name,fop_subs)
+	return code
+
+def load_fragments (path="recon-tmpl.c"):
+	pragma_re = re.compile('pragma fragment (.*)')
+	cur_symbol = None
+	cur_value = ""
+	result = {}
+	for line in open(path,"r").readlines():
+		m = pragma_re.search(line)
+		if m:
+			if cur_symbol:
+				result[cur_symbol] = cur_value
+			cur_symbol = m.group(1)
+			cur_value = ""
+		else:
+			cur_value += line
+	if cur_symbol:
+		result[cur_symbol] = cur_value
+	return result
+
+if __name__ == "__main__":
+	fragments = load_fragments(sys.argv[1])
+	print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+	print fragments["PROLOG"]
+	print gen_functions()
+	print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases())
+	print "/* END GENERATED CODE */"
diff --git a/xlators/experimental/fdl/src/jnl-types.h b/xlators/experimental/fdl/src/jnl-types.h
new file mode 100644
index 00000000000..8cb39d01a25
--- /dev/null
+++ b/xlators/experimental/fdl/src/jnl-types.h
@@ -0,0 +1,14 @@
+#define NEW_REQUEST     (uint8_t)'N'
+
+typedef struct {
+        uint8_t         event_type;     /* e.g. NEW_REQUEST */
+        uint8_t         fop_type;       /* e.g. GF_FOP_SETATTR */
+        uint16_t        request_id;
+        uint32_t        ext_length;
+} event_header_t;
+
+enum {
+        FDL_IPC_BASE = 0xfeedbee5,       /* ... and they make honey */
+        FDL_IPC_CHANGE_TERM,
+        FDL_IPC_GET_TERMS,
+};
diff --git a/xlators/experimental/fdl/src/logdump.c b/xlators/experimental/fdl/src/logdump.c
new file mode 100644
index 00000000000..7c979c32a04
--- /dev/null
+++ b/xlators/experimental/fdl/src/logdump.c
@@ -0,0 +1,50 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+extern int fdl_dump (char **, char **);
+
+int
+main (int argc, char **argv)
+{
+        int     meta_fd         = (-1);
+        char    *meta_buf       = NULL;
+        int     data_fd         = (-1);
+        char    *data_buf       = NULL;
+
+        meta_fd = open (argv[1], O_RDONLY);
+        if (meta_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0);
+        if (meta_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        data_fd = open (argv[2], O_RDONLY);
+        if (data_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0);
+        if (data_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        for (;;) {
+                if (!fdl_dump(&meta_buf,&data_buf)) {
+                        break;
+                }
+        }
+
+        return EXIT_SUCCESS;
+}
diff --git a/xlators/experimental/fdl/src/recon-tmpl.c b/xlators/experimental/fdl/src/recon-tmpl.c
new file mode 100644
index 00000000000..523bda39418
--- /dev/null
+++ b/xlators/experimental/fdl/src/recon-tmpl.c
@@ -0,0 +1,305 @@
+#pragma fragment PROLOG
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "fd.h"
+#include "iatt.h"
+#include "syncop.h"
+#include "xlator.h"
+#include "glfs-internal.h"
+
+#include "jnl-types.h"
+
+#define GFAPI_SUCCESS 0
+
+inode_t *
+recon_get_inode (glfs_t *fs, uuid_t gfid)
+{
+        inode_t         *inode;
+        loc_t           loc     = {NULL,};
+        struct iatt     iatt;
+        int             ret;
+        inode_t         *newinode;
+
+        inode = inode_find (fs->active_subvol->itable, gfid);
+        if (inode) {
+                printf ("=== FOUND %s IN TABLE\n", uuid_utoa(gfid));
+                return inode;
+        }
+
+        loc.inode = inode_new (fs->active_subvol->itable);
+        if (!loc.inode) {
+                return NULL;
+        }
+        gf_uuid_copy (loc.inode->gfid, gfid);
+        gf_uuid_copy (loc.gfid, gfid);
+
+        printf ("=== DOING LOOKUP FOR %s\n", uuid_utoa(gfid));
+
+        ret = syncop_lookup (fs->active_subvol, &loc, &iatt,
+                             NULL, NULL, NULL);
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "syncop_lookup failed (%d)\n", ret);
+                return NULL;
+        }
+
+        newinode = inode_link (loc.inode, NULL, NULL, &iatt);
+        if (newinode) {
+                inode_lookup (newinode);
+        }
+
+        return newinode;
+}
+
+#pragma fragment DICT
+        dict_t  *@ARGNAME@;
+
+        @ARGNAME@ = dict_new();
+        if (!@ARGNAME@) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+
+        {
+                int     key_len, data_len;
+                char    *key_ptr;
+                int     garbage;
+                for (;;) {
+                        key_len = *((int *)new_meta);
+                        new_meta += sizeof(int);
+                        if (!key_len) {
+                                break;
+                        }
+                        key_ptr = new_meta;
+                        new_meta += key_len;
+                        data_len = *((int *)new_meta);
+                        new_meta += sizeof(int);
+                        garbage = dict_set_static_bin (@ARGNAME@, key_ptr,
+                                                       new_meta, data_len);
+                        /* TBD: check error from dict_set_static_bin */
+                        (void)garbage;
+                        new_meta += data_len;
+                }
+        }
+
+#pragma fragment DICT_CLEANUP
+cleanup_@ARGNAME@:
+        dict_unref (@ARGNAME@);
+
+#pragma fragment DOUBLE
+        @ARGTYPE@       @ARGNAME@       = *((@ARGTYPE@ *)new_meta);
+        new_meta += sizeof(uint64_t);
+
+#pragma fragment FD
+        inode_t *@ARGNAME@_ino;
+        fd_t    *@ARGNAME@;
+
+        @ARGNAME@_ino = recon_get_inode (fs, *((uuid_t *)new_meta));
+        new_meta += 16;
+        if (!@ARGNAME@_ino) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@_ino;
+
+        @ARGNAME@ = fd_anonymous (@ARGNAME@_ino);
+        if (!@ARGNAME@) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+
+#pragma fragment FD_CLEANUP
+cleanup_@ARGNAME@:
+        fd_unref (@ARGNAME@);
+cleanup_@ARGNAME@_ino:
+        inode_unref (@ARGNAME@_ino);
+
+#pragma fragment NEW_FD
+        /*
+         * This pseudo-type is only used for create, and in that case we know
+         * we'll be using loc.inode, so it's not worth generalizing to take an
+         * extra argument.
+         */
+        fd_t    *@ARGNAME@      = fd_anonymous (loc.inode);
+
+        if (!fd) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+        new_meta += 16;
+
+#pragma fragment NEW_FD_CLEANUP
+cleanup_@ARGNAME@:
+        fd_unref (@ARGNAME@);
+
+#pragma fragment INTEGER
+        @ARGTYPE@       @ARGNAME@       = *((@ARGTYPE@ *)new_meta);
+
+        new_meta += sizeof(@ARGTYPE@);
+
+#pragma fragment LOC
+        loc_t           @ARGNAME@       = { NULL, };
+
+        @ARGNAME@.inode = recon_get_inode (fs, *((uuid_t *)new_meta));
+        if (!@ARGNAME@.inode) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+        gf_uuid_copy (@ARGNAME@.gfid, @ARGNAME@.inode->gfid);
+        new_meta += 16;
+        new_meta += 16; /* skip over pargfid */
+        if (*(new_meta++)) {
+                @ARGNAME@.name = new_meta;
+                new_meta += strlen(new_meta) + 1;
+        }
+
+#pragma fragment LOC_CLEANUP
+cleanup_@ARGNAME@:
+        loc_wipe (&@ARGNAME@);
+
+#pragma fragment PARENT_LOC
+        loc_t           @ARGNAME@       = { NULL, };
+
+        new_meta += 16; /* skip over gfid */
+        @ARGNAME@.parent = recon_get_inode (fs, *((uuid_t *)new_meta));
+        if (!@ARGNAME@.parent) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+        gf_uuid_copy (@ARGNAME@.pargfid, @ARGNAME@.parent->gfid);
+        new_meta += 16;
+        if (!*(new_meta++)) {
+                goto *err_label;
+        }
+        @ARGNAME@.name = new_meta;
+        new_meta += strlen(new_meta) + 1;
+
+        @ARGNAME@.inode = inode_new (fs->active_subvol->itable);
+        if (!@ARGNAME@.inode) {
+                goto *err_label;
+        }
+
+#pragma fragment PARENT_LOC_CLEANUP
+cleanup_@ARGNAME@:
+        loc_wipe (&@ARGNAME@);
+
+#pragma fragment STRING
+        char    *@ARGNAME@;
+        if (*(new_meta++)) {
+                @ARGNAME@ = new_meta;
+                new_meta += (strlen(new_meta) + 1);
+        }
+        else {
+                goto *err_label;
+        }
+
+#pragma fragment VECTOR
+        struct iovec    @ARGNAME@;
+
+        @ARGNAME@.iov_len = *((size_t *)new_meta);
+        new_meta += sizeof(@ARGNAME@.iov_len);
+        @ARGNAME@.iov_base = new_data;
+        new_data += @ARGNAME@.iov_len;
+
+#pragma fragment IATT
+        struct iatt     @ARGNAME@;
+        {
+                @ARGNAME@.ia_prot = *((ia_prot_t *)new_meta);
+                new_meta += sizeof(ia_prot_t);
+                uint32_t *myints = (uint32_t *)new_meta;
+                @ARGNAME@.ia_uid = myints[0];
+                @ARGNAME@.ia_gid = myints[1];
+                @ARGNAME@.ia_atime = myints[2];
+                @ARGNAME@.ia_atime_nsec = myints[3];
+                @ARGNAME@.ia_mtime = myints[4];
+                @ARGNAME@.ia_mtime_nsec = myints[5];
+                new_meta += sizeof(*myints) * 6;
+        }
+
+#pragma fragment IOBREF
+        struct iobref   *@ARGNAME@;
+
+        @ARGNAME@ = iobref_new();
+        if (!@ARGNAME@) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+
+#pragma fragment IOBREF_CLEANUP
+cleanup_@ARGNAME@:
+        iobref_unref (@ARGNAME@);
+
+#pragma fragment LINK
+        /* TBD: check error */
+        inode_t *new_inode = inode_link (@INODE_ARG@, NULL, NULL, @IATT_ARG@);
+        if (new_inode) {
+                inode_lookup (new_inode);
+        }
+
+#pragma fragment FOP
+int
+fdl_replay_@NAME@ (glfs_t *fs, char **old_meta, char **old_data)
+{
+        char    *new_meta	= *old_meta;
+        char	*new_data	= *old_data;
+        int     ret;
+        int     status          = 0xbad;
+        void    *err_label      = &&done;
+
+@FUNCTION_BODY@
+
+        ret = syncop_@NAME@ (fs->active_subvol, @SYNCOP_ARGS@, NULL);
+        if (ret != @SUCCESS_VALUE@) {
+                fprintf (stderr, "syncop_@NAME@ returned %d", ret);
+                goto *err_label;
+        }
+
+@LINKS@
+
+        status = 0;
+
+@CLEANUPS@
+
+done:
+        *old_meta = new_meta;
+        *old_data = new_data;
+        return status;
+}
+
+#pragma fragment CASE
+        case GF_FOP_@UPNAME@:
+                printf ("=== GF_FOP_@UPNAME@\n");
+                if (fdl_replay_@NAME@ (fs, &new_meta, &new_data) != 0) {
+                        goto done;
+                }
+                recognized = 1;
+                break;
+
+#pragma fragment EPILOG
+int
+recon_execute (glfs_t *fs, char **old_meta, char **old_data)
+{
+        char            *new_meta       = *old_meta;
+        char            *new_data       = *old_data;
+        int             recognized      = 0;
+        event_header_t  *eh;
+
+        eh = (event_header_t *)new_meta;
+        new_meta += sizeof (*eh);
+
+        /* TBD: check event_type instead of assuming NEW_REQUEST */
+
+        switch (eh->fop_type) {
+@SWITCH_BODY@
+
+        default:
+                printf ("unknown fop %u\n", eh->fop_type);
+        }
+
+done:
+        *old_meta = new_meta;
+        *old_data = new_data;
+        return recognized;
+}
diff --git a/xlators/experimental/fdl/src/recon.c b/xlators/experimental/fdl/src/recon.c
new file mode 100644
index 00000000000..14168a011e0
--- /dev/null
+++ b/xlators/experimental/fdl/src/recon.c
@@ -0,0 +1,89 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "glusterfs.h"
+#include "fd.h"
+#include "syncop.h"
+#include "glfs-internal.h"
+
+#define GFAPI_SUCCESS 0
+
+extern int recon_execute (glfs_t *, char **, char **);
+
+int
+main (int argc, char **argv)
+{
+        glfs_t  *fs;
+        int     ret;
+        int     meta_fd         = (-1);
+        char    *meta_buf       = NULL;
+        int     data_fd         = (-1);
+        char    *data_buf       = NULL;
+
+        fs = glfs_new ("whocares");
+        if (!fs) {
+                fprintf (stderr, "glfs_new failed\n");
+                return EXIT_FAILURE;
+        }
+
+        if (getenv("RECON_DEBUG")) {
+                ret = glfs_set_logging (fs, "/dev/stderr", 7);
+        }
+        else {
+                ret = glfs_set_logging (fs, "/dev/null", 0);
+        }
+
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "glfs_set_logging failed (%d)\n", errno);
+                return EXIT_FAILURE;
+        }
+
+        ret = glfs_set_volfile (fs, argv[1]);
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "glfs_set_volfile failed (%d)\n", errno);
+                return EXIT_FAILURE;
+        }
+
+        ret = glfs_init (fs);
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "glfs_init failed (%d)\n", errno);
+                return EXIT_FAILURE;
+        }
+
+        meta_fd = open (argv[2], O_RDONLY);
+        if (meta_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0);
+        if (meta_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        data_fd = open (argv[3], O_RDONLY);
+        if (data_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0);
+        if (data_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        for (;;) {
+                if (!recon_execute(fs,&meta_buf,&data_buf)) {
+                        break;
+                }
+        }
+
+        return EXIT_SUCCESS;
+}
-- 
cgit