summaryrefslogtreecommitdiffstats
path: root/xlators/experimental
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2016-02-08 13:30:49 -0500
committerJeff Darcy <jdarcy@redhat.com>2016-02-13 05:13:07 -0800
commitc458433041aafb48ae6d6e5fcf3e1e737dc3fda3 (patch)
tree33a03ca0c1f5faf58419de2c4ff4532752ddfb07 /xlators/experimental
parentda33097c3d6492e3b468b4347e47c70828fb4320 (diff)
experimental: add fdl (Full Data Logging) translator
NSR needs logging that is different than our existing changelog in several ways: * Full data, not just metadata * Pre-op, not post-op * High performance * Supports the concept of time-bounded "terms" Others (for example EC) might need the same thing. This patch adds such a translator. It also adds code to dump the resulting journals, and to replay them using syncops, plus (very rudimentary) tests for all of the above. Change-Id: I29680a1b4e0a9e7d5a8497fef302c46434b86636 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.org/12450 Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/experimental')
-rw-r--r--xlators/experimental/Makefile.am2
-rw-r--r--xlators/experimental/fdl/Makefile.am3
-rw-r--r--xlators/experimental/fdl/src/Makefile.am42
-rw-r--r--xlators/experimental/fdl/src/dump-tmpl.c156
-rw-r--r--xlators/experimental/fdl/src/fdl-tmpl.c506
-rwxr-xr-xxlators/experimental/fdl/src/gen_dumper.py116
-rwxr-xr-xxlators/experimental/fdl/src/gen_fdl.py328
-rwxr-xr-xxlators/experimental/fdl/src/gen_recon.py191
-rw-r--r--xlators/experimental/fdl/src/jnl-types.h14
-rw-r--r--xlators/experimental/fdl/src/logdump.c50
-rw-r--r--xlators/experimental/fdl/src/recon-tmpl.c305
-rw-r--r--xlators/experimental/fdl/src/recon.c89
12 files changed, 1801 insertions, 1 deletions
diff --git a/xlators/experimental/Makefile.am b/xlators/experimental/Makefile.am
index 06f04a193c8..a31512203f6 100644
--- a/xlators/experimental/Makefile.am
+++ b/xlators/experimental/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = nsr-client nsr-server
+SUBDIRS = nsr-client nsr-server fdl
CLEANFILES =
diff --git a/xlators/experimental/fdl/Makefile.am b/xlators/experimental/fdl/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/experimental/fdl/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/experimental/fdl/src/Makefile.am b/xlators/experimental/fdl/src/Makefile.am
new file mode 100644
index 00000000000..a05fc797b0a
--- /dev/null
+++ b/xlators/experimental/fdl/src/Makefile.am
@@ -0,0 +1,42 @@
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental
+xlator_LTLIBRARIES = fdl.la
+
+noinst_HEADERS = jnl-types.h
+
+nodist_fdl_la_SOURCES = fdl.c
+fdl_la_LDFLAGS = -module -avoid-version
+fdl_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+sbin_PROGRAMS = gf_logdump gf_recon
+gf_logdump_SOURCES = logdump.c
+nodist_gf_logdump_SOURCES = libfdl.c
+gf_logdump_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+ $(top_builddir)/api/src/libgfapi.la
+
+# Eventually recon(ciliation) code will move elsewhere, but for now it's
+# easier to have it next to the similar logdump code.
+gf_recon_SOURCES = recon.c
+nodist_gf_recon_SOURCES = librecon.c
+gf_recon_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+ $(top_builddir)/api/src/libgfapi.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/api/src -fPIC \
+ -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+ -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+noinst_PYTHON = gen_fdl.py gen_dumper.py gen_recon.py
+EXTRA_DIST = fdl-tmpl.c dump-tmpl.c recon-tmpl.c
+
+CLEANFILES = $(nodist_fdl_la_SOURCES) $(nodist_gf_logdump_SOURCES)
+
+fdl.c: fdl-tmpl.c gen_fdl.py
+ $(PYTHON) $(srcdir)/gen_fdl.py $(srcdir)/fdl-tmpl.c > $@
+
+libfdl.c: dump-tmpl.c gen_dumper.py
+ $(PYTHON) $(srcdir)/gen_dumper.py $(srcdir)/dump-tmpl.c > $@
+
+librecon.c: recon-tmpl.c gen_recon.py
+ $(PYTHON) $(srcdir)/gen_recon.py $(srcdir)/recon-tmpl.c > $@
diff --git a/xlators/experimental/fdl/src/dump-tmpl.c b/xlators/experimental/fdl/src/dump-tmpl.c
new file mode 100644
index 00000000000..cac1071a9c1
--- /dev/null
+++ b/xlators/experimental/fdl/src/dump-tmpl.c
@@ -0,0 +1,156 @@
+#pragma fragment PROLOG
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glfs.h"
+#include "iatt.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+#pragma fragment DICT
+ {
+ int key_len, data_len;
+ char *key_ptr;
+ printf ("@ARGNAME@ = dict {\n");
+ for (;;) {
+ key_len = *((int *)new_meta);
+ new_meta += sizeof(int);
+ if (!key_len) {
+ break;
+ }
+ key_ptr = new_meta;
+ new_meta += key_len;
+ data_len = *((int *)new_meta);
+ new_meta += sizeof(int) + data_len;
+ printf (" %s = <%d bytes>\n", key_ptr, data_len);
+ }
+ printf ("}\n");
+ }
+
+#pragma fragment DOUBLE
+ printf ("@ARGNAME@ = @FORMAT@\n", *((uint64_t *)new_meta),
+ *((uint64_t *)new_meta));
+ new_meta += sizeof(uint64_t);
+
+#pragma fragment GFID
+ printf ("@ARGNAME@ = <gfid %s>\n", uuid_utoa(*((uuid_t *)new_meta)));
+ new_meta += 16;
+
+#pragma fragment INTEGER
+ printf ("@ARGNAME@ = @FORMAT@\n", *((uint32_t *)new_meta),
+ *((uint32_t *)new_meta));
+ new_meta += sizeof(uint32_t);
+
+#pragma fragment LOC
+ printf ("@ARGNAME@ = loc {\n");
+ printf (" gfid = %s\n", uuid_utoa(*((uuid_t *)new_meta)));
+ new_meta += 16;
+ printf (" pargfid = %s\n", uuid_utoa(*((uuid_t *)new_meta)));
+ new_meta += 16;
+ if (*(new_meta++)) {
+ printf (" name = %s\n", new_meta);
+ new_meta += (strlen(new_meta) + 1);
+ }
+ printf ("}\n");
+
+#pragma fragment STRING
+ if (*(new_meta++)) {
+ printf ("@ARGNAME@ = %s\n", new_meta);
+ new_meta += (strlen(new_meta) + 1);
+ }
+
+#pragma fragment VECTOR
+ {
+ size_t len = *((size_t *)new_meta);
+ new_meta += sizeof(len);
+ printf ("@ARGNAME@ = <%zu bytes>\n", len);
+ new_data += len;
+ }
+
+#pragma fragment IATT
+ {
+ ia_prot_t *myprot = ((ia_prot_t *)new_meta);
+ printf ("@ARGNAME@ = iatt {\n");
+ printf (" ia_prot = %c%c%c",
+ myprot->suid ? 'S' : '-',
+ myprot->sgid ? 'S' : '-',
+ myprot->sticky ? 'T' : '-');
+ printf ("%c%c%c",
+ myprot->owner.read ? 'r' : '-',
+ myprot->owner.write ? 'w' : '-',
+ myprot->owner.exec ? 'x' : '-');
+ printf ("%c%c%c",
+ myprot->group.read ? 'r' : '-',
+ myprot->group.write ? 'w' : '-',
+ myprot->group.exec ? 'x' : '-');
+ printf ("%c%c%c\n",
+ myprot->other.read ? 'r' : '-',
+ myprot->other.write ? 'w' : '-',
+ myprot->other.exec ? 'x' : '-');
+ new_meta += sizeof(ia_prot_t);
+ uint32_t *myints = (uint32_t *)new_meta;
+ printf (" ia_uid = %u\n", myints[0]);
+ printf (" ia_gid = %u\n", myints[1]);
+ printf (" ia_atime = %u.%09u\n", myints[2], myints[3]);
+ printf (" ia_mtime = %u.%09u\n", myints[4], myints[5]);
+ new_meta += sizeof(*myints) * 6;
+ }
+
+#pragma fragment FOP
+void
+fdl_dump_@NAME@ (char **old_meta, char **old_data)
+{
+ char *new_meta = *old_meta;
+ char *new_data = *old_data;
+
+ /* TBD: word size/endianness */
+@FUNCTION_BODY@
+
+ *old_meta = new_meta;
+ *old_data = new_data;
+}
+
+#pragma fragment CASE
+ case GF_FOP_@UPNAME@:
+ printf ("=== GF_FOP_@UPNAME@\n");
+ fdl_dump_@NAME@ (&new_meta, &new_data);
+ break;
+
+#pragma fragment EPILOG
+int
+fdl_dump (char **old_meta, char **old_data)
+{
+ char *new_meta = *old_meta;
+ char *new_data = *old_data;
+ static glfs_t *fs = NULL;
+ int recognized = 1;
+ event_header_t *eh;
+
+ /*
+ * We don't really call anything else in GFAPI, but this is the most
+ * convenient way to satisfy all of the spurious dependencies on how it
+ * or glusterfsd initialize (e.g. setting up THIS).
+ */
+ if (!fs) {
+ fs = glfs_new ("dummy");
+ }
+
+ eh = (event_header_t *)new_meta;
+ new_meta += sizeof (*eh);
+
+ /* TBD: check event_type instead of assuming NEW_REQUEST */
+
+ switch (eh->fop_type) {
+@SWITCH_BODY@
+
+ default:
+ printf ("unknown fop %u\n", eh->fop_type);
+ recognized = 0;
+ }
+
+ *old_meta = new_meta;
+ *old_data = new_data;
+ return recognized;
+}
diff --git a/xlators/experimental/fdl/src/fdl-tmpl.c b/xlators/experimental/fdl/src/fdl-tmpl.c
new file mode 100644
index 00000000000..8fcc6a8d6ff
--- /dev/null
+++ b/xlators/experimental/fdl/src/fdl-tmpl.c
@@ -0,0 +1,506 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include "call-stub.h"
+#include "iatt.h"
+#include "defaults.h"
+#include "syscall.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+/* TBD: make tunable */
+#define META_FILE_SIZE (1 << 20)
+#define DATA_FILE_SIZE (1 << 24)
+
+enum gf_fdl {
+ gf_fdl_mt_fdl_private_t = gf_common_mt_end + 1,
+ gf_fdl_mt_end
+};
+
+typedef struct {
+ char *type;
+ off_t size;
+ char *path;
+ int fd;
+ void * ptr;
+ off_t max_offset;
+} log_obj_t;
+
+typedef struct {
+ struct list_head reqs;
+ pthread_mutex_t req_lock;
+ pthread_cond_t req_cond;
+ char *log_dir;
+ pthread_t worker;
+ gf_boolean_t should_stop;
+ gf_boolean_t change_term;
+ log_obj_t meta_log;
+ log_obj_t data_log;
+ int term;
+ int first_term;
+} fdl_private_t;
+
+void
+fdl_enqueue (xlator_t *this, call_stub_t *stub)
+{
+ fdl_private_t *priv = this->private;
+
+ pthread_mutex_lock (&priv->req_lock);
+ list_add_tail (&stub->list, &priv->reqs);
+ pthread_mutex_unlock (&priv->req_lock);
+
+ pthread_cond_signal (&priv->req_cond);
+}
+
+#pragma generate
+
+char *
+fdl_open_term_log (xlator_t *this, log_obj_t *obj, int term)
+{
+ fdl_private_t *priv = this->private;
+ int ret;
+ char * ptr = NULL;
+
+ /*
+ * Use .jnl instead of .log so that we don't get test info (mistakenly)
+ * appended to our journal files.
+ */
+ if (this->ctx->cmd_args.log_ident) {
+ ret = gf_asprintf (&obj->path, "%s/%s-%s-%d.jnl",
+ priv->log_dir, this->ctx->cmd_args.log_ident,
+ obj->type, term);
+ }
+ else {
+ ret = gf_asprintf (&obj->path, "%s/fubar-%s-%d.jnl",
+ priv->log_dir, obj->type, term);
+ }
+ if ((ret <= 0) || !obj->path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to construct log-file path");
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "opening %s (size %ld)",
+ obj->path, obj->size);
+
+ obj->fd = open (obj->path, O_RDWR|O_CREAT|O_TRUNC, 0666);
+ if (obj->fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open log file (%s)", strerror(errno));
+ goto err;
+ }
+
+#if !defined(GF_BSD_HOST_OS)
+ /*
+ * NetBSD can just go die in a fire. Even though it claims to support
+ * fallocate/posix_fallocate they don't actually *do* anything so the
+ * file size remains zero. Then mmap succeeds anyway, but any access
+ * to the mmap'ed region will segfault. It would be acceptable for
+ * fallocate to do what it says, for mmap to fail, or for access to
+ * extend the file. NetBSD managed to hit the trifecta of Getting
+ * Everything Wrong, and debugging in that environment to get this far
+ * has already been painful enough (systems I worked on in 1990 were
+ * better that way). We'll fall through to the lseek/write method, and
+ * performance will be worse, and TOO BAD.
+ */
+ if (sys_fallocate(obj->fd,0,0,obj->size) < 0)
+#endif
+ {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to fallocate space for log file");
+ /* Have to do this the ugly page-faulty way. */
+ (void) sys_lseek (obj->fd, obj->size-1, SEEK_SET);
+ (void) sys_write (obj->fd, "", 1);
+ }
+
+ ptr = mmap (NULL, obj->size, PROT_WRITE, MAP_SHARED, obj->fd, 0);
+ if (ptr == MAP_FAILED) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to mmap log (%s)",
+ strerror(errno));
+ goto err;
+ }
+
+ obj->ptr = ptr;
+ obj->max_offset = 0;
+ return ptr;
+
+err:
+ if (obj->fd >= 0) {
+ sys_close (obj->fd);
+ obj->fd = (-1);
+ }
+ if (obj->path) {
+ GF_FREE (obj->path);
+ obj->path = NULL;
+ }
+ return ptr;
+}
+
+void
+fdl_close_term_log (xlator_t *this, log_obj_t *obj)
+{
+ fdl_private_t *priv = this->private;
+
+ if (obj->ptr) {
+ (void) munmap (obj->ptr, obj->size);
+ obj->ptr = NULL;
+ }
+
+ if (obj->fd >= 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "truncating term %d %s journal to %ld",
+ priv->term, obj->type, obj->max_offset);
+ if (sys_ftruncate(obj->fd,obj->max_offset) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to truncate journal (%s)",
+ strerror(errno));
+ }
+ sys_close (obj->fd);
+ obj->fd = (-1);
+ }
+
+ if (obj->path) {
+ GF_FREE (obj->path);
+ obj->path = NULL;
+ }
+}
+
+gf_boolean_t
+fdl_change_term (xlator_t *this, char **meta_ptr, char **data_ptr)
+{
+ fdl_private_t *priv = this->private;
+
+ fdl_close_term_log (this, &priv->meta_log);
+ fdl_close_term_log (this, &priv->data_log);
+
+ ++(priv->term);
+
+ *meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+ if (!*meta_ptr) {
+ return _gf_false;
+ }
+
+ *data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+ if (!*data_ptr) {
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+void *
+fdl_worker (void *arg)
+{
+ xlator_t *this = arg;
+ fdl_private_t *priv = this->private;
+ call_stub_t *stub;
+ char * meta_ptr = NULL;
+ off_t *meta_offset = &priv->meta_log.max_offset;
+ char * data_ptr = NULL;
+ off_t *data_offset = &priv->data_log.max_offset;
+ unsigned long base_as_ul;
+ void * msync_ptr;
+ size_t msync_len;
+ gf_boolean_t recycle;
+ void *err_label = &&err_unlocked;
+
+ priv->meta_log.type = "meta";
+ priv->meta_log.size = META_FILE_SIZE;
+ priv->meta_log.path = NULL;
+ priv->meta_log.fd = (-1);
+ priv->meta_log.ptr = NULL;
+
+ priv->data_log.type = "data";
+ priv->data_log.size = DATA_FILE_SIZE;
+ priv->data_log.path = NULL;
+ priv->data_log.fd = (-1);
+ priv->data_log.ptr = NULL;
+
+ /* TBD: initial term should come from persistent storage (e.g. etcd) */
+ priv->first_term = ++(priv->term);
+ meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+ if (!meta_ptr) {
+ goto *err_label;
+ }
+ data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+ if (!data_ptr) {
+ fdl_close_term_log (this, &priv->meta_log);
+ goto *err_label;
+ }
+
+ for (;;) {
+ pthread_mutex_lock (&priv->req_lock);
+ err_label = &&err_locked;
+ while (list_empty(&priv->reqs)) {
+ pthread_cond_wait (&priv->req_cond, &priv->req_lock);
+ if (priv->should_stop) {
+ goto *err_label;
+ }
+ if (priv->change_term) {
+ if (!fdl_change_term(this, &meta_ptr,
+ &data_ptr)) {
+ goto *err_label;
+ }
+ priv->change_term = _gf_false;
+ continue;
+ }
+ }
+ stub = list_entry (priv->reqs.next, call_stub_t, list);
+ list_del_init (&stub->list);
+ pthread_mutex_unlock (&priv->req_lock);
+ err_label = &&err_unlocked;
+ /*
+ * TBD: batch requests
+ *
+ * What we should do here is gather up *all* of the requests
+ * that have accumulated since we were last at this point,
+ * blast them all out in one big writev, and then dispatch them
+ * all before coming back for more. That maximizes throughput,
+ * at some cost to latency (due to queuing effects at the log
+ * stage). Note that we're likely to be above io-threads, so
+ * the dispatch itself will be parallelized (at further cost to
+ * latency). For now, we just do the simplest thing and handle
+ * one request all the way through before fetching the next.
+ *
+ * So, why mmap/msync instead of writev/fdatasync? Because it's
+ * faster. Much faster. So much faster that I half-suspect
+ * cheating, but it's more convenient for now than having to
+ * ensure that everything's page-aligned for O_DIRECT (the only
+ * alternative that still might avoid ridiculous levels of
+ * local-FS overhead).
+ *
+ * TBD: check that msync really does get our data to disk.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "logging %u+%u bytes for op %d",
+ stub->jnl_meta_len, stub->jnl_data_len, stub->fop);
+ recycle = _gf_false;
+ if ((*meta_offset + stub->jnl_meta_len) > priv->meta_log.size) {
+ recycle = _gf_true;
+ }
+ if ((*data_offset + stub->jnl_data_len) > priv->data_log.size) {
+ recycle = _gf_true;
+ }
+ if (recycle && !fdl_change_term(this,&meta_ptr,&data_ptr)) {
+ goto *err_label;
+ }
+ meta_ptr = priv->meta_log.ptr;
+ data_ptr = priv->data_log.ptr;
+ gf_log (this->name, GF_LOG_DEBUG, "serializing to %p/%p",
+ meta_ptr + *meta_offset, data_ptr + *data_offset);
+ stub->serialize (stub, meta_ptr + *meta_offset,
+ data_ptr + *data_offset);
+ if (stub->jnl_meta_len > 0) {
+ base_as_ul = (unsigned long) (meta_ptr + *meta_offset);
+ msync_ptr = (void *) (base_as_ul & ~0x0fff);
+ msync_len = (size_t) (base_as_ul & 0x0fff);
+ if (msync (msync_ptr, msync_len+stub->jnl_meta_len,
+ MS_SYNC) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to log request meta (%s)",
+ strerror(errno));
+ }
+ *meta_offset += stub->jnl_meta_len;
+ }
+ if (stub->jnl_data_len > 0) {
+ base_as_ul = (unsigned long) (data_ptr + *data_offset);
+ msync_ptr = (void *) (base_as_ul & ~0x0fff);
+ msync_len = (size_t) (base_as_ul & 0x0fff);
+ if (msync (msync_ptr, msync_len+stub->jnl_data_len,
+ MS_SYNC) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to log request data (%s)",
+ strerror(errno));
+ }
+ *data_offset += stub->jnl_data_len;
+ }
+ call_resume (stub);
+ }
+
+err_locked:
+ pthread_mutex_unlock (&priv->req_lock);
+err_unlocked:
+ fdl_close_term_log (this, &priv->meta_log);
+ fdl_close_term_log (this, &priv->data_log);
+ return NULL;
+}
+
+int32_t
+fdl_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ fdl_private_t *priv = this->private;
+ dict_t *tdict;
+ int32_t gt_err = EIO;
+
+ switch (op) {
+
+ case FDL_IPC_CHANGE_TERM:
+ gf_log (this->name, GF_LOG_INFO, "got CHANGE_TERM op");
+ priv->change_term = _gf_true;
+ pthread_cond_signal (&priv->req_cond);
+ STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL);
+ break;
+
+ case FDL_IPC_GET_TERMS:
+ gf_log (this->name, GF_LOG_INFO, "got GET_TERMS op");
+ tdict = dict_new ();
+ if (!tdict) {
+ gt_err = ENOMEM;
+ goto gt_done;
+ }
+ if (dict_set_int32(tdict,"first",priv->first_term) != 0) {
+ goto gt_done;
+ }
+ if (dict_set_int32(tdict,"last",priv->term) != 0) {
+ goto gt_done;
+ }
+ gt_err = 0;
+ gt_done:
+ if (gt_err) {
+ STACK_UNWIND_STRICT (ipc, frame, -1, gt_err, NULL);
+ } else {
+ STACK_UNWIND_STRICT (ipc, frame, 0, 0, tdict);
+ }
+ if (tdict) {
+ dict_unref (tdict);
+ }
+ break;
+
+ default:
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc,
+ op, xdata);
+ }
+
+ return 0;
+}
+
+int
+fdl_init (xlator_t *this)
+{
+ fdl_private_t *priv = NULL;
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_fdl_mt_fdl_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate fdl_private");
+ goto err;
+ }
+
+ INIT_LIST_HEAD (&priv->reqs);
+ if (pthread_mutex_init (&priv->req_lock, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to initialize req_lock");
+ goto err;
+ }
+ if (pthread_cond_init (&priv->req_cond, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to initialize req_cond");
+ goto err;
+ }
+
+ GF_OPTION_INIT ("log-path", priv->log_dir, path, err);
+
+ if (pthread_create(&priv->worker,NULL,fdl_worker,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to start fdl_worker");
+ goto err;
+ }
+
+ /*
+ * The rest of the fop table is automatically generated, so this is a
+ * bit cleaner than messing with the generation to add a hand-written
+ * exception.
+ */
+ this->fops->ipc = fdl_ipc;
+
+ this->private = priv;
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+void
+fdl_fini (xlator_t *this)
+{
+ fdl_private_t *priv = this->private;
+
+ if (priv) {
+ priv->should_stop = _gf_true;
+ pthread_cond_signal (&priv->req_cond);
+ pthread_join (priv->worker, NULL);
+ GF_FREE(priv);
+ }
+}
+
+int
+fdl_reconfigure (xlator_t *this, dict_t *options)
+{
+ fdl_private_t *priv = this->private;
+
+ GF_OPTION_RECONF ("log_dir", priv->log_dir, options, path, out);
+ /* TBD: react if it changed */
+
+out:
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("fdl", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_fdl_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+class_methods_t class_methods = {
+ .init = fdl_init,
+ .fini = fdl_fini,
+ .reconfigure = fdl_reconfigure,
+ .notify = default_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {"log-path"},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = DEFAULT_LOG_FILE_DIRECTORY,
+ .description = "Directory for FDL files."
+ },
+ { .key = {NULL} },
+};
+
+struct xlator_cbks cbks = {
+ .release = default_release,
+ .releasedir = default_releasedir,
+ .forget = default_forget,
+};
diff --git a/xlators/experimental/fdl/src/gen_dumper.py b/xlators/experimental/fdl/src/gen_dumper.py
new file mode 100755
index 00000000000..42db55d2cb3
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_dumper.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+
+import os
+import re
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# See the big header comment at the start of gen_fdl.py to see how the stages
+# fit together. The big difference here is that *all* of the C code is in the
+# template file as labelled fragments, instead of as Python strings. That
+# makes it much easier to edit in one place, with proper syntax highlighting
+# and indentation.
+#
+# Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of
+# LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE.
+#
+# Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and
+# FOP_TEMPLATE. The expanded FOP code (including FUNCTION_BODY substitution
+# in the middle of each function) is emitted immediately; the expanded CASE
+# code is saved for the next stage.
+#
+# Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code
+# in the middle of EPILOG, to generate the whole output file.
+#
+# Another way of looking at it is to consider how the fragments appear in
+# the final output:
+#
+# PROLOG
+# FOP (expanded for CREATE)
+# FOP before FUNCTION_BODY
+# LOC, INTEGER, GFID, etc. (one per arg, by type)
+# FOP after FUNCTION_BODY
+# FOP (expanded for WRITEV)
+# FOP before FUNCTION_BODY
+# GFID, VECTOR, etc. (on per arg, by type)
+# FOP after FUNCTION_BODY
+# (more FOPs)
+# EPILOG
+# EPILOG before CASE
+# CASE statements (one per fop)
+# EPILOG after CASE
+
+typemap = {
+ 'dict_t *': ( "DICT", ""),
+ 'fd_t *': ( "GFID", ""),
+ 'dev_t': ( "DOUBLE", "%ld (0x%lx)"),
+ 'gf_xattrop_flags_t': ( "INTEGER", "%d (0x%x)"),
+ 'int32_t': ( "INTEGER", "%d (0x%x)"),
+ 'mode_t': ( "INTEGER", "%d (0x%x)"),
+ 'off_t': ( "DOUBLE", "%ld (0x%lx)"),
+ 'size_t': ( "DOUBLE", "%ld (0x%lx)"),
+ 'uint32_t': ( "INTEGER", "%d (0x%x)"),
+ 'loc_t *': ( "LOC", ""),
+ 'const char *': ( "STRING", ""),
+ 'struct iovec *': ( "VECTOR", ""),
+ 'struct iatt *': ( "IATT", ""),
+}
+
+def get_special_subs (args):
+ code = ""
+ for arg in args:
+ if (arg[0] != 'fop-arg') or (len(arg) < 4):
+ continue
+ recon_type, recon_fmt = typemap[arg[2]]
+ code += fragments[recon_type].replace("@ARGNAME@",arg[3]) \
+ .replace("@FORMAT@",recon_fmt)
+ return code
+
+def gen_functions ():
+ code = ""
+ for name, value in ops.iteritems():
+ if "journal" not in [ x[0] for x in value ]:
+ continue
+ fop_subs[name]["@FUNCTION_BODY@"] = get_special_subs(value)
+ # Print the FOP fragment with @FUNCTION_BODY@ in the middle.
+ code += generate(fragments["FOP"],name,fop_subs)
+ return code
+
+def gen_cases ():
+ code = ""
+ for name, value in ops.iteritems():
+ if "journal" not in [ x[0] for x in value ]:
+ continue
+ # Add the CASE fragment for this fop.
+ code += generate(fragments["CASE"],name,fop_subs)
+ return code
+
+def load_fragments (path="recon-tmpl.c"):
+ pragma_re = re.compile('pragma fragment (.*)')
+ cur_symbol = None
+ cur_value = ""
+ result = {}
+ for line in open(path,"r").readlines():
+ m = pragma_re.search(line)
+ if m:
+ if cur_symbol:
+ result[cur_symbol] = cur_value
+ cur_symbol = m.group(1)
+ cur_value = ""
+ else:
+ cur_value += line
+ if cur_symbol:
+ result[cur_symbol] = cur_value
+ return result
+
+if __name__ == "__main__":
+ fragments = load_fragments(sys.argv[1])
+ print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+ print fragments["PROLOG"]
+ print gen_functions()
+ print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases())
+ print "/* END GENERATED CODE */"
diff --git a/xlators/experimental/fdl/src/gen_fdl.py b/xlators/experimental/fdl/src/gen_fdl.py
new file mode 100755
index 00000000000..7f6b1aaaeaa
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_fdl.py
@@ -0,0 +1,328 @@
+#!/usr/bin/python
+
+import os
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# Generation occurs in three stages. In this case, it actually makes more
+# sense to discuss them in the *opposite* order of that in which they
+# actually happen.
+#
+# Stage 3 is to insert all of the generated code into a file, replacing the
+# "#pragma generate" that's already there. The file can thus contain all
+# sorts of stuff that's not specific to one fop, either before or after the
+# generated code as appropriate.
+#
+# Stage 2 is to generate all of the code *for a particular fop*, using a
+# string-valued template plus a table of substitution values. Most of these
+# are built in to the generator itself. However, we also add a couple that
+# are specific to this particular translator - LEN_CODE and SER_CODE. These
+# are per-fop functions to get the length or the contents (respectively) of
+# what we'll put in the log. As with stage 3 allowing per-file boilerplate
+# before and after generated code, this allows per-fop boilerplate before and
+# after generated code.
+#
+# Stage 1, therefore, is to create the LEN_CODE and SER_CODE substitutions for
+# each fop, and put them in the same table where e.g. NAME and SHORT_ARGS
+# already are. We do this by looking at the fop-description table in the
+# generator module, then doing out own template substitution to plug each
+# specific argument name into another string-valued template.
+#
+# So, what does this leave us with in terms of variables and files?
+#
+# For stage 1, we have a series of LEN_*_TEMPLATE and SERLZ_*_TEMPLATE
+# strings, which are used to generate the length and serialization code for
+# each argument type.
+#
+# For stage 2, we have a bunch of *_TEMPLATE strings (no LEN_ or SERLZ_
+# prefix), which are used (along with the output from stage 1) to generate
+# whole functions.
+#
+# For stage 3, we have a whole separate file (fdl_tmpl.c) into which we insert
+# the collection of all functions defined in stage 2.
+
+
+LEN_TEMPLATE = """
+void
+fdl_len_@NAME@ (call_stub_t *stub)
+{
+ uint32_t meta_len = sizeof (event_header_t);
+ uint32_t data_len = 0;
+
+ /* TBD: global stuff, e.g. uid/gid */
+@LEN_CODE@
+
+ /* TBD: pad extension length */
+ stub->jnl_meta_len = meta_len;
+ stub->jnl_data_len = data_len;
+}
+"""
+
+SER_TEMPLATE = """
+void
+fdl_serialize_@NAME@ (call_stub_t *stub, char *meta_buf, char *data_buf)
+{
+ event_header_t *eh;
+ unsigned long offset = 0;
+
+ /* TBD: word size/endianness */
+ eh = (event_header_t *)meta_buf;
+ eh->event_type = NEW_REQUEST;
+ eh->fop_type = GF_FOP_@UPNAME@;
+ eh->request_id = 0; // TBD
+ meta_buf += sizeof (*eh);
+@SER_CODE@
+ /* TBD: pad extension length */
+ eh->ext_length = offset;
+}
+"""
+
+CBK_TEMPLATE = """
+int32_t
+fdl_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ @LONG_ARGS@)
+{
+ STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno,
+ @SHORT_ARGS@);
+ return 0;
+}
+"""
+
+CONTINUE_TEMPLATE = """
+int32_t
+fdl_@NAME@_continue (call_frame_t *frame, xlator_t *this,
+ @LONG_ARGS@)
+{
+ STACK_WIND (frame, fdl_@NAME@_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@,
+ @SHORT_ARGS@);
+ return 0;
+}
+
+"""
+
+FOP_TEMPLATE = """
+int32_t
+fdl_@NAME@ (call_frame_t *frame, xlator_t *this,
+ @LONG_ARGS@)
+{
+ call_stub_t *stub;
+
+ stub = fop_@NAME@_stub (frame, default_@NAME@,
+ @SHORT_ARGS@);
+ fdl_len_@NAME@ (stub);
+ stub->serialize = fdl_serialize_@NAME@;
+ fdl_enqueue (this, stub);
+
+ return 0;
+}
+"""
+
+LEN_DICT_TEMPLATE = """
+ if (@SRC@) {
+ data_pair_t *memb;
+ for (memb = @SRC@->members_list; memb; memb = memb->next) {
+ meta_len += sizeof(int);
+ meta_len += strlen(memb->key) + 1;
+ meta_len += sizeof(int);
+ meta_len += memb->value->len;
+ }
+ }
+ meta_len += sizeof(int);
+"""
+
+LEN_GFID_TEMPLATE = """
+ meta_len += 16;
+"""
+
+LEN_INTEGER_TEMPLATE = """
+ meta_len += sizeof (@SRC@);
+"""
+
+# 16 for gfid, 16 for pargfid, 1 for flag, 0/1 for terminating NUL
+LEN_LOC_TEMPLATE = """
+ if (@SRC@.name) {
+ meta_len += (strlen (@SRC@.name) + 34);
+ } else {
+ meta_len += 33;
+ }
+"""
+
+LEN_STRING_TEMPLATE = """
+ if (@SRC@) {
+ meta_len += (strlen (@SRC@) + 1);
+ } else {
+ meta_len += 1;
+ }
+"""
+
+LEN_VECTOR_TEMPLATE = """
+ meta_len += sizeof(size_t);
+ data_len += iov_length (@VEC@, @CNT@);
+"""
+
+LEN_IATT_TEMPLATE = """
+ meta_len += sizeof(@SRC@.ia_prot);
+ meta_len += sizeof(@SRC@.ia_uid);
+ meta_len += sizeof(@SRC@.ia_gid);
+ meta_len += sizeof(@SRC@.ia_atime);
+ meta_len += sizeof(@SRC@.ia_atime_nsec);
+ meta_len += sizeof(@SRC@.ia_mtime);
+ meta_len += sizeof(@SRC@.ia_mtime_nsec);
+"""
+
+SERLZ_DICT_TEMPLATE = """
+ if (@SRC@) {
+ data_pair_t *memb;
+ for (memb = @SRC@->members_list; memb; memb = memb->next) {
+ *((int *)(meta_buf+offset)) = strlen(memb->key) + 1;
+ offset += sizeof(int);
+ strcpy (meta_buf+offset, memb->key);
+ offset += strlen(memb->key) + 1;
+ *((int *)(meta_buf+offset)) = memb->value->len;
+ offset += sizeof(int);
+ memcpy (meta_buf+offset, memb->value->data, memb->value->len);
+ offset += memb->value->len;
+ }
+ }
+ *((int *)(meta_buf+offset)) = 0;
+ offset += sizeof(int);
+"""
+
+SERLZ_GFID_TEMPLATE = """
+ memcpy (meta_buf+offset, @SRC@->inode->gfid, 16);
+ offset += 16;
+"""
+
+SERLZ_INTEGER_TEMPLATE = """
+ memcpy (meta_buf+offset, &@SRC@, sizeof(@SRC@));
+ offset += sizeof(@SRC@);
+"""
+
+SERLZ_LOC_TEMPLATE = """
+ memcpy (meta_buf+offset, @SRC@.gfid, 16);
+ offset += 16;
+ memcpy (meta_buf+offset, @SRC@.pargfid, 16);
+ offset += 16;
+ if (@SRC@.name) {
+ *(meta_buf+offset) = 1;
+ ++offset;
+ strcpy (meta_buf+offset, @SRC@.name);
+ offset += (strlen (@SRC@.name) + 1);
+ } else {
+ *(meta_buf+offset) = 0;
+ ++offset;
+ }
+"""
+
+SERLZ_STRING_TEMPLATE = """
+ if (@SRC@) {
+ *(meta_buf+offset) = 1;
+ ++offset;
+ strcpy (meta_buf+offset, @SRC@);
+ offset += strlen(@SRC@);
+ } else {
+ *(meta_buf+offset) = 0;
+ ++offset;
+ }
+"""
+
+SERLZ_VECTOR_TEMPLATE = """
+ *((size_t *)(meta_buf+offset)) = iov_length (@VEC@, @CNT@);
+ offset += sizeof(size_t);
+ int32_t i;
+ for (i = 0; i < @CNT@; ++i) {
+ memcpy (data_buf, @VEC@[i].iov_base, @VEC@[i].iov_len);
+ data_buf += @VEC@[i].iov_len;
+ }
+"""
+
+# We don't need to save all of the fields - only those affected by chown,
+# chgrp, chmod, and utime.
+SERLZ_IATT_TEMPLATE = """
+ *((ia_prot_t *)(meta_buf+offset)) = @SRC@.ia_prot;
+ offset += sizeof(@SRC@.ia_prot);
+ *((uint32_t *)(meta_buf+offset)) = @SRC@.ia_uid;
+ offset += sizeof(@SRC@.ia_uid);
+ *((uint32_t *)(meta_buf+offset)) = @SRC@.ia_gid;
+ offset += sizeof(@SRC@.ia_gid);
+ *((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime;
+ offset += sizeof(@SRC@.ia_atime);
+ *((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime_nsec;
+ offset += sizeof(@SRC@.ia_atime_nsec);
+ *((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime;
+ offset += sizeof(@SRC@.ia_mtime);
+ *((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime_nsec;
+ offset += sizeof(@SRC@.ia_mtime_nsec);
+"""
+
+typemap = {
+ 'dict_t *': ( LEN_DICT_TEMPLATE, SERLZ_DICT_TEMPLATE),
+ 'fd_t *': ( LEN_GFID_TEMPLATE, SERLZ_GFID_TEMPLATE),
+ 'dev_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'gf_xattrop_flags_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'int32_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'mode_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'off_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'size_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'uint32_t': ( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+ 'loc_t *': ( LEN_LOC_TEMPLATE, SERLZ_LOC_TEMPLATE),
+ 'const char *': ( LEN_STRING_TEMPLATE, SERLZ_STRING_TEMPLATE),
+ 'struct iatt *': ( LEN_IATT_TEMPLATE, SERLZ_IATT_TEMPLATE),
+}
+
+def get_special_subs (args):
+ len_code = ""
+ ser_code = ""
+ for arg in args:
+ if (arg[0] != 'fop-arg') or (len(arg) < 4):
+ continue
+ # Let this throw an exception if we get an unknown field name. The
+ # broken build will remind whoever messed with the stub code that a
+ # corresponding update is needed here.
+ if arg[3] == "vector":
+ # Make it as obvious as possible that this is a special case.
+ len_code += LEN_VECTOR_TEMPLATE \
+ .replace("@VEC@","stub->args.vector") \
+ .replace("@CNT@","stub->args.count")
+ ser_code += SERLZ_VECTOR_TEMPLATE \
+ .replace("@VEC@","stub->args.vector") \
+ .replace("@CNT@","stub->args.count")
+ else:
+ len_tmpl, ser_tmpl = typemap[arg[2]]
+ src = "stub->args.%s" % arg[3]
+ len_code += len_tmpl.replace("@SRC@",src)
+ ser_code += ser_tmpl.replace("@SRC@",src)
+ return len_code, ser_code
+
+def gen_fdl ():
+ entrypoints = []
+ for name, value in ops.iteritems():
+ if "journal" not in [ x[0] for x in value ]:
+ continue
+ len_code, ser_code = get_special_subs(value)
+ fop_subs[name]["@LEN_CODE@"] = len_code[:-1]
+ fop_subs[name]["@SER_CODE@"] = ser_code[:-1]
+ print generate(LEN_TEMPLATE,name,fop_subs)
+ print generate(SER_TEMPLATE,name,fop_subs)
+ print generate(CBK_TEMPLATE,name,cbk_subs)
+ print generate(CONTINUE_TEMPLATE,name,fop_subs)
+ print generate(FOP_TEMPLATE,name,fop_subs)
+ entrypoints.append(name)
+ print "struct xlator_fops fops = {"
+ for ep in entrypoints:
+ print "\t.%s = fdl_%s," % (ep, ep)
+ print "};"
+
+for l in open(sys.argv[1],'r').readlines():
+ if l.find('#pragma generate') != -1:
+ print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+ gen_fdl()
+ print "/* END GENERATED CODE */"
+ else:
+ print l[:-1]
diff --git a/xlators/experimental/fdl/src/gen_recon.py b/xlators/experimental/fdl/src/gen_recon.py
new file mode 100755
index 00000000000..26318f92d88
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_recon.py
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+
+import os
+import re
+import string
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# See the big header comment at the start of gen_fdl.py to see how the stages
+# fit together. The big difference here is that *all* of the C code is in the
+# template file as labelled fragments, instead of as Python strings. That
+# makes it much easier to edit in one place, with proper syntax highlighting
+# and indentation.
+#
+# Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of
+# LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE.
+#
+# Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and
+# FOP_TEMPLATE. The expanded FOP code (including FUNCTION_BODY substitution
+# in the middle of each function) is emitted immediately; the expanded CASE
+# code is saved for the next stage.
+#
+# Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code
+# in the middle of EPILOG, to generate the whole output file.
+#
+# Another way of looking at it is to consider how the fragments appear in
+# the final output:
+#
+# PROLOG
+# FOP (expanded for CREATE)
+# FOP before FUNCTION_BODY
+# LOC, INTEGER, GFID, etc. (one per arg, by type)
+# FOP after FUNCTION_BODY
+# FOP (expanded for WRITEV)
+# FOP before FUNCTION_BODY
+# GFID, VECTOR, etc. (one per arg, by type)
+# FOP after FUNCTION_BODY
+# (more FOPs)
+# EPILOG
+# EPILOG before CASE
+# CASE statements (one per fop)
+# EPILOG after CASE
+
+typemap = {
+ 'dict_t *': "DICT",
+ 'fd_t *': "FD",
+ 'dev_t': "DOUBLE",
+ 'gf_xattrop_flags_t': "INTEGER",
+ 'int32_t': "INTEGER",
+ 'mode_t': "INTEGER",
+ 'off_t': "DOUBLE",
+ 'size_t': "DOUBLE",
+ 'uint32_t': "INTEGER",
+ 'loc_t *': "LOC",
+ 'const char *': "STRING",
+ 'struct iovec *': "VECTOR",
+ 'struct iatt *': "IATT",
+ 'struct iobref *': "IOBREF",
+}
+
+def get_special_subs (name, args, fop_type):
+ code = ""
+ cleanups = ""
+ links = ""
+ s_args = []
+ for arg in args:
+ if arg[0] == 'extra':
+ code += "\t%s %s;\n\n" % (arg[2], arg[1])
+ s_args.append(arg[3])
+ continue
+ if arg[0] == 'link':
+ links += fragments["LINK"].replace("@INODE_ARG@",arg[1]) \
+ .replace("@IATT_ARG@",arg[2])
+ continue
+ if arg[0] != 'fop-arg':
+ continue
+ if (name, arg[1]) == ('writev', 'count'):
+ # Special case: just skip this. We can't mark it as 'nosync'
+ # because of the way the translator and dumper generators look for
+ # that after 'stub-name' which we don't define. Instead of adding a
+ # bunch of generic infrastructure for this one case, just pound it
+ # here.
+ continue
+ recon_type = typemap[arg[2]]
+ # print "/* %s.%s => %s (%s)*/" % (name, arg[1], recon_type, fop_type)
+ if (name == "create") and (arg[1] == "fd"):
+ # Special case: fd for create is new, not looked up.
+ # print "/* change to NEW_FD */"
+ recon_type = "NEW_FD"
+ elif (recon_type == "LOC") and (fop_type == "entry-op"):
+ # Need to treat this differently for inode vs. entry ops.
+ # Special case: link source is treated like inode-op.
+ if (name != "link") or (arg[1] != "oldloc"):
+ # print "/* change to PARENT_LOC */"
+ recon_type = "PARENT_LOC"
+ code += fragments[recon_type].replace("@ARGNAME@",arg[1]) \
+ .replace("@ARGTYPE@",arg[2])
+ cleanup_key = recon_type + "_CLEANUP"
+ if fragments.has_key(cleanup_key):
+ cleanups += fragments[cleanup_key].replace("@ARGNAME@",arg[1])
+ if 'nosync' in arg[4:]:
+ code += "\t(void)%s;\n" % arg[1];
+ continue
+ if arg[2] in ("loc_t *", "struct iatt *"):
+ # These are passed as pointers to the syncop, but they're actual
+ # structures in the generated code.
+ s_args.append("&"+arg[1]);
+ else:
+ s_args.append(arg[1])
+ # We have to handle a couple of special cases here, because some n00b
+ # defined the syncops with a different argument order than the fops they're
+ # based on.
+ if name == 'writev':
+ # Swap 'flags' and 'iobref'. Also, we need to add the iov count, which
+ # is not stored in or read from the journal. There are other ways to
+ # do that, but this is the only place we need anything similar and we
+ # already have to treat it as a special case so this is simplest.
+ s_args_str = 'fd, &vector, 1, off, iobref, flags, xdata'
+ elif name == 'symlink':
+ # Swap 'linkpath' and 'loc'.
+ s_args_str = '&loc, linkpath, &iatt, xdata'
+ else:
+ s_args_str = string.join (s_args, ", ")
+ return code, links, s_args_str, cleanups
+
+# TBD: probably need to generate type-specific cleanup code as well - e.g.
+# fd_unref for an fd_t, loc_wipe for a loc_t, and so on. All of these
+# generated CLEANUP fragments will go at the end of the function, with goto
+# labels. Meanwhile, the error-checking part of each type-specific fragment
+# (e.g. LOC or FD) will need to update the indirect label that we jump to when
+# an error is detected. This will probably get messy.
+def gen_functions ():
+ code = ""
+ for name, value in ops.iteritems():
+ fop_type = [ x[1] for x in value if x[0] == "journal" ]
+ if not fop_type:
+ continue
+ body, links, syncop_args, cleanups = get_special_subs (name, value,
+ fop_type[0])
+ fop_subs[name]["@FUNCTION_BODY@"] = body
+ fop_subs[name]["@LINKS@"] = links
+ fop_subs[name]["@SYNCOP_ARGS@"] = syncop_args
+ fop_subs[name]["@CLEANUPS@"] = cleanups
+ if name == "writev":
+ # Take advantage of the fact that, *during reconciliation*, the
+ # vector is always a single element. In normal I/O it's not.
+ fop_subs[name]["@SUCCESS_VALUE@"] = "vector.iov_len"
+ else:
+ fop_subs[name]["@SUCCESS_VALUE@"] = "GFAPI_SUCCESS"
+ # Print the FOP fragment with @FUNCTION_BODY@ in the middle.
+ code += generate(fragments["FOP"],name,fop_subs)
+ return code
+
+def gen_cases ():
+ code = ""
+ for name, value in ops.iteritems():
+ if "journal" not in [ x[0] for x in value ]:
+ continue
+ # Add the CASE fragment for this fop.
+ code += generate(fragments["CASE"],name,fop_subs)
+ return code
+
+def load_fragments (path="recon-tmpl.c"):
+ pragma_re = re.compile('pragma fragment (.*)')
+ cur_symbol = None
+ cur_value = ""
+ result = {}
+ for line in open(path,"r").readlines():
+ m = pragma_re.search(line)
+ if m:
+ if cur_symbol:
+ result[cur_symbol] = cur_value
+ cur_symbol = m.group(1)
+ cur_value = ""
+ else:
+ cur_value += line
+ if cur_symbol:
+ result[cur_symbol] = cur_value
+ return result
+
+if __name__ == "__main__":
+ fragments = load_fragments(sys.argv[1])
+ print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+ print fragments["PROLOG"]
+ print gen_functions()
+ print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases())
+ print "/* END GENERATED CODE */"
diff --git a/xlators/experimental/fdl/src/jnl-types.h b/xlators/experimental/fdl/src/jnl-types.h
new file mode 100644
index 00000000000..8cb39d01a25
--- /dev/null
+++ b/xlators/experimental/fdl/src/jnl-types.h
@@ -0,0 +1,14 @@
+#define NEW_REQUEST (uint8_t)'N'
+
+typedef struct {
+ uint8_t event_type; /* e.g. NEW_REQUEST */
+ uint8_t fop_type; /* e.g. GF_FOP_SETATTR */
+ uint16_t request_id;
+ uint32_t ext_length;
+} event_header_t;
+
+enum {
+ FDL_IPC_BASE = 0xfeedbee5, /* ... and they make honey */
+ FDL_IPC_CHANGE_TERM,
+ FDL_IPC_GET_TERMS,
+};
diff --git a/xlators/experimental/fdl/src/logdump.c b/xlators/experimental/fdl/src/logdump.c
new file mode 100644
index 00000000000..7c979c32a04
--- /dev/null
+++ b/xlators/experimental/fdl/src/logdump.c
@@ -0,0 +1,50 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+extern int fdl_dump (char **, char **);
+
+int
+main (int argc, char **argv)
+{
+ int meta_fd = (-1);
+ char *meta_buf = NULL;
+ int data_fd = (-1);
+ char *data_buf = NULL;
+
+ meta_fd = open (argv[1], O_RDONLY);
+ if (meta_fd < 0) {
+ perror ("open");
+ return EXIT_FAILURE;
+ }
+
+ /* TBD: get proper length */
+ meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0);
+ if (meta_buf == MAP_FAILED) {
+ perror ("mmap");
+ return EXIT_FAILURE;
+ }
+
+ data_fd = open (argv[2], O_RDONLY);
+ if (data_fd < 0) {
+ perror ("open");
+ return EXIT_FAILURE;
+ }
+
+ /* TBD: get proper length */
+ data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0);
+ if (data_buf == MAP_FAILED) {
+ perror ("mmap");
+ return EXIT_FAILURE;
+ }
+
+ for (;;) {
+ if (!fdl_dump(&meta_buf,&data_buf)) {
+ break;
+ }
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/xlators/experimental/fdl/src/recon-tmpl.c b/xlators/experimental/fdl/src/recon-tmpl.c
new file mode 100644
index 00000000000..523bda39418
--- /dev/null
+++ b/xlators/experimental/fdl/src/recon-tmpl.c
@@ -0,0 +1,305 @@
+#pragma fragment PROLOG
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "fd.h"
+#include "iatt.h"
+#include "syncop.h"
+#include "xlator.h"
+#include "glfs-internal.h"
+
+#include "jnl-types.h"
+
+#define GFAPI_SUCCESS 0
+
+inode_t *
+recon_get_inode (glfs_t *fs, uuid_t gfid)
+{
+ inode_t *inode;
+ loc_t loc = {NULL,};
+ struct iatt iatt;
+ int ret;
+ inode_t *newinode;
+
+ inode = inode_find (fs->active_subvol->itable, gfid);
+ if (inode) {
+ printf ("=== FOUND %s IN TABLE\n", uuid_utoa(gfid));
+ return inode;
+ }
+
+ loc.inode = inode_new (fs->active_subvol->itable);
+ if (!loc.inode) {
+ return NULL;
+ }
+ gf_uuid_copy (loc.inode->gfid, gfid);
+ gf_uuid_copy (loc.gfid, gfid);
+
+ printf ("=== DOING LOOKUP FOR %s\n", uuid_utoa(gfid));
+
+ ret = syncop_lookup (fs->active_subvol, &loc, &iatt,
+ NULL, NULL, NULL);
+ if (ret != GFAPI_SUCCESS) {
+ fprintf (stderr, "syncop_lookup failed (%d)\n", ret);
+ return NULL;
+ }
+
+ newinode = inode_link (loc.inode, NULL, NULL, &iatt);
+ if (newinode) {
+ inode_lookup (newinode);
+ }
+
+ return newinode;
+}
+
+#pragma fragment DICT
+ dict_t *@ARGNAME@;
+
+ @ARGNAME@ = dict_new();
+ if (!@ARGNAME@) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@;
+
+ {
+ int key_len, data_len;
+ char *key_ptr;
+ int garbage;
+ for (;;) {
+ key_len = *((int *)new_meta);
+ new_meta += sizeof(int);
+ if (!key_len) {
+ break;
+ }
+ key_ptr = new_meta;
+ new_meta += key_len;
+ data_len = *((int *)new_meta);
+ new_meta += sizeof(int);
+ garbage = dict_set_static_bin (@ARGNAME@, key_ptr,
+ new_meta, data_len);
+ /* TBD: check error from dict_set_static_bin */
+ (void)garbage;
+ new_meta += data_len;
+ }
+ }
+
+#pragma fragment DICT_CLEANUP
+cleanup_@ARGNAME@:
+ dict_unref (@ARGNAME@);
+
+#pragma fragment DOUBLE
+ @ARGTYPE@ @ARGNAME@ = *((@ARGTYPE@ *)new_meta);
+ new_meta += sizeof(uint64_t);
+
+#pragma fragment FD
+ inode_t *@ARGNAME@_ino;
+ fd_t *@ARGNAME@;
+
+ @ARGNAME@_ino = recon_get_inode (fs, *((uuid_t *)new_meta));
+ new_meta += 16;
+ if (!@ARGNAME@_ino) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@_ino;
+
+ @ARGNAME@ = fd_anonymous (@ARGNAME@_ino);
+ if (!@ARGNAME@) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@;
+
+#pragma fragment FD_CLEANUP
+cleanup_@ARGNAME@:
+ fd_unref (@ARGNAME@);
+cleanup_@ARGNAME@_ino:
+ inode_unref (@ARGNAME@_ino);
+
+#pragma fragment NEW_FD
+ /*
+ * This pseudo-type is only used for create, and in that case we know
+ * we'll be using loc.inode, so it's not worth generalizing to take an
+ * extra argument.
+ */
+ fd_t *@ARGNAME@ = fd_anonymous (loc.inode);
+
+ if (!fd) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@;
+ new_meta += 16;
+
+#pragma fragment NEW_FD_CLEANUP
+cleanup_@ARGNAME@:
+ fd_unref (@ARGNAME@);
+
+#pragma fragment INTEGER
+ @ARGTYPE@ @ARGNAME@ = *((@ARGTYPE@ *)new_meta);
+
+ new_meta += sizeof(@ARGTYPE@);
+
+#pragma fragment LOC
+ loc_t @ARGNAME@ = { NULL, };
+
+ @ARGNAME@.inode = recon_get_inode (fs, *((uuid_t *)new_meta));
+ if (!@ARGNAME@.inode) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@;
+ gf_uuid_copy (@ARGNAME@.gfid, @ARGNAME@.inode->gfid);
+ new_meta += 16;
+ new_meta += 16; /* skip over pargfid */
+ if (*(new_meta++)) {
+ @ARGNAME@.name = new_meta;
+ new_meta += strlen(new_meta) + 1;
+ }
+
+#pragma fragment LOC_CLEANUP
+cleanup_@ARGNAME@:
+ loc_wipe (&@ARGNAME@);
+
+#pragma fragment PARENT_LOC
+ loc_t @ARGNAME@ = { NULL, };
+
+ new_meta += 16; /* skip over gfid */
+ @ARGNAME@.parent = recon_get_inode (fs, *((uuid_t *)new_meta));
+ if (!@ARGNAME@.parent) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@;
+ gf_uuid_copy (@ARGNAME@.pargfid, @ARGNAME@.parent->gfid);
+ new_meta += 16;
+ if (!*(new_meta++)) {
+ goto *err_label;
+ }
+ @ARGNAME@.name = new_meta;
+ new_meta += strlen(new_meta) + 1;
+
+ @ARGNAME@.inode = inode_new (fs->active_subvol->itable);
+ if (!@ARGNAME@.inode) {
+ goto *err_label;
+ }
+
+#pragma fragment PARENT_LOC_CLEANUP
+cleanup_@ARGNAME@:
+ loc_wipe (&@ARGNAME@);
+
+#pragma fragment STRING
+ char *@ARGNAME@;
+ if (*(new_meta++)) {
+ @ARGNAME@ = new_meta;
+ new_meta += (strlen(new_meta) + 1);
+ }
+ else {
+ goto *err_label;
+ }
+
+#pragma fragment VECTOR
+ struct iovec @ARGNAME@;
+
+ @ARGNAME@.iov_len = *((size_t *)new_meta);
+ new_meta += sizeof(@ARGNAME@.iov_len);
+ @ARGNAME@.iov_base = new_data;
+ new_data += @ARGNAME@.iov_len;
+
+#pragma fragment IATT
+ struct iatt @ARGNAME@;
+ {
+ @ARGNAME@.ia_prot = *((ia_prot_t *)new_meta);
+ new_meta += sizeof(ia_prot_t);
+ uint32_t *myints = (uint32_t *)new_meta;
+ @ARGNAME@.ia_uid = myints[0];
+ @ARGNAME@.ia_gid = myints[1];
+ @ARGNAME@.ia_atime = myints[2];
+ @ARGNAME@.ia_atime_nsec = myints[3];
+ @ARGNAME@.ia_mtime = myints[4];
+ @ARGNAME@.ia_mtime_nsec = myints[5];
+ new_meta += sizeof(*myints) * 6;
+ }
+
+#pragma fragment IOBREF
+ struct iobref *@ARGNAME@;
+
+ @ARGNAME@ = iobref_new();
+ if (!@ARGNAME@) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_@ARGNAME@;
+
+#pragma fragment IOBREF_CLEANUP
+cleanup_@ARGNAME@:
+ iobref_unref (@ARGNAME@);
+
+#pragma fragment LINK
+ /* TBD: check error */
+ inode_t *new_inode = inode_link (@INODE_ARG@, NULL, NULL, @IATT_ARG@);
+ if (new_inode) {
+ inode_lookup (new_inode);
+ }
+
+#pragma fragment FOP
+int
+fdl_replay_@NAME@ (glfs_t *fs, char **old_meta, char **old_data)
+{
+ char *new_meta = *old_meta;
+ char *new_data = *old_data;
+ int ret;
+ int status = 0xbad;
+ void *err_label = &&done;
+
+@FUNCTION_BODY@
+
+ ret = syncop_@NAME@ (fs->active_subvol, @SYNCOP_ARGS@, NULL);
+ if (ret != @SUCCESS_VALUE@) {
+ fprintf (stderr, "syncop_@NAME@ returned %d", ret);
+ goto *err_label;
+ }
+
+@LINKS@
+
+ status = 0;
+
+@CLEANUPS@
+
+done:
+ *old_meta = new_meta;
+ *old_data = new_data;
+ return status;
+}
+
+#pragma fragment CASE
+ case GF_FOP_@UPNAME@:
+ printf ("=== GF_FOP_@UPNAME@\n");
+ if (fdl_replay_@NAME@ (fs, &new_meta, &new_data) != 0) {
+ goto done;
+ }
+ recognized = 1;
+ break;
+
+#pragma fragment EPILOG
+int
+recon_execute (glfs_t *fs, char **old_meta, char **old_data)
+{
+ char *new_meta = *old_meta;
+ char *new_data = *old_data;
+ int recognized = 0;
+ event_header_t *eh;
+
+ eh = (event_header_t *)new_meta;
+ new_meta += sizeof (*eh);
+
+ /* TBD: check event_type instead of assuming NEW_REQUEST */
+
+ switch (eh->fop_type) {
+@SWITCH_BODY@
+
+ default:
+ printf ("unknown fop %u\n", eh->fop_type);
+ }
+
+done:
+ *old_meta = new_meta;
+ *old_data = new_data;
+ return recognized;
+}
diff --git a/xlators/experimental/fdl/src/recon.c b/xlators/experimental/fdl/src/recon.c
new file mode 100644
index 00000000000..14168a011e0
--- /dev/null
+++ b/xlators/experimental/fdl/src/recon.c
@@ -0,0 +1,89 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "glusterfs.h"
+#include "fd.h"
+#include "syncop.h"
+#include "glfs-internal.h"
+
+#define GFAPI_SUCCESS 0
+
+extern int recon_execute (glfs_t *, char **, char **);
+
+int
+main (int argc, char **argv)
+{
+ glfs_t *fs;
+ int ret;
+ int meta_fd = (-1);
+ char *meta_buf = NULL;
+ int data_fd = (-1);
+ char *data_buf = NULL;
+
+ fs = glfs_new ("whocares");
+ if (!fs) {
+ fprintf (stderr, "glfs_new failed\n");
+ return EXIT_FAILURE;
+ }
+
+ if (getenv("RECON_DEBUG")) {
+ ret = glfs_set_logging (fs, "/dev/stderr", 7);
+ }
+ else {
+ ret = glfs_set_logging (fs, "/dev/null", 0);
+ }
+
+ if (ret != GFAPI_SUCCESS) {
+ fprintf (stderr, "glfs_set_logging failed (%d)\n", errno);
+ return EXIT_FAILURE;
+ }
+
+ ret = glfs_set_volfile (fs, argv[1]);
+ if (ret != GFAPI_SUCCESS) {
+ fprintf (stderr, "glfs_set_volfile failed (%d)\n", errno);
+ return EXIT_FAILURE;
+ }
+
+ ret = glfs_init (fs);
+ if (ret != GFAPI_SUCCESS) {
+ fprintf (stderr, "glfs_init failed (%d)\n", errno);
+ return EXIT_FAILURE;
+ }
+
+ meta_fd = open (argv[2], O_RDONLY);
+ if (meta_fd < 0) {
+ perror ("open");
+ return EXIT_FAILURE;
+ }
+
+ /* TBD: get proper length */
+ meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0);
+ if (meta_buf == MAP_FAILED) {
+ perror ("mmap");
+ return EXIT_FAILURE;
+ }
+
+ data_fd = open (argv[3], O_RDONLY);
+ if (data_fd < 0) {
+ perror ("open");
+ return EXIT_FAILURE;
+ }
+
+ /* TBD: get proper length */
+ data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0);
+ if (data_buf == MAP_FAILED) {
+ perror ("mmap");
+ return EXIT_FAILURE;
+ }
+
+ for (;;) {
+ if (!recon_execute(fs,&meta_buf,&data_buf)) {
+ break;
+ }
+ }
+
+ return EXIT_SUCCESS;
+}