summaryrefslogtreecommitdiffstats
path: root/xlators/experimental/fdl/src/fdl-tmpl.c
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2016-02-08 13:30:49 -0500
committerJeff Darcy <jdarcy@redhat.com>2016-02-13 05:13:07 -0800
commitc458433041aafb48ae6d6e5fcf3e1e737dc3fda3 (patch)
tree33a03ca0c1f5faf58419de2c4ff4532752ddfb07 /xlators/experimental/fdl/src/fdl-tmpl.c
parentda33097c3d6492e3b468b4347e47c70828fb4320 (diff)
experimental: add fdl (Full Data Logging) translator
NSR needs logging that is different than our existing changelog in several ways: * Full data, not just metadata * Pre-op, not post-op * High performance * Supports the concept of time-bounded "terms" Others (for example EC) might need the same thing. This patch adds such a translator. It also adds code to dump the resulting journals, and to replay them using syncops, plus (very rudimentary) tests for all of the above. Change-Id: I29680a1b4e0a9e7d5a8497fef302c46434b86636 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.org/12450 Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/experimental/fdl/src/fdl-tmpl.c')
-rw-r--r--xlators/experimental/fdl/src/fdl-tmpl.c506
1 files changed, 506 insertions, 0 deletions
diff --git a/xlators/experimental/fdl/src/fdl-tmpl.c b/xlators/experimental/fdl/src/fdl-tmpl.c
new file mode 100644
index 00000000000..8fcc6a8d6ff
--- /dev/null
+++ b/xlators/experimental/fdl/src/fdl-tmpl.c
@@ -0,0 +1,506 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include "call-stub.h"
+#include "iatt.h"
+#include "defaults.h"
+#include "syscall.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+/* TBD: make tunable */
+#define META_FILE_SIZE (1 << 20)
+#define DATA_FILE_SIZE (1 << 24)
+
+enum gf_fdl {
+ gf_fdl_mt_fdl_private_t = gf_common_mt_end + 1,
+ gf_fdl_mt_end
+};
+
+typedef struct {
+ char *type;
+ off_t size;
+ char *path;
+ int fd;
+ void * ptr;
+ off_t max_offset;
+} log_obj_t;
+
+typedef struct {
+ struct list_head reqs;
+ pthread_mutex_t req_lock;
+ pthread_cond_t req_cond;
+ char *log_dir;
+ pthread_t worker;
+ gf_boolean_t should_stop;
+ gf_boolean_t change_term;
+ log_obj_t meta_log;
+ log_obj_t data_log;
+ int term;
+ int first_term;
+} fdl_private_t;
+
+void
+fdl_enqueue (xlator_t *this, call_stub_t *stub)
+{
+ fdl_private_t *priv = this->private;
+
+ pthread_mutex_lock (&priv->req_lock);
+ list_add_tail (&stub->list, &priv->reqs);
+ pthread_mutex_unlock (&priv->req_lock);
+
+ pthread_cond_signal (&priv->req_cond);
+}
+
+#pragma generate
+
+char *
+fdl_open_term_log (xlator_t *this, log_obj_t *obj, int term)
+{
+ fdl_private_t *priv = this->private;
+ int ret;
+ char * ptr = NULL;
+
+ /*
+ * Use .jnl instead of .log so that we don't get test info (mistakenly)
+ * appended to our journal files.
+ */
+ if (this->ctx->cmd_args.log_ident) {
+ ret = gf_asprintf (&obj->path, "%s/%s-%s-%d.jnl",
+ priv->log_dir, this->ctx->cmd_args.log_ident,
+ obj->type, term);
+ }
+ else {
+ ret = gf_asprintf (&obj->path, "%s/fubar-%s-%d.jnl",
+ priv->log_dir, obj->type, term);
+ }
+ if ((ret <= 0) || !obj->path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to construct log-file path");
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "opening %s (size %ld)",
+ obj->path, obj->size);
+
+ obj->fd = open (obj->path, O_RDWR|O_CREAT|O_TRUNC, 0666);
+ if (obj->fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open log file (%s)", strerror(errno));
+ goto err;
+ }
+
+#if !defined(GF_BSD_HOST_OS)
+ /*
+ * NetBSD can just go die in a fire. Even though it claims to support
+ * fallocate/posix_fallocate they don't actually *do* anything so the
+ * file size remains zero. Then mmap succeeds anyway, but any access
+ * to the mmap'ed region will segfault. It would be acceptable for
+ * fallocate to do what it says, for mmap to fail, or for access to
+ * extend the file. NetBSD managed to hit the trifecta of Getting
+ * Everything Wrong, and debugging in that environment to get this far
+ * has already been painful enough (systems I worked on in 1990 were
+ * better that way). We'll fall through to the lseek/write method, and
+ * performance will be worse, and TOO BAD.
+ */
+ if (sys_fallocate(obj->fd,0,0,obj->size) < 0)
+#endif
+ {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to fallocate space for log file");
+ /* Have to do this the ugly page-faulty way. */
+ (void) sys_lseek (obj->fd, obj->size-1, SEEK_SET);
+ (void) sys_write (obj->fd, "", 1);
+ }
+
+ ptr = mmap (NULL, obj->size, PROT_WRITE, MAP_SHARED, obj->fd, 0);
+ if (ptr == MAP_FAILED) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to mmap log (%s)",
+ strerror(errno));
+ goto err;
+ }
+
+ obj->ptr = ptr;
+ obj->max_offset = 0;
+ return ptr;
+
+err:
+ if (obj->fd >= 0) {
+ sys_close (obj->fd);
+ obj->fd = (-1);
+ }
+ if (obj->path) {
+ GF_FREE (obj->path);
+ obj->path = NULL;
+ }
+ return ptr;
+}
+
+void
+fdl_close_term_log (xlator_t *this, log_obj_t *obj)
+{
+ fdl_private_t *priv = this->private;
+
+ if (obj->ptr) {
+ (void) munmap (obj->ptr, obj->size);
+ obj->ptr = NULL;
+ }
+
+ if (obj->fd >= 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "truncating term %d %s journal to %ld",
+ priv->term, obj->type, obj->max_offset);
+ if (sys_ftruncate(obj->fd,obj->max_offset) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to truncate journal (%s)",
+ strerror(errno));
+ }
+ sys_close (obj->fd);
+ obj->fd = (-1);
+ }
+
+ if (obj->path) {
+ GF_FREE (obj->path);
+ obj->path = NULL;
+ }
+}
+
+gf_boolean_t
+fdl_change_term (xlator_t *this, char **meta_ptr, char **data_ptr)
+{
+ fdl_private_t *priv = this->private;
+
+ fdl_close_term_log (this, &priv->meta_log);
+ fdl_close_term_log (this, &priv->data_log);
+
+ ++(priv->term);
+
+ *meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+ if (!*meta_ptr) {
+ return _gf_false;
+ }
+
+ *data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+ if (!*data_ptr) {
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+void *
+fdl_worker (void *arg)
+{
+ xlator_t *this = arg;
+ fdl_private_t *priv = this->private;
+ call_stub_t *stub;
+ char * meta_ptr = NULL;
+ off_t *meta_offset = &priv->meta_log.max_offset;
+ char * data_ptr = NULL;
+ off_t *data_offset = &priv->data_log.max_offset;
+ unsigned long base_as_ul;
+ void * msync_ptr;
+ size_t msync_len;
+ gf_boolean_t recycle;
+ void *err_label = &&err_unlocked;
+
+ priv->meta_log.type = "meta";
+ priv->meta_log.size = META_FILE_SIZE;
+ priv->meta_log.path = NULL;
+ priv->meta_log.fd = (-1);
+ priv->meta_log.ptr = NULL;
+
+ priv->data_log.type = "data";
+ priv->data_log.size = DATA_FILE_SIZE;
+ priv->data_log.path = NULL;
+ priv->data_log.fd = (-1);
+ priv->data_log.ptr = NULL;
+
+ /* TBD: initial term should come from persistent storage (e.g. etcd) */
+ priv->first_term = ++(priv->term);
+ meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+ if (!meta_ptr) {
+ goto *err_label;
+ }
+ data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+ if (!data_ptr) {
+ fdl_close_term_log (this, &priv->meta_log);
+ goto *err_label;
+ }
+
+ for (;;) {
+ pthread_mutex_lock (&priv->req_lock);
+ err_label = &&err_locked;
+ while (list_empty(&priv->reqs)) {
+ pthread_cond_wait (&priv->req_cond, &priv->req_lock);
+ if (priv->should_stop) {
+ goto *err_label;
+ }
+ if (priv->change_term) {
+ if (!fdl_change_term(this, &meta_ptr,
+ &data_ptr)) {
+ goto *err_label;
+ }
+ priv->change_term = _gf_false;
+ continue;
+ }
+ }
+ stub = list_entry (priv->reqs.next, call_stub_t, list);
+ list_del_init (&stub->list);
+ pthread_mutex_unlock (&priv->req_lock);
+ err_label = &&err_unlocked;
+ /*
+ * TBD: batch requests
+ *
+ * What we should do here is gather up *all* of the requests
+ * that have accumulated since we were last at this point,
+ * blast them all out in one big writev, and then dispatch them
+ * all before coming back for more. That maximizes throughput,
+ * at some cost to latency (due to queuing effects at the log
+ * stage). Note that we're likely to be above io-threads, so
+ * the dispatch itself will be parallelized (at further cost to
+ * latency). For now, we just do the simplest thing and handle
+ * one request all the way through before fetching the next.
+ *
+ * So, why mmap/msync instead of writev/fdatasync? Because it's
+ * faster. Much faster. So much faster that I half-suspect
+ * cheating, but it's more convenient for now than having to
+ * ensure that everything's page-aligned for O_DIRECT (the only
+ * alternative that still might avoid ridiculous levels of
+ * local-FS overhead).
+ *
+ * TBD: check that msync really does get our data to disk.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "logging %u+%u bytes for op %d",
+ stub->jnl_meta_len, stub->jnl_data_len, stub->fop);
+ recycle = _gf_false;
+ if ((*meta_offset + stub->jnl_meta_len) > priv->meta_log.size) {
+ recycle = _gf_true;
+ }
+ if ((*data_offset + stub->jnl_data_len) > priv->data_log.size) {
+ recycle = _gf_true;
+ }
+ if (recycle && !fdl_change_term(this,&meta_ptr,&data_ptr)) {
+ goto *err_label;
+ }
+ meta_ptr = priv->meta_log.ptr;
+ data_ptr = priv->data_log.ptr;
+ gf_log (this->name, GF_LOG_DEBUG, "serializing to %p/%p",
+ meta_ptr + *meta_offset, data_ptr + *data_offset);
+ stub->serialize (stub, meta_ptr + *meta_offset,
+ data_ptr + *data_offset);
+ if (stub->jnl_meta_len > 0) {
+ base_as_ul = (unsigned long) (meta_ptr + *meta_offset);
+ msync_ptr = (void *) (base_as_ul & ~0x0fff);
+ msync_len = (size_t) (base_as_ul & 0x0fff);
+ if (msync (msync_ptr, msync_len+stub->jnl_meta_len,
+ MS_SYNC) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to log request meta (%s)",
+ strerror(errno));
+ }
+ *meta_offset += stub->jnl_meta_len;
+ }
+ if (stub->jnl_data_len > 0) {
+ base_as_ul = (unsigned long) (data_ptr + *data_offset);
+ msync_ptr = (void *) (base_as_ul & ~0x0fff);
+ msync_len = (size_t) (base_as_ul & 0x0fff);
+ if (msync (msync_ptr, msync_len+stub->jnl_data_len,
+ MS_SYNC) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to log request data (%s)",
+ strerror(errno));
+ }
+ *data_offset += stub->jnl_data_len;
+ }
+ call_resume (stub);
+ }
+
+err_locked:
+ pthread_mutex_unlock (&priv->req_lock);
+err_unlocked:
+ fdl_close_term_log (this, &priv->meta_log);
+ fdl_close_term_log (this, &priv->data_log);
+ return NULL;
+}
+
+int32_t
+fdl_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ fdl_private_t *priv = this->private;
+ dict_t *tdict;
+ int32_t gt_err = EIO;
+
+ switch (op) {
+
+ case FDL_IPC_CHANGE_TERM:
+ gf_log (this->name, GF_LOG_INFO, "got CHANGE_TERM op");
+ priv->change_term = _gf_true;
+ pthread_cond_signal (&priv->req_cond);
+ STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL);
+ break;
+
+ case FDL_IPC_GET_TERMS:
+ gf_log (this->name, GF_LOG_INFO, "got GET_TERMS op");
+ tdict = dict_new ();
+ if (!tdict) {
+ gt_err = ENOMEM;
+ goto gt_done;
+ }
+ if (dict_set_int32(tdict,"first",priv->first_term) != 0) {
+ goto gt_done;
+ }
+ if (dict_set_int32(tdict,"last",priv->term) != 0) {
+ goto gt_done;
+ }
+ gt_err = 0;
+ gt_done:
+ if (gt_err) {
+ STACK_UNWIND_STRICT (ipc, frame, -1, gt_err, NULL);
+ } else {
+ STACK_UNWIND_STRICT (ipc, frame, 0, 0, tdict);
+ }
+ if (tdict) {
+ dict_unref (tdict);
+ }
+ break;
+
+ default:
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc,
+ op, xdata);
+ }
+
+ return 0;
+}
+
+int
+fdl_init (xlator_t *this)
+{
+ fdl_private_t *priv = NULL;
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_fdl_mt_fdl_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate fdl_private");
+ goto err;
+ }
+
+ INIT_LIST_HEAD (&priv->reqs);
+ if (pthread_mutex_init (&priv->req_lock, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to initialize req_lock");
+ goto err;
+ }
+ if (pthread_cond_init (&priv->req_cond, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to initialize req_cond");
+ goto err;
+ }
+
+ GF_OPTION_INIT ("log-path", priv->log_dir, path, err);
+
+ if (pthread_create(&priv->worker,NULL,fdl_worker,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to start fdl_worker");
+ goto err;
+ }
+
+ /*
+ * The rest of the fop table is automatically generated, so this is a
+ * bit cleaner than messing with the generation to add a hand-written
+ * exception.
+ */
+ this->fops->ipc = fdl_ipc;
+
+ this->private = priv;
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+void
+fdl_fini (xlator_t *this)
+{
+ fdl_private_t *priv = this->private;
+
+ if (priv) {
+ priv->should_stop = _gf_true;
+ pthread_cond_signal (&priv->req_cond);
+ pthread_join (priv->worker, NULL);
+ GF_FREE(priv);
+ }
+}
+
+int
+fdl_reconfigure (xlator_t *this, dict_t *options)
+{
+ fdl_private_t *priv = this->private;
+
+ GF_OPTION_RECONF ("log_dir", priv->log_dir, options, path, out);
+ /* TBD: react if it changed */
+
+out:
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("fdl", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_fdl_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+class_methods_t class_methods = {
+ .init = fdl_init,
+ .fini = fdl_fini,
+ .reconfigure = fdl_reconfigure,
+ .notify = default_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {"log-path"},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = DEFAULT_LOG_FILE_DIRECTORY,
+ .description = "Directory for FDL files."
+ },
+ { .key = {NULL} },
+};
+
+struct xlator_cbks cbks = {
+ .release = default_release,
+ .releasedir = default_releasedir,
+ .forget = default_forget,
+};