summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/nsr-server/src/nsr.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/nsr-server/src/nsr.c')
-rw-r--r--xlators/cluster/nsr-server/src/nsr.c682
1 files changed, 682 insertions, 0 deletions
diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c
new file mode 100644
index 000000000..3707b3003
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr.c
@@ -0,0 +1,682 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+#include "run.h"
+#include "common-utils.h"
+
+
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+#define NSR_FLUSH_INTERVAL 5
+
+nsr_inode_ctx_t *
+nsr_get_inode_ctx (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_inode_ctx_t *ctx_ptr;
+
+ if (__inode_ctx_get(inode,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr),
+ gf_mt_nsr_inode_ctx_t);
+ if (ctx_ptr) {
+ ctx_int = (uint64_t)(long)ctx_ptr;
+ if (__inode_ctx_set(inode,this,&ctx_int) == 0) {
+ LOCK_INIT(&ctx_ptr->lock);
+ INIT_LIST_HEAD(&ctx_ptr->aqueue);
+ INIT_LIST_HEAD(&ctx_ptr->pqueue);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+nsr_fd_ctx_t *
+nsr_get_fd_ctx (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t);
+ if (ctx_ptr) {
+ if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) {
+ INIT_LIST_HEAD(&ctx_ptr->dirty_list);
+ INIT_LIST_HEAD(&ctx_ptr->fd_list);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+void
+nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local)
+{
+ fd_t *fd = local->fd;
+ nsr_fd_ctx_t *ctx_ptr;
+ nsr_dirty_list_t *dirty;
+ nsr_private_t *priv = this->private;
+
+ /*
+ * TBD: don't do any of this for O_SYNC/O_DIRECT writes.
+ * Unfortunately, that optimization requires that we distinguish
+ * between writev and other "write" calls, saving the original flags
+ * and checking them in the callback. Too much work for too little
+ * gain right now.
+ */
+
+ LOCK(&fd->lock);
+ ctx_ptr = nsr_get_fd_ctx(this,fd);
+ dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t);
+ if (ctx_ptr && dirty) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "marking fd %p as dirty (%p)", fd, dirty);
+ /* TBD: fill dirty->id from what changelog gave us */
+ list_add_tail(&dirty->links,&ctx_ptr->dirty_list);
+ if (list_empty(&ctx_ptr->fd_list)) {
+ /* Add a ref so _release doesn't get called. */
+ ctx_ptr->fd = fd_ref(fd);
+ LOCK(&priv->dirty_lock);
+ list_add_tail (&ctx_ptr->fd_list,
+ &priv->dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+ }
+ }
+ else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not mark %p dirty", fd);
+ if (ctx_ptr) {
+ GF_FREE(ctx_ptr);
+ }
+ if (dirty) {
+ GF_FREE(dirty);
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count"
+#include "nsr-cg.c"
+
+uint8_t
+nsr_count_up_kids (nsr_private_t *priv)
+{
+ uint8_t retval = 0;
+ uint8_t i;
+
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ ++retval;
+ }
+ }
+
+ return retval;
+}
+
+/*
+ * The fsync machinery looks a lot like that for any write call, but there are
+ * some important differences that are easy to miss. First, we don't care
+ * about the xdata that shows whether the call came from a leader or
+ * reconciliation process. If we're the leader we fan out; if we're not we
+ * don't. Second, we don't wait for followers before we issue the local call.
+ * The code generation system could be updated to handle this, and still might
+ * if we need to implement other "almost identical" paths (e.g. for open), but
+ * a copy is more readable as long as it's just one.
+ */
+
+int32_t
+nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_local_t *local = frame->local;
+ gf_boolean_t unwind;
+
+ LOCK(&frame->lock);
+ unwind = !--(local->call_count);
+ UNLOCK(&frame->lock);
+
+ if (unwind) {
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ }
+ return 0;
+}
+
+int32_t
+nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+ nsr_local_t *local = frame->local;
+
+ list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", local->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+int32_t
+nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ nsr_private_t *priv = this->private;
+ nsr_local_t *local;
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+ xlator_list_t *trav;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata);
+ return 0;
+ }
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ /* Move the dirty list from the fd to the fsync request. */
+ LOCK(&fd->lock);
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ list_splice_init (&ctx_ptr->dirty_list,
+ &local->qlinks);
+ }
+ UNLOCK(&fd->lock);
+
+ /* Issue the local call. */
+ local->call_count = priv->leader ? priv->n_children : 1;
+ STACK_WIND (frame, nsr_fsync_local_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+
+ /* Issue remote calls if we're the leader. */
+ if (priv->leader) {
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fsync_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ dict_t *result;
+ uint8_t up;
+ nsr_private_t *priv = this->private;
+
+ if (!priv->leader) {
+ STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL);
+ return 0;
+ }
+
+ if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) {
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+ }
+
+ result = dict_new();
+ if (!result) {
+ goto dn_failed;
+ }
+
+ up = nsr_count_up_kids(this->private);
+ if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,up) != 0) {
+ goto dsu_failed;
+ }
+
+ STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL);
+ dict_destroy(result);
+ return 0;
+
+dsu_failed:
+ dict_destroy(result);
+dn_failed:
+ STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+void
+nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+
+ list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", fd_ctx->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ INIT_LIST_HEAD(&fd_ctx->dirty_list);
+}
+
+void *
+nsr_flush_thread (void *ctx)
+{
+ xlator_t *this = ctx;
+ nsr_private_t *priv = this->private;
+ struct list_head dirty_fds;
+ nsr_fd_ctx_t *fd_ctx;
+ nsr_fd_ctx_t *fd_tmp;
+
+ for (;;) {
+ /*
+ * We have to be very careful to avoid lock inversions here, so
+ * we can't just hold priv->dirty_lock while we take and
+ * release locks for each fd. Instead, we only hold dirty_lock
+ * at the beginning of each iteration, as we (effectively) make
+ * a copy of the current list head and then clear the original.
+ * This leads to four scenarios for adding the first entry to
+ * an fd and potentially putting it on the global list.
+ *
+ * (1) While we're asleep. No lock contention, it just gets
+ * added and will be processed on the next iteration.
+ *
+ * (2) After we've made a local copy, but before we've started
+ * processing that fd. The new entry will be added to the
+ * fd (under its lock), and we'll process it on the current
+ * iteration.
+ *
+ * (3) While we're processing the fd. They'll block on the fd
+ * lock, then see that the list is empty and put it on the
+ * global list. We'll process it here on the next
+ * iteration.
+ *
+ * (4) While we're working, but after we've processed that fd.
+ * Same as (1) as far as that fd is concerned.
+ */
+ INIT_LIST_HEAD(&dirty_fds);
+ LOCK(&priv->dirty_lock);
+ list_splice_init(&priv->dirty_fds,&dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+
+ list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) {
+ LOCK(&fd_ctx->fd->lock);
+ nsr_flush_fd(this,fd_ctx);
+ list_del_init(&fd_ctx->fd_list);
+ UNLOCK(&fd_ctx->fd->lock);
+ fd_unref(fd_ctx->fd);
+ }
+
+ sleep(NSR_FLUSH_INTERVAL);
+ }
+
+ return NULL;
+}
+
+int32_t
+nsr_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx = 0LL;
+
+ if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_release (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx = 0LL;
+
+ if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+struct xlator_cbks cbks = {
+ .forget = nsr_forget,
+ .release = nsr_release,
+};
+
+int
+nsr_reconfigure (xlator_t *this, dict_t *options)
+{
+ nsr_private_t *priv = this->private;
+
+ GF_OPTION_RECONF ("leader", priv->leader, options, bool, err);
+ return 0;
+
+err:
+ return -1;
+}
+
+int
+nsr_get_child_index (xlator_t *this, xlator_t *kid)
+{
+ xlator_list_t *trav;
+ int retval = -1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++retval;
+ if (trav->xlator == kid) {
+ return retval;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Child notify handling is unreasonably FUBAR. Sometimes we'll get a
+ * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it.
+ * Other times we won't. Because it's effectively random (probably racy), we
+ * can't just maintain a count. We actually have to keep track of the state
+ * for each child separately, to filter out the bogus CHILD_DOWN events, and
+ * then generate counts on demand.
+ */
+int
+nsr_notify (xlator_t *this, int event, void *data, ...)
+{
+ nsr_private_t *priv = this->private;
+ int index;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state |= (1 << index);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_UP for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ nsr_count_up_kids(priv));
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state &= ~(1 << index);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_DOWN for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ nsr_count_up_kids(priv));
+ }
+ break;
+ default:
+ ;
+ }
+
+ return default_notify(this,event,data);
+}
+
+
+extern void *nsr_leader_thread (void *);
+
+int32_t
+nsr_init (xlator_t *this)
+{
+ xlator_list_t *remote;
+ xlator_list_t *local;
+ nsr_private_t *priv = NULL;
+ xlator_list_t *trav;
+ pthread_t kid;
+ uuid_t tmp_uuid;
+ char *my_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL;
+ char *volname;
+ extern xlator_t global_xlator;
+ glusterfs_ctx_t *oldctx = global_xlator.ctx;
+ runner_t runner = {0,};
+ int32_t ret = -1;
+ struct stat buf;
+
+ /*
+ * Any fop that gets special treatment has to be patched in here,
+ * because the compiled-in table is produced by the code generator and
+ * only contains generated functions. Note that we have to go through
+ * this->fops because of some dynamic-linking strangeness; modifying
+ * the static table doesn't work.
+ */
+ this->fops->getxattr = nsr_getxattr_special;
+ this->fops->fsync = nsr_fsync;
+
+ local = this->children;
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "no local subvolume");
+ goto err;
+ }
+
+ remote = local->next;
+ if (!remote) {
+ gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes");
+ goto err;
+ }
+
+ this->local_pool = mem_pool_new (nsr_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsr_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate priv");
+ goto err;
+ }
+
+ // set this so that unless leader election is done, IO is fenced
+ priv->fence_io = 1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++(priv->n_children);
+ }
+
+ LOCK_INIT(&priv->dirty_lock);
+ INIT_LIST_HEAD(&priv->dirty_fds);
+
+ this->private = priv;
+
+ GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err);
+ if (!priv->etcd_servers) {
+ gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???");
+ goto err;
+ }
+ priv->vol_uuid = "temporary";
+ uuid_generate(tmp_uuid);
+ priv->brick_uuid = strdup(uuid_utoa(tmp_uuid));
+ priv->term_uuid = "nsr-term";
+ gf_log (this->name, GF_LOG_INFO,
+ "brick_uuid = %s\n", priv->brick_uuid);
+
+ GF_OPTION_INIT ("my-name", my_name, str, err);
+ if (!my_name) {
+ gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???");
+ goto err;
+ }
+ GF_OPTION_INIT ("vol-name", volname, str, err);
+ if (!volname) {
+ gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???");
+ goto err;
+ }
+
+ recon_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("recon") +1, gf_mt_nsr_private_t);
+ if ((!recon_file) || (!recon_pid_file)) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ ptr = strchr (my_name, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (my_name, '/');
+ }
+
+ sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(recon_file, my_name);
+ strcat(recon_file, "-nsr-recon.vol");
+
+ sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ "run");
+ strcat(recon_pid_file, my_name);
+ strcat(recon_pid_file, "-recon.pid");
+
+ priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ if (!priv->vol_file) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ sprintf(priv->vol_file,"%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(priv->vol_file, "con:");
+ strcat(priv->vol_file, my_name);
+
+ if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not start flush thread");
+ /* TBD: treat this as a fatal error? */
+ }
+
+ // Start the recon process. Then start the leader thread.
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+ if (!stat(priv->vol_file, &buf)) {
+ runinit (&runner);
+ runner_add_args(&runner, "/usr/local/sbin/glusterfs",
+ "-f", recon_file,
+ "-p", recon_pid_file,
+ "-l", "/tmp/reconciliation.log",
+ NULL);
+ ret = runner_run (&runner);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "could not exec reconciliation process " );
+ goto err;
+ }
+
+ // TBD - convert this to make sure recon process runs
+ sleep(2);
+ priv->nsr_recon_start = _gf_true;
+ }
+
+
+ (void)pthread_create(&kid,NULL,nsr_leader_thread,this);
+ while (priv->leader_inited == 0) {
+ sleep(1);
+ }
+ /*
+ * Calling glfs_new changes old->ctx, even if THIS still points
+ * to global_xlator. That causes problems later in the main
+ * thread, when gf_log_dump_graph tries to use the FILE after
+ * we've mucked with it and gets a segfault in __fprintf_chk.
+ * We can avoid all that by undoing the damage before we
+ * continue.
+ */
+ global_xlator.ctx = oldctx;
+
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+
+void
+nsr_fini (xlator_t *this)
+{
+}
+
+class_methods_t class_methods = {
+ .init = nsr_init,
+ .fini = nsr_fini,
+ .reconfigure = nsr_reconfigure,
+ .notify = nsr_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {"leader"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "Start in the leader role. This is only for "
+ "bootstrapping the code, and should go away when we "
+ "have real leader election."
+ },
+ { .key ={"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ { .key = {"my-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "brick name in form of host:/path"
+ },
+ { .key = {"etcd-servers"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "list of comma seperated etc servers"
+ },
+ { .key = {NULL} },
+};