/* Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif #include "call-stub.h" #include "defaults.h" #include "xlator.h" #include "api/src/glfs.h" #include "api/src/glfs-internal.h" #include "run.h" #include "common-utils.h" #include "syncop.h" #include "etcd-api.h" #include "nsr-internal.h" #include "../../nsr-recon/src/recon_driver.h" #include "../../nsr-recon/src/recon_xlator.h" #define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd" #define GLUSTERD_VOLUME_DIR_PREFIX "vols" #define GLUSTERD_BRICK_INFO_DIR "bricks" #define NSR_FLUSH_INTERVAL 5 nsr_inode_ctx_t * nsr_get_inode_ctx (xlator_t *this, inode_t *inode) { uint64_t ctx_int = 0LL; nsr_inode_ctx_t *ctx_ptr; if (__inode_ctx_get(inode,this,&ctx_int) == 0) { ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int; } else { ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_inode_ctx_t); if (ctx_ptr) { ctx_int = (uint64_t)(long)ctx_ptr; if (__inode_ctx_set(inode,this,&ctx_int) == 0) { LOCK_INIT(&ctx_ptr->lock); INIT_LIST_HEAD(&ctx_ptr->aqueue); INIT_LIST_HEAD(&ctx_ptr->pqueue); } else { GF_FREE(ctx_ptr); ctx_ptr = NULL; } } } return ctx_ptr; } nsr_fd_ctx_t * nsr_get_fd_ctx (xlator_t *this, fd_t *fd) { uint64_t ctx_int = 0LL; nsr_fd_ctx_t *ctx_ptr; if (__fd_ctx_get(fd,this,&ctx_int) == 0) { ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; } else { ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t); if (ctx_ptr) { if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) { INIT_LIST_HEAD(&ctx_ptr->dirty_list); INIT_LIST_HEAD(&ctx_ptr->fd_list); } else { GF_FREE(ctx_ptr); ctx_ptr = NULL; } } } return ctx_ptr; } void nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local) { fd_t *fd = local->fd; nsr_fd_ctx_t *ctx_ptr; nsr_dirty_list_t *dirty; nsr_private_t *priv = this->private; /* * TBD: don't do any of this for O_SYNC/O_DIRECT writes. * Unfortunately, that optimization requires that we distinguish * between writev and other "write" calls, saving the original flags * and checking them in the callback. Too much work for too little * gain right now. */ LOCK(&fd->lock); ctx_ptr = nsr_get_fd_ctx(this,fd); dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t); if (ctx_ptr && dirty) { gf_log (this->name, GF_LOG_TRACE, "marking fd %p as dirty (%p)", fd, dirty); /* TBD: fill dirty->id from what changelog gave us */ list_add_tail(&dirty->links,&ctx_ptr->dirty_list); if (list_empty(&ctx_ptr->fd_list)) { /* Add a ref so _release doesn't get called. */ ctx_ptr->fd = fd_ref(fd); LOCK(&priv->dirty_lock); list_add_tail (&ctx_ptr->fd_list, &priv->dirty_fds); UNLOCK(&priv->dirty_lock); } } else { gf_log (this->name, GF_LOG_ERROR, "could not mark %p dirty", fd); if (ctx_ptr) { GF_FREE(ctx_ptr); } if (dirty) { GF_FREE(dirty); } } UNLOCK(&fd->lock); } #define NSR_TERM_XATTR "trusted.nsr.term" #define NSR_INDEX_XATTR "trusted.nsr.index" #define RECON_TERM_XATTR "trusted.nsr.recon-term" #define RECON_INDEX_XATTR "trusted.nsr.recon-index" #define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count" #include "nsr-cg.c" uint8_t nsr_count_up_kids (nsr_private_t *priv) { uint8_t retval = 0; uint8_t i; for (i = 0; i < priv->n_children; ++i) { if (priv->kid_state & (1 << i)) { ++retval; } } return retval; } /* * The fsync machinery looks a lot like that for any write call, but there are * some important differences that are easy to miss. First, we don't care * about the xdata that shows whether the call came from a leader or * reconciliation process. If we're the leader we fan out; if we're not we * don't. Second, we don't wait for followers before we issue the local call. * The code generation system could be updated to handle this, and still might * if we need to implement other "almost identical" paths (e.g. for open), but * a copy is more readable as long as it's just one. */ int32_t nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { nsr_local_t *local = frame->local; gf_boolean_t unwind; LOCK(&frame->lock); unwind = !--(local->call_count); UNLOCK(&frame->lock); if (unwind) { STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); } return 0; } int32_t nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { nsr_dirty_list_t *dirty; nsr_dirty_list_t *dtmp; nsr_local_t *local = frame->local; list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) { gf_log (this->name, GF_LOG_TRACE, "sending post-op on %p (%p)", local->fd, dirty); GF_FREE(dirty); } return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno, prebuf, postbuf, xdata); } int32_t nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { nsr_private_t *priv = this->private; nsr_local_t *local; uint64_t ctx_int = 0LL; nsr_fd_ctx_t *ctx_ptr; xlator_list_t *trav; local = mem_get0(this->local_pool); if (!local) { STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata); return 0; } INIT_LIST_HEAD(&local->qlinks); frame->local = local; /* Move the dirty list from the fd to the fsync request. */ LOCK(&fd->lock); if (__fd_ctx_get(fd,this,&ctx_int) == 0) { ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; list_splice_init (&ctx_ptr->dirty_list, &local->qlinks); } UNLOCK(&fd->lock); /* Issue the local call. */ local->call_count = priv->leader ? priv->n_children : 1; STACK_WIND (frame, nsr_fsync_local_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); /* Issue remote calls if we're the leader. */ if (priv->leader) { for (trav = this->children->next; trav; trav = trav->next) { STACK_WIND (frame, nsr_fsync_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); } } return 0; } int32_t nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { dict_t *result; nsr_private_t *priv = this->private; if (!priv->leader) { STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL); return 0; } if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) { STACK_WIND_TAIL (frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); return 0; } result = dict_new(); if (!result) { goto dn_failed; } priv->up_children = nsr_count_up_kids(this->private); if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,priv->up_children) != 0) { goto dsu_failed; } STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL); dict_destroy(result); return 0; dsu_failed: dict_destroy(result); dn_failed: STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); return 0; } void nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx) { nsr_dirty_list_t *dirty; nsr_dirty_list_t *dtmp; list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) { gf_log (this->name, GF_LOG_TRACE, "sending post-op on %p (%p)", fd_ctx->fd, dirty); GF_FREE(dirty); } INIT_LIST_HEAD(&fd_ctx->dirty_list); } void * nsr_flush_thread (void *ctx) { xlator_t *this = ctx; nsr_private_t *priv = this->private; struct list_head dirty_fds; nsr_fd_ctx_t *fd_ctx; nsr_fd_ctx_t *fd_tmp; int ret; for (;;) { /* * We have to be very careful to avoid lock inversions here, so * we can't just hold priv->dirty_lock while we take and * release locks for each fd. Instead, we only hold dirty_lock * at the beginning of each iteration, as we (effectively) make * a copy of the current list head and then clear the original. * This leads to four scenarios for adding the first entry to * an fd and potentially putting it on the global list. * * (1) While we're asleep. No lock contention, it just gets * added and will be processed on the next iteration. * * (2) After we've made a local copy, but before we've started * processing that fd. The new entry will be added to the * fd (under its lock), and we'll process it on the current * iteration. * * (3) While we're processing the fd. They'll block on the fd * lock, then see that the list is empty and put it on the * global list. We'll process it here on the next * iteration. * * (4) While we're working, but after we've processed that fd. * Same as (1) as far as that fd is concerned. */ INIT_LIST_HEAD(&dirty_fds); LOCK(&priv->dirty_lock); list_splice_init(&priv->dirty_fds,&dirty_fds); UNLOCK(&priv->dirty_lock); list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) { ret = syncop_fsync(FIRST_CHILD(this),fd_ctx->fd,0); if (ret) { gf_log (this->name, GF_LOG_WARNING, "failed to fsync %p (%d)", fd_ctx->fd, -ret); } LOCK(&fd_ctx->fd->lock); nsr_flush_fd(this,fd_ctx); list_del_init(&fd_ctx->fd_list); UNLOCK(&fd_ctx->fd->lock); fd_unref(fd_ctx->fd); } sleep(NSR_FLUSH_INTERVAL); } return NULL; } int32_t nsr_forget (xlator_t *this, inode_t *inode) { uint64_t ctx = 0LL; if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) { GF_FREE((void *)(long)ctx); } return 0; } int32_t nsr_release (xlator_t *this, fd_t *fd) { uint64_t ctx = 0LL; if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) { GF_FREE((void *)(long)ctx); } return 0; } struct xlator_cbks cbks = { .forget = nsr_forget, .release = nsr_release, }; int nsr_reconfigure (xlator_t *this, dict_t *options) { nsr_private_t *priv = this->private; GF_OPTION_RECONF ("leader", priv->leader, options, bool, err); gf_log (this->name, GF_LOG_INFO, "reconfigure called. setting priv->leader to %d\n", priv->leader); return 0; err: return -1; } int nsr_get_child_index (xlator_t *this, xlator_t *kid) { xlator_list_t *trav; int retval = -1; for (trav = this->children; trav; trav = trav->next) { ++retval; if (trav->xlator == kid) { return retval; } } return -1; } /* * Child notify handling is unreasonably FUBAR. Sometimes we'll get a * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. * Other times we won't. Because it's effectively random (probably racy), we * can't just maintain a count. We actually have to keep track of the state * for each child separately, to filter out the bogus CHILD_DOWN events, and * then generate counts on demand. */ int nsr_notify (xlator_t *this, int event, void *data, ...) { nsr_private_t *priv = this->private; int index; switch (event) { case GF_EVENT_CHILD_UP: index = nsr_get_child_index(this,data); if (index >= 0) { priv->kid_state |= (1 << index); priv->up_children = nsr_count_up_kids(priv); gf_log (this->name, GF_LOG_INFO, "got CHILD_UP for %s, now %u kids", ((xlator_t *)data)->name, priv->up_children); if (priv->nsr_recon_start == _gf_true) { nsr_recon_notify_event_add_child(priv, index); } } break; case GF_EVENT_CHILD_DOWN: index = nsr_get_child_index(this,data); if (index >= 0) { priv->kid_state &= ~(1 << index); priv->up_children = nsr_count_up_kids(priv); gf_log (this->name, GF_LOG_INFO, "got CHILD_DOWN for %s, now %u kids", ((xlator_t *)data)->name, priv->up_children); } break; default: ; } return default_notify(this,event,data); } int32_t mem_acct_init (xlator_t *this) { int ret = -1; GF_VALIDATE_OR_GOTO ("nsr", this, out); ret = xlator_mem_acct_init (this, gf_mt_nsr_end + 1); if (ret != 0) { gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" "failed"); return ret; } out: return ret; } extern void *nsr_leader_thread (void *); void nsr_deallocate_priv (nsr_private_t *priv) { if (!priv) { return; } if (priv->leader_key) { GF_FREE(priv->leader_key); } if (priv->term_key) { GF_FREE(priv->term_key); } GF_FREE(priv); } int32_t nsr_init (xlator_t *this) { xlator_list_t *remote; xlator_list_t *local; nsr_private_t *priv = NULL; xlator_list_t *trav; pthread_t kid; uuid_t tmp_uuid; char *my_name = NULL, *morph_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL; char *volname; extern xlator_t global_xlator; glusterfs_ctx_t *oldctx = global_xlator.ctx; runner_t runner = {0,}; int32_t ret = -1; struct stat buf; char *recon_log = NULL, *recon_log_dir = NULL; /* * Any fop that gets special treatment has to be patched in here, * because the compiled-in table is produced by the code generator and * only contains generated functions. Note that we have to go through * this->fops because of some dynamic-linking strangeness; modifying * the static table doesn't work. */ this->fops->getxattr = nsr_getxattr_special; this->fops->fsync = nsr_fsync; local = this->children; if (!local) { gf_log (this->name, GF_LOG_ERROR, "no local subvolume"); goto err; } remote = local->next; if (!remote) { gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes"); goto err; } this->local_pool = mem_pool_new (nsr_local_t, 128); if (!this->local_pool) { gf_log (this->name, GF_LOG_ERROR, "failed to create nsr_local_t pool"); goto err; } priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t); if (!priv) { gf_log (this->name, GF_LOG_ERROR, "could not allocate priv"); goto err; } // set this so that unless leader election is done, IO is fenced priv->fence_io = 1; for (trav = this->children; trav; trav = trav->next) { ++(priv->n_children); } LOCK_INIT(&priv->dirty_lock); LOCK_INIT(&priv->index_lock); INIT_LIST_HEAD(&priv->dirty_fds); this->private = priv; GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err); if (!priv->etcd_servers) { gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???"); goto err; } GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err); GF_OPTION_INIT ("subvol-uuid", priv->subvol_uuid, str, err); gf_log (this->name, GF_LOG_INFO, "subvol_uuid = %s", priv->subvol_uuid); if (gf_asprintf(&priv->leader_key,"%s:leader",priv->subvol_uuid) <= 0) { gf_log (this->name, GF_LOG_ERROR, "could not generate leader key"); goto err; } if (gf_asprintf(&priv->term_key,"%s:term",priv->subvol_uuid) <= 0) { gf_log (this->name, GF_LOG_ERROR, "could not generate term key"); goto err; } uuid_generate(tmp_uuid); priv->brick_uuid = strdup(uuid_utoa(tmp_uuid)); gf_log (this->name, GF_LOG_INFO, "brick_uuid = %s\n", priv->brick_uuid); GF_OPTION_INIT ("my-name", my_name, str, err); if (!my_name) { gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???"); goto err; } GF_OPTION_INIT ("vol-name", volname, str, err); if (!volname) { gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???"); goto err; } morph_name = GF_CALLOC (1, strlen(my_name) + 1, gf_mt_nsr_private_t); strcpy(morph_name, my_name); recon_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t); recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("recon") +1, gf_mt_nsr_private_t); if ((!recon_file) || (!recon_pid_file)) { gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); goto err; } ptr = strchr (morph_name, '/'); while (ptr) { *ptr = '-'; ptr = strchr (morph_name, '/'); } sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, GLUSTERD_VOLUME_DIR_PREFIX, volname, GLUSTERD_BRICK_INFO_DIR); strcat(recon_file, morph_name); strcat(recon_file, "-nsr-recon.vol"); sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, GLUSTERD_VOLUME_DIR_PREFIX, volname, "run"); strcat(recon_pid_file, morph_name); strcat(recon_pid_file, "-recon.pid"); priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t); if (!priv->vol_file) { gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); goto err; } sprintf(priv->vol_file,"%s/%s/%s/%s/", GLUSTERD_DEFAULT_WORKDIR, GLUSTERD_VOLUME_DIR_PREFIX, volname, GLUSTERD_BRICK_INFO_DIR); strcat(priv->vol_file, "con:"); strcat(priv->vol_file, morph_name); if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) { gf_log (this->name, GF_LOG_ERROR, "could not start flush thread"); /* TBD: treat this as a fatal error? */ } // Start the recon process. Then start the leader thread. /* * REVIEW * Logs belong in /var/log not /tmp. */ ret = mkdir (NSR_LOG_DIR, 0777); if (ret != 0) { if (errno != EEXIST) { gf_log (this->name, GF_LOG_ERROR, "Couldn't create" " nsr log directory (%s)", strerror (errno)); goto err; } } recon_log_dir = GF_CALLOC (1, strlen (NSR_LOG_DIR) + strlen(morph_name) + 2, gf_mt_nsr_private_t); if (!recon_log_dir) { gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log " "dir name"); goto err; } sprintf (recon_log_dir, "%s/%s", NSR_LOG_DIR, morph_name); ret = mkdir (recon_log_dir, 0777); if (ret != 0){ if (errno != EEXIST) { gf_log (this->name, GF_LOG_ERROR, "Couldn't create brick log dir (%s)", strerror (errno)); goto err; } } recon_log = GF_CALLOC (1, strlen (recon_log_dir)+ strlen ("reconciliation.log") + 2, gf_mt_nsr_private_t); if (!recon_log) { gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log" " file name"); goto err; } sprintf (recon_log, "%s/reconciliation.log", recon_log_dir); if (!stat(priv->vol_file, &buf)) { runinit (&runner); runner_add_args(&runner, SBIN_DIR "/glusterfs", "-f", recon_file, "-p", recon_pid_file, "-l", recon_log, NULL); ret = runner_run (&runner); if (ret != 0) { gf_log (this->name, GF_LOG_ERROR, "could not exec reconciliation process %s", SBIN_DIR "/glusterfs"); goto err; } // TBD - convert this to make sure recon process runs sleep(2); priv->nsr_recon_start = _gf_true; } (void)pthread_create(&kid,NULL,nsr_recon_notify_thread,this); while (priv->recon_notify_inited == 0) { sleep(1); } if (pthread_create(&kid,NULL,nsr_leader_thread,this) != 0) { gf_log (this->name, GF_LOG_ERROR, "failed to start leader thread"); } while (priv->leader_inited == 0) { sleep(1); } /* * Calling glfs_new changes old->ctx, even if THIS still points * to global_xlator. That causes problems later in the main * thread, when gf_log_dump_graph tries to use the FILE after * we've mucked with it and gets a segfault in __fprintf_chk. * We can avoid all that by undoing the damage before we * continue. */ global_xlator.ctx = oldctx; return 0; err: nsr_deallocate_priv(priv); return -1; } void nsr_fini (xlator_t *this) { nsr_deallocate_priv(this->private); } class_methods_t class_methods = { .init = nsr_init, .fini = nsr_fini, .reconfigure = nsr_reconfigure, .notify = nsr_notify, }; struct volume_options options[] = { { .key = {"leader"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "false", .description = "Start in the leader role. This is only for " "bootstrapping the code, and should go away when we " "have real leader election." }, { .key ={"vol-name"}, .type = GF_OPTION_TYPE_STR, .description = "volume name" }, { .key = {"my-name"}, .type = GF_OPTION_TYPE_STR, .description = "brick name in form of host:/path" }, { .key = {"etcd-servers"}, .type = GF_OPTION_TYPE_STR, .description = "list of comma seperated etc servers" }, { .key = {"subvol-uuid"}, .type = GF_OPTION_TYPE_STR, .description = "UUID for this NSR (sub)volume" }, { .key = {"quorum-percent"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "50.0", .description = "percentage of rep_count-1 that must be up" }, { .key = {NULL} }, };