diff options
Diffstat (limited to 'xlators')
166 files changed, 88606 insertions, 0 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am new file mode 100644 index 000000000..2abb52194 --- /dev/null +++ b/xlators/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = cluster storage protocol performance debug features encryption mount + +CLEANFILES = diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am new file mode 100644 index 000000000..f77665802 --- /dev/null +++ b/xlators/bindings/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = $(BINDINGS_SUBDIRS) diff --git a/xlators/bindings/python/Makefile.am b/xlators/bindings/python/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/bindings/python/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am new file mode 100644 index 000000000..c0b9141c6 --- /dev/null +++ b/xlators/bindings/python/src/Makefile.am @@ -0,0 +1,19 @@ + +xlator_PROGRAMS = python.so + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings + +python_PYTHON = gluster.py glustertypes.py glusterstack.py + +pythondir = $(xlatordir)/python + +python_so_SOURCES = python.c + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ + $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\" + +AM_LDFLAGS = $(PYTHON_LDFLAGS) + +CLEANFILES = + diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py new file mode 100644 index 000000000..ee0eb1310 --- /dev/null +++ b/xlators/bindings/python/src/gluster.py @@ -0,0 +1,47 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. +from ctypes import * +from glustertypes import * +from glusterstack import * +import sys +import inspect + +libglusterfs = CDLL("libglusterfs.so") +_gf_log = libglusterfs._gf_log +_gf_log.restype = c_int32 +_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p] + +gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel") + +GF_LOG_NONE = 0 +GF_LOG_CRITICAL = 1 +GF_LOG_ERROR = 2 +GF_LOG_WARNING = 3 +GF_LOG_DEBUG = 4 + +def gf_log(module, level, fmt, *params): + if level <= gf_log_loglevel: + frame = sys._getframe(1) + _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name, + frame.f_lineno, level, fmt, *params) + +class ComplexTranslator(object): + def __init__(self, xlator): + self.xlator = xlator_t.from_address(xlator) + + def __getattr__(self, item): + return getattr(self.xlator, item) diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py new file mode 100644 index 000000000..ba24c8165 --- /dev/null +++ b/xlators/bindings/python/src/glusterstack.py @@ -0,0 +1,55 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. +from ctypes import * +from glustertypes import * + +libc = CDLL("libc.so.6") +calloc = libc.calloc +calloc.argtypes = [c_int, c_int] +calloc.restype = c_void_p + +# TODO: Can these be done in C somehow? +def stack_wind(frame, rfn, obj, fn, *params): + """Frame is a frame object""" + _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t)) + _new[0].root = frame.root + _new[0].next = frame.root[0].frames.next + _new[0].prev = pointer(frame.root[0].frames) + if frame.root[0].frames.next: + frame.root[0].frames.next[0].prev = _new + frame.root[0].frames.next = _new + _new[0].this = obj + # TODO: Type checking like tmp_cbk? + _new[0].ret = rfn + _new[0].parent = pointer(frame) + _new[0].cookie = cast(_new, c_void_p) + # TODO: Initialize lock + #_new.lock.init() + frame.ref_count += 1 + fn(_new, obj, *params) + +def stack_unwind(frame, *params): + """Frame is a frame object""" + fn = frame[0].ret + parent = frame[0].parent[0] + parent.ref_count -= 1 + + op_ret = params[0] + op_err = params[1] + params = params[2:] + fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this, + op_ret, op_err, *params) diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py new file mode 100644 index 000000000..e9069d07c --- /dev/null +++ b/xlators/bindings/python/src/glustertypes.py @@ -0,0 +1,167 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. +from ctypes import * +import collections + +# +# Forward declaration of some gluster types +# +class call_frame_t(Structure): + pass + +class call_ctx_t(Structure): + pass + +class call_pool_t(Structure): + pass + +class xlator_t(Structure): + def _getFirstChild(self): + return self.children[0].xlator + firstChild = property(_getFirstChild) + +class xlator_list_t(Structure): + pass + +class xlator_fops(Structure): + pass + +class xlator_mops(Structure): + pass + +class glusterfs_ctx_t(Structure): + pass + +class list_head(Structure): + pass + +class dict_t(Structure): + pass + +class inode_table_t(Structure): + pass + +class fd_t(Structure): + pass + +class iovec(Structure): + _fields_ = [ + ("iov_base", c_void_p), + ("iov_len", c_size_t), + ] + + def __init__(self, s): + self.iov_base = cast(c_char_p(s), c_void_p) + self.iov_len = len(s) + + def getBytes(self): + return string_at(self.iov_base, self.iov_len) + +# This is a pthread_spinlock_t +# TODO: what happens to volatile-ness? +gf_lock_t = c_int + +uid_t = c_uint32 +gid_t = c_uint32 +pid_t = c_int32 + +off_t = c_int64 + +# +# Function pointer types +# +ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t), + POINTER(xlator_t), c_int32, c_int32) + +fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t)) +init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t)) +event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p) + +list_head._fields_ = [ + ("next", POINTER(list_head)), + ("prev", POINTER(list_head)), + ] + +call_frame_t._fields_ = [ + ("root", POINTER(call_ctx_t)), + ("parent", POINTER(call_frame_t)), + ("next", POINTER(call_frame_t)), + ("prev", POINTER(call_frame_t)), + ("local", c_void_p), + ("this", POINTER(xlator_t)), + ("ret", ret_fn_t), + ("ref_count", c_int32), + ("lock", gf_lock_t), + ("cookie", c_void_p), + ("op", c_int32), + ("type", c_int8), + ] + +call_ctx_t._fields_ = [ + ("all_frames", list_head), + ("trans", c_void_p), + ("pool", call_pool_t), + ("unique", c_uint64), + ("state", c_void_p), + ("uid", uid_t), + ("gid", gid_t), + ("pid", pid_t), + ("frames", call_frame_t), + ("req_refs", POINTER(dict_t)), + ("rsp_refs", POINTER(dict_t)), + ] + +xlator_t._fields_ = [ + ("name", c_char_p), + ("type", c_char_p), + ("next", POINTER(xlator_t)), + ("prev", POINTER(xlator_t)), + ("parent", POINTER(xlator_t)), + ("children", POINTER(xlator_list_t)), + ("fops", POINTER(xlator_fops)), + ("mops", POINTER(xlator_mops)), + ("fini", fini_fn_t), + ("init", init_fn_t), + ("notify", event_notify_fn_t), + ("options", POINTER(dict_t)), + ("ctx", POINTER(glusterfs_ctx_t)), + ("itable", POINTER(inode_table_t)), + ("ready", c_char), + ("private", c_void_p), + ] + +xlator_list_t._fields_ = [ + ("xlator", POINTER(xlator_t)), + ("next", POINTER(xlator_list_t)), + ] + +fop_functions = collections.defaultdict(lambda: c_void_p) +fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod', + 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access', + 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink', + 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush', + 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir', + 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir', + # TODO: Call backs? + ] + +fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t), + POINTER(fd_t), POINTER(iovec), c_int32, + off_t) + +fop_functions['writev'] = fop_writev_t +xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names] diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c new file mode 100644 index 000000000..739ef7329 --- /dev/null +++ b/xlators/bindings/python/src/python.c @@ -0,0 +1,235 @@ +/* + Copyright (c) 2007 Chris AtLee <chris@atlee.ca> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <Python.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "defaults.h" + +typedef struct +{ + char *scriptname; + PyObject *pXlator; + PyObject *pScriptModule; + PyObject *pGlusterModule; + PyThreadState *pInterp; + + PyObject *pFrameType, *pVectorType, *pFdType; +} python_private_t; + +int32_t +python_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + python_private_t *priv = (python_private_t *)this->private; + gf_log("python", GF_LOG_DEBUG, "In writev"); + if (PyObject_HasAttrString(priv->pXlator, "writev")) + { + + PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev", + "O O O i l", + PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame), + PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd), + PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector), + count, + offset); + if (PyErr_Occurred()) + { + PyErr_Print(); + } + Py_XDECREF(retval); + } + else + { + return default_writev(frame, this, fd, vector, count, offset); + } + return 0; +} + +struct xlator_fops fops = { + .writev = python_writev +}; + +struct xlator_mops mops = { +}; + +static PyObject * +AnonModule_FromFile (const char* fname) +{ + // Get the builtins + PyThreadState* pThread = PyThreadState_Get(); + PyObject *pBuiltins = pThread->interp->builtins; + + if (PyErr_Occurred()) + { + PyErr_Print(); + return NULL; + } + + // Create a new dictionary for running code in + PyObject *pModuleDict = PyDict_New(); + PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins); + Py_INCREF(pBuiltins); + + // Run the file in the new context + FILE* fp = fopen(fname, "r"); + PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict); + fclose(fp); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + return NULL; + } + + // Create an object to hold the new context + PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + return NULL; + } + PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + Py_XDECREF(pModule); + return NULL; + } + + // Set the new context's dictionary to the one we used to run the code + // inside + PyObject_SetAttrString(pModule, "__dict__", pModuleDict); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + Py_DECREF(pModule); + return NULL; + } + + return pModule; +} + +int32_t +init (xlator_t *this) +{ + // This is ok to call more than once per process + Py_InitializeEx(0); + + if (!this->children) { + gf_log ("python", GF_LOG_ERROR, + "FATAL: python should have exactly one child"); + return -1; + } + + python_private_t *priv = CALLOC (sizeof (python_private_t), 1); + ERR_ABORT (priv); + + data_t *scriptname = dict_get (this->options, "scriptname"); + if (scriptname) { + priv->scriptname = data_to_str(scriptname); + } else { + gf_log("python", GF_LOG_ERROR, + "FATAL: python requires the scriptname parameter"); + return -1; + } + + priv->pInterp = Py_NewInterpreter(); + + // Adjust python's path + PyObject *syspath = PySys_GetObject("path"); + PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH); + PyList_Append(syspath, path); + Py_DECREF(path); + + gf_log("python", GF_LOG_DEBUG, + "Loading gluster module"); + + priv->pGlusterModule = PyImport_ImportModule("gluster"); + if (PyErr_Occurred()) + { + PyErr_Print(); + return -1; + } + + priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t"); + priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t"); + priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec"); + + gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname); + + priv->pScriptModule = AnonModule_FromFile(priv->scriptname); + if (!priv->pScriptModule || PyErr_Occurred()) + { + gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname); + PyErr_Print(); + return -1; + } + + if (!PyObject_HasAttrString(priv->pScriptModule, "xlator")) + { + gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname); + return -1; + } + gf_log("python", GF_LOG_DEBUG, "Instantiating translator"); + priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&", + PyLong_FromVoidPtr, this); + if (PyErr_Occurred() || !priv->pXlator) + { + PyErr_Print(); + return -1; + } + + this->private = priv; + + gf_log ("python", GF_LOG_DEBUG, "python xlator loaded"); + return 0; +} + +void +fini (xlator_t *this) +{ + python_private_t *priv = (python_private_t*)(this->private); + Py_DECREF(priv->pXlator); + Py_DECREF(priv->pScriptModule); + Py_DECREF(priv->pGlusterModule); + Py_DECREF(priv->pFrameType); + Py_DECREF(priv->pFdType); + Py_DECREF(priv->pVectorType); + Py_EndInterpreter(priv->pInterp); + return; +} diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py new file mode 100644 index 000000000..507455c85 --- /dev/null +++ b/xlators/bindings/python/src/testxlator.py @@ -0,0 +1,56 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. + +""" +This is a test translator written in python. + +Important things to note: + This file must be import-able from glusterfsd. This probably means + setting PYTHONPATH to where this file is located. + + This file must have a top-level xlator class object that will be + used to instantiate individual translators. +""" +from gluster import * + +class MyXlator(ComplexTranslator): + name = "MyXlator" + def writev_cbk(self, frame, cookie, op_ret, op_errno, buf): + stack_unwind(frame, op_ret, op_errno, buf) + return 0 + + def writev(self, frame, fd, vector, count, offset): + gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len) + # TODO: Use cookie to pass this to writev_cbk + old_count = vector.iov_len + + data = vector.getBytes().encode("zlib") + + vector = iovec(data) + gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len) + + @ret_fn_t + def rfn(frame, prev, this, op_ret, op_errno, *params): + if len(params) == 0: + params = [0] + return self.writev_cbk(frame, prev, old_count, op_errno, *params) + + stack_wind(frame, rfn, self.firstChild, + self.firstChild[0].fops[0].writev, fd, vector, count, offset) + return 0 + +xlator = MyXlator diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am new file mode 100644 index 000000000..a6ddb3564 --- /dev/null +++ b/xlators/cluster/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = unify stripe afr dht ha map + +CLEANFILES = diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/afr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am new file mode 100644 index 000000000..1bde9e5ba --- /dev/null +++ b/xlators/cluster/afr/src/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = afr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +afr_la_LDFLAGS = -module -avoidversion + +afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/replicate.so + +install-data-hook: + ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
\ No newline at end of file diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c new file mode 100644 index 000000000..0c65ca852 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -0,0 +1,345 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +int32_t +afr_opendir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int child_count = 0; + int i = 0; + + int ret = -1; + int call_count = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + local->fd = fd_ref (fd); + + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_opendir_cbk, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + + +/** + * Common algorithm for directory read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: readdir + */ + +int32_t +afr_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readdir.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.readdir.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_readdir_cbk, + children[this_try], + children[this_try]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int ret = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readdir.last_tried = call_child; + + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + + STACK_WIND (frame, afr_readdir_cbk, + children[call_child], children[call_child]->fops->readdir, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_getdents_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dir_entry_t *entry, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getdents.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.getdents.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_getdents_cbk, + children[this_try], + children[this_try]->fops->getdents, + local->fd, local->cont.getdents.size, + local->cont.getdents.offset, local->cont.getdents.flag); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count); + } + + return 0; +} + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getdents.last_tried = call_child; + + local->fd = fd_ref (fd); + + local->cont.getdents.size = size; + local->cont.getdents.offset = offset; + local->cont.getdents.flag = flag; + + frame->local = local; + + STACK_WIND (frame, afr_getdents_cbk, + children[call_child], children[call_child]->fops->getdents, + fd, size, offset, flag); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h new file mode 100644 index 000000000..172ec3c90 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_READ_H__ +#define __DIR_READ_H__ + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd); + +int32_t +afr_closedir (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag); + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags); + + +#endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c new file mode 100644 index 000000000..87a6e09b5 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -0,0 +1,1786 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +void +afr_build_parent_loc (loc_t *parent, loc_t *child) +{ + char *tmp = NULL; + + if (!child->parent) { + loc_copy (parent, child); + return; + } + + tmp = strdup (child->path); + parent->path = strdup (dirname (tmp)); + FREE (tmp); + + parent->name = strrchr (parent->path, '/'); + if (parent->name) + parent->name++; + + parent->inode = inode_ref (child->parent); + parent->parent = inode_parent (parent->inode, 0, NULL); + parent->ino = parent->inode->ino; +} + + +/* {{{ create */ + +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, + local->cont.create.inode, + &local->cont.create.buf); + return 0; +} + + +int +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.create.buf = *buf; + local->cont.create.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.create.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_create_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->create, + &local->loc, + local->cont.create.flags, + local->cont.create.mode, + local->cont.create.fd); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_create_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.create.flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref (fd); + + local->transaction.fop = afr_create_wind; + local->transaction.done = afr_create_done; + local->transaction.unwind = afr_create_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mknod */ + +int +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mknod.inode, + &local->cont.mknod.buf); + return 0; +} + + +int +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mknod.buf = *buf; + local->cont.mknod.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.mknod.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_mknod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mknod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + + local->transaction.fop = afr_mknod_wind; + local->transaction.done = afr_mknod_done; + local->transaction.unwind = afr_mknod_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mkdir */ + + +int +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mkdir.inode, + &local->cont.mkdir.buf); + return 0; +} + + +int +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mkdir.buf = *buf; + local->cont.mkdir.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.mkdir.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, local->cont.mkdir.mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mkdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mkdir.mode = mode; + + local->transaction.fop = afr_mkdir_wind; + local->transaction.done = afr_mkdir_done; + local->transaction.unwind = afr_mkdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ link */ + + +int +afr_link_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.link.buf.st_ino = local->cont.link.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.link.inode, + &local->cont.link.buf); + } + + return 0; +} + + +int +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.link.buf = *buf; + local->cont.link.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.link.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->link, + &local->loc, + &local->newloc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_link_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.link.ino = oldloc->inode->ino; + + local->transaction.fop = afr_link_wind; + local->transaction.done = afr_link_done; + local->transaction.unwind = afr_link_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ symlink */ + + +int +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.symlink.inode, + &local->cont.symlink.buf); + return 0; +} + + +int +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.symlink.buf = *buf; + local->cont.symlink.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.symlink.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + local->cont.symlink.linkpath, + &local->loc); + + if (!--call_count) + break; + + } + } + + return 0; +} + + +int +afr_symlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.symlink.ino = loc->inode->ino; + local->cont.symlink.linkpath = strdup (linkpath); + + local->transaction.fop = afr_symlink_wind; + local->transaction.done = afr_symlink_done; + local->transaction.unwind = afr_symlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ rename */ + +int +afr_rename_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.rename.buf.st_ino = local->cont.rename.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.rename.buf); + } + + return 0; +} + + +int +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + + if (buf) { + local->cont.rename.buf = *buf; + local->cont.rename.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_rename_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rename, + &local->loc, + &local->newloc); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rename_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.rename.ino = oldloc->inode->ino; + + local->transaction.fop = afr_rename_wind; + local->transaction.done = afr_rename_done; + local->transaction.unwind = afr_rename_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ unlink */ + +int +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->unlink, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_unlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_unlink_wind; + local->transaction.done = afr_unlink_done; + local->transaction.unwind = afr_unlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ rmdir */ + + + +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) + need_unwind = 1; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rmdir, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rmdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_rmdir_wind; + local->transaction.done = afr_rmdir_done; + local->transaction.unwind = afr_rmdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ setdents */ + +int32_t +afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_setdents_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setdents, + local->fd, local->cont.setdents.flags, + local->cont.setdents.entries, + local->cont.setdents.count); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_setdents_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + local->fd = fd_ref (fd); + + local->cont.setdents.flags = flags; + local->cont.setdents.entries = entries; + local->cont.setdents.count = count; + + local->transaction.fop = afr_setdents_wind; + local->transaction.done = afr_setdents_done; + + local->transaction.basename = NULL; + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h new file mode 100644 index 000000000..e6e8a5e79 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_WRITE_H__ +#define __DIR_WRITE_H__ + +int32_t +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd); + +int32_t +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev); + +int32_t +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *oldloc); + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + +#endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c new file mode 100644 index 000000000..a6c99ec05 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -0,0 +1,721 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +/** + * Common algorithm for inode read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: access, stat, fstat, readlink, getxattr + */ + +/* {{{ access */ + +int32_t +afr_access_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.access.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.access.last_tried; + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->access, + &local->loc, local->cont.access.mask); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.access.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->access, + loc, mask); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +/* }}} */ + +/* {{{ stat */ + +int32_t +afr_stat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.stat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.stat.last_tried; + + if (this_try == deitransform_child) { + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->stat, + &local->loc); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.stat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_deitransform (loc->inode->ino, priv->child_count); + loc_copy (&local->loc, loc); + + /* + if stat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.stat.last_tried = -1; + local->cont.stat.ino = loc->inode->ino; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->stat, + loc); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ fstat */ + +int32_t +afr_fstat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.fstat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.fstat.last_tried; + + if (this_try == deitransform_child) { + /* + skip the deitransform'd child since if we are here + we must have already tried that child + */ + goto retry; + } + + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->fstat, + local->fd); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.fstat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + VALIDATE_OR_GOTO (fd->inode, out); + + call_child = afr_deitransform (fd->inode->ino, priv->child_count); + + /* + if fstat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.fstat.last_tried = -1; + local->cont.fstat.ino = fd->inode->ino; + local->fd = fd_ref (fd); + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->fstat, + fd); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ readlink */ + +int32_t +afr_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + const char *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readlink.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readlink.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readlink, + &local->loc, + local->cont.readlink.size); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readlink.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->readlink, + loc, size); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ getxattr */ + +int32_t +afr_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getxattr.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.getxattr.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->getxattr, + &local->loc, + local->cont.getxattr.name); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, dict); + } + + return 0; +} + + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t * local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getxattr.last_tried = call_child; + loc_copy (&local->loc, loc); + if (name) + local->cont.getxattr.name = strdup (name); + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->getxattr, + loc, name); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ readv */ + +/** + * read algorithm: + * + * if the user has specified a read subvolume, use it + * otherwise - + * use the inode number to hash it to one of the subvolumes, and + * read from there (to balance read load) + * + * if any of the above read's fail, try the children in sequence + * beginning at the beginning + */ + +int32_t +afr_readv_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.readv.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readv.last_tried; + + if (this_try == priv->read_child) { + /* + skip the read child since if we are here + we must have already tried that child + */ + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf); + } + + return 0; +} + + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + if (priv->read_child != -1) { + call_child = priv->read_child; + + /* + if read fails from the read child, we try + all children starting with the first one + */ + local->cont.readv.last_tried = -1; + } else { + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readv.last_tried = call_child; + } + + local->fd = fd_ref (fd); + + local->cont.readv.size = size; + local->cont.readv.offset = offset; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readv, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL); + } + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h new file mode 100644 index 000000000..6b3bd2da8 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_READ_H__ +#define __INODE_READ_H__ + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask); + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size); + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c new file mode 100644 index 000000000..267350b2c --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -0,0 +1,2024 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +/* {{{ chmod */ + + +int +afr_chmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chmod.buf.st_ino = local->cont.chmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chmod.buf); + } + return 0; +} + + +int +afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_chmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, + local->cont.chmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chmod.mode = mode; + local->cont.chmod.ino = loc->inode->ino; + + local->transaction.fop = afr_chmod_wind; + local->transaction.done = afr_chmod_done; + local->transaction.unwind = afr_chmod_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + + +/* {{{ fchmod */ + +int +afr_fchmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchmod.buf); + } + return 0; +} + + +int +afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_fchmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchmod, + local->fd, + local->cont.fchmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchmod.mode = mode; + local->cont.fchmod.ino = fd->inode->ino; + + local->transaction.fop = afr_fchmod_wind; + local->transaction.done = afr_fchmod_done; + local->transaction.unwind = afr_fchmod_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ chown */ + +int +afr_chown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chown.buf.st_ino = local->cont.chown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chown.buf); + } + return 0; +} + + +int +afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, local->cont.chown.uid, + local->cont.chown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chown.uid = uid; + local->cont.chown.gid = gid; + local->cont.chown.ino = loc->inode->ino; + + local->transaction.fop = afr_chown_wind; + local->transaction.done = afr_chown_done; + local->transaction.unwind = afr_chown_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ chown */ + +int +afr_fchown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchown.buf.st_ino = local->cont.fchown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchown.buf); + } + return 0; +} + + +int +afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchown, + local->fd, local->cont.fchown.uid, + local->cont.fchown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchown.uid = uid; + local->cont.fchown.gid = gid; + local->cont.fchown.ino = fd->inode->ino; + + local->transaction.fop = afr_fchown_wind; + local->transaction.done = afr_fchown_done; + local->transaction.unwind = afr_fchown_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ writev */ + +int +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.writev.buf.st_ino = local->cont.writev.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.writev.buf); + } + return 0; +} + + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.writev.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_writev_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + local->fd, + local->cont.writev.vector, + local->cont.writev.count, + local->cont.writev.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_writev_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (local->cont.writev.refs) + dict_unref (local->cont.writev.refs); + local->cont.writev.refs = NULL; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_WRITE; + local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.ino = fd->inode->ino; + + if (frame->root->req_refs) + local->cont.writev.refs = dict_ref (frame->root->req_refs); + + local->transaction.fop = afr_writev_wind; + local->transaction.done = afr_writev_done; + local->transaction.unwind = afr_writev_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + if (fd->flags & O_APPEND) { + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = offset; + local->transaction.len = iov_length (vector, count); + } + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ truncate */ + +int +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.truncate.buf.st_ino = local->cont.truncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.truncate.buf); + } + return 0; +} + + +int +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.truncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_truncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->truncate, + &local->loc, + local->cont.truncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_truncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.truncate.offset = offset; + local->cont.truncate.ino = loc->inode->ino; + + local->transaction.fop = afr_truncate_wind; + local->transaction.done = afr_truncate_done; + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ ftruncate */ + + +int +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.ftruncate.buf); + } + return 0; +} + + +int +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.ftruncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_FTRUNCATE; + local->op_ret = -1; + + local->cont.ftruncate.offset = offset; + local->cont.ftruncate.ino = fd->inode->ino; + + local->transaction.fop = afr_ftruncate_wind; + local->transaction.done = afr_ftruncate_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ utimens */ + + +int +afr_utimens_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.utimens.buf.st_ino = local->cont.utimens.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.utimens.buf); + } + return 0; +} + + +int +afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 1; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.utimens.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_utimens_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, + local->cont.utimens.tv); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_utimens_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.utimens.tv[0] = tv[0]; + local->cont.utimens.tv[1] = tv[1]; + + local->cont.utimens.ino = loc->inode->ino; + + local->transaction.fop = afr_utimens_wind; + local->transaction.done = afr_utimens_done; + local->transaction.unwind = afr_utimens_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ setxattr */ + + +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, + local->cont.setxattr.dict, + local->cont.setxattr.flags); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.setxattr.dict = dict_ref (dict); + local->cont.setxattr.flags = flags; + + local->transaction.fop = afr_setxattr_wind; + local->transaction.done = afr_setxattr_done; + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ removexattr */ + + +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, + local->cont.removexattr.name); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_removexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.removexattr.name = strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h new file mode 100644 index 000000000..9c0b5cad3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -0,0 +1,63 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_WRITE_H__ +#define __INODE_WRITE_H__ + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid); + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid); + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode); + +int32_t +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset); + +int32_t +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset); + +int32_t +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset); + +int32_t +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]); + +int32_t +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags); + +int32_t +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c new file mode 100644 index 000000000..45d065169 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -0,0 +1,1073 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal.h" + + +/** + * select_source - select a source and return it + * TODO: take into account option 'favorite-child' + */ + +int +afr_sh_select_source (int sources[], int child_count) +{ + int i; + for (i = 0; i < child_count; i++) + if (sources[i]) + return i; + + return -1; +} + + +/** + * sink_count - return number of sinks in sources array + */ + +int +afr_sh_sink_count (int sources[], int child_count) +{ + int i; + int sinks = 0; + for (i = 0; i < child_count; i++) + if (!sources[i]) + sinks++; + return sinks; +} + +int +afr_sh_source_count (int sources[], int child_count) +{ + int i; + int nsource = 0; + + for (i = 0; i < child_count; i++) + if (sources[i]) + nsource++; + return nsource; +} + + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) { + if (child_errno[i] && sources[i]) { + sources[i] = 0; + } + } + + return 0; +} + + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key) +{ + int i = 0; + int32_t *pending = NULL; + int ret = 0; + int all_xattr_missing = 1; + + /* if the file was created by afr with xattrs */ + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + continue; + } + + all_xattr_missing = 0; + break; + } + + if (all_xattr_missing) { + /* supress 0byte files.. this avoids empty file created + by dir selfheal to overwrite the 'good' file */ + for (i = 0; i < child_count; i++) { + if (!buf[i].st_size) + sources[i] = 0; + } + goto out; + } + + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) { + sources[i] = 0; + continue; + } + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + sources[i] = 0; + continue; + } + + if (!pending) { + sources[i] = 0; + continue; + } + } + +out: + return 0; +} + + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + + char *buf = NULL; + char *ptr = NULL; + + int i, j; + + /* 10 digits per entry + 1 space + '[' and ']' */ + buf = MALLOC (priv->child_count * 11 + 8); + + for (i = 0; i < priv->child_count; i++) { + ptr = buf; + ptr += sprintf (ptr, "[ "); + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + ptr += sprintf (ptr, "]"); + gf_log (this->name, GF_LOG_DEBUG, + "pending_matrix: %s", buf); + } + + FREE (buf); +} + + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + int32_t *pending = NULL; + int ret = -1; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = NULL; + + ret = dict_get_ptr (xattr[i], (char *) key, + VOID(&pending)); + if (ret != 0) + continue; + + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = ntoh32 (pending[j]); + } + } +} + + +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + */ + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count) +{ + int i = 0; + int j = 0; + + int nsources = 0; + + + /* start clean */ + for (i = 0; i < child_count; i++) { + sources[i] = 0; + } + + /* + Let's 'normalize' the pending matrix first, + by disregarding all pending entries that refer + to themselves + */ + for (i = 0; i < child_count; i++) { + pending_matrix[i][i] = 0; + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (pending_matrix[j][i]) + break; + } + + if (j == child_count) { + nsources++; + sources[i] = 1; + } + } + + return nsources; +} + + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int success[], int child_count) +{ + int i = 0; + int j = 0; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + delta_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (!success[j]) + continue; + delta_matrix[i][j] = -pending_matrix[i][j]; + } + } +} + + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + + int ret = 0; + + int32_t *pending = 0; + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = CALLOC (sizeof (int32_t), child_count); + for (j = 0; j < child_count; j++) { + pending[j] = hton32 (delta_matrix[i][j]); + } + + ret = dict_set_bin (xattr[i], (char *) key, pending, + child_count * sizeof (int32_t)); + } + + return 0; +} + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + + +/** + * is_matrix_zero - return true if pending matrix is all zeroes + */ + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +{ + int i, j; + + for (i = 0; i < child_count; i++) + for (j = 0; j < child_count; j++) + if (pending_matrix[i][j]) + return 0; + return 1; +} + + +int +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_self_heal_metadata (frame, this); + } + + return 0; +} + + +int +sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_self_heal_t *sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %"PRId64"/%s on subvolume %s", + sh->parent_loc.inode->ino, local->loc.name, + priv->children[i]->name); + + STACK_WIND (frame, sh_missing_entries_unlck_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + if (!--call_count) + break; + } + } + return 0; +} + + +static int +sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int op_errno, struct stat *stbuf) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +static int +sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + call_frame_t *chown_frame = NULL; + int call_count = 0; + int child_index = 0; + struct stat *buf = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + buf = &sh->buf[sh->source]; + child_index = (long) cookie; + + if (op_ret == 0) { + chown_frame = copy_frame (frame); + + gf_log (this->name, GF_LOG_DEBUG, + "chown %s to %d %d on subvolume %s", + local->loc.path, buf->st_uid, buf->st_gid, + priv->children[child_index]->name); + + STACK_WIND (chown_frame, sh_destroy_cbk, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &local->loc, + buf->st_uid, buf->st_gid); + } + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + dev_t st_dev = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + st_dev = sh->buf[sh->source].st_dev; + + gf_log (this->name, GF_LOG_DEBUG, + "mknod %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, st_mode, st_dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + + gf_log (this->name, GF_LOG_DEBUG, + "mkdir %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, st_mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, + const char *link) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "symlink %s -> %s on %d subvolumes", + local->loc.path, link, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + link, &local->loc); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *link) +{ + if (op_ret > 0) + sh_missing_entries_symlink (frame, this, link); + else + sh_missing_entries_finish (frame, this); + + return 0; +} + + +static int +sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + STACK_WIND (frame, sh_missing_entries_readlink_cbk, + priv->children[sh->source], + priv->children[sh->source]->fops->readlink, + &local->loc, 4096); + + return 0; +} + + +static int +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int type = 0; + int i = 0; + afr_private_t *priv = NULL; + int enoent_count = 0; + int govinda_gOvinda = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i]) { + if (sh->child_errno[i] == ENOENT) + enoent_count++; + } else { + if (type) { + if (type != (sh->buf[i].st_mode & S_IFMT)) + govinda_gOvinda = 1; + } else { + sh->source = i; + type = sh->buf[i].st_mode & S_IFMT; + } + } + } + + if (govinda_gOvinda) { + gf_log (this->name, GF_LOG_ERROR, + "conflicing filetypes exist for path %s. returning.", + local->loc.path); + + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + return 0; + } + + if (!type) { + gf_log (this->name, GF_LOG_ERROR, + "no source found for %s. all nodes down?. returning.", + local->loc.path); + /* subvolumes down and/or file does not exist */ + sh_missing_entries_finish (frame, this); + return 0; + } + + if (enoent_count == 0) { + gf_log (this->name, GF_LOG_ERROR, + "no missing files - %s. proceeding to metadata check", + local->loc.path); + /* proceed to next step - metadata self-heal */ + sh_missing_entries_finish (frame, this); + return 0; + } + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + sh_missing_entries_mknod (frame, this); + break; + case S_IFLNK: + sh_missing_entries_readlink (frame, this); + break; + case S_IFDIR: + sh_missing_entries_mkdir (frame, this); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown file type: 0%o", type); + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + int child_index = 0; + afr_local_t *local = NULL; + int call_count = 0; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + local->self_heal.buf[child_index] = *buf; + } else { + gf_log (this->name, GF_LOG_WARNING, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + local->self_heal.child_errno[child_index] = op_errno; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_create (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + int ret = -1; + + local = frame->local; + call_count = local->child_count; + priv = this->private; + + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, + sh_missing_entries_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +static int +sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + sh_missing_entries_finish (frame, this); + return 0; + } + + sh_missing_entries_lookup (frame, this); + } + + return 0; +} + + +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "attempting to recreate missing entries for path=%s", + local->loc.path); + + afr_build_parent_loc (&sh->parent_loc, &local->loc); + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, sh_missing_entries_lk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "performing self heal on %s (metadata=%d data=%d entry=%d)", + local->loc.path, + local->need_metadata_self_heal, + local->need_data_self_heal, + local->need_entry_self_heal); + + sh->completion_cbk = completion_cbk; + + sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); + sh->child_errno = CALLOC (priv->child_count, sizeof (int)); + sh->success = CALLOC (priv->child_count, sizeof (int)); + sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); + sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + + sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->pending_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->delta_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + if (local->success_count && local->enoent_count) { + afr_self_heal_missing_entries (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h new file mode 100644 index 000000000..9dd597f07 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -0,0 +1,66 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_COMMON_H__ +#define __AFR_SELF_HEAL_COMMON_H__ + +#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512)) + +int +afr_sh_select_source (int sources[], int child_count); + +int +afr_sh_sink_count (int sources[], int child_count); + +int +afr_sh_source_count (int sources[], int child_count); + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count); + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key); + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key); + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int32_t success[], int child_count); + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], + int child_count); + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key); + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); + + +#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c new file mode 100644 index 000000000..3a48da485 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -0,0 +1,1030 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_data_done (frame, this); + } + + return 0; +} + + +int +afr_sh_data_close (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (!sh->healing_fd) { + afr_sh_data_done (frame, this); + return 0; + } + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + + /* closed source */ + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[sh->source]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->flush, + sh->healing_fd); + call_count--; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + sh->healing_fd); + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_close (frame, this); + } + + return 0; +} + + +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing data selfheal of %s", local->loc.path); + + afr_sh_data_unlock (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_data_finish (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_DATA_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + else + gf_log (this->name, GF_LOG_DEBUG, + "ftruncate of %s on subvolume %s completed", + local->loc.path, + priv->children[child_index]->name); + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + } + + return 0; +} + + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int *sources = NULL; + int call_count = 0; + int i = 0; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sources = sh->sources; + call_count = sh->active_sinks; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + sh->healing_fd, sh->file_size); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "wrote %d bytes of data from %s to child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset - op_ret); + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "write to %s failed on subvolume %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_read_write_iter (frame, this); + } + + return 0; +} + + +int +afr_sh_data_read_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int i = 0; + int call_count = 0; + + off_t offset; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = sh->active_sinks; + + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "read %d bytes of data from %s on child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset); + + if (op_ret <= 0) { + afr_sh_data_trim_sinks (frame, this); + return 0; + } + + /* what if we read less than block size? */ + offset = sh->offset; + sh->offset += op_ret; + + frame->root->req_refs = frame->root->rsp_refs; + + if (sh->file_has_holes) { + if (iov_0filled (vector, count) == 0) { + /* the iter function depends on the + sh->offset already being updated + above + */ + afr_sh_data_read_write_iter (frame, this); + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + /* this is a sink, so write to it */ + STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + sh->healing_fd, vector, count, offset); + + if (!--call_count) + break; + } + +out: + return 0; +} + + +int +afr_sh_data_read_write (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->readv, + sh->healing_fd, sh->block_size, + sh->offset); + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + goto out; + } + + if (sh->offset >= sh->file_size) { + gf_log (this->name, GF_LOG_DEBUG, + "closing fd's of %s", + local->loc.path); + afr_sh_data_trim_sinks (frame, this); + + goto out; + } + + afr_sh_data_read_write (frame, this); + +out: + return 0; +} + + +int +afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "open of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + gf_log (this->name, GF_LOG_WARNING, + "sourcing file %s from %s to other sinks", + local->loc.path, priv->children[sh->source]->name); + + afr_sh_data_read_write (frame, this); + } + + return 0; +} + + +int +afr_sh_data_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 65536; + sh->file_size = sh->buf[source].st_size; + + if (FILE_HAS_HOLES (&sh->buf[source])) + sh->file_has_holes = 1; + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->open, + &local->loc, O_RDONLY|O_LARGEFILE, fd); + call_count--; + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if(sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + O_WRONLY|O_LARGEFILE, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_data_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing data of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + afr_sh_data_open (frame, this); + + return 0; +} + + +int +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting data of %s. " + "Please resolve manually by deleting the file %s " + "from all but the preferred subvolume. " + "Please consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_data_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_data_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_fix (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + + int call_count = 0; + int i = 0; + int ret = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + + afr_sh_data_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_data_self_heal && priv->data_self_heal) { + afr_sh_data_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "not doing data self heal on %s", + local->loc.path); + afr_sh_data_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c new file mode 100644 index 000000000..ec341922e --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -0,0 +1,2038 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "unlocking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "unlocked inode of %s on child %d", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->healing_fd) + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_entry_done (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing entry selfheal of %s", local->loc.path); + + afr_sh_entry_unlock (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_finish (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_ENTRY_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + + +static int +next_active_source (call_frame_t *frame, xlator_t *this, + int current_active_source) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int source = -1; + int next_active_source = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + source = sh->source; + + if (source != -1) { + if (current_active_source != source) + next_active_source = source; + goto out; + } + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_source)) { + + next_active_source = i; + break; + } + } +out: + return next_active_source; +} + + + +static int +next_active_sink (call_frame_t *frame, xlator_t *this, + int current_active_sink) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int next_active_sink = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_sink)) { + + next_active_sink = i; + break; + } + } + + return next_active_sink; +} + + +int +build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + + if (!child) { + goto out; + } + + if (strcmp (parent->path, "/") == 0) + asprintf ((char **)&child->path, "/%s", name); + else + asprintf ((char **)&child->path, "%s/%s", parent->path, name); + + if (!child->path) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + + if (!child->inode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ret = 0; +out: + if (ret == -1) + loc_wipe (child); + + return ret; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + call_frame_t *frame = NULL; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + + active_src = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "removed %s on %s", + expunge_local->loc.path, + priv->children[active_src]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "removing %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "removing directory %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->rmdir, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "unlinking file %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->unlink, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, + int active_src, struct stat *buf) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int type = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + source = expunge_sh->source; + + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFLNK: + afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + + break; + case S_IFDIR: + afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + expunge_local->loc.path, + priv->children[source]->name, type); + goto out; + break; + } + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "lookup of %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &expunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = expunge_sh->active_source; + source = (long) cookie; + + if (op_ret == -1 && op_errno == ENOENT) { + + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + expunge_local->loc.path, + priv->children[source]->name); + + afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + expunge_local->loc.path, + priv->children[source]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + expunge_local->loc.path, + priv->children[source]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *expunge_frame = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + int source = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + source = sh->source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + expunge_frame = copy_frame (frame); + if (!expunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + + expunge_frame->local = expunge_local; + expunge_sh = &expunge_local->self_heal; + expunge_sh->sh_frame = frame; + expunge_sh->active_source = active_src; + + ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", expunge_local->loc.path, + priv->children[source]->name); + + STACK_WIND_COOKIE (expunge_frame, + afr_sh_entry_expunge_entry_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->lookup, + &expunge_local->loc, 0); + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_expunge_entry (frame, this, entry->d_name); + } + + return 0; +} + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + if (sh->source == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s to expunge entries", + local->loc.path); + goto out; + } + + active_src = next_active_sink (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + goto out; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "expunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +out: + afr_sh_entry_erase_pending (frame, this); + return 0; + +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "utimes set for %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting utimes of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + struct timespec ts[2]; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "ownership of %s on %s changed", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting ownership of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atim; + ts[1] = impunge_local->cont.lookup.buf.st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atimespec; + ts[1] = impunge_local->cont.lookup.buf.st_mtimespec; +#else + ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime; + ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime; +#endif + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_utimens_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->utimens, + &impunge_local->loc, ts); + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "creation of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + + inode->st_mode = stbuf->st_mode; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &impunge_local->loc, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s", + impunge_local->loc.path, + stbuf->st_mode, stbuf->st_rdev, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mknod, + &impunge_local->loc, + stbuf->st_mode, stbuf->st_rdev); + + return 0; +} + + + +int +afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating directory %s mode=0%o on %s", + impunge_local->loc.path, + stbuf->st_mode, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mkdir, + &impunge_local->loc, stbuf->st_mode); + + return 0; +} + + +int +afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating symlink %s -> %s on %s", + impunge_local->loc.path, linkname, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->symlink, + linkname, &impunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + call_frame_t *frame = NULL; + int call_count = -1; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + active_src = impunge_sh->active_source; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "readlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, + linkname); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->readlink, + &impunge_local->loc, 4096); + + return 0; +} + + +int +afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, + dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int type = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int call_count = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + active_src = impunge_sh->active_source; + + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s on %s (for %s) failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + impunge_local->cont.lookup.buf = *buf; + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + afr_sh_entry_impunge_mknod (impunge_frame, this, + child_index, buf); + break; + case S_IFLNK: + afr_sh_entry_impunge_readlink (impunge_frame, this, + child_index, buf); + break; + case S_IFDIR: + afr_sh_entry_impunge_mkdir (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + goto out; + break; + } + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_recreate_lookup_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &impunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int call_count = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + active_src = impunge_sh->active_source; + + if (op_ret == -1 && op_errno == ENOENT) { + /* decrease call_count in recreate-callback */ + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + + afr_sh_entry_impunge_recreate (impunge_frame, this, + child_index); + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *impunge_frame = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int i = 0; + int call_count = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + impunge_frame = copy_frame (frame); + if (!impunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (impunge_local, afr_local_t, out); + + impunge_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_src; + + ret = build_child_loc (this, &impunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + call_count++; + } + + impunge_local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", impunge_local->loc.path, + priv->children[i]->name); + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_entry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &impunge_local->loc, 0); + + if (!--call_count) + break; + } + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_impunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_impunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_impunge_entry (frame, this, entry->d_name); + } + + return 0; +} + + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + active_src = next_active_source (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "impunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + sh->active_source = -1; + afr_sh_entry_impunge_all (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 131072; + sh->offset = 0; + + call_count = sh->active_sinks; + if (source != -1) + call_count++; + + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + if (source != -1) { + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (source)", + local->loc.path, priv->children[source]->name); + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->opendir, + &local->loc, fd); + call_count--; + } + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (sink)", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + if (source != -1) + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for self-heal on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + if (source == -1 && active_sinks < 2) { + gf_log (this->name, GF_LOG_WARNING, + "cannot sync with 0 sources and 1 sink on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + if (source != -1) + gf_log (this->name, GF_LOG_DEBUG, + "syncing %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, + active_sinks); + else + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s found. " + "merging all entries as a conservative decision", + local->loc.path); + + afr_sh_entry_open (frame, this); + + return 0; +} + + +int +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_ENTRY_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + afr_sh_entry_sync_prepare (frame, this); + + return 0; +} + + + +int +afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_entry_fix (frame, this); + } + + return 0; +} + + + +int +afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t * sh = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + dict_t *xattr_req = NULL; + int ret = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, + afr_sh_entry_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + + +int +afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + afr_sh_entry_finish (frame, this); + return 0; + } + + afr_sh_entry_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (local->need_entry_self_heal && priv->entry_self_heal) { + afr_sh_entry_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to completion on %s", + local->loc.path); + afr_sh_entry_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c new file mode 100644 index 000000000..e65a426db --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -0,0 +1,791 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + +int +afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + memset (sh->success, 0, sizeof (int) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + if (S_ISREG (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_self_heal_data (frame, this); + return 0; + } + + if (S_ISDIR (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to entry check on %s", + local->loc.path); + afr_self_heal_entry (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "completed self heal of %s", + local->loc.path); + + sh->completion_cbk (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + int call_count = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_done (frame, this); + + return 0; +} + + +int +afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND (frame, afr_sh_metadata_unlck_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_finish (frame, this); + + return 0; +} + + +int +afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_METADATA_PENDING); + + local->call_count = call_count; + + if (call_count == 0) { + gf_log (this->name, GF_LOG_WARNING, + "metadata of %s not healed on any subvolume", + local->loc.path); + + afr_sh_metadata_finish (frame, this); + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "setting attributes failed for %s on %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->success[child_index] = 0; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_erase_pending (frame, this); + + return 0; +} + + +int +afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + int active_sinks = 0; + int call_count = 0; + int i = 0; + struct timespec ts[2]; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + active_sinks = sh->active_sinks; + + /* + * 4 calls per sink - chown, chmod, utimes, setxattr + */ + if (xattr) + call_count = active_sinks * 4; + else + call_count = active_sinks * 3; + + local->call_count = call_count; + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = sh->buf[source].st_atim; + ts[1] = sh->buf[source].st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = sh->buf[source].st_atimespec; + ts[1] = sh->buf[source].st_mtimespec; +#else + ts[0].tv_sec = sh->buf[source].st_atime; + ts[1].tv_sec = sh->buf[source].st_mtime; +#endif + + for (i = 0; i < priv->child_count; i++) { + if (call_count == 0) { + break; + } + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from %s to %s", + local->loc.path, priv->children[source]->name, + priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, + sh->buf[source].st_uid, + sh->buf[source].st_gid); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, sh->buf[source].st_mode); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, ts); + + call_count = call_count - 3; + + if (!xattr) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, xattr, 0); + call_count--; + } + + return 0; +} + + +int +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", + local->loc.path, priv->children[source]->name, + strerror (op_errno)); + + afr_sh_metadata_sync (frame, this, NULL); + } else { + dict_del (xattr, AFR_DATA_PENDING); + dict_del (xattr, AFR_METADATA_PENDING); + dict_del (xattr, AFR_ENTRY_PENDING); + afr_sh_metadata_sync (frame, this, xattr); + } + + return 0; +} + + +int +afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_metadata_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, + priv->children[source], + priv->children[source]->fops->getxattr, + &local->loc, NULL); + + return 0; +} + + +int +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_METADATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting metadata of %s. " + "Please resolve manually by fixing the " + "permissions/ownership of %s on your subvolumes. " + "You can also consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_metadata_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + + if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_metadata_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + sh->buf[child_index] = *buf; + if (xattr) + sh->xattr[child_index] = dict_ref (xattr); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->child_errno[child_index] = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_fix (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + dict_t *xattr_req = NULL; + int ret = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_metadata_finish (frame, this); + return 0; + } + + afr_sh_metadata_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_metadata_self_heal && priv->metadata_self_heal) { + afr_sh_metadata_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_sh_metadata_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h new file mode 100644 index 000000000..1c97a9bc1 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -0,0 +1,52 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_H__ +#define __AFR_SELF_HEAL_H__ + +#include <sys/stat.h> + +#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode)) +#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode)) +#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid))) +#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size)) + + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)); + +#endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c new file mode 100644 index 000000000..3df9f07e5 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -0,0 +1,957 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "dict.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +static void +__mark_all_pending (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (1); +} + + +static void +__mark_child_dead (int32_t *pending, int child_count, int child) +{ + pending[child] = 0; +} + + +static void +__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up) +{ + int i; + + for (i = 0; i < child_count; i++) + if (!child_up[i]) + pending[i] = 0; +} + + +static void +__mark_all_success (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (-1); +} + + +static int +__is_first_write_on_fd (xlator_t *this, fd_t *fd) +{ + int op_ret = 0; + int _ret = -1; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "first writev() on fd=%p, writing changelog", + fd); + + _ret = fd_ctx_set (fd, this, 0xaf1); + op_ret = 1; + } + + return op_ret; +} + + +static int +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_DATA_TRANSACTION: + if (priv->data_change_log) + ret = 1; + + break; + + case AFR_METADATA_TRANSACTION: + if (priv->metadata_change_log) + ret = 1; + + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (priv->entry_change_log) + ret = 1; + + break; + + case AFR_FLUSH_TRANSACTION: + ret = 1; + } + + return ret; +} + + +static int +__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + fd_t * fd = NULL; + + int op_ret = 0; + + priv = this->private; + local = frame->local; + + if (__changelog_enabled (priv, local->transaction.type)) { + switch (local->op) { + + case GF_FOP_WRITE: + case GF_FOP_FTRUNCATE: + /* + if it's a data transaction, we write the changelog + only on the first write on an fd + */ + + fd = local->fd; + if (!fd || __is_first_write_on_fd (this, fd)) + op_ret = 1; + + break; + + case GF_FOP_FLUSH: + /* only do post-op on flush() */ + + op_ret = 0; + break; + + default: + op_ret = 1; + } + } + + return op_ret; +} + + +static int +__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = 0; + afr_transaction_type type = -1; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + + if (__changelog_enabled (priv, type) + && (local->op != GF_FOP_WRITE) + && (local->op != GF_FOP_FTRUNCATE)) + ret = 1; + + return ret; +} + + +static int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_FLUSH_TRANSACTION: + case AFR_DATA_TRANSACTION: + ret = priv->data_lock_server_count; + break; + + case AFR_METADATA_TRANSACTION: + ret = priv->metadata_lock_server_count; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = priv->entry_lock_server_count; + break; + } + + return ret; +} + + +/* {{{ unlock */ + +int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + local->transaction.done (frame, this); + } + + return 0; +} + + +int +afr_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + + int i = 0; + int call_count = 0; + + afr_local_t *local = NULL; + afr_private_t * priv = this->private; + + local = frame->local; + + call_count = afr_locked_nodes_count (local->transaction.locked_nodes, + priv->child_count); + + if (call_count == 0) { + local->transaction.done (frame, this); + return 0; + } + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_UNLCK; + + if (local->transaction.locked_nodes[i]) { + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + local->fd, F_SETLK, &flock); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + call_count--; + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + } + break; + } + + if (!--call_count) + break; + } + } + + return 0; +} + +/* }}} */ + + +/* {{{ pending */ + +int32_t +afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int ret = 0; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + dict_t * xattr = dict_ref (get_new_dict ()); + + local = frame->local; + + __mark_all_success (local->pending_array, priv->child_count); + __mark_down_children (local->pending_array, priv->child_count, local->child_up); + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + local->call_count = call_count; + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + + +int32_t +afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = this->private; + loc_t * loc = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + loc = &local->loc; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->child_up[child_index] = 0; + + if (op_errno == ENOTSUP) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop not supported by %s", + priv->children[child_index]->name); + local->op_ret = -1; + } else if (!child_went_down (op_ret, op_errno)) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop failed on child %s: %s", + priv->children[child_index]->name, + strerror (op_errno)); + } + local->op_errno = op_errno; + } + + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOTSUP)) { + local->transaction.resume (frame, this); + } else { + local->transaction.fop (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int i = 0; + int ret = 0; + int call_count = 0; + dict_t *xattr = NULL; + + afr_local_t *local = NULL; + + local = frame->local; + xattr = get_new_dict (); + dict_ref (xattr); + + call_count = afr_up_children_count (priv->child_count, + local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + local->call_count = call_count; + + __mark_all_pending (local->pending_array, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, + local->transaction.pending, + local->pending_array, + (priv->child_count * + sizeof (int32_t))); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &(local->loc), + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + +/* }}} */ + +/* {{{ lock */ + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); + +int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int done = 0; + int child_index = (long) cookie; + + int call_count = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + /* wait for the other lock to return */ + call_count = --local->call_count; + } + + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/posix-locks xlator on server"); + local->op_ret = op_ret; + done = 1; + } + + local->child_up[child_index] = 0; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOSYS)) { + afr_unlock (frame, this); + } else { + local->transaction.locked_nodes[child_index] = 1; + local->transaction.lock_count++; + afr_lock_rec (frame, this, child_index + 1); + } + } + + return 0; +} + + +static loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +{ + int ret = 0; + + ret = strcmp (l1->path, l2->path); + + if (ret == 0) + ret = strcmp (b1, b2); + + if (ret <= 0) + return l1; + else + return l2; +} + + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + struct flock flock; + + loc_t * lower = NULL; + loc_t * higher = NULL; + + const char *lower_name = NULL; + const char *higher_name = NULL; + + local = frame->local; + priv = this->private; + + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_WRLCK; + + /* skip over children that are down */ + while ((child_index < priv->child_count) + && !local->child_up[child_index]) + child_index++; + + if ((child_index == priv->child_count) && + local->transaction.lock_count == 0) { + + gf_log (this->name, GF_LOG_DEBUG, + "unable to lock on even one child"); + + local->op_ret = -1; + local->op_errno = EAGAIN; + + local->transaction.done (frame, this); + + return 0; + + } + + if ((child_index == priv->child_count) + || (local->transaction.lock_count == + afr_lock_server_count (priv, local->transaction.type))) { + + /* we're done locking */ + + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + + return 0; + } + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->finodelk, + local->fd, F_SETLKW, &flock); + + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->inodelk, + &local->loc, F_SETLKW, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + local->call_count = 2; + + lower = lower_path (&local->transaction.parent_loc, + local->transaction.basename, + &local->transaction.new_parent_loc, + local->transaction.new_basename); + + lower_name = (lower == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + higher = (lower == &local->transaction.parent_loc ? + &local->transaction.new_parent_loc : + &local->transaction.parent_loc); + + higher_name = (higher == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + + /* TODO: these locks should be blocking */ + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + lower, lower_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + higher, higher_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + break; + } + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } + + break; + } + + return 0; +} + + +int32_t afr_lock (call_frame_t *frame, xlator_t *this) +{ + return afr_lock_rec (frame, this, 0); +} + + +/* }}} */ + +int32_t +afr_transaction_resume (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + if (__changelog_needed_post_op (frame, this)) { + afr_changelog_post_op (frame, this); + } else { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +/** + * afr_transaction_child_died - inform that a child died during an fop + */ + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + __mark_child_dead (local->pending_array, priv->child_count, child_index); +} + + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + afr_transaction_local_init (local, priv); + + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + } else { + afr_lock (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h new file mode 100644 index 000000000..49cdd219f --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __TRANSACTION_H__ +#define __TRANSACTION_H__ + +#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending" + +#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending" + +#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending" + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, + int child_index); + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +#endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c new file mode 100644 index 000000000..e4c1a8479 --- /dev/null +++ b/xlators/cluster/afr/src/afr.c @@ -0,0 +1,2338 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + +#include "afr-self-heal.h" + + +/** + * afr_local_cleanup - cleanup everything in frame->local + */ + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + sh = &local->self_heal; + priv = this->private; + + if (sh->buf) + FREE (sh->buf); + + if (sh->xattr) { + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + } + FREE (sh->xattr); + } + + if (sh->child_errno) + FREE (sh->child_errno); + + if (sh->pending_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->pending_matrix[i]); + } + FREE (sh->pending_matrix); + } + + if (sh->delta_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->delta_matrix[i]); + } + FREE (sh->delta_matrix); + } + + if (sh->sources) + FREE (sh->sources); + + if (sh->success) + FREE (sh->success); + + if (sh->healing_fd) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + } + + loc_wipe (&sh->parent_loc); +} + + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ + if (!local) + return; + + afr_local_sh_cleanup (local, this); + + FREE (local->child_errno); + FREE (local->pending_array); + + loc_wipe (&local->loc); + loc_wipe (&local->newloc); + + FREE (local->transaction.locked_nodes); + FREE (local->transaction.child_errno); + + FREE (local->transaction.basename); + FREE (local->transaction.new_basename); + + loc_wipe (&local->transaction.parent_loc); + loc_wipe (&local->transaction.new_parent_loc); + + if (local->fd) + fd_unref (local->fd); + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local->child_up); + + { /* lookup */ + if (local->cont.lookup.xattr) + dict_unref (local->cont.lookup.xattr); + } + + { /* getxattr */ + if (local->cont.getxattr.name) + FREE (local->cont.getxattr.name); + } + + { /* lk */ + if (local->cont.lk.locked_nodes) + FREE (local->cont.lk.locked_nodes); + } + + { /* checksum */ + if (local->cont.checksum.file_checksum) + FREE (local->cont.checksum.file_checksum); + if (local->cont.checksum.dir_checksum) + FREE (local->cont.checksum.dir_checksum); + } + + { /* create */ + if (local->cont.create.fd) + fd_unref (local->cont.create.fd); + } + + { /* writev */ + FREE (local->cont.writev.vector); + } + + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref (local->cont.setxattr.dict); + } + + { /* removexattr */ + FREE (local->cont.removexattr.name); + } + + { /* symlink */ + FREE (local->cont.symlink.linkpath); + } +} + + +int +afr_frame_return (call_frame_t *frame) +{ + afr_local_t *local = NULL; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + return call_count; +} + +/** + * first_up_child - return the index of the first child that is up + */ + +int +afr_first_up_child (afr_private_t *priv) +{ + xlator_t ** children = NULL; + int ret = -1; + int i = 0; + + LOCK (&priv->lock); + { + children = priv->children; + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i]) { + ret = i; + break; + } + } + } + UNLOCK (&priv->lock); + + return ret; +} + + +/** + * up_children_count - return the number of children that are up + */ + +int +afr_up_children_count (int child_count, unsigned char *child_up) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < child_count; i++) + if (child_up[i]) + ret++; + return ret; +} + + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) +{ + int ret = 0; + int i; + + for (i = 0; i < child_count; i++) + if (locked_nodes[i]) + ret++; + + return ret; +} + + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index) +{ + ino64_t scaled_ino = -1; + + if (ino == ((uint64_t) -1)) { + scaled_ino = ((uint64_t) -1); + goto out; + } + + scaled_ino = (ino * child_count) + child_index; + +out: + return scaled_ino; +} + + +int +afr_deitransform_orig (ino64_t ino, int child_count) +{ + int index = -1; + + index = ino % child_count; + + return index; +} + + +int +afr_deitransform (ino64_t ino, int child_count) +{ + return 0; +} + + +int +afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int ret = -1; + + local = frame->local; + + if (local->govinda_gOvinda) { + ret = inode_ctx_put (local->cont.lookup.inode, this, 1); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } else { + inode_ctx_del (local->cont.lookup.inode, this, NULL); + } + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + + return 0; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + struct stat * lookup_buf = NULL; + int call_count = -1; + int child_index = -1; + int prev_child_index = -1; + uint32_t open_fd_count = 0; + int ret = 0; + + child_index = (long) cookie; + priv = this->private; + + LOCK (&frame->lock); + { + local = frame->local; + + lookup_buf = &local->cont.lookup.buf; + + if (op_ret == -1) { + if (op_errno == ENOENT) + local->enoent_count++; + + if (op_errno != ENOTCONN) + local->op_errno = op_errno; + + goto unlock; + } + + if (afr_sh_has_metadata_pending (xattr, child_index, this)) + local->need_metadata_self_heal = 1; + + if (afr_sh_has_entry_pending (xattr, child_index, this)) + local->need_entry_self_heal = 1; + + if (afr_sh_has_data_pending (xattr, child_index, this)) + local->need_data_self_heal = 1; + + ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + local->open_fd_count += open_fd_count; + + /* in case of revalidate, we need to send stat of the + * child whose stat was sent during the first lookup. + * (so that time stamp does not vary with revalidate. + * in case it is down, stat of the fist success will + * be replied */ + + /* inode number should be preserved across revalidates */ + + if (local->success_count == 0) { + local->op_ret = op_ret; + + local->cont.lookup.inode = inode; + local->cont.lookup.xattr = dict_ref (xattr); + + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } else { + if (FILETYPE_DIFFERS (buf, lookup_buf)) { + /* mismatching filetypes with same name + -- Govinda !! GOvinda !!! + */ + local->govinda_gOvinda = 1; + } + + if (PERMISSION_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (SIZE_DIFFERS (buf, lookup_buf) + && S_ISREG (buf->st_mode)) { + local->need_data_self_heal = 1; + } + + prev_child_index = afr_deitransform_orig (lookup_buf->st_ino, + priv->child_count); + if (child_index < prev_child_index) { + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + } + + local->success_count++; + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (local->op_ret == 0) { + /* KLUDGE: assuming DHT will not itransform in + revalidate */ + if (local->cont.lookup.inode->ino) + lookup_buf->st_ino = + local->cont.lookup.inode->ino; + } + + if (local->success_count && local->enoent_count) { + local->need_metadata_self_heal = 1; + local->need_data_self_heal = 1; + local->need_entry_self_heal = 1; + } + + if (local->success_count) { + /* check for govinda_gOvinda case in previous lookup */ + if (!inode_ctx_get (local->cont.lookup.inode, + this, NULL)) + local->need_data_self_heal = 1; + } + + if ((local->need_metadata_self_heal + || local->need_data_self_heal + || local->need_entry_self_heal) + && (!local->open_fd_count)) { + + if (!local->cont.lookup.inode->st_mode) { + /* fix for RT #602 */ + local->cont.lookup.inode->st_mode = + lookup_buf->st_mode; + } + + afr_self_heal (frame, this, afr_self_heal_cbk); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + } + } + + return 0; +} + + +int +afr_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + int i = 0; + int32_t op_errno = 0; + + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + local->op_ret = -1; + + frame->local = local; + + loc_copy (&local->loc, loc); + + local->reval_child_index = 0; + + local->call_count = priv->child_count; + + local->child_up = memdup (priv->child_up, priv->child_count); + local->child_count = afr_up_children_count (priv->child_count, + local->child_up); + + /* By default assume ENOTCONN. On success it will be set to 0. */ + local->op_errno = ENOTCONN; + + if ((xattr_req == NULL) + && (priv->metadata_self_heal + || priv->data_self_heal + || priv->entry_self_heal)) + local->xattr_req = dict_new (); + else + local->xattr_req = dict_ref (xattr_req); + + if (priv->metadata_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->data_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->entry_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + loc, local->xattr_req); + } + + ret = 0; +out: + if (ret == -1) + AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + +/* {{{ open */ + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if ((local->cont.open.flags & O_TRUNC) + && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + } + + return 0; +} + + +int +afr_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int i = 0; + int ret = -1; + + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t wind_flags = flags & (~O_TRUNC); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + ret = inode_ctx_get (loc->inode, this, NULL); + if (ret == 0) { + /* if ctx is set it means self-heal failed */ + + gf_log (this->name, GF_LOG_WARNING, + "returning EIO, file has to be manually corrected " + "in backend"); + op_errno = EIO; + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + local->cont.open.flags = flags; + local->fd = fd_ref (fd); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + loc, wind_flags, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + +/* }}} */ + +/* {{{ flush */ + +int +afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_flush_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_flush_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_simple_flush_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +static int +__is_fd_ctx_set (xlator_t *this, fd_t *fd) +{ + int _ret = 0; + int op_ret = 0; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret == 0) + op_ret = 1; + + return op_ret; +} + + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + int i = 0; + int call_count = 0; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + if (__is_fd_ctx_set (this, fd)) { + local->op = GF_FOP_FLUSH; + local->transaction.fop = afr_flush_wind; + local->transaction.done = afr_flush_done; + + local->fd = fd_ref (fd); + + local->transaction.start = 0; + local->transaction.len = 0; + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); + } else { + /* + * if fd's ctx is not set, then there is no need + * to erase changelog. So just send the flush + */ + + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_simple_flush_cbk, + priv->children[i], + priv->children[i]->fops->flush, + fd); + + if (!--call_count) + break; + } + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsync, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsyncdir, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_xattrop_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + loc, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fxattrop_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + fd, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_inodelk_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + loc, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_finodelk_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + fd, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_entrylk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + loc, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fentrylk_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + fd, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_checksum_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + uint8_t *file_checksum, uint8_t *dir_checksum) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0 && (local->op_ret != 0)) { + local->op_ret = 0; + + local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.file_checksum, file_checksum, + ZR_FILENAME_MAX); + + local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.dir_checksum, dir_checksum, + ZR_FILENAME_MAX); + + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.checksum.file_checksum, + local->cont.checksum.dir_checksum); + + return 0; +} + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flag) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_checksum_cbk, + priv->children[i], + priv->children[i]->fops->checksum, + loc, flag); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct statvfs *statvfs) +{ + afr_local_t *local = NULL; + + int call_count = 0; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) { + local->op_ret = op_ret; + + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) + local->cont.statfs.buf = *statvfs; + } else { + local->cont.statfs.buf = *statvfs; + local->cont.statfs.buf_set = 1; + } + } + + if (op_ret == -1) + local->op_errno = op_errno; + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf); + + return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + int child_count = 0; + afr_local_t * local = NULL; + int i = 0; + + int ret = -1; + int call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_statfs_cbk, + priv->children[i], + priv->children[i]->fops->statfs, + loc); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + lock); + + return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i; + int call_count = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, + priv->child_count); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + return 0; + } + + local->call_count = call_count; + + local->cont.lk.flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND (frame, afr_lk_unlock_cbk, + priv->children[i], + priv->children[i]->fops->lk, + local->fd, F_SETLK, + &local->cont.lk.flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + call_count = --local->call_count; + + if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_lk_unlock (frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.flock = *lock; + local->cont.lk.locked_nodes[child_index] = 1; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, + local->fd, local->cont.lk.cmd, + &local->cont.lk.flock); + } else if (local->op_ret == -1) { + /* all nodes have gone down */ + + AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); + } else { + /* locking has succeeded on all nodes that are up */ + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + } + + return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, + struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int i = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_INIT (local, priv); + + frame->local = local; + + local->cont.lk.locked_nodes = CALLOC (priv->child_count, + sizeof (*local->cont.lk.locked_nodes)); + + if (!local->cont.lk.locked_nodes) { + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref (fd); + local->cont.lk.cmd = cmd; + local->cont.lk.flock = *flock; + + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, + priv->children[i], + priv->children[i]->fops->lk, + fd, cmd, flock); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if ((xlator_t *) child == priv->children[i]) + break; + } + + return i; +} + + +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) +{ + afr_private_t * priv = NULL; + unsigned char * child_up = NULL; + + int i = -1; + int up_children = 0; + + priv = this->private; + + if (!priv) + return 0; + + child_up = priv->child_up; + + switch (event) { + case GF_EVENT_CHILD_UP: + i = find_child_index (this, data); + + child_up[i] = 1; + + /* + if all the children were down, and one child came up, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 1) + default_notify (this, event, data); + + break; + + case GF_EVENT_CHILD_DOWN: + i = find_child_index (this, data); + + child_up[i] = 0; + + /* + if all children are down, and this was the last to go down, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 0) + default_notify (this, event, data); + + break; + + default: + default_notify (this, event, data); + } + + return 0; +} + + +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " + "as the 'favorite child'. This means that if a discrepancy in the content " + "or attributes (ownership, permission, etc.) of a file is detected among " + "the subvolumes, the file on '%s' will be considered the definitive " + "version and its contents will OVERWRITE the contents of the file on other " + "subvolumes. All versions of the file except that on '%s' " + "WILL BE LOST."; + +static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " + "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " + "applications write to the same region of a file, there is a possibility that " + "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " + "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " + "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value " + "greater than 0."; + +int32_t +init (xlator_t *this) +{ + afr_private_t * priv = NULL; + int child_count = 0; + xlator_list_t * trav = NULL; + int i = 0; + int ret = -1; + int op_errno = 0; + + char * read_subvol = NULL; + char * fav_child = NULL; + char * self_heal = NULL; + char * change_log = NULL; + + int32_t lock_server_count = 1; + + int fav_ret = -1; + int read_ret = -1; + int dict_ret = -1; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "AFR needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + ALLOC_OR_GOTO (this->private, afr_private_t, out); + + priv = this->private; + + read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); + priv->read_child = -1; + + fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); + priv->favorite_child = -1; + + /* Default values */ + + priv->data_self_heal = 1; + priv->metadata_self_heal = 1; + priv->entry_self_heal = 1; + + dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->data_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-self-heal %s' " + "defaulting to data-self-heal as 'on'", + self_heal); + priv->data_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-self-heal", + &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-self-heal %s' " + "defaulting to metadata-self-heal as 'on'", + self_heal); + priv->metadata_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->entry_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-self-heal %s' " + "defaulting to entry-self-heal as 'on'", + self_heal); + priv->entry_self_heal = 1; + } + } + + /* Change log options */ + + priv->data_change_log = 1; + priv->metadata_change_log = 0; + priv->entry_change_log = 1; + + dict_ret = dict_get_str (this->options, "data-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->data_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-change-log %s'. " + "defaulting to data-change-log as 'on'", + change_log); + priv->data_change_log = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, + &priv->metadata_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-change-log %s'. " + "defaulting to metadata-change-log as 'off'", + change_log); + priv->metadata_change_log = 0; + } + } + + dict_ret = dict_get_str (this->options, "entry-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->entry_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-change-log %s'. " + "defaulting to entry-change-log as 'on'", + change_log); + priv->entry_change_log = 1; + } + } + + /* Locking options */ + + priv->data_lock_server_count = 1; + priv->metadata_lock_server_count = 0; + priv->entry_lock_server_count = 1; + + dict_ret = dict_get_int32 (this->options, "data-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting data lock server count to %d", + lock_server_count); + + if (lock_server_count == 0) + gf_log (this->name, GF_LOG_WARNING, + no_lock_servers_warning_str); + + priv->data_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, + "metadata-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting metadata lock server count to %d", + lock_server_count); + priv->metadata_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting entry lock server count to %d", + lock_server_count); + + priv->entry_lock_server_count = lock_server_count; + } + + + trav = this->children; + while (trav) { + if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume '%s' specified as read child", + trav->xlator->name); + + priv->read_child = child_count; + } + + if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, trav->xlator->name, + trav->xlator->name, trav->xlator->name); + priv->favorite_child = child_count; + } + + child_count++; + trav = trav->next; + } + + /* XXX: return inode numbers from 1st subvolume till + afr supports read-subvolume based on inode's ctx + (and not itransform) for this reason afr_deitransform() + returns 0 always + */ + priv->read_child = 0; + + priv->wait_count = 1; + + priv->child_count = child_count; + LOCK_INIT (&priv->lock); + + priv->child_up = CALLOC (sizeof (unsigned char), child_count); + if (!priv->child_up) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + priv->children = CALLOC (sizeof (xlator_t *), child_count); + if (!priv->children) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + + trav = trav->next; + i++; + } + + ret = 0; +out: + return ret; +} + + +int +fini (xlator_t *this) +{ + return 0; +} + + +struct xlator_fops fops = { + .lookup = afr_lookup, + .open = afr_open, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .checksum = afr_checksum, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .readv = afr_readv, + + /* inode write */ + .chmod = afr_chmod, + .chown = afr_chown, + .fchmod = afr_fchmod, + .fchown = afr_fchown, + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .utimens = afr_utimens, + .setxattr = afr_setxattr, + .removexattr = afr_removexattr, + + /* dir read */ + .opendir = afr_opendir, + .readdir = afr_readdir, + .getdents = afr_getdents, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, + .setdents = afr_setdents, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"read-subvolume" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"favorite-child"}, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"metadata-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"entry-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h new file mode 100644 index 000000000..4cf6cdf9d --- /dev/null +++ b/xlators/cluster/afr/src/afr.h @@ -0,0 +1,523 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef __AFR_H__ +#define __AFR_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include "call-stub.h" +#include "compat-errno.h" + + +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ + + xlator_t **children; + + unsigned char *child_up; + + gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + + gf_boolean_t data_change_log; /* on/off */ + gf_boolean_t metadata_change_log; /* on/off */ + gf_boolean_t entry_change_log; /* on/off */ + + unsigned int read_child; /* read-subvolume */ + unsigned int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + + unsigned int data_lock_server_count; + unsigned int metadata_lock_server_count; + unsigned int entry_lock_server_count; + + unsigned int wait_count; /* # of servers to wait for success */ +} afr_private_t; + +typedef struct { + /* array of stat's, one for each child */ + struct stat *buf; + + /* array of xattr's, one for each child */ + dict_t **xattr; + + /* array of errno's, one for each child */ + int *child_errno; + + int32_t **pending_matrix; + int32_t **delta_matrix; + + int *sources; + int source; + int active_source; + int active_sinks; + int *success; + + fd_t *healing_fd; + int op_failed; + + int file_has_holes; + blksize_t block_size; + off_t file_size; + off_t offset; + + loc_t parent_loc; + int (*completion_cbk) (call_frame_t *frame, xlator_t *this); + call_frame_t *sh_frame; +} afr_self_heal_t; + + +typedef enum { + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + AFR_FLUSH_TRANSACTION, /* flush */ +} afr_transaction_type; + +typedef struct _afr_local { + unsigned int call_count; + unsigned int success_count; + unsigned int enoent_count; + + unsigned int need_metadata_self_heal; + unsigned int need_entry_self_heal; + unsigned int need_data_self_heal; + unsigned int govinda_gOvinda; + + unsigned int reval_child_index; + int32_t op_ret; + int32_t op_errno; + + int32_t *pending_array; + + loc_t loc; + loc_t newloc; + + fd_t *fd; + + glusterfs_fop_t fop; + + unsigned char *child_up; + int child_count; + + int32_t *child_errno; + + dict_t *xattr_req; + int open_fd_count; + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ + + int op; + struct { + struct { + unsigned char buf_set; + struct statvfs buf; + } statfs; + + struct { + inode_t *inode; + struct stat buf; + dict_t *xattr; + } lookup; + + struct { + int32_t flags; + } open; + + struct { + int32_t cmd; + struct flock flock; + unsigned char *locked_nodes; + } lk; + + struct { + uint8_t *file_checksum; + uint8_t *dir_checksum; + } checksum; + + /* inode read */ + + struct { + int32_t mask; + int last_tried; /* index of the child we tried previously */ + } access; + + struct { + int last_tried; + ino_t ino; + } stat; + + struct { + int last_tried; + ino_t ino; + } fstat; + + struct { + size_t size; + int last_tried; + } readlink; + + struct { + const char *name; + int last_tried; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_tried; + } readv; + + /* dir read */ + + struct { + int success_count; + int32_t op_ret; + int32_t op_errno; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + + int last_tried; + } readdir; + + struct { + int32_t op_ret; + int32_t op_errno; + + size_t size; + off_t offset; + int32_t flag; + + int last_tried; + } getdents; + + /* inode write */ + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } chmod; + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } fchmod; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } chown; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } fchown; + + struct { + ino_t ino; + struct stat buf; + + int32_t op_ret; + + struct iovec *vector; + dict_t *refs; + int32_t count; + off_t offset; + } writev; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } truncate; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } ftruncate; + + struct { + ino_t ino; + struct timespec tv[2]; + struct stat buf; + } utimens; + + struct { + dict_t *dict; + int32_t flags; + } setxattr; + + struct { + const char *name; + } removexattr; + + /* dir write */ + + struct { + ino_t ino; + fd_t *fd; + int32_t flags; + mode_t mode; + inode_t *inode; + struct stat buf; + } create; + + struct { + ino_t ino; + dev_t dev; + mode_t mode; + inode_t *inode; + struct stat buf; + } mknod; + + struct { + ino_t ino; + int32_t mode; + inode_t *inode; + struct stat buf; + } mkdir; + + struct { + int32_t op_ret; + int32_t op_errno; + } unlink; + + struct { + int32_t op_ret; + int32_t op_errno; + } rmdir; + + struct { + ino_t ino; + struct stat buf; + } rename; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + } link; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + char *linkpath; + } symlink; + + struct { + int32_t flags; + dir_entry_t *entries; + int32_t count; + } setdents; + } cont; + + struct { + off_t start, len; + + unsigned char *locked_nodes; + int lock_count; + + const char *basename; + const char *new_basename; + + char *pending; + + loc_t parent_loc; + loc_t new_parent_loc; + + afr_transaction_type type; + + int success_count; + int erase_pending; + int failure_count; + + int last_tried; + int32_t *child_errno; + + call_frame_t *main_frame; + + int (*fop) (call_frame_t *frame, xlator_t *this); + + int (*done) (call_frame_t *frame, xlator_t *this); + + int (*resume) (call_frame_t *frame, xlator_t *this); + + int (*unwind) (call_frame_t *frame, xlator_t *this); + } transaction; + + afr_self_heal_t self_heal; +} afr_local_t; + +/* try alloc and if it fails, goto label */ +#define ALLOC_OR_GOTO(var, type, label) do { \ + var = CALLOC (sizeof (type), 1); \ + if (!var) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "out of memory :("); \ + op_errno = ENOMEM; \ + goto label; \ + } \ + } while (0); + + +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) + +/* have we tried all children? */ +#define all_tried(i, count) ((i) == (count) - 1) + +void +afr_build_parent_loc (loc_t *parent, loc_t *child); + +int +afr_up_children_count (int child_count, unsigned char *child_up); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); + +int +afr_first_up_child (afr_private_t *priv); + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index); + +int +afr_deitransform (ino64_t ino, int child_count); + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_frame_return (call_frame_t *frame); + +#define AFR_STACK_UNWIND(frame, params ...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +/* allocate and return a string that is the basename of argument */ +static inline char * +AFR_BASENAME (const char *str) +{ + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = strdup (str); + __basename_str = strdup (basename (__tmp_str)); + FREE (__tmp_str); + return __basename_str; +} + +/* initialize local_t */ +static inline int +AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +{ + local->child_up = CALLOC (sizeof (*local->child_up), + priv->child_count); + if (!local->child_up) { + return -ENOMEM; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + + + local->call_count = afr_up_children_count (priv->child_count, local->child_up); + if (local->call_count == 0) + return -ENOTCONN; + + local->transaction.erase_pending = 1; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + return 0; +} + + +static inline int +afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +{ + local->child_errno = CALLOC (sizeof (*local->child_errno), + priv->child_count); + if (!local->child_errno) { + return -ENOMEM; + } + + local->pending_array = CALLOC (sizeof (*local->pending_array), + priv->child_count); + if (!local->pending_array) { + return -ENOMEM; + } + + local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes), + priv->child_count); + + local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno), + priv->child_count); + + return 0; +} + +#endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/Makefile.am b/xlators/cluster/dht/Makefile.am new file mode 100644 index 000000000..f963effea --- /dev/null +++ b/xlators/cluster/dht/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src
\ No newline at end of file diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am new file mode 100644 index 000000000..b7d07d137 --- /dev/null +++ b/xlators/cluster/dht/src/Makefile.am @@ -0,0 +1,30 @@ + +xlator_LTLIBRARIES = dht.la nufa.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + + +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-hashfn-tea.c + +dht_la_SOURCES = $(dht_common_source) dht.c + +nufa_la_SOURCES = $(dht_common_source) nufa.c + +dht_la_LDFLAGS = -module -avoidversion +dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = dht-common.h dht-common.c + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/distribute.so + +install-data-hook: + ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
\ No newline at end of file diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c new file mode 100644 index 000000000..5e4979e31 --- /dev/null +++ b/xlators/cluster/dht/src/dht-common.c @@ -0,0 +1,3470 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "defaults.h" + + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + +int +dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int ret = 0; + + local = frame->local; + ret = op_ret; + + if (ret == 0) { + layout = local->selfheal.layout; + ret = inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + + if (ret == 0) + local->selfheal.layout = NULL; + + if (local->st_ino) { + local->stbuf.st_ino = local->st_ino; + } else { + gf_log (this->name, GF_LOG_WARNING, + "could not find hashed subvolume for %s", + local->loc.path); + } + } + + DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr); + + return 0; +} + + +int +dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = 0; + int is_dir = 0; + + conf = this->private; + local = frame->local; + prev = cookie; + + layout = local->layout; + + LOCK (&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + /* TODO: assert equal hash type in xattr, local->xattr */ + + /* TODO: always ensure same subvolume is in layout->list[0] */ + + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); + + if (op_ret == -1) { + local->op_errno = ENOENT; + gf_log (this->name, GF_LOG_WARNING, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); + + goto unlock; + } + + is_dir = check_is_dir (inode, stbuf, xattr); + if (!is_dir) + goto unlock; + + local->op_ret = 0; + if (local->xattr == NULL) + local->xattr = dict_ref (xattr); + if (local->inode == NULL) + local->inode = inode_ref (inode); + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + if (prev->this == local->hashed_subvol) + local->st_ino = local->stbuf.st_ino; + + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + if (local->op_ret == 0) { + ret = dht_layout_normalize (this, &local->loc, layout); + + local->layout = NULL; + + if (ret != 0) { + layout->gen = conf->gen; + + gf_log (this->name, GF_LOG_WARNING, + "fixing assignment on %s", + local->loc.path); + goto selfheal; + } + + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + + if (local->st_ino) { + local->stbuf.st_ino = local->st_ino; + } else { + gf_log (this->name, GF_LOG_WARNING, + "could not find hashed subvolume for %s", + local->loc.path); + } + } + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + } + + return 0; + +selfheal: + ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, + &local->loc, layout); + + return 0; +} + +int +dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + + if (op_errno != ENOTCONN && op_errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + } + + goto unlock; + } + + if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching filetypes 0%o v/s 0%o for %s", + (stbuf->st_mode & S_IFMT), + (local->inode->st_mode & S_IFMT), + local->loc.path); + + local->op_ret = -1; + local->op_errno = EINVAL; + + goto unlock; + } + + layout = dht_layout_get (this, inode); + + is_dir = check_is_dir (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile found in revalidate for %s", + local->loc.path); + local->layout_mismatch = 1; + + goto unlock; + } + + if (is_dir) { + ret = dht_layout_dir_mismatch (this, layout, + prev->this, &local->loc, + xattr); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching layouts for %s", + local->loc.path); + + local->layout_mismatch = 1; + + goto unlock; + } + } + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + local->op_ret = 0; + local->stbuf.st_ino = local->st_ino; + + if (!local->xattr) + local->xattr = dict_ref (xattr); + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + if (!S_ISDIR (local->stbuf.st_mode) + && (local->hashed_subvol != local->cached_subvol) + && (local->stbuf.st_nlink == 1)) + local->stbuf.st_mode |= S_ISVTX; + + if (local->layout_mismatch) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + } + + return 0; +} + + +int +dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *cached_subvol = NULL; + + local = frame->local; + cached_subvol = local->cached_subvol; + + layout = dht_layout_for_subvol (this, local->cached_subvol); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + cached_subvol ? cached_subvol->name : "<nil>"); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + local->op_ret = 0; + if (local->stbuf.st_nlink == 1) + local->stbuf.st_mode |= S_ISVTX; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + return 0; +} + + +int +dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + int is_linkfile = 0; + int is_dir = 0; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + + conf = this->private; + + local = frame->local; + loc = &local->loc; + + prev = cookie; + subvol = prev->this; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + goto unlock; + } + + is_linkfile = check_is_linkfile (inode, buf, xattr); + is_dir = check_is_dir (inode, buf, xattr); + + if (is_linkfile) { + link_subvol = dht_linkfile_subvol (this, inode, buf, + xattr); + gf_log (this->name, GF_LOG_DEBUG, + "found on %s linkfile %s (-> %s)", + subvol->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "found on %s file %s", + subvol->name, loc->path); + } + + if (!local->cached_subvol) { + /* found one file */ + dht_stat_merge (this, &local->stbuf, buf, subvol); + local->xattr = dict_ref (xattr); + local->cached_subvol = subvol; + } else { + gf_log (this->name, GF_LOG_WARNING, + "multiple subvolumes (%s and %s atleast) have " + "file %s", local->cached_subvol->name, + subvol->name, local->loc.path); + } + } +unlock: + UNLOCK (&frame->lock); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_WARNING, + "deleting stale linkfile %s on %s", + loc->path, subvol->name); + dht_linkfile_unlink (frame, this, subvol, loc); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + if (!cached_subvol) { + DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL); + return 0; + } + + gf_log (this->name, GF_LOG_WARNING, + "linking file %s existing on %s to %s (hash)", + loc->path, cached_subvol->name, hashed_subvol->name); + + dht_linkfile_create (frame, dht_lookup_linkfile_create_cbk, + cached_subvol, hashed_subvol, loc); + } + + return 0; +} + + +int +dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int call_cnt = 0; + + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + if (!local->inode) + local->inode = inode_ref (loc->inode); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_everywhere_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + + return 0; +} + + +int +dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + call_frame_t *prev = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + + prev = cookie; + subvol = prev->this; + + local = frame->local; + loc = &local->loc; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "lookup of %s on %s (following linkfile) failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + /* TODO: assert type is non-dir and non-linkfile */ + + if (stbuf->st_nlink == 1) + stbuf->st_mode |= S_ISVTX; + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + + return 0; +} + + +int +dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_layout_t *layout = NULL; + char is_linkfile = 0; + char is_dir = 0; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + call_frame_t *prev = NULL; + int call_cnt = 0; + + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + + if (op_ret == 0) { + is_dir = check_is_dir (inode, stbuf, xattr); + if (is_dir) { + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + } + } + + if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + dht_itransform (this, prev->this, stbuf->st_ino, + &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + goto out; + } + + if (is_linkfile) { + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + + if (!subvol) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + return 0; +} + + +int +dht_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "copying location failed for path=%s", + loc->path); + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + cached_subvol = dht_subvol_get_cached (this, loc->inode); + + local->cached_subvol = cached_subvol; + local->hashed_subvol = hashed_subvol; + + if (is_revalidate (loc)) { + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "revalidate without cache. path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_WARNING, + "incomplete layout failure for path=%s", + loc->path); + op_errno = EAGAIN; + goto err; + } + + local->inode = inode_ref (loc->inode); + local->st_ino = loc->inode->ino; + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + /* TODO: remove the hard-coding */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht.linkto", 256); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + STACK_WIND (frame, dht_lookup_cbk, + hashed_subvol, hashed_subvol->fops->lookup, + loc, local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int +dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + if (local->inode) + local->stbuf.st_ino = local->inode->ino; + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->stat, + loc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "local allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt;; + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->fstat, + fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->chmod, + loc, mode); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->chown, + loc, uid, gid); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fchmod, + fd, mode); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fchown, + fd, uid, gid); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->utimens, + loc, tv); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->truncate, + loc, offset); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->ftruncate, + fd, offset); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->access, + loc, mask); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, const char *path) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, path); + + return 0; +} + + +int +dht_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readlink_cbk, + subvol, subvol->fops->readlink, + loc, size); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr); + + return 0; +} + + +int +dht_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->getxattr, + loc, key); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr, int flags) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, flags); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->removexattr, + loc, key); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + + return 0; +} + + +int +dht_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, fd_t *fd) +{ + xlator_t *subvol = NULL; + int ret = -1; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_fd_cbk, + subvol, subvol->fops->open, + loc, flags, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iovec *vector, int count, struct stat *stbuf) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + + +int +dht_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readv_cbk, + subvol, subvol->fops->readv, + fd, size, off); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct stat *stbuf) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf); + + return 0; +} + + +int +dht_writev (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iovec *vector, int count, off_t off) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + fd, vector, count, off); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0); + + return 0; +} + + +int +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->flush, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocatoin failed :("); + goto err; + } + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->fsync, + fd, datasync); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct flock *flock) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + + return 0; +} + + +int +dht_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct flock *flock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_lk_cbk, + subvol, subvol->fops->lk, + fd, cmd, flock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +/* gf_lk no longer exists +int +dht_gf_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct flock *flock) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + + return 0; +} + + +int +dht_gf_lk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int cmd, struct flock *flock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_gf_lk_cbk, + subvol, subvol->fops->gf_lk, + fd, cmd, flock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} +*/ + +int +dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct statvfs *statvfs) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + + /* TODO: normalize sizes */ + local->statvfs.f_bsize = statvfs->f_bsize; + local->statvfs.f_frsize = statvfs->f_frsize; + + local->statvfs.f_blocks += statvfs->f_blocks; + local->statvfs.f_bfree += statvfs->f_bfree; + local->statvfs.f_bavail += statvfs->f_bavail; + local->statvfs.f_files += statvfs->f_files; + local->statvfs.f_ffree += statvfs->f_ffree; + local->statvfs.f_favail += statvfs->f_favail; + local->statvfs.f_fsid = statvfs->f_fsid; + local->statvfs.f_flag = statvfs->f_flag; + local->statvfs.f_namemax = statvfs->f_namemax; + + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs); + + return 0; +} + + +int +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_statfs_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fd_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + xlator_t *next = NULL; + dht_layout_t *layout = NULL; + int count = 0; + + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + + if (op_ret < 0) + goto done; + + layout = dht_layout_get (this, local->fd->inode); + + list_for_each_entry (orig_entry, &orig_entries->list, list) { + subvol = dht_layout_search (this, layout, orig_entry->d_name); + + if (!subvol || subvol == prev->this) { + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto unwind; + } + + dht_itransform (this, subvol, orig_entry->d_ino, + &entry->d_ino); + dht_itransform (this, subvol, orig_entry->d_off, + &entry->d_off); + + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + list_add_tail (&entry->list, &entries.list); + count++; + } + } + op_ret = count; + +done: + if (count == 0) { + next = dht_subvol_next (this, prev->this); + if (!next) { + goto unwind; + } + + STACK_WIND (frame, dht_readdir_cbk, + next, next->fops->readdir, + local->fd, local->size, 0); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries); + + gf_dirent_free (&entries); + + return 0; +} + + +int +dht_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t yoff) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + xlator_t *xvol = NULL; + off_t xoff = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref (fd); + local->size = size; + + dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + + /* TODO: do proper readdir */ + STACK_WIND (frame, dht_readdir_cbk, + xvol, xvol->fops->readdir, + fd, size, xoff); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) + local->op_errno = op_errno; + + if (op_ret == 0) + local->op_ret = 0; + } + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fsyncdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->fsyncdir, + fd, datasync); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + + + if (op_ret == -1) + goto out; + + prev = cookie; + + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = dht_layout_for_subvol (this, prev->this); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context"); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +dht_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->mknod, + loc, mode, rdev); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->symlink, + linkname, loc); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + if (hashed_subvol != cached_subvol) + local->call_cnt++; + + STACK_WIND (frame, dht_err_cbk, + cached_subvol, cached_subvol->fops->unlink, loc); + + if (hashed_subvol != cached_subvol) + STACK_WIND (frame, dht_err_cbk, + hashed_subvol, hashed_subvol->fops->unlink, loc); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + + prev = cookie; + local = frame->local; + + if (op_ret == -1) + goto out; + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + stbuf->st_ino = local->loc.inode->ino; + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + + return 0; +} + + +int +dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + xlator_t *srcvol = NULL; + + + if (op_ret == -1) + goto err; + + local = frame->local; + srcvol = local->linkfile.srcvol; + + STACK_WIND (frame, dht_link_cbk, + srcvol, srcvol->fops->link, + &local->loc, &local->loc2); + + return 0; + +err: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + + return 0; +} + + +int +dht_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + cached_subvol = dht_subvol_get_cached (this, oldloc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, newloc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc, oldloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + if (hashed_subvol != cached_subvol) { + dht_linkfile_create (frame, dht_link_linkfile_cbk, + cached_subvol, hashed_subvol, newloc); + } else { + STACK_WIND (frame, dht_link_cbk, + cached_subvol, cached_subvol->fops->link, + oldloc, newloc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + fd_t *fd, inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + + + if (op_ret == -1) + goto out; + + prev = cookie; + + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = dht_layout_for_subvol (this, prev->this); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context"); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf); + return 0; +} + + +int +dht_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + + + local = frame->local; + layout = local->selfheal.layout; + + if (op_ret == 0) { + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + local->selfheal.layout = NULL; + local->stbuf.st_ino = local->st_ino; + } + + DHT_STACK_UNWIND (frame, op_ret, op_errno, + local->inode, &local->stbuf); + + return 0; +} + + +int +dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + + LOCK (&frame->lock); + { + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, NULL); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + local->layout = NULL; + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + + return 0; +} + +int +dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + conf = this->private; + hashed_subvol = local->hashed_subvol; + + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, NULL); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } + local->op_ret = 0; + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + local->st_ino = local->stbuf.st_ino; + + local->call_cnt = conf->subvolume_cnt - 1; + + if (local->call_cnt == 0) { + local->layout = NULL; + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == hashed_subvol) + continue; + STACK_WIND (frame, dht_mkdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->mkdir, + &local->loc, local->mode); + } + return 0; +err: + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int ret = -1; + xlator_t *hashed_subvol = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + + if (hashed_subvol == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "hashed subvol not found"); + op_errno = EINVAL; + goto err; + } + + local->hashed_subvol = hashed_subvol; + local->inode = inode_ref (loc->inode); + ret = loc_copy (&local->loc, loc); + local->mode = mode; + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + STACK_WIND (frame, dht_mkdir_hashed_cbk, + hashed_subvol, + hashed_subvol->fops->mkdir, + loc, mode); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + + local = frame->local; + local->layout = NULL; + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + uint64_t tmp_layout = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + + if (op_errno != ENOENT) + local->need_selfheal = 1; + + gf_log (this->name, GF_LOG_ERROR, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + inode_ctx_get (local->loc.inode, this, + &tmp_layout); + layout = (dht_layout_t *)(long)tmp_layout; + + /* TODO: neater interface needed below */ + local->stbuf.st_mode = local->loc.inode->st_mode; + + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, layout); + } else { + DHT_STACK_UNWIND (frame, local->op_ret, + local->op_errno); + } + } + + return 0; +} + + +int +dht_rmdir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rmdir, + &local->loc); + } + + return 0; + +err: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + return 0; +} + + +int +dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret > 2) { + gf_log (this->name, GF_LOG_DEBUG, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; +} + + +int +dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } + + STACK_WIND (frame, dht_rmdir_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0); + + return 0; + +err: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; +} + + +int +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, local->fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +dht_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, + dht_xattrop_cbk, + subvol, subvol->fops->xattrop, + loc, flags, dict); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +dht_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +dht_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + dht_fxattrop_cbk, + subvol, subvol->fops->fxattrop, + fd, flags, dict); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +static int32_t +dht_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +dht_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, + dht_inodelk_cbk, + subvol, subvol->fops->inodelk, + loc, cmd, lock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +dht_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + STACK_WIND (frame, + dht_finodelk_cbk, + subvol, subvol->fops->finodelk, + fd, cmd, lock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +dht_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_entrylk_cbk, + subvol, subvol->fops->entrylk, + loc, basename, cmd, type); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + +static int32_t +dht_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +dht_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fentrylk_cbk, + subvol, subvol->fops->fentrylk, + fd, basename, cmd, type); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_forget (xlator_t *this, inode_t *inode) +{ + uint64_t tmp_layout = 0; + dht_layout_t *layout = NULL; + + inode_ctx_get (inode, this, &tmp_layout); + + if (!layout) + return 0; + layout = (dht_layout_t *)(long)tmp_layout; + if (!layout->preset) + FREE (layout); + + return 0; +} + + + +static int +dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *)); + if (!conf->subvolumes) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + return -1; + } + conf->subvolume_cnt = cnt; + + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + + conf->subvolume_status = CALLOC (cnt, sizeof (char)); + if (!conf->subvolume_status) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + return -1; + } + + return 0; +} + + +int +dht_notify (xlator_t *this, int event, void *data, ...) +{ + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + + + conf = this->private; + + switch (event) { + case GF_EVENT_CHILD_UP: + subvol = data; + + conf->gen++; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + + if (cnt == -1) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + subvol->name); + break; + } + + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 1; + } + UNLOCK (&conf->subvolume_lock); + + break; + + case GF_EVENT_CHILD_DOWN: + subvol = data; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + + if (cnt == -1) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + subvol->name); + break; + } + + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 0; + } + UNLOCK (&conf->subvolume_lock); + + break; + } + + ret = default_notify (this, event, data); + + return ret; +} + diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h new file mode 100644 index 000000000..17017381b --- /dev/null +++ b/xlators/cluster/dht/src/dht-common.h @@ -0,0 +1,212 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _DHT_H +#define _DHT_H + + +typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno); + + +struct dht_layout { + int cnt; + int preset; + int gen; + int type; + struct { + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + xlator_t *xlator; + } list[0]; +}; +typedef struct dht_layout dht_layout_t; + + +struct dht_local { + int call_cnt; + loc_t loc; + loc_t loc2; + int op_ret; + int op_errno; + int layout_mismatch; + struct stat stbuf; + struct statvfs statvfs; + fd_t *fd; + inode_t *inode; + dict_t *xattr; + dict_t *xattr_req; + dht_layout_t *layout; + size_t size; + ino_t st_ino; + xlator_t *src_hashed, *src_cached; + xlator_t *dst_hashed, *dst_cached; + xlator_t *cached_subvol; + xlator_t *hashed_subvol; + char need_selfheal; + struct { + fop_mknod_cbk_t linkfile_cbk; + struct stat stbuf; + loc_t loc; + inode_t *inode; + dict_t *xattr; + xlator_t *srcvol; + } linkfile; + struct { + uint32_t hole_cnt; + uint32_t overlaps_cnt; + uint32_t missing; + uint32_t down; + uint32_t misc; + dht_selfheal_dir_cbk_t dir_cbk; + dht_layout_t *layout; + } selfheal; + + /* needed by nufa */ + int32_t flags; + mode_t mode; + dev_t rdev; +}; +typedef struct dht_local dht_local_t; + + +struct dht_conf { + gf_lock_t subvolume_lock; + int subvolume_cnt; + xlator_t **subvolumes; + xlator_t *local_volume; /* Needed by NUFA */ + char *subvolume_status; + dht_layout_t **file_layouts; + dht_layout_t **dir_layouts; + dht_layout_t *default_dir_layout; + gf_boolean_t search_unhashed; + int gen; +}; +typedef struct dht_conf dht_conf_t; + + +struct dht_disk_layout { + uint32_t cnt; + uint32_t type; + struct { + uint32_t start; + uint32_t stop; + } list[1]; +}; +typedef struct dht_disk_layout dht_disk_layout_t; + +#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) + +#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) + +#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) + +#define is_last_call(cnt) (cnt == 0) + +#define DHT_LINKFILE_MODE (S_ISVTX) +#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE) + +#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode)) + +#define layout_is_sane(layout) ((layout) && (layout->cnt > 0)) + +#define DHT_STACK_UNWIND(frame, params ...) do { \ + dht_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + dht_local_wipe (__local); \ + } while (0) + +#define DHT_STACK_DESTROY(frame) do { \ + dht_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + dht_local_wipe (__local); \ + } while (0) + +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); + +xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, + struct stat *buf, dict_t *xattr); +int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc); + +int dht_layouts_init (xlator_t *this, dht_conf_t *conf); +int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr); + +int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p); +int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, int32_t *disk_layout); + + +int dht_frame_return (call_frame_t *frame); + +int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); +int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, + uint64_t *x); + +void dht_local_wipe (dht_local_t *local); +dht_local_t *dht_local_init (call_frame_t *frame); +int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from, + xlator_t *subvol); + +xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); +xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); +xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); + +int dht_hash_compute (int type, const char *name, uint32_t *hash_p); + +int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc); +int +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +int +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); + +int dht_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); +#endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-hashfn-tea.c b/xlators/cluster/dht/src/dht-hashfn-tea.c new file mode 100644 index 000000000..8437b4955 --- /dev/null +++ b/xlators/cluster/dht/src/dht-hashfn-tea.c @@ -0,0 +1,146 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <stdint.h> +#include <stdio.h> +#include <string.h> + + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + + +static int +tearound (int rounds, uint32_t *array, uint32_t *h0, uint32_t *h1) +{ + uint32_t sum = 0; + int n = 0; + uint32_t b0 = 0; + uint32_t b1 = 0; + + b0 = *h0; + b1 = *h1; + + n = rounds; + + do { + sum += DELTA; + b0 += ((b1 << 4) + array[0]) + ^ (b1 + sum) + ^ ((b1 >> 5) + array[1]); + b1 += ((b0 << 4) + array[2]) + ^ (b0 + sum) + ^ ((b0 >> 5) + array[3]); + } while (--n); + + *h0 += b0; + *h1 += b1; + + return 0; +} + + +uint32_t +__pad (int len) +{ + uint32_t pad = 0; + + pad = (uint32_t) len | ((uint32_t) len << 8); + pad |= pad << 16; + + return pad; +} + + +uint32_t +dht_hashfn_tea (const char *msg, int len) +{ + uint32_t h0 = 0x9464a485; + uint32_t h1 = 0x542e1a94; + uint32_t array[4]; + uint32_t pad = 0; + int i = 0; + int j = 0; + int full_quads = 0; + int full_words = 0; + int full_bytes = 0; + uint32_t *intmsg = NULL; + int word = 0; + + + intmsg = (uint32_t *) msg; + pad = __pad (len); + + full_bytes = len; + full_words = len / 4; + full_quads = len / 16; + + for (i = 0; i < full_quads; i++) { + for (j = 0; j < 4; j++) { + word = *intmsg; + array[j] = word; + intmsg++; + full_words--; + full_bytes -= 4; + } + tearound (PARTROUNDS, &array[0], &h0, &h1); + } + + if ((len % 16) == 0) { + goto done; + } + + for (j = 0; j < 4; j++) { + if (full_words) { + word = *intmsg; + array[j] = word; + intmsg++; + full_words--; + full_bytes -= 4; + } else { + array[j] = pad; + while (full_bytes) { + array[j] <<= 8; + array[j] |= msg[len - full_bytes]; + full_bytes--; + } + } + } + tearound (FULLROUNDS, &array[0], &h0, &h1); + +done: + return h0 ^ h1; +} + + +#if 0 +int +main (int argc, char *argv[]) +{ + int i = 0; + int hashval = 0; + + for (i = 1; i < argc; i++) { + hashval = tea (argv[i], strlen (argv[i])); + printf ("%s: %x\n", argv[i], hashval); + } +} +#endif diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c new file mode 100644 index 000000000..9e321a43c --- /dev/null +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -0,0 +1,88 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +uint32_t dht_hashfn_tea (const char *name, int len); + + +typedef enum { + DHT_HASH_TYPE_TEA, +} dht_hashfn_type_t; + + +int +dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) +{ + int ret = 0; + uint32_t hash = 0; + + switch (type) { + case DHT_HASH_TYPE_TEA: + hash = dht_hashfn_tea (name, strlen (name)); + break; + default: + ret = -1; + break; + } + + if (ret == 0) { + *hash_p = hash; + } + + return ret; +} + + +#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ + rsync_frndly_name = (char *) name; \ + if (name[0] == '.') { \ + char *dot = 0; \ + int namelen = 0; \ + \ + dot = strrchr (name, '.'); \ + if (dot && dot > (name + 1) && *(dot + 1)) { \ + namelen = (dot - name); \ + rsync_frndly_name = alloca (namelen); \ + strncpy (rsync_frndly_name, name + 1, \ + namelen); \ + rsync_frndly_name[namelen - 1] = 0; \ + } \ + } \ + } while (0); + + +int +dht_hash_compute (int type, const char *name, uint32_t *hash_p) +{ + char *rsync_friendly_name = NULL; + + MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + + return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); +} diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c new file mode 100644 index 000000000..52d072002 --- /dev/null +++ b/xlators/cluster/dht/src/dht-helper.c @@ -0,0 +1,326 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +int +dht_frame_return (call_frame_t *frame) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + + if (!frame) + return -1; + + local = frame->local; + + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK (&frame->lock); + + return this_call_cnt; +} + + +int +dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +{ + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + conf = this->private; + + max = conf->subvolume_cnt; + cnt = dht_subvol_cnt (this, subvol); + + y = ((x * max) + cnt); + +out: + if (y_p) + *y_p = y; + + return 0; +} + + +int +dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, + uint64_t *x_p) +{ + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + xlator_t *subvol = 0; + + + conf = this->private; + max = conf->subvolume_cnt; + + cnt = y % max; + x = y / max; + + subvol = conf->subvolumes[cnt]; + + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; + + return 0; +} + + +void +dht_local_wipe (dht_local_t *local) +{ + if (!local) + return; + + loc_wipe (&local->loc); + loc_wipe (&local->loc2); + + if (local->xattr) + dict_unref (local->xattr); + + if (local->inode) + inode_unref (local->inode); + + if (local->layout) + FREE (local->layout); + + loc_wipe (&local->linkfile.loc); + + if (local->linkfile.xattr) + dict_unref (local->linkfile.xattr); + + if (local->linkfile.inode) + inode_unref (local->linkfile.inode); + + if (local->fd) { + fd_unref (local->fd); + local->fd = NULL; + } + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local); +} + + +dht_local_t * +dht_local_init (call_frame_t *frame) +{ + dht_local_t *local = NULL; + + /* TODO: use mem-pool */ + local = CALLOC (1, sizeof (*local)); + + if (!local) + return NULL; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + frame->local = local; + + return local; +} + + +char * +basestr (const char *str) +{ + char *basestr = NULL; + + basestr = strrchr (str, '/'); + if (basestr) + basestr ++; + + return basestr; +} + +xlator_t * +dht_first_up_child (xlator_t *this) +{ + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + + conf = this->private; + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } + } + } + UNLOCK (&conf->subvolume_lock); + + return child; +} + +xlator_t * +dht_subvol_get_hashed (xlator_t *this, loc_t *loc) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + + if (is_fs_root (loc)) { + subvol = dht_first_up_child (this); + goto out; + } + + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout missing path=%s parent=%"PRId64, + loc->path, loc->parent->ino); + goto out; + } + + subvol = dht_layout_search (this, layout, loc->name); + + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "could not find subvolume for path=%s", + loc->path); + goto out; + } + +out: + return subvol; +} + + +xlator_t * +dht_subvol_get_cached (xlator_t *this, inode_t *inode) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + + + layout = dht_layout_get (this, inode); + + if (!layout) { + goto out; + } + + subvol = layout->list[0].xlator; + +out: + return subvol; +} + + +xlator_t * +dht_subvol_next (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; + } + } + + return next; +} + + +int +dht_subvol_cnt (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; + } + } + + return ret; +} + + +#define set_if_greater(a, b) do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) + +int +dht_stat_merge (xlator_t *this, struct stat *to, + struct stat *from, xlator_t *subvol) +{ + to->st_dev = from->st_dev; + + dht_itransform (this, subvol, from->st_ino, &to->st_ino); + + to->st_mode = from->st_mode; + to->st_nlink = from->st_nlink; + to->st_uid = from->st_uid; + to->st_gid = from->st_gid; + to->st_rdev = from->st_rdev; + to->st_size += from->st_size; + to->st_blksize = from->st_blksize; + to->st_blocks += from->st_blocks; + + set_if_greater (to->st_atime, from->st_atime); + set_if_greater (to->st_mtime, from->st_mtime); + set_if_greater (to->st_ctime, from->st_ctime); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c new file mode 100644 index 000000000..08b4a2746 --- /dev/null +++ b/xlators/cluster/dht/src/dht-layout.c @@ -0,0 +1,543 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "byte-order.h" + +#define layout_base_size (sizeof (dht_layout_t)) + +#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0]) + +#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) + + +dht_layout_t * +dht_layout_new (xlator_t *this, int cnt) +{ + dht_layout_t *layout = NULL; + + + layout = CALLOC (1, layout_size (cnt)); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + layout->cnt = cnt; + +out: + return layout; +} + + +dht_layout_t * +dht_layout_get (xlator_t *this, inode_t *inode) +{ + uint64_t layout = 0; + int ret = -1; + + ret = inode_ctx_get (inode, this, &layout); + + return (dht_layout_t *)(long)layout; +} + + +xlator_t * +dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +{ + uint32_t hash = 0; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + + + ret = dht_hash_compute (layout->type, name, &hash); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "hash computation failed for type=%d name=%s", + layout->type, name); + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].start <= hash + && layout->list[i].stop >= hash) { + subvol = layout->list[i].xlator; + break; + } + } + + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume for hash (value) = %u", hash); + } + +out: + return subvol; +} + + +dht_layout_t * +dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) +{ + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + layout = conf->file_layouts[i]; + break; + } + } + + return layout; +} + + +int +dht_layouts_init (xlator_t *this, dht_conf_t *conf) +{ + dht_layout_t *layout = NULL; + int i = 0; + int ret = -1; + + + conf->file_layouts = CALLOC (conf->subvolume_cnt, + sizeof (dht_layout_t *)); + if (!conf->file_layouts) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + layout = dht_layout_new (this, 1); + + if (!layout) { + goto out; + } + + layout->preset = 1; + + layout->list[0].xlator = conf->subvolumes[i]; + + conf->file_layouts[i] = layout; + } + + ret = 0; +out: + return ret; +} + + +int +dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p) +{ + int ret = -1; + int32_t *disk_layout = NULL; + + disk_layout = CALLOC (5, sizeof (int)); + if (!disk_layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + disk_layout[0] = hton32 (1); + disk_layout[1] = hton32 (layout->type); + disk_layout[2] = hton32 (layout->list[pos].start); + disk_layout[3] = hton32 (layout->list[pos].stop); + + if (disk_layout_p) + *disk_layout_p = disk_layout; + ret = 0; + +out: + return ret; +} + + +int +dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, int32_t *disk_layout) +{ + int cnt = 0; + int type = 0; + int start_off = 0; + int stop_off = 0; + + + /* TODO: assert disk_layout_ptr is of required length */ + + cnt = ntoh32 (disk_layout[0]); + if (cnt != 1) { + gf_log (this->name, GF_LOG_ERROR, + "disk layout has invalid count %d", cnt); + return -1; + } + + /* TODO: assert type is compatible */ + type = ntoh32 (disk_layout[1]); + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + layout->list[pos].start = start_off; + layout->list[pos].stop = stop_off; + + gf_log (this->name, GF_LOG_DEBUG, + "merged to layout: %u - %u (type %d) from %s", + start_off, stop_off, type, + layout->list[pos].xlator->name); + + return 0; +} + + +int +dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr) +{ + int i = 0; + int ret = -1; + int err = -1; + int32_t *disk_layout = NULL; + + + if (op_ret != 0) { + err = op_errno; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == NULL) { + layout->list[i].err = err; + layout->list[i].xlator = subvol; + break; + } + } + + if (op_ret != 0) { + ret = 0; + goto out; + } + + if (xattr) { + /* during lookup and not mkdir */ + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + VOID(&disk_layout)); + } + + if (ret != 0) { + layout->list[i].err = -1; + gf_log (this->name, GF_LOG_DEBUG, + "missing disk layout on %s. err = %d", + subvol->name, err); + ret = 0; + goto out; + } + + ret = dht_disk_layout_merge (this, layout, i, disk_layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "layout merge from subvolume %s failed", + subvol->name); + goto out; + } + layout->list[i].err = 0; + +out: + return ret; +} + + +void +dht_layout_entry_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + xlator_t *xlator_swap = 0; + int err_swap = 0; + + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + xlator_swap = layout->list[i].xlator; + err_swap = layout->list[i].err; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + layout->list[i].xlator = layout->list[j].xlator; + layout->list[i].err = layout->list[j].err; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; + layout->list[j].xlator = xlator_swap; + layout->list[j].err = err_swap; +} + + +int64_t +dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) +{ + int64_t diff = 0; + + if (layout->list[i].err || layout->list[j].err) + diff = layout->list[i].err - layout->list[j].err; + else + diff = (int64_t) layout->list[i].start + - (int64_t) layout->list[j].start; + + return diff; +} + + +int +dht_layout_sort (dht_layout_t *layout) +{ + int i = 0; + int j = 0; + int64_t ret = 0; + + /* TODO: O(n^2) -- bad bad */ + + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp (layout, i, j); + if (ret > 0) + dht_layout_entry_swap (layout, i, j); + } + } + + return 0; +} + + +int +dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) +{ + dht_conf_t *conf = NULL; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + uint32_t hole_cnt = 0; + uint32_t overlap_cnt = 0; + int i = 0; + int ret = 0; + uint32_t prev_stop = 0; + uint32_t last_stop = 0; + char is_virgin = 1; + + + conf = this->private; + + /* TODO: explain WTF is happening */ + + last_stop = layout->list[0].start - 1; + prev_stop = last_stop; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err) { + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + break; + case ENOTCONN: + down++; + break; + default: + misc++; + } + continue; + } + + is_virgin = 0; + + if ((prev_stop + 1) < layout->list[i].start) { + hole_cnt++; + holes += (layout->list[i].start - (prev_stop + 1)); + } + + if ((prev_stop + 1) > layout->list[i].start) { + overlap_cnt++; + overlaps += ((prev_stop + 1) - layout->list[i].start); + } + prev_stop = layout->list[i].stop; + } + + if ((last_stop - prev_stop) || is_virgin) + hole_cnt++; + holes += (last_stop - prev_stop); + + if (holes_p) + *holes_p = hole_cnt; + + if (overlaps_p) + *overlaps_p = overlap_cnt; + + if (missing_p) + *missing_p = missing; + + if (down_p) + *down_p = down; + + if (misc_p) + *misc_p = misc; + + return ret; +} + + +int +dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + + + ret = dht_layout_sort (layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "sort failed?! how the ...."); + goto out; + } + + ret = dht_layout_anomalies (this, loc, layout, + &holes, &overlaps, + &missing, &down, &misc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "error while finding anomalies in %s -- not good news", + loc->path); + goto out; + } + + if (holes || overlaps) { + if (missing == layout->cnt) { + gf_log (this->name, GF_LOG_WARNING, + "directory %s looked up first time", + loc->path); + } else { + gf_log (this->name, GF_LOG_ERROR, + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); + } + ret = 1; + } + +out: + return ret; +} + + +int +dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr) +{ + int idx = 0; + int pos = -1; + int ret = -1; + int32_t *disk_layout = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + + + for (idx = 0; idx < layout->cnt; idx++) { + if (layout->list[idx].xlator == subvol) { + pos = idx; + break; + } + } + + if (pos == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s - no layout info for subvolume %s", + loc->path, subvol->name); + ret = 1; + goto out; + } + + if (xattr == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "%s - xattr dictionary is NULL", + loc->path); + ret = -1; + goto out; + } + + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + VOID(&disk_layout)); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout missing", loc->path); + ret = -1; + goto out; + } + + count = ntoh32 (disk_layout[0]); + if (count != 1) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout has invalid count %d", + loc->path, count); + ret = -1; + goto out; + } + + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + if ((layout->list[pos].start != start_off) + || (layout->list[pos].stop != stop_off)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvol: %s; inode layout - %"PRId32" - %"PRId32"; " + "disk layout - %"PRId32" - %"PRId32, + layout->list[pos].xlator->name, + layout->list[pos].start, layout->list[pos].stop, + start_off, stop_off); + ret = 1; + } else { + ret = 0; + } +out: + return ret; +} + diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c new file mode 100644 index 000000000..9cc24ccf6 --- /dev/null +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -0,0 +1,224 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "compat.h" +#include "dht-common.h" + + + +int +dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + local->linkfile.inode, + &local->linkfile.stbuf); + + return 0; +} + + +int +dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dict_t *xattr = NULL; + data_t *str_data = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) + goto err; + + xattr = get_new_dict (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->linkfile.xattr = dict_ref (xattr); + local->linkfile.inode = inode_ref (inode); + + str_data = str_to_data (local->linkfile.srcvol->name); + if (!str_data) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to initialize linkfile data"); + op_errno = EINVAL; + } + str_data = NULL; + + local->linkfile.stbuf = *stbuf; + + STACK_WIND (frame, dht_linkfile_xattr_cbk, + prev->this, prev->this->fops->setxattr, + &local->linkfile.loc, local->linkfile.xattr, 0); + + return 0; + +err: + if (str_data) { + data_destroy (str_data); + str_data = NULL; + } + + local->linkfile.linkfile_cbk (frame, cookie, this, + op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->linkfile.linkfile_cbk = linkfile_cbk; + local->linkfile.srcvol = tovol; + loc_copy (&local->linkfile.loc, loc); + + STACK_WIND (frame, dht_linkfile_create_cbk, + fromvol, fromvol->fops->mknod, loc, + S_IFREG | DHT_LINKFILE_MODE, 0); + + return 0; +} + + +int +dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + prev = cookie; + subvol = prev->this; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlinking linkfile %s on %s failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + } + + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc) +{ + call_frame_t *unlink_frame = NULL; + dht_local_t *unlink_local = NULL; + + unlink_frame = copy_frame (frame); + if (!unlink_frame) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + unlink_local = dht_local_init (unlink_frame); + if (!unlink_local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + loc_copy (&unlink_local->loc, loc); + + STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, + subvol, subvol->fops->unlink, + &unlink_local->loc); + + return 0; +err: + if (unlink_frame) + DHT_STACK_DESTROY (unlink_frame); + + return -1; +} + + +xlator_t * +dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf, + dict_t *xattr) +{ + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + void *volname = NULL; + int i = 0, ret = 0; + + + conf = this->private; + + if (!xattr) + goto out; + + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + + if ((-1 == ret) || !volname) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { + subvol = conf->subvolumes[i]; + break; + } + } + +out: + return subvol; +} + + diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c new file mode 100644 index 000000000..e5532f1bc --- /dev/null +++ b/xlators/cluster/dht/src/dht-rename.c @@ -0,0 +1,562 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should + * delete the newpath if it gets EEXISTS from link() call. + */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "defaults.h" + + +int +dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + /* TODO: undo the damage */ + + gf_log (this->name, GF_LOG_ERROR, + "rename %s -> %s on %s failed (%s)", + local->loc.path, local->loc2.path, + prev->this->name, strerror (op_errno)); + + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + /* TODO: construct proper stbuf for dir */ + local->stbuf = *stbuf; + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + + + +int +dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, + &local->loc, &local->loc2); + } + + return 0; + +err: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + return 0; +} + + +int +dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret > 2) { + gf_log (this->name, GF_LOG_DEBUG, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } + + return 0; +} + + +int +dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } + + STACK_WIND (frame, dht_rename_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0); + + return 0; + +err: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } + + return 0; +} + + +int +dht_rename_dir (call_frame_t *frame, xlator_t *this) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int op_errno = -1; + + + conf = frame->this->private; + local = frame->local; + + local->call_cnt = conf->subvolume_cnt; + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->op_ret = 0; + + if (!local->dst_cached) { + dht_rename_dir_do (frame, this); + return 0; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + &local->loc2, local->fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + +int +dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + + this_call_cnt = dht_frame_return (frame); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlink on %s failed (%s)", + prev->this->name, strerror (op_errno)); + } + + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *rename_subvol = NULL; + + local = frame->local; + prev = cookie; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "rename on %s failed (%s)", prev->this->name, + strerror (op_errno)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk + * is called. since rename has already happened on rename_subvol, + * unlink should not be sent for oldpath (either linkfile or cached-file) + * on rename_subvol. */ + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + /* TODO: delete files in background */ + + if (src_cached != dst_hashed && src_cached != dst_cached) + local->call_cnt++; + + if (src_hashed != rename_subvol && src_hashed != src_cached) + local->call_cnt++; + + if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) + local->call_cnt++; + + if (local->call_cnt == 0) + goto unwind; + + if (src_cached != dst_hashed && src_cached != dst_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old src datafile %s @ %s", + local->loc.path, src_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_cached, src_cached->fops->unlink, + &local->loc); + } + + if (src_hashed != rename_subvol && src_hashed != src_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old src linkfile %s @ %s", + local->loc.path, src_hashed->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_hashed, src_hashed->fops->unlink, + &local->loc); + } + + if (dst_cached + && (dst_cached != dst_hashed) + && (dst_cached != src_cached)) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old dst datafile %s @ %s", + local->loc2.path, dst_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + dst_cached, dst_cached->fops->unlink, + &local->loc2); + } + return 0; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_do_rename (call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_cached = NULL; + xlator_t *this = NULL; + xlator_t *rename_subvol = NULL; + + + local = frame->local; + this = frame->this; + + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + src_cached = local->src_cached; + + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + gf_log (this->name, GF_LOG_DEBUG, + "renaming %s => %s (%s)", + local->loc.path, local->loc2.path, rename_subvol->name); + + STACK_WIND (frame, dht_rename_cbk, + rename_subvol, rename_subvol->fops->rename, + &local->loc, &local->loc2); + + return 0; +} + + +int +dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "link/file on %s failed (%s)", + prev->this->name, strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->op_ret == -1) + goto unwind; + + dht_do_rename (frame); + } + + return 0; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_rename_create_links (call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + + + local = frame->local; + this = frame->this; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (src_cached == dst_cached) + goto nolinks; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) + call_cnt++; + + if (src_cached != dst_hashed) + call_cnt++; + + local->call_cnt = call_cnt; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "linkfile %s @ %s => %s", + local->loc.path, dst_hashed->name, src_cached->name); + dht_linkfile_create (frame, dht_rename_links_cbk, + src_cached, dst_hashed, &local->loc); + } + + if (src_cached != dst_hashed) { + gf_log (this->name, GF_LOG_DEBUG, + "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + STACK_WIND (frame, dht_rename_links_cbk, + src_cached, src_cached->fops->link, + &local->loc, &local->loc2); + } + +nolinks: + if (!call_cnt) { + /* skip to next step */ + dht_do_rename (frame); + } + + return 0; +} + + +int +dht_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + xlator_t *src_cached = NULL; + xlator_t *src_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *dst_hashed = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + src_hashed = dht_subvol_get_hashed (this, oldloc); + if (!src_hashed) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + oldloc->path); + op_errno = EINVAL; + goto err; + } + + src_cached = dht_subvol_get_cached (this, oldloc->inode); + if (!src_cached) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + dst_hashed = dht_subvol_get_hashed (this, newloc); + if (!dst_hashed) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + if (newloc->inode) + dst_cached = dht_subvol_get_cached (this, newloc->inode); + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc, oldloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->src_hashed = src_hashed; + local->src_cached = src_cached; + local->dst_hashed = dst_hashed; + local->dst_cached = dst_cached; + + gf_log (this->name, GF_LOG_DEBUG, + "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", + oldloc->path, src_hashed->name, src_cached->name, + newloc->path, dst_hashed->name, + dst_cached ? dst_cached->name : "<nul>"); + + if (S_ISDIR (oldloc->inode->st_mode)) { + dht_rename_dir (frame, this); + } else { + local->op_ret = 0; + dht_rename_create_links (frame); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c new file mode 100644 index 000000000..ee32b2253 --- /dev/null +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -0,0 +1,460 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +int +dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->selfheal.dir_cbk (frame, NULL, frame->this, ret, + local->op_errno); + + return 0; +} + + +int +dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int err = 0; + int this_call_cnt = 0; + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if (op_ret == 0) + err = 0; + else + err = op_errno; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = err; + break; + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_finish (frame, this, 0); + } + + return 0; +} + + +int +dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int i) +{ + xlator_t *subvol = NULL; + dict_t *xattr = NULL; + int ret = 0; + xlator_t *this = NULL; + int32_t *disk_layout = NULL; + + + subvol = layout->list[i].xlator; + this = frame->this; + + xattr = get_new_dict (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = dht_disk_layout_extract (this, layout, i, &disk_layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to extract disk layout"); + goto err; + } + + ret = dict_set_bin (xattr, "trusted.glusterfs.dht", + disk_layout, 4 * 4); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr dictionary"); + goto err; + } + disk_layout = NULL; + + gf_log (this->name, GF_LOG_DEBUG, + "setting hash range %u - %u (type %d) on subvolume %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->type, subvol->name, loc->path); + + dict_ref (xattr); + + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, 0); + + dict_unref (xattr); + + return 0; + +err: + if (xattr) + dict_destroy (xattr); + + if (disk_layout) + FREE (disk_layout); + + dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, + -1, ENOMEM); + return 0; +} + + +int +dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int missing_xattr = 0; + int i = 0; + int ret = 0; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + /* attr missing and layout present */ + missing_xattr++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "%d subvolumes missing xattr for %s", + missing_xattr, loc->path); + + if (missing_xattr == 0) { + dht_selfheal_dir_finish (frame, this, 0); + return 0; + } + + local->call_cnt = missing_xattr; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + + ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + + if (--missing_xattr == 0) + break; + } + return 0; +} + + +int +dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + int this_call_cnt = 0; + + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if ((op_ret == 0) || (op_errno == EEXIST)) { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = -1; + break; + } + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_xattr (frame, &local->loc, layout); + } + + return 0; +} + + +int +dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int force) +{ + int missing_dirs = 0; + int i = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) + missing_dirs++; + } + + if (missing_dirs == 0) { + dht_selfheal_dir_xattr (frame, loc, layout); + return 0; + } + + local->call_cnt = missing_dirs; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) { + gf_log (this->name, GF_LOG_DEBUG, + "creating directory %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->mkdir, + loc, local->stbuf.st_mode); + } + } + + return 0; +} + +void +dht_selfheal_fix_this_virgin (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + uint32_t chunk = 0; + int i = 0; + uint32_t start = 0; + int cnt = 0; + int err = 0; + + this = frame->this; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1) { + cnt++; + } + } + + chunk = ((unsigned long) 0xffffffff) / cnt; + + start = 0; + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1) { + layout->list[i].start = start; + layout->list[i].stop = start + chunk - 1; + + start = start + chunk; + + gf_log (this->name, GF_LOG_DEBUG, + "gave fix: %u - %u on %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->list[i].xlator->name, loc->path); + if (--cnt == 0) { + layout->list[i].stop = 0xffffffff; + break; + } + } + } +} + + +int +dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + dht_local_t *local = NULL; + int missing = -1; + int down = -1; + int holes = -1; + int ret = -1; + int i = -1; + + this = frame->this; + conf = this->private; + local = frame->local; + + missing = local->selfheal.missing; + down = local->selfheal.down; + holes = local->selfheal.hole_cnt; + + if ((missing + down) == conf->subvolume_cnt) { + dht_selfheal_fix_this_virgin (frame, loc, layout); + ret = 0; + } + + if (holes <= down) { + /* the down subvol might fill up the holes */ + ret = 0; + } + + for (i = 0; i < layout->cnt; i++) { + /* directory not present */ + if (layout->list[i].err == ENOENT) { + ret = 0; + break; + } + } + + /* TODO: give a fix to these non-virgins */ + + return ret; +} + + +int +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + int ret = 0; + xlator_t *this = NULL; + + + local = frame->local; + this = frame->this; + + ret = dht_layout_anomalies (this, loc, layout, + &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, + &local->selfheal.missing, + &local->selfheal.down, + &local->selfheal.misc); + + holes = local->selfheal.hole_cnt; + overlaps = local->selfheal.overlaps_cnt; + missing = local->selfheal.missing; + down = local->selfheal.down; + misc = local->selfheal.misc; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = layout; + +/* + if (down) { + gf_log (this->name, GF_LOG_ERROR, + "%d subvolumes down -- not fixing", down); + ret = 0; + goto sorry_no_fix; + } + + if (overlaps) { + gf_log (this->name, GF_LOG_ERROR, + "not fixing overlaps in %s", loc->path); + local->op_errno = EINVAL; + ret = -1; + goto sorry_no_fix; + } + + if (misc) { + gf_log (this->name, GF_LOG_ERROR, + "%d subvolumes have unrecoverable errors", misc); + ret = 0; + goto sorry_no_fix; + } + + if (holes > missing) { + gf_log (this->name, GF_LOG_ERROR, + "%d holes and %d pigeons -- not fixing", + holes, missing); + ret = 0; + goto sorry_no_fix; + } +*/ + ret = dht_selfheal_dir_getafix (frame, loc, layout); + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "the directory is not a virgin"); + goto sorry_no_fix; + } + + dht_selfheal_dir_mkdir (frame, loc, layout, 0); + + return 0; + +sorry_no_fix: + /* TODO: need to put appropriate local->op_errno */ + dht_selfheal_dir_finish (frame, this, ret); + + return 0; +} + + +int +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = layout; + + ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c new file mode 100644 index 000000000..836e7a4e8 --- /dev/null +++ b/xlators/cluster/dht/src/dht.c @@ -0,0 +1,222 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "dht-common.c" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + + + +int +notify (xlator_t *this, int event, void *data, ...) +{ + int ret = -1; + + ret = dht_notify (this, event, data); + + return ret; +} + +void +fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return; +} + +int +init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *lookup_unhashed_str = NULL; + int ret = -1; + int i = 0; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "DHT needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = CALLOC (1, sizeof (*conf)); + if (!conf) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + conf->search_unhashed = 0; + + if (dict_get_str (this->options, "lookup-unhashed", + &lookup_unhashed_str) == 0) { + gf_string2boolean (lookup_unhashed_str, + &conf->search_unhashed); + } + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + + conf->gen = 1; + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return -1; +} + + +struct xlator_fops fops = { + .lookup = dht_lookup, + .mknod = dht_mknod, + .create = dht_create, + + .stat = dht_stat, + .chmod = dht_chmod, + .chown = dht_chown, + .fchown = dht_fchown, + .fchmod = dht_fchmod, + .fstat = dht_fstat, + .utimens = dht_utimens, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, |