summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
Diffstat (limited to 'xlators')
-rw-r--r--xlators/Makefile.am3
-rw-r--r--xlators/bindings/Makefile.am1
-rw-r--r--xlators/bindings/python/Makefile.am1
-rw-r--r--xlators/bindings/python/src/Makefile.am19
-rw-r--r--xlators/bindings/python/src/gluster.py47
-rw-r--r--xlators/bindings/python/src/glusterstack.py55
-rw-r--r--xlators/bindings/python/src/glustertypes.py167
-rw-r--r--xlators/bindings/python/src/python.c235
-rw-r--r--xlators/bindings/python/src/testxlator.py56
-rw-r--r--xlators/cluster/Makefile.am3
-rw-r--r--xlators/cluster/afr/Makefile.am3
-rw-r--r--xlators/cluster/afr/src/Makefile.am20
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c345
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.h47
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c1786
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.h59
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c721
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h47
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c2024
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h63
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c1073
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h66
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1030
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2038
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c791
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h52
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c957
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h36
-rw-r--r--xlators/cluster/afr/src/afr.c2338
-rw-r--r--xlators/cluster/afr/src/afr.h523
-rw-r--r--xlators/cluster/dht/Makefile.am1
-rw-r--r--xlators/cluster/dht/src/Makefile.am30
-rw-r--r--xlators/cluster/dht/src/dht-common.c3470
-rw-r--r--xlators/cluster/dht/src/dht-common.h212
-rw-r--r--xlators/cluster/dht/src/dht-hashfn-tea.c146
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c88
-rw-r--r--xlators/cluster/dht/src/dht-helper.c326
-rw-r--r--xlators/cluster/dht/src/dht-layout.c543
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c224
-rw-r--r--xlators/cluster/dht/src/dht-rename.c562
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c460
-rw-r--r--xlators/cluster/dht/src/dht.c222
-rw-r--r--xlators/cluster/dht/src/nufa.c684
-rw-r--r--xlators/cluster/ha/Makefile.am3
-rw-r--r--xlators/cluster/ha/src/Makefile.am15
-rw-r--r--xlators/cluster/ha/src/ha-helpers.c191
-rw-r--r--xlators/cluster/ha/src/ha.c3479
-rw-r--r--xlators/cluster/ha/src/ha.h59
-rw-r--r--xlators/cluster/map/Makefile.am3
-rw-r--r--xlators/cluster/map/src/Makefile.am15
-rw-r--r--xlators/cluster/map/src/map-helper.c357
-rw-r--r--xlators/cluster/map/src/map.c2193
-rw-r--r--xlators/cluster/map/src/map.h76
-rw-r--r--xlators/cluster/stripe/Makefile.am3
-rw-r--r--xlators/cluster/stripe/src/Makefile.am14
-rw-r--r--xlators/cluster/stripe/src/stripe.c3286
-rw-r--r--xlators/cluster/unify/Makefile.am3
-rw-r--r--xlators/cluster/unify/src/Makefile.am16
-rw-r--r--xlators/cluster/unify/src/unify-self-heal.c1225
-rw-r--r--xlators/cluster/unify/src/unify.c4451
-rw-r--r--xlators/cluster/unify/src/unify.h132
-rw-r--r--xlators/debug/Makefile.am3
-rw-r--r--xlators/debug/error-gen/Makefile.am3
-rw-r--r--xlators/debug/error-gen/src/Makefile.am14
-rw-r--r--xlators/debug/error-gen/src/error-gen.c1780
-rw-r--r--xlators/debug/trace/Makefile.am3
-rw-r--r--xlators/debug/trace/src/Makefile.am14
-rw-r--r--xlators/debug/trace/src/trace.c2321
-rw-r--r--xlators/encryption/Makefile.am3
-rw-r--r--xlators/encryption/rot-13/Makefile.am3
-rw-r--r--xlators/encryption/rot-13/src/Makefile.am14
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c200
-rw-r--r--xlators/encryption/rot-13/src/rot-13.h33
-rw-r--r--xlators/features/Makefile.am3
-rw-r--r--xlators/features/filter/Makefile.am3
-rw-r--r--xlators/features/filter/src/Makefile.am13
-rw-r--r--xlators/features/filter/src/filter.c1768
-rw-r--r--xlators/features/locks/Makefile.am3
-rw-r--r--xlators/features/locks/src/Makefile.am20
-rw-r--r--xlators/features/locks/src/common.c561
-rw-r--r--xlators/features/locks/src/common.h59
-rw-r--r--xlators/features/locks/src/internal.c762
-rw-r--r--xlators/features/locks/src/locks.h111
-rw-r--r--xlators/features/locks/src/posix.c834
-rw-r--r--xlators/features/locks/tests/unit-test.c75
-rw-r--r--xlators/features/path-convertor/Makefile.am3
-rw-r--r--xlators/features/path-convertor/src/Makefile.am14
-rw-r--r--xlators/features/path-convertor/src/path.c1217
-rw-r--r--xlators/features/quota/Makefile.am3
-rw-r--r--xlators/features/quota/src/Makefile.am13
-rw-r--r--xlators/features/quota/src/quota.c1056
-rw-r--r--xlators/features/trash/Makefile.am3
-rw-r--r--xlators/features/trash/src/Makefile.am13
-rw-r--r--xlators/features/trash/src/trash.c596
-rw-r--r--xlators/meta/Makefile.am1
-rw-r--r--xlators/meta/src/Makefile.am10
-rw-r--r--xlators/meta/src/meta.c1285
-rw-r--r--xlators/meta/src/meta.h48
-rw-r--r--xlators/meta/src/misc.c67
-rw-r--r--xlators/meta/src/misc.h31
-rw-r--r--xlators/meta/src/tree.c176
-rw-r--r--xlators/meta/src/tree.h35
-rw-r--r--xlators/meta/src/view.c258
-rw-r--r--xlators/meta/src/view.h32
-rw-r--r--xlators/mount/Makefile.am3
-rw-r--r--xlators/mount/fuse/Makefile.am3
-rw-r--r--xlators/mount/fuse/src/Makefile.am14
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c2859
-rw-r--r--xlators/mount/fuse/src/fuse-extra.c137
-rw-r--r--xlators/mount/fuse/src/fuse-extra.h42
-rw-r--r--xlators/mount/fuse/utils/Makefile.am10
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in152
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in181
-rw-r--r--xlators/performance/Makefile.am3
-rw-r--r--xlators/performance/io-cache/Makefile.am3
-rw-r--r--xlators/performance/io-cache/src/Makefile.am14
-rw-r--r--xlators/performance/io-cache/src/io-cache.c1478
-rw-r--r--xlators/performance/io-cache/src/io-cache.h330
-rw-r--r--xlators/performance/io-cache/src/ioc-inode.c201
-rw-r--r--xlators/performance/io-cache/src/page.c778
-rw-r--r--xlators/performance/io-threads/Makefile.am3
-rw-r--r--xlators/performance/io-threads/src/Makefile.am14
-rw-r--r--xlators/performance/io-threads/src/io-threads.c1254
-rw-r--r--xlators/performance/io-threads/src/io-threads.h99
-rw-r--r--xlators/performance/read-ahead/Makefile.am3
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am14
-rw-r--r--xlators/performance/read-ahead/src/page.c487
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c890
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h194
-rw-r--r--xlators/performance/stat-prefetch/Makefile.am1
-rw-r--r--xlators/performance/stat-prefetch/src/Makefile.am11
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.c508
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.h32
-rw-r--r--xlators/performance/symlink-cache/Makefile.am3
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am12
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache.c399
-rw-r--r--xlators/performance/write-behind/Makefile.am3
-rw-r--r--xlators/performance/write-behind/src/Makefile.am12
-rw-r--r--xlators/performance/write-behind/src/write-behind.c1444
-rw-r--r--xlators/protocol/Makefile.am3
-rw-r--r--xlators/protocol/client/Makefile.am3
-rw-r--r--xlators/protocol/client/src/Makefile.am16
-rw-r--r--xlators/protocol/client/src/client-protocol.c6671
-rw-r--r--xlators/protocol/client/src/client-protocol.h173
-rw-r--r--xlators/protocol/client/src/saved-frames.c178
-rw-r--r--xlators/protocol/client/src/saved-frames.h74
-rw-r--r--xlators/protocol/server/Makefile.am3
-rw-r--r--xlators/protocol/server/src/Makefile.am18
-rw-r--r--xlators/protocol/server/src/server-dentry.c413
-rw-r--r--xlators/protocol/server/src/server-helpers.c586
-rw-r--r--xlators/protocol/server/src/server-helpers.h77
-rw-r--r--xlators/protocol/server/src/server-protocol.c7984
-rw-r--r--xlators/protocol/server/src/server-protocol.h143
-rw-r--r--xlators/storage/Makefile.am3
-rw-r--r--xlators/storage/bdb/Makefile.am3
-rw-r--r--xlators/storage/bdb/src/Makefile.am18
-rw-r--r--xlators/storage/bdb/src/bctx.c394
-rw-r--r--xlators/storage/bdb/src/bdb-ll.c1455
-rw-r--r--xlators/storage/bdb/src/bdb.c3371
-rw-r--r--xlators/storage/bdb/src/bdb.h439
-rw-r--r--xlators/storage/posix/Makefile.am3
-rw-r--r--xlators/storage/posix/src/Makefile.am17
-rw-r--r--xlators/storage/posix/src/posix.c3715
-rw-r--r--xlators/storage/posix/src/posix.h110
-rw-r--r--xlators/storage/posix/src/xattr-cache.c521
-rw-r--r--xlators/storage/posix/src/xattr-cache.h65
166 files changed, 88606 insertions, 0 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
new file mode 100644
index 00000000000..2abb5219488
--- /dev/null
+++ b/xlators/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = cluster storage protocol performance debug features encryption mount
+
+CLEANFILES =
diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am
new file mode 100644
index 00000000000..f7766580257
--- /dev/null
+++ b/xlators/bindings/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = $(BINDINGS_SUBDIRS)
diff --git a/xlators/bindings/python/Makefile.am b/xlators/bindings/python/Makefile.am
new file mode 100644
index 00000000000..af437a64d6d
--- /dev/null
+++ b/xlators/bindings/python/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am
new file mode 100644
index 00000000000..c0b9141c667
--- /dev/null
+++ b/xlators/bindings/python/src/Makefile.am
@@ -0,0 +1,19 @@
+
+xlator_PROGRAMS = python.so
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings
+
+python_PYTHON = gluster.py glustertypes.py glusterstack.py
+
+pythondir = $(xlatordir)/python
+
+python_so_SOURCES = python.c
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
+ $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\"
+
+AM_LDFLAGS = $(PYTHON_LDFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py
new file mode 100644
index 00000000000..ee0eb131011
--- /dev/null
+++ b/xlators/bindings/python/src/gluster.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
+# This file is part of GlusterFS.
+#
+# GlusterFS is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# GlusterFS is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+from ctypes import *
+from glustertypes import *
+from glusterstack import *
+import sys
+import inspect
+
+libglusterfs = CDLL("libglusterfs.so")
+_gf_log = libglusterfs._gf_log
+_gf_log.restype = c_int32
+_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p]
+
+gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel")
+
+GF_LOG_NONE = 0
+GF_LOG_CRITICAL = 1
+GF_LOG_ERROR = 2
+GF_LOG_WARNING = 3
+GF_LOG_DEBUG = 4
+
+def gf_log(module, level, fmt, *params):
+ if level <= gf_log_loglevel:
+ frame = sys._getframe(1)
+ _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name,
+ frame.f_lineno, level, fmt, *params)
+
+class ComplexTranslator(object):
+ def __init__(self, xlator):
+ self.xlator = xlator_t.from_address(xlator)
+
+ def __getattr__(self, item):
+ return getattr(self.xlator, item)
diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py
new file mode 100644
index 00000000000..ba24c81652e
--- /dev/null
+++ b/xlators/bindings/python/src/glusterstack.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
+# This file is part of GlusterFS.
+#
+# GlusterFS is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# GlusterFS is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+from ctypes import *
+from glustertypes import *
+
+libc = CDLL("libc.so.6")
+calloc = libc.calloc
+calloc.argtypes = [c_int, c_int]
+calloc.restype = c_void_p
+
+# TODO: Can these be done in C somehow?
+def stack_wind(frame, rfn, obj, fn, *params):
+ """Frame is a frame object"""
+ _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t))
+ _new[0].root = frame.root
+ _new[0].next = frame.root[0].frames.next
+ _new[0].prev = pointer(frame.root[0].frames)
+ if frame.root[0].frames.next:
+ frame.root[0].frames.next[0].prev = _new
+ frame.root[0].frames.next = _new
+ _new[0].this = obj
+ # TODO: Type checking like tmp_cbk?
+ _new[0].ret = rfn
+ _new[0].parent = pointer(frame)
+ _new[0].cookie = cast(_new, c_void_p)
+ # TODO: Initialize lock
+ #_new.lock.init()
+ frame.ref_count += 1
+ fn(_new, obj, *params)
+
+def stack_unwind(frame, *params):
+ """Frame is a frame object"""
+ fn = frame[0].ret
+ parent = frame[0].parent[0]
+ parent.ref_count -= 1
+
+ op_ret = params[0]
+ op_err = params[1]
+ params = params[2:]
+ fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this,
+ op_ret, op_err, *params)
diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py
new file mode 100644
index 00000000000..e9069d07c72
--- /dev/null
+++ b/xlators/bindings/python/src/glustertypes.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
+# This file is part of GlusterFS.
+#
+# GlusterFS is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# GlusterFS is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+from ctypes import *
+import collections
+
+#
+# Forward declaration of some gluster types
+#
+class call_frame_t(Structure):
+ pass
+
+class call_ctx_t(Structure):
+ pass
+
+class call_pool_t(Structure):
+ pass
+
+class xlator_t(Structure):
+ def _getFirstChild(self):
+ return self.children[0].xlator
+ firstChild = property(_getFirstChild)
+
+class xlator_list_t(Structure):
+ pass
+
+class xlator_fops(Structure):
+ pass
+
+class xlator_mops(Structure):
+ pass
+
+class glusterfs_ctx_t(Structure):
+ pass
+
+class list_head(Structure):
+ pass
+
+class dict_t(Structure):
+ pass
+
+class inode_table_t(Structure):
+ pass
+
+class fd_t(Structure):
+ pass
+
+class iovec(Structure):
+ _fields_ = [
+ ("iov_base", c_void_p),
+ ("iov_len", c_size_t),
+ ]
+
+ def __init__(self, s):
+ self.iov_base = cast(c_char_p(s), c_void_p)
+ self.iov_len = len(s)
+
+ def getBytes(self):
+ return string_at(self.iov_base, self.iov_len)
+
+# This is a pthread_spinlock_t
+# TODO: what happens to volatile-ness?
+gf_lock_t = c_int
+
+uid_t = c_uint32
+gid_t = c_uint32
+pid_t = c_int32
+
+off_t = c_int64
+
+#
+# Function pointer types
+#
+ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t),
+ POINTER(xlator_t), c_int32, c_int32)
+
+fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t))
+init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t))
+event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p)
+
+list_head._fields_ = [
+ ("next", POINTER(list_head)),
+ ("prev", POINTER(list_head)),
+ ]
+
+call_frame_t._fields_ = [
+ ("root", POINTER(call_ctx_t)),
+ ("parent", POINTER(call_frame_t)),
+ ("next", POINTER(call_frame_t)),
+ ("prev", POINTER(call_frame_t)),
+ ("local", c_void_p),
+ ("this", POINTER(xlator_t)),
+ ("ret", ret_fn_t),
+ ("ref_count", c_int32),
+ ("lock", gf_lock_t),
+ ("cookie", c_void_p),
+ ("op", c_int32),
+ ("type", c_int8),
+ ]
+
+call_ctx_t._fields_ = [
+ ("all_frames", list_head),
+ ("trans", c_void_p),
+ ("pool", call_pool_t),
+ ("unique", c_uint64),
+ ("state", c_void_p),
+ ("uid", uid_t),
+ ("gid", gid_t),
+ ("pid", pid_t),
+ ("frames", call_frame_t),
+ ("req_refs", POINTER(dict_t)),
+ ("rsp_refs", POINTER(dict_t)),
+ ]
+
+xlator_t._fields_ = [
+ ("name", c_char_p),
+ ("type", c_char_p),
+ ("next", POINTER(xlator_t)),
+ ("prev", POINTER(xlator_t)),
+ ("parent", POINTER(xlator_t)),
+ ("children", POINTER(xlator_list_t)),
+ ("fops", POINTER(xlator_fops)),
+ ("mops", POINTER(xlator_mops)),
+ ("fini", fini_fn_t),
+ ("init", init_fn_t),
+ ("notify", event_notify_fn_t),
+ ("options", POINTER(dict_t)),
+ ("ctx", POINTER(glusterfs_ctx_t)),
+ ("itable", POINTER(inode_table_t)),
+ ("ready", c_char),
+ ("private", c_void_p),
+ ]
+
+xlator_list_t._fields_ = [
+ ("xlator", POINTER(xlator_t)),
+ ("next", POINTER(xlator_list_t)),
+ ]
+
+fop_functions = collections.defaultdict(lambda: c_void_p)
+fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod',
+ 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access',
+ 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink',
+ 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush',
+ 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir',
+ 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir',
+ # TODO: Call backs?
+ ]
+
+fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), POINTER(iovec), c_int32,
+ off_t)
+
+fop_functions['writev'] = fop_writev_t
+xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names]
diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c
new file mode 100644
index 00000000000..739ef732900
--- /dev/null
+++ b/xlators/bindings/python/src/python.c
@@ -0,0 +1,235 @@
+/*
+ Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "logging.h"
+#include "defaults.h"
+
+typedef struct
+{
+ char *scriptname;
+ PyObject *pXlator;
+ PyObject *pScriptModule;
+ PyObject *pGlusterModule;
+ PyThreadState *pInterp;
+
+ PyObject *pFrameType, *pVectorType, *pFdType;
+} python_private_t;
+
+int32_t
+python_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t offset)
+{
+ python_private_t *priv = (python_private_t *)this->private;
+ gf_log("python", GF_LOG_DEBUG, "In writev");
+ if (PyObject_HasAttrString(priv->pXlator, "writev"))
+ {
+
+ PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev",
+ "O O O i l",
+ PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame),
+ PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd),
+ PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector),
+ count,
+ offset);
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ }
+ Py_XDECREF(retval);
+ }
+ else
+ {
+ return default_writev(frame, this, fd, vector, count, offset);
+ }
+ return 0;
+}
+
+struct xlator_fops fops = {
+ .writev = python_writev
+};
+
+struct xlator_mops mops = {
+};
+
+static PyObject *
+AnonModule_FromFile (const char* fname)
+{
+ // Get the builtins
+ PyThreadState* pThread = PyThreadState_Get();
+ PyObject *pBuiltins = pThread->interp->builtins;
+
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ return NULL;
+ }
+
+ // Create a new dictionary for running code in
+ PyObject *pModuleDict = PyDict_New();
+ PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins);
+ Py_INCREF(pBuiltins);
+
+ // Run the file in the new context
+ FILE* fp = fopen(fname, "r");
+ PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict);
+ fclose(fp);
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ Py_DECREF(pModuleDict);
+ Py_DECREF(pBuiltins);
+ return NULL;
+ }
+
+ // Create an object to hold the new context
+ PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict);
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ Py_DECREF(pModuleDict);
+ Py_DECREF(pBuiltins);
+ return NULL;
+ }
+ PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict);
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ Py_DECREF(pModuleDict);
+ Py_DECREF(pBuiltins);
+ Py_XDECREF(pModule);
+ return NULL;
+ }
+
+ // Set the new context's dictionary to the one we used to run the code
+ // inside
+ PyObject_SetAttrString(pModule, "__dict__", pModuleDict);
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ Py_DECREF(pModuleDict);
+ Py_DECREF(pBuiltins);
+ Py_DECREF(pModule);
+ return NULL;
+ }
+
+ return pModule;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ // This is ok to call more than once per process
+ Py_InitializeEx(0);
+
+ if (!this->children) {
+ gf_log ("python", GF_LOG_ERROR,
+ "FATAL: python should have exactly one child");
+ return -1;
+ }
+
+ python_private_t *priv = CALLOC (sizeof (python_private_t), 1);
+ ERR_ABORT (priv);
+
+ data_t *scriptname = dict_get (this->options, "scriptname");
+ if (scriptname) {
+ priv->scriptname = data_to_str(scriptname);
+ } else {
+ gf_log("python", GF_LOG_ERROR,
+ "FATAL: python requires the scriptname parameter");
+ return -1;
+ }
+
+ priv->pInterp = Py_NewInterpreter();
+
+ // Adjust python's path
+ PyObject *syspath = PySys_GetObject("path");
+ PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH);
+ PyList_Append(syspath, path);
+ Py_DECREF(path);
+
+ gf_log("python", GF_LOG_DEBUG,
+ "Loading gluster module");
+
+ priv->pGlusterModule = PyImport_ImportModule("gluster");
+ if (PyErr_Occurred())
+ {
+ PyErr_Print();
+ return -1;
+ }
+
+ priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t");
+ priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t");
+ priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec");
+
+ gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname);
+
+ priv->pScriptModule = AnonModule_FromFile(priv->scriptname);
+ if (!priv->pScriptModule || PyErr_Occurred())
+ {
+ gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname);
+ PyErr_Print();
+ return -1;
+ }
+
+ if (!PyObject_HasAttrString(priv->pScriptModule, "xlator"))
+ {
+ gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname);
+ return -1;
+ }
+ gf_log("python", GF_LOG_DEBUG, "Instantiating translator");
+ priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&",
+ PyLong_FromVoidPtr, this);
+ if (PyErr_Occurred() || !priv->pXlator)
+ {
+ PyErr_Print();
+ return -1;
+ }
+
+ this->private = priv;
+
+ gf_log ("python", GF_LOG_DEBUG, "python xlator loaded");
+ return 0;
+}
+
+void
+fini (xlator_t *this)
+{
+ python_private_t *priv = (python_private_t*)(this->private);
+ Py_DECREF(priv->pXlator);
+ Py_DECREF(priv->pScriptModule);
+ Py_DECREF(priv->pGlusterModule);
+ Py_DECREF(priv->pFrameType);
+ Py_DECREF(priv->pFdType);
+ Py_DECREF(priv->pVectorType);
+ Py_EndInterpreter(priv->pInterp);
+ return;
+}
diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py
new file mode 100644
index 00000000000..507455c856a
--- /dev/null
+++ b/xlators/bindings/python/src/testxlator.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
+# This file is part of GlusterFS.
+#
+# GlusterFS is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# GlusterFS is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+
+"""
+This is a test translator written in python.
+
+Important things to note:
+ This file must be import-able from glusterfsd. This probably means
+ setting PYTHONPATH to where this file is located.
+
+ This file must have a top-level xlator class object that will be
+ used to instantiate individual translators.
+"""
+from gluster import *
+
+class MyXlator(ComplexTranslator):
+ name = "MyXlator"
+ def writev_cbk(self, frame, cookie, op_ret, op_errno, buf):
+ stack_unwind(frame, op_ret, op_errno, buf)
+ return 0
+
+ def writev(self, frame, fd, vector, count, offset):
+ gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
+ # TODO: Use cookie to pass this to writev_cbk
+ old_count = vector.iov_len
+
+ data = vector.getBytes().encode("zlib")
+
+ vector = iovec(data)
+ gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
+
+ @ret_fn_t
+ def rfn(frame, prev, this, op_ret, op_errno, *params):
+ if len(params) == 0:
+ params = [0]
+ return self.writev_cbk(frame, prev, old_count, op_errno, *params)
+
+ stack_wind(frame, rfn, self.firstChild,
+ self.firstChild[0].fops[0].writev, fd, vector, count, offset)
+ return 0
+
+xlator = MyXlator
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
new file mode 100644
index 00000000000..a6ddb3564a9
--- /dev/null
+++ b/xlators/cluster/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = unify stripe afr dht ha map
+
+CLEANFILES =
diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/xlators/cluster/afr/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
new file mode 100644
index 00000000000..1bde9e5bad7
--- /dev/null
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -0,0 +1,20 @@
+xlator_LTLIBRARIES = afr.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+afr_la_LDFLAGS = -module -avoidversion
+
+afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c
+afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/replicate.so
+
+install-data-hook:
+ ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so \ No newline at end of file
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
new file mode 100644
index 00000000000..0c65ca8528d
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -0,0 +1,345 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+
+
+int32_t
+afr_opendir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int child_count = 0;
+ int i = 0;
+
+ int ret = -1;
+ int call_count = -1;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ child_count = priv->child_count;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ local->fd = fd_ref (fd);
+
+ call_count = local->call_count;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_opendir_cbk,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ loc, fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+
+/**
+ * Common algorithm for directory read calls:
+ *
+ * - Try the fop on the first child that is up
+ * - if we have failed due to ENOTCONN:
+ * try the next child
+ *
+ * Applicable to: readdir
+ */
+
+int32_t
+afr_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.readdir.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+
+ this_try = ++local->cont.readdir.last_tried;
+ unwind = 0;
+
+ STACK_WIND (frame, afr_readdir_cbk,
+ children[this_try],
+ children[this_try]->fops->readdir,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readdir.last_tried = call_child;
+
+ local->fd = fd_ref (fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+
+ STACK_WIND (frame, afr_readdir_cbk,
+ children[call_child], children[call_child]->fops->readdir,
+ fd, size, offset);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_getdents_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dir_entry_t *entry, int32_t count)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.getdents.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+
+ this_try = ++local->cont.getdents.last_tried;
+ unwind = 0;
+
+ STACK_WIND (frame, afr_getdents_cbk,
+ children[this_try],
+ children[this_try]->fops->getdents,
+ local->fd, local->cont.getdents.size,
+ local->cont.getdents.offset, local->cont.getdents.flag);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_getdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, int32_t flag)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.getdents.last_tried = call_child;
+
+ local->fd = fd_ref (fd);
+
+ local->cont.getdents.size = size;
+ local->cont.getdents.offset = offset;
+ local->cont.getdents.flag = flag;
+
+ frame->local = local;
+
+ STACK_WIND (frame, afr_getdents_cbk,
+ children[call_child], children[call_child]->fops->getdents,
+ fd, size, offset, flag);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
new file mode 100644
index 00000000000..172ec3c90c4
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __DIR_READ_H__
+#define __DIR_READ_H__
+
+
+int32_t
+afr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd);
+
+int32_t
+afr_closedir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd);
+
+int32_t
+afr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset);
+
+
+int32_t
+afr_getdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, int32_t flag);
+
+
+int32_t
+afr_checksum (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags);
+
+
+#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
new file mode 100644
index 00000000000..87a6e09b5be
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -0,0 +1,1786 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+
+void
+afr_build_parent_loc (loc_t *parent, loc_t *child)
+{
+ char *tmp = NULL;
+
+ if (!child->parent) {
+ loc_copy (parent, child);
+ return;
+ }
+
+ tmp = strdup (child->path);
+ parent->path = strdup (dirname (tmp));
+ FREE (tmp);
+
+ parent->name = strrchr (parent->path, '/');
+ if (parent->name)
+ parent->name++;
+
+ parent->inode = inode_ref (child->parent);
+ parent->parent = inode_parent (parent->inode, 0, NULL);
+ parent->ino = parent->inode->ino;
+}
+
+
+/* {{{ create */
+
+int
+afr_create_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd,
+ local->cont.create.inode,
+ &local->cont.create.buf);
+ return 0;
+}
+
+
+int
+afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.create.buf = *buf;
+ local->cont.create.buf.st_ino =
+ afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ local->cont.create.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_create_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->create,
+ &local->loc,
+ local->cont.create.flags,
+ local->cont.create.mode,
+ local->cont.create.fd);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_create_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.create.flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref (fd);
+
+ local->transaction.fop = afr_create_wind;
+ local->transaction.done = afr_create_done;
+ local->transaction.unwind = afr_create_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ mknod */
+
+int
+afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.mknod.inode,
+ &local->cont.mknod.buf);
+ return 0;
+}
+
+
+int
+afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.mknod.buf = *buf;
+ local->cont.mknod.buf.st_ino =
+ afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ local->cont.mknod.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_mknod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mknod,
+ &local->loc, local->cont.mknod.mode,
+ local->cont.mknod.dev);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_mknod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t dev)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+
+ local->transaction.fop = afr_mknod_wind;
+ local->transaction.done = afr_mknod_done;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ mkdir */
+
+
+int
+afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.mkdir.inode,
+ &local->cont.mkdir.buf);
+ return 0;
+}
+
+
+int
+afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.mkdir.buf = *buf;
+ local->cont.mkdir.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.mkdir.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mkdir,
+ &local->loc, local->cont.mkdir.mode);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.mkdir.mode = mode;
+
+ local->transaction.fop = afr_mkdir_wind;
+ local->transaction.done = afr_mkdir_done;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ link */
+
+
+int
+afr_link_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.link.buf.st_ino = local->cont.link.ino;
+
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.link.inode,
+ &local->cont.link.buf);
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.link.buf = *buf;
+ local->cont.link.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.link.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->link,
+ &local->loc,
+ &local->newloc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
+
+ local->cont.link.ino = oldloc->inode->ino;
+
+ local->transaction.fop = afr_link_wind;
+ local->transaction.done = afr_link_done;
+ local->transaction.unwind = afr_link_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ symlink */
+
+
+int
+afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.symlink.inode,
+ &local->cont.symlink.buf);
+ return 0;
+}
+
+
+int
+afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.symlink.buf = *buf;
+ local->cont.symlink.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.symlink.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->symlink,
+ local->cont.symlink.linkpath,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_symlink_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.symlink.ino = loc->inode->ino;
+ local->cont.symlink.linkpath = strdup (linkpath);
+
+ local->transaction.fop = afr_symlink_wind;
+ local->transaction.done = afr_symlink_done;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ rename */
+
+int
+afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.rename.buf.st_ino = local->cont.rename.ino;
+
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.rename.buf);
+ }
+
+ return 0;
+}
+
+
+int
+afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if ((op_ret != -1) && (local->success_count == 0)) {
+ local->op_ret = op_ret;
+
+ if (buf) {
+ local->cont.rename.buf = *buf;
+ local->cont.rename.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_rename_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rename,
+ &local->loc,
+ &local->newloc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
+
+ local->cont.rename.ino = oldloc->inode->ino;
+
+ local->transaction.fop = afr_rename_wind;
+ local->transaction.done = afr_rename_done;
+ local->transaction.unwind = afr_rename_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+ afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ unlink */
+
+int
+afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->unlink,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.fop = afr_unlink_wind;
+ local->transaction.done = afr_unlink_done;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ rmdir */
+
+
+
+int
+afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count)
+ need_unwind = 1;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rmdir,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.fop = afr_rmdir_wind;
+ local->transaction.done = afr_rmdir_done;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ setdents */
+
+int32_t
+afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if ((op_ret != -1) && (local->success_count == 0)) {
+ local->op_ret = op_ret;
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setdents,
+ local->fd, local->cont.setdents.flags,
+ local->cont.setdents.entries,
+ local->cont.setdents.count);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ local->fd = fd_ref (fd);
+
+ local->cont.setdents.flags = flags;
+ local->cont.setdents.entries = entries;
+ local->cont.setdents.count = count;
+
+ local->transaction.fop = afr_setdents_wind;
+ local->transaction.done = afr_setdents_done;
+
+ local->transaction.basename = NULL;
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
new file mode 100644
index 00000000000..e6e8a5e797c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -0,0 +1,59 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __DIR_WRITE_H__
+#define __DIR_WRITE_H__
+
+int32_t
+afr_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd);
+
+int32_t
+afr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t dev);
+
+int32_t
+afr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode);
+
+int32_t
+afr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+
+int32_t
+afr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+
+int32_t
+afr_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *oldloc);
+
+int32_t
+afr_setdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+
+#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
new file mode 100644
index 00000000000..a6c99ec0576
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -0,0 +1,721 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+
+
+/**
+ * Common algorithm for inode read calls:
+ *
+ * - Try the fop on the first child that is up
+ * - if we have failed due to ENOTCONN:
+ * try the next child
+ *
+ * Applicable to: access, stat, fstat, readlink, getxattr
+ */
+
+/* {{{ access */
+
+int32_t
+afr_access_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.access.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.access.last_tried;
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->access,
+ &local->loc, local->cont.access.mask);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.access.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ local->cont.access.mask = mask;
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->access,
+ loc, mask);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ stat */
+
+int32_t
+afr_stat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int deitransform_child = -1;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ deitransform_child = (long) cookie;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.stat.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.stat.last_tried;
+
+ if (this_try == deitransform_child) {
+ goto retry;
+ }
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk,
+ (void *) (long) deitransform_child,
+ children[this_try],
+ children[this_try]->fops->stat,
+ &local->loc);
+ }
+
+out:
+ if (unwind) {
+ if (op_ret != -1)
+ buf->st_ino = local->cont.stat.ino;
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ call_child = afr_deitransform (loc->inode->ino, priv->child_count);
+ loc_copy (&local->loc, loc);
+
+ /*
+ if stat fails from the deitranform'd child, we try
+ all children starting with the first one
+ */
+ local->cont.stat.last_tried = -1;
+ local->cont.stat.ino = loc->inode->ino;
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->stat,
+ loc);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ fstat */
+
+int32_t
+afr_fstat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int deitransform_child = -1;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ deitransform_child = (long) cookie;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.fstat.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.fstat.last_tried;
+
+ if (this_try == deitransform_child) {
+ /*
+ skip the deitransform'd child since if we are here
+ we must have already tried that child
+ */
+ goto retry;
+ }
+
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk,
+ (void *) (long) deitransform_child,
+ children[this_try],
+ children[this_try]->fops->fstat,
+ local->fd);
+ }
+
+out:
+ if (unwind) {
+ if (op_ret != -1)
+ buf->st_ino = local->cont.fstat.ino;
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ VALIDATE_OR_GOTO (fd->inode, out);
+
+ call_child = afr_deitransform (fd->inode->ino, priv->child_count);
+
+ /*
+ if fstat fails from the deitranform'd child, we try
+ all children starting with the first one
+ */
+ local->cont.fstat.last_tried = -1;
+ local->cont.fstat.ino = fd->inode->ino;
+ local->fd = fd_ref (fd);
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->fstat,
+ fd);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ readlink */
+
+int32_t
+afr_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ const char *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.readlink.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.readlink.last_tried;
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->readlink,
+ &local->loc,
+ local->cont.readlink.size);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readlink.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ local->cont.readlink.size = size;
+
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->readlink,
+ loc, size);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ getxattr */
+
+int32_t
+afr_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.getxattr.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.getxattr.last_tried;
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->getxattr,
+ &local->loc,
+ local->cont.getxattr.name);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t * local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.getxattr.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ if (name)
+ local->cont.getxattr.name = strdup (name);
+
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->getxattr,
+ loc, name);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ readv */
+
+/**
+ * read algorithm:
+ *
+ * if the user has specified a read subvolume, use it
+ * otherwise -
+ * use the inode number to hash it to one of the subvolumes, and
+ * read from there (to balance read load)
+ *
+ * if any of the above read's fail, try the children in sequence
+ * beginning at the beginning
+ */
+
+int32_t
+afr_readv_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.readv.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.readv.last_tried;
+
+ if (this_try == priv->read_child) {
+ /*
+ skip the read child since if we are here
+ we must have already tried that child
+ */
+ goto retry;
+ }
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->readv,
+ local->fd, local->cont.readv.size,
+ local->cont.readv.offset);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ if (priv->read_child != -1) {
+ call_child = priv->read_child;
+
+ /*
+ if read fails from the read child, we try
+ all children starting with the first one
+ */
+ local->cont.readv.last_tried = -1;
+ } else {
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readv.last_tried = call_child;
+ }
+
+ local->fd = fd_ref (fd);
+
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk,
+ (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->readv,
+ fd, size, offset);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL);
+ }
+ return 0;
+}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
new file mode 100644
index 00000000000..6b3bd2da850
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __INODE_READ_H__
+#define __INODE_READ_H__
+
+int32_t
+afr_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask);
+
+int32_t
+afr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd);
+
+int32_t
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size);
+
+int32_t
+afr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset);
+
+int32_t
+afr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name);
+
+#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
new file mode 100644
index 00000000000..267350b2c4a
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -0,0 +1,2024 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+
+/* {{{ chmod */
+
+
+int
+afr_chmod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.chmod.buf.st_ino = local->cont.chmod.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.chmod.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.chmod.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ afr_chmod_unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_chmod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chmod,
+ &local->loc,
+ local->cont.chmod.mode);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_chmod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.chmod.mode = mode;
+ local->cont.chmod.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_chmod_wind;
+ local->transaction.done = afr_chmod_done;
+ local->transaction.unwind = afr_chmod_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
+/* {{{ fchmod */
+
+int
+afr_fchmod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.fchmod.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.fchmod.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ afr_fchmod_unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchmod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fchmod,
+ local->fd,
+ local->cont.fchmod.mode);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchmod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.fchmod.mode = mode;
+ local->cont.fchmod.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_fchmod_wind;
+ local->transaction.done = afr_fchmod_done;
+ local->transaction.unwind = afr_fchmod_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ chown */
+
+int
+afr_chown_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.chown.buf.st_ino = local->cont.chown.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.chown.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.chown.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_chown_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chown,
+ &local->loc, local->cont.chown.uid,
+ local->cont.chown.gid);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_chown_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.chown.uid = uid;
+ local->cont.chown.gid = gid;
+ local->cont.chown.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_chown_wind;
+ local->transaction.done = afr_chown_done;
+ local->transaction.unwind = afr_chown_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ chown */
+
+int
+afr_fchown_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.fchown.buf.st_ino = local->cont.fchown.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.fchown.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.fchown.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchown_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fchown,
+ local->fd, local->cont.fchown.uid,
+ local->cont.fchown.gid);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchown_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.fchown.uid = uid;
+ local->cont.fchown.gid = gid;
+ local->cont.fchown.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_fchown_wind;
+ local->transaction.done = afr_fchown_done;
+ local->transaction.unwind = afr_fchown_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ writev */
+
+int
+afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.writev.buf.st_ino = local->cont.writev.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.writev.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.writev.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_writev_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ local->fd,
+ local->cont.writev.vector,
+ local->cont.writev.count,
+ local->cont.writev.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_writev_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->cont.writev.refs)
+ dict_unref (local->cont.writev.refs);
+ local->cont.writev.refs = NULL;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op = GF_FOP_WRITE;
+ local->cont.writev.vector = iov_dup (vector, count);
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.ino = fd->inode->ino;
+
+ if (frame->root->req_refs)
+ local->cont.writev.refs = dict_ref (frame->root->req_refs);
+
+ local->transaction.fop = afr_writev_wind;
+ local->transaction.done = afr_writev_done;
+ local->transaction.unwind = afr_writev_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ if (fd->flags & O_APPEND) {
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = offset;
+ local->transaction.len = iov_length (vector, count);
+ }
+
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ truncate */
+
+int
+afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.truncate.buf.st_ino = local->cont.truncate.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.truncate.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.truncate.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->truncate,
+ &local->loc,
+ local->cont.truncate.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_truncate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.truncate.offset = offset;
+ local->cont.truncate.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_truncate_wind;
+ local->transaction.done = afr_truncate_done;
+ local->transaction.unwind = afr_truncate_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = offset;
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ ftruncate */
+
+
+int
+afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.ftruncate.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.ftruncate.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->ftruncate,
+ local->fd, local->cont.ftruncate.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op = GF_FOP_FTRUNCATE;
+ local->op_ret = -1;
+
+ local->cont.ftruncate.offset = offset;
+ local->cont.ftruncate.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_ftruncate_wind;
+ local->transaction.done = afr_ftruncate_done;
+ local->transaction.unwind = afr_ftruncate_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = offset;
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ utimens */
+
+
+int
+afr_utimens_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.utimens.buf.st_ino = local->cont.utimens.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.utimens.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 1;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.utimens.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_utimens_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->utimens,
+ &local->loc,
+ local->cont.utimens.tv);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_utimens_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2])
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.utimens.tv[0] = tv[0];
+ local->cont.utimens.tv[1] = tv[1];
+
+ local->cont.utimens.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_utimens_wind;
+ local->transaction.done = afr_utimens_done;
+ local->transaction.unwind = afr_utimens_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ setxattr */
+
+
+int
+afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
+ }
+ return 0;
+}
+
+
+int
+afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc,
+ local->cont.setxattr.dict,
+ local->cont.setxattr.flags);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *dict, int32_t flags)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.setxattr.dict = dict_ref (dict);
+ local->cont.setxattr.flags = flags;
+
+ local->transaction.fop = afr_setxattr_wind;
+ local->transaction.done = afr_setxattr_done;
+ local->transaction.unwind = afr_setxattr_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ removexattr */
+
+
+int
+afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
+ }
+ return 0;
+}
+
+
+int
+afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->removexattr,
+ &local->loc,
+ local->cont.removexattr.name);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.removexattr.name = strdup (name);
+
+ local->transaction.fop = afr_removexattr_wind;
+ local->transaction.done = afr_removexattr_done;
+ local->transaction.unwind = afr_removexattr_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
new file mode 100644
index 00000000000..9c0b5cad314
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -0,0 +1,63 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __INODE_WRITE_H__
+#define __INODE_WRITE_H__
+
+int32_t
+afr_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode);
+
+int32_t
+afr_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid);
+
+int
+afr_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid);
+
+int32_t
+afr_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode);
+
+int32_t
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset);
+
+int32_t
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset);
+
+int32_t
+afr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset);
+
+int32_t
+afr_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2]);
+
+int32_t
+afr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *dict, int32_t flags);
+
+int32_t
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name);
+
+#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
new file mode 100644
index 00000000000..45d06516965
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -0,0 +1,1073 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "byte-order.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-self-heal-common.h"
+#include "afr-self-heal.h"
+
+
+/**
+ * select_source - select a source and return it
+ * TODO: take into account option 'favorite-child'
+ */
+
+int
+afr_sh_select_source (int sources[], int child_count)
+{
+ int i;
+ for (i = 0; i < child_count; i++)
+ if (sources[i])
+ return i;
+
+ return -1;
+}
+
+
+/**
+ * sink_count - return number of sinks in sources array
+ */
+
+int
+afr_sh_sink_count (int sources[], int child_count)
+{
+ int i;
+ int sinks = 0;
+ for (i = 0; i < child_count; i++)
+ if (!sources[i])
+ sinks++;
+ return sinks;
+}
+
+int
+afr_sh_source_count (int sources[], int child_count)
+{
+ int i;
+ int nsource = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (sources[i])
+ nsource++;
+ return nsource;
+}
+
+
+int
+afr_sh_supress_errenous_children (int sources[], int child_errno[],
+ int child_count)
+{
+ int i = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (child_errno[i] && sources[i]) {
+ sources[i] = 0;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
+ struct stat *buf,
+ int child_count, const char *key)
+{
+ int i = 0;
+ int32_t *pending = NULL;
+ int ret = 0;
+ int all_xattr_missing = 1;
+
+ /* if the file was created by afr with xattrs */
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
+ if (ret != 0) {
+ continue;
+ }
+
+ all_xattr_missing = 0;
+ break;
+ }
+
+ if (all_xattr_missing) {
+ /* supress 0byte files.. this avoids empty file created
+ by dir selfheal to overwrite the 'good' file */
+ for (i = 0; i < child_count; i++) {
+ if (!buf[i].st_size)
+ sources[i] = 0;
+ }
+ goto out;
+ }
+
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i]) {
+ sources[i] = 0;
+ continue;
+ }
+
+ ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
+ if (ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+
+ if (!pending) {
+ sources[i] = 0;
+ continue;
+ }
+ }
+
+out:
+ return 0;
+}
+
+
+void
+afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ char *buf = NULL;
+ char *ptr = NULL;
+
+ int i, j;
+
+ /* 10 digits per entry + 1 space + '[' and ']' */
+ buf = MALLOC (priv->child_count * 11 + 8);
+
+ for (i = 0; i < priv->child_count; i++) {
+ ptr = buf;
+ ptr += sprintf (ptr, "[ ");
+ for (j = 0; j < priv->child_count; j++) {
+ ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
+ }
+ ptr += sprintf (ptr, "]");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pending_matrix: %s", buf);
+ }
+
+ FREE (buf);
+}
+
+
+void
+afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
+ int child_count, const char *key)
+{
+ int i = 0;
+ int j = 0;
+ int32_t *pending = NULL;
+ int ret = -1;
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = 0;
+ }
+ }
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ pending = NULL;
+
+ ret = dict_get_ptr (xattr[i], (char *) key,
+ VOID(&pending));
+ if (ret != 0)
+ continue;
+
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = ntoh32 (pending[j]);
+ }
+ }
+}
+
+
+/**
+ * mark_sources: Mark all 'source' nodes and return number of source
+ * nodes found
+ */
+
+int
+afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ int nsources = 0;
+
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ sources[i] = 0;
+ }
+
+ /*
+ Let's 'normalize' the pending matrix first,
+ by disregarding all pending entries that refer
+ to themselves
+ */
+ for (i = 0; i < child_count; i++) {
+ pending_matrix[i][i] = 0;
+ }
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ if (pending_matrix[j][i])
+ break;
+ }
+
+ if (j == child_count) {
+ nsources++;
+ sources[i] = 1;
+ }
+ }
+
+ return nsources;
+}
+
+
+void
+afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
+ int success[], int child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ delta_matrix[i][j] = 0;
+ }
+ }
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ if (!success[j])
+ continue;
+ delta_matrix[i][j] = -pending_matrix[i][j];
+ }
+ }
+}
+
+
+int
+afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
+ int child_count, const char *key)
+{
+ int i = 0;
+ int j = 0;
+
+ int ret = 0;
+
+ int32_t *pending = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ pending = CALLOC (sizeof (int32_t), child_count);
+ for (j = 0; j < child_count; j++) {
+ pending[j] = hton32 (delta_matrix[i][j]);
+ }
+
+ ret = dict_set_bin (xattr[i], (char *) key, pending,
+ child_count * sizeof (int32_t));
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+
+/**
+ * is_matrix_zero - return true if pending matrix is all zeroes
+ */
+
+int
+afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
+{
+ int i, j;
+
+ for (i = 0; i < child_count; i++)
+ for (j = 0; j < child_count; j++)
+ if (pending_matrix[i][j])
+ return 0;
+ return 1;
+}
+
+
+int
+afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
+ memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i])
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+
+ if (local->govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "aborting selfheal of %s",
+ local->loc.path);
+ sh->completion_cbk (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to metadata check on %s",
+ local->loc.path);
+ afr_self_heal_metadata (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_missing_entries_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_self_heal_t *sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %"PRId64"/%s on subvolume %s",
+ sh->parent_loc.inode->ino, local->loc.name,
+ priv->children[i]->name);
+
+ STACK_WIND (frame, sh_missing_entries_unlck_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &sh->parent_loc, local->loc.name,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+}
+
+
+static int
+sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int op_errno, struct stat *stbuf)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+
+static int
+sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *chown_frame = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ struct stat *buf = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ buf = &sh->buf[sh->source];
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ chown_frame = copy_frame (frame);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "chown %s to %d %d on subvolume %s",
+ local->loc.path, buf->st_uid, buf->st_gid,
+ priv->children[child_index]->name);
+
+ STACK_WIND (chown_frame, sh_destroy_cbk,
+ priv->children[child_index],
+ priv->children[child_index]->fops->chown,
+ &local->loc,
+ buf->st_uid, buf->st_gid);
+ }
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_missing_entries_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+ mode_t st_mode = 0;
+ dev_t st_dev = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ st_mode = sh->buf[sh->source].st_mode;
+ st_dev = sh->buf[sh->source].st_dev;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "mknod %s mode 0%o on %d subvolumes",
+ local->loc.path, st_mode, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mknod,
+ &local->loc, st_mode, st_dev);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+ mode_t st_mode = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ st_mode = sh->buf[sh->source].st_mode;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "mkdir %s mode 0%o on %d subvolumes",
+ local->loc.path, st_mode, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mkdir,
+ &local->loc, st_mode);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this,
+ const char *link)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "symlink %s -> %s on %d subvolumes",
+ local->loc.path, link, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->symlink,
+ link, &local->loc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *link)
+{
+ if (op_ret > 0)
+ sh_missing_entries_symlink (frame, this, link);
+ else
+ sh_missing_entries_finish (frame, this);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ STACK_WIND (frame, sh_missing_entries_readlink_cbk,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readlink,
+ &local->loc, 4096);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int type = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int enoent_count = 0;
+ int govinda_gOvinda = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i]) {
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+ } else {
+ if (type) {
+ if (type != (sh->buf[i].st_mode & S_IFMT))
+ govinda_gOvinda = 1;
+ } else {
+ sh->source = i;
+ type = sh->buf[i].st_mode & S_IFMT;
+ }
+ }
+ }
+
+ if (govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "conflicing filetypes exist for path %s. returning.",
+ local->loc.path);
+
+ local->govinda_gOvinda = 1;
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ if (!type) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no source found for %s. all nodes down?. returning.",
+ local->loc.path);
+ /* subvolumes down and/or file does not exist */
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ if (enoent_count == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no missing files - %s. proceeding to metadata check",
+ local->loc.path);
+ /* proceed to next step - metadata self-heal */
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ sh_missing_entries_mknod (frame, this);
+ break;
+ case S_IFLNK:
+ sh_missing_entries_readlink (frame, this);
+ break;
+ case S_IFDIR:
+ sh_missing_entries_mkdir (frame, this);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown file type: 0%o", type);
+ local->govinda_gOvinda = 1;
+ sh_missing_entries_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ int child_index = 0;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s is of mode 0%o",
+ local->loc.path,
+ priv->children[child_index]->name,
+ buf->st_mode);
+
+ local->self_heal.buf[child_index] = *buf;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "path %s on subvolume %s => -1 (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ local->self_heal.child_errno[child_index] = op_errno;
+ }
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_missing_entries_create (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr_req = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ call_count = local->child_count;
+ priv = this->private;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed == 1) {
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ sh_missing_entries_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "attempting to recreate missing entries for path=%s",
+ local->loc.path);
+
+ afr_build_parent_loc (&sh->parent_loc, &local->loc);
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, sh_missing_entries_lk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &sh->parent_loc, local->loc.name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal (call_frame_t *frame, xlator_t *this,
+ int (*completion_cbk) (call_frame_t *, xlator_t *))
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "performing self heal on %s (metadata=%d data=%d entry=%d)",
+ local->loc.path,
+ local->need_metadata_self_heal,
+ local->need_data_self_heal,
+ local->need_entry_self_heal);
+
+ sh->completion_cbk = completion_cbk;
+
+ sh->buf = CALLOC (priv->child_count, sizeof (struct stat));
+ sh->child_errno = CALLOC (priv->child_count, sizeof (int));
+ sh->success = CALLOC (priv->child_count, sizeof (int));
+ sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *));
+ sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count);
+
+ sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sh->pending_matrix[i] = CALLOC (sizeof (int32_t),
+ priv->child_count);
+ }
+
+ sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sh->delta_matrix[i] = CALLOC (sizeof (int32_t),
+ priv->child_count);
+ }
+
+ if (local->success_count && local->enoent_count) {
+ afr_self_heal_missing_entries (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to metadata check on %s",
+ local->loc.path);
+ afr_sh_missing_entries_done (frame, this);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
new file mode 100644
index 00000000000..9dd597f0787
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -0,0 +1,66 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __AFR_SELF_HEAL_COMMON_H__
+#define __AFR_SELF_HEAL_COMMON_H__
+
+#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512))
+
+int
+afr_sh_select_source (int sources[], int child_count);
+
+int
+afr_sh_sink_count (int sources[], int child_count);
+
+int
+afr_sh_source_count (int sources[], int child_count);
+
+int
+afr_sh_supress_errenous_children (int sources[], int child_errno[],
+ int child_count);
+
+int
+afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
+ struct stat *buf,
+ int child_count, const char *key);
+
+void
+afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
+
+void
+afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
+ int child_count, const char *key);
+
+void
+afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
+ int32_t success[], int child_count);
+
+int
+afr_sh_mark_sources (int32_t *pending_matrix[], int sources[],
+ int child_count);
+
+int
+afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
+ int child_count, const char *key);
+
+int
+afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
+
+
+#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
new file mode 100644
index 00000000000..3a48da48587
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -0,0 +1,1030 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+
+int
+afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ /*
+ TODO: cleanup sh->*
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "self heal of %s completed",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ afr_sh_data_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int i = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (!sh->healing_fd) {
+ afr_sh_data_done (frame, this);
+ return 0;
+ }
+
+ call_count = sh->active_sinks + 1;
+ local->call_count = call_count;
+
+
+ /* closed source */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd of %s on %s",
+ local->loc.path, priv->children[sh->source]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->flush,
+ sh->healing_fd);
+ call_count--;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd of %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ sh->healing_fd);
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_close (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finishing data selfheal of %s", local->loc.path);
+
+ afr_sh_data_unlock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_data_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_DATA_PENDING);
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "ftruncate of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ftruncate of %s on subvolume %s completed",
+ local->loc.path,
+ priv->children[child_index]->name);
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_erase_pending (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int *sources = NULL;
+ int call_count = 0;
+ int i = 0;
+
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sources = sh->sources;
+ call_count = sh->active_sinks;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->ftruncate,
+ sh->healing_fd, sh->file_size);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset - op_ret);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write to %s failed on subvolume %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_read_write_iter (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int child_index = (long) cookie;
+ int i = 0;
+ int call_count = 0;
+
+ off_t offset;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = sh->active_sinks;
+
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "read %d bytes of data from %s on child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset);
+
+ if (op_ret <= 0) {
+ afr_sh_data_trim_sinks (frame, this);
+ return 0;
+ }
+
+ /* what if we read less than block size? */
+ offset = sh->offset;
+ sh->offset += op_ret;
+
+ frame->root->req_refs = frame->root->rsp_refs;
+
+ if (sh->file_has_holes) {
+ if (iov_0filled (vector, count) == 0) {
+ /* the iter function depends on the
+ sh->offset already being updated
+ above
+ */
+ afr_sh_data_read_write_iter (frame, this);
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ /* this is a sink, so write to it */
+ STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ sh->healing_fd, vector, count, offset);
+
+ if (!--call_count)
+ break;
+ }
+
+out:
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readv,
+ sh->healing_fd, sh->block_size,
+ sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ goto out;
+ }
+
+ if (sh->offset >= sh->file_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd's of %s",
+ local->loc.path);
+ afr_sh_data_trim_sinks (frame, this);
+
+ goto out;
+ }
+
+ afr_sh_data_read_write (frame, this);
+
+out:
+ return 0;
+}
+
+
+int
+afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ /* TODO: some of the open's might fail.
+ In that case, modify cleanup fn to send flush on those
+ fd's which are already open */
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "open of %s failed on child %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fd for %s opened, commencing sync",
+ local->loc.path);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "sourcing file %s from %s to other sinks",
+ local->loc.path, priv->children[sh->source]->name);
+
+ afr_sh_data_read_write (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ int source = -1;
+ int *sources = NULL;
+
+ fd_t *fd = NULL;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = sh->active_sinks + 1;
+ local->call_count = call_count;
+
+ fd = fd_create (local->loc.inode, frame->root->pid);
+ sh->healing_fd = fd;
+
+ source = local->self_heal.source;
+ sources = local->self_heal.sources;
+
+ sh->block_size = 65536;
+ sh->file_size = sh->buf[source].st_size;
+
+ if (FILE_HAS_HOLES (&sh->buf[source]))
+ sh->file_has_holes = 1;
+
+ /* open source */
+ STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->open,
+ &local->loc, O_RDONLY|O_LARGEFILE, fd);
+ call_count--;
+
+ /* open sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if(sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ &local->loc,
+ O_WRONLY|O_LARGEFILE, fd);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for performing self-heal on file %s",
+ local->loc.path);
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing data of %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name, active_sinks);
+
+ afr_sh_data_open (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int nsources = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_DATA_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf,
+ priv->child_count, AFR_DATA_PENDING);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ nsources = afr_sh_source_count (sh->sources, priv->child_count);
+
+ if ((nsources == 0)
+ && (priv->favorite_child != -1)
+ && (sh->child_errno[priv->favorite_child] == 0)) {
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Picking favorite child %s as authentic source to resolve conflicting data of %s",
+ priv->children[priv->favorite_child]->name,
+ local->loc.path);
+
+ sh->sources[priv->favorite_child] = 1;
+
+ nsources = afr_sh_source_count (sh->sources,
+ priv->child_count);
+ }
+
+ if (nsources == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to resolve conflicting data of %s. "
+ "Please resolve manually by deleting the file %s "
+ "from all but the preferred subvolume. "
+ "Please consider 'option favorite-child <>'",
+ local->loc.path, local->loc.path);
+
+ local->govinda_gOvinda = 1;
+
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ /* detect changes not visible through pending flags -- JIC */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || sh->child_errno[i])
+ continue;
+
+ if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+ }
+
+ afr_sh_data_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ sh->xattr[child_index] = dict_ref (xattr);
+ sh->buf[child_index] = *buf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_fix (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr_req = NULL;
+
+ int call_count = 0;
+ int i = 0;
+ int ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_data_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_WRLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_data (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_data_self_heal && priv->data_self_heal) {
+ afr_sh_data_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "not doing data self heal on %s",
+ local->loc.path);
+ afr_sh_data_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
new file mode 100644
index 00000000000..ec341922ee7
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -0,0 +1,2038 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+
+int
+afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ /*
+ TODO: cleanup sh->*
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "self heal of %s completed",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlocking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocked inode of %s on child %d",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->healing_fd)
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ afr_sh_entry_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->loc, NULL,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finishing entry selfheal of %s", local->loc.path);
+
+ afr_sh_entry_unlock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_ENTRY_PENDING);
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+
+static int
+next_active_source (call_frame_t *frame, xlator_t *this,
+ int current_active_source)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int source = -1;
+ int next_active_source = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ source = sh->source;
+
+ if (source != -1) {
+ if (current_active_source != source)
+ next_active_source = source;
+ goto out;
+ }
+
+ /*
+ the next active sink becomes the source for the
+ 'conservative decision' of merging all entries
+ */
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0)
+ && (local->child_up[i] == 1)
+ && (i > current_active_source)) {
+
+ next_active_source = i;
+ break;
+ }
+ }
+out:
+ return next_active_source;
+}
+
+
+
+static int
+next_active_sink (call_frame_t *frame, xlator_t *this,
+ int current_active_sink)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int next_active_sink = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ /*
+ the next active sink becomes the source for the
+ 'conservative decision' of merging all entries
+ */
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0)
+ && (local->child_up[i] == 1)
+ && (i > current_active_sink)) {
+
+ next_active_sink = i;
+ break;
+ }
+ }
+
+ return next_active_sink;
+}
+
+
+int
+build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ int ret = -1;
+
+ if (!child) {
+ goto out;
+ }
+
+ if (strcmp (parent->path, "/") == 0)
+ asprintf ((char **)&child->path, "/%s", name);
+ else
+ asprintf ((char **)&child->path, "%s/%s", parent->path, name);
+
+ if (!child->path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ child->name = strrchr (child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref (parent->inode);
+ child->inode = inode_new (parent->inode->table);
+
+ if (!child->inode) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ loc_wipe (child);
+
+ return ret;
+}
+
+
+int
+afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src);
+
+int
+afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_expunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int active_src = 0;
+ call_frame_t *frame = NULL;
+
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+
+ active_src = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "removed %s on %s",
+ expunge_local->loc.path,
+ priv->children[active_src]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "removing %s on %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ }
+
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "removing directory %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->rmdir,
+ &expunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlinking file %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->unlink,
+ &expunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src, struct stat *buf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int source = 0;
+ call_frame_t *frame = NULL;
+ int type = 0;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ source = expunge_sh->source;
+
+ type = (buf->st_mode & S_IFMT);
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ case S_IFLNK:
+ afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
+
+ break;
+ case S_IFDIR:
+ afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s has unknown file type on %s: 0%o",
+ expunge_local->loc.path,
+ priv->children[source]->name, type);
+ goto out;
+ break;
+ }
+
+ return 0;
+out:
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ active_src = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "lookup of %s on %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
+
+ return 0;
+out:
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->lookup,
+ &expunge_local->loc, 0);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int source = 0;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ active_src = expunge_sh->active_source;
+ source = (long) cookie;
+
+ if (op_ret == -1 && op_errno == ENOENT) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing entry %s on %s",
+ expunge_local->loc.path,
+ priv->children[source]->name);
+
+ afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
+
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s exists under %s",
+ expunge_local->loc.path,
+ priv->children[source]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s under %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[source]->name,
+ strerror (op_errno));
+ }
+
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
+ char *name)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ call_frame_t *expunge_frame = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int active_src = 0;
+ int source = 0;
+ int op_errno = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+ source = sh->source;
+
+ if ((strcmp (name, ".") == 0)
+ || (strcmp (name, "..") == 0)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "skipping inspection of %s under %s",
+ name, local->loc.path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inspecting existance of %s under %s",
+ name, local->loc.path);
+
+ expunge_frame = copy_frame (frame);
+ if (!expunge_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
+
+ expunge_frame->local = expunge_local;
+ expunge_sh = &expunge_local->self_heal;
+ expunge_sh->sh_frame = frame;
+ expunge_sh->active_source = active_src;
+
+ ret = build_child_loc (this, &expunge_local->loc, &local->loc, name);
+ if (ret != 0) {
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s", expunge_local->loc.path,
+ priv->children[source]->name);
+
+ STACK_WIND_COOKIE (expunge_frame,
+ afr_sh_entry_expunge_entry_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->lookup,
+ &expunge_local->loc, 0);
+
+ ret = 0;
+out:
+ if (ret == -1)
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ gf_dirent_t *entry = NULL;
+ off_t last_offset = 0;
+ int active_src = 0;
+ int entry_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if (op_ret <= 0) {
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readdir of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir of %s on subvolume %s complete",
+ local->loc.path,
+ priv->children[active_src]->name);
+ }
+
+ afr_sh_entry_expunge_all (frame, this);
+ return 0;
+ }
+
+ list_for_each_entry (entry, &entries->list, list) {
+ last_offset = entry->d_off;
+ entry_count++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir'ed %d entries from %s",
+ entry_count, priv->children[active_src]->name);
+
+ sh->offset = last_offset;
+ local->call_count = entry_count;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ afr_sh_entry_expunge_entry (frame, this, entry->d_name);
+ }
+
+ return 0;
+}
+
+int
+afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readdir,
+ sh->healing_fd, sh->block_size, sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->offset = 0;
+
+ if (sh->source == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sources for %s to expunge entries",
+ local->loc.path);
+ goto out;
+ }
+
+ active_src = next_active_sink (frame, this, sh->active_source);
+ sh->active_source = active_src;
+
+ if (sh->op_failed) {
+ goto out;
+ }
+
+ if (active_src == -1) {
+ /* completed creating missing files on all subvolumes */
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "expunging entries of %s on %s to other sinks",
+ local->loc.path, priv->children[active_src]->name);
+
+ afr_sh_entry_expunge_subvol (frame, this, active_src);
+
+ return 0;
+out:
+ afr_sh_entry_erase_pending (frame, this);
+ return 0;
+
+}
+
+
+int
+afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src);
+
+int
+afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_impunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "utimes set for %s on %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting utimes of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+ struct timespec ts[2];
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ownership of %s on %s changed",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ownership of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
+ ts[0] = impunge_local->cont.lookup.buf.st_atim;
+ ts[1] = impunge_local->cont.lookup.buf.st_mtim;
+#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
+ ts[0] = impunge_local->cont.lookup.buf.st_atimespec;
+ ts[1] = impunge_local->cont.lookup.buf.st_mtimespec;
+#else
+ ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime;
+ ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime;
+#endif
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_utimens_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->utimens,
+ &impunge_local->loc, ts);
+
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+
+ child_index = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "creation of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting ownership of %s on %s to %d/%d",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ impunge_local->cont.lookup.buf.st_uid,
+ impunge_local->cont.lookup.buf.st_gid);
+
+ inode->st_mode = stbuf->st_mode;
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->chown,
+ &impunge_local->loc,
+ impunge_local->cont.lookup.buf.st_uid,
+ impunge_local->cont.lookup.buf.st_gid);
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s",
+ impunge_local->loc.path,
+ stbuf->st_mode, stbuf->st_rdev,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->mknod,
+ &impunge_local->loc,
+ stbuf->st_mode, stbuf->st_rdev);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating directory %s mode=0%o on %s",
+ impunge_local->loc.path,
+ stbuf->st_mode,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->mkdir,
+ &impunge_local->loc, stbuf->st_mode);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, const char *linkname)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating symlink %s -> %s on %s",
+ impunge_local->loc.path, linkname,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->symlink,
+ linkname, &impunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *linkname)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int child_index = -1;
+ call_frame_t *frame = NULL;
+ int call_count = -1;
+ int active_src = -1;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ active_src = impunge_sh->active_source;
+
+ child_index = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readlink of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
+ linkname);
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ active_src = impunge_sh->active_source;
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
+ (void *) (long) child_index,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readlink,
+ &impunge_local->loc, 4096);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf,
+ dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+ int type = 0;
+ int child_index = 0;
+ call_frame_t *frame = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+
+ child_index = (long) cookie;
+
+ active_src = impunge_sh->active_source;
+
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s on %s (for %s) failed (%s)",
+ impunge_local->loc.path,
+ priv->children[active_src]->name,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ impunge_local->cont.lookup.buf = *buf;
+ type = (buf->st_mode & S_IFMT);
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ afr_sh_entry_impunge_mknod (impunge_frame, this,
+ child_index, buf);
+ break;
+ case S_IFLNK:
+ afr_sh_entry_impunge_readlink (impunge_frame, this,
+ child_index, buf);
+ break;
+ case S_IFDIR:
+ afr_sh_entry_impunge_mkdir (impunge_frame, this,
+ child_index, buf);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s has unknown file type on %s: 0%o",
+ impunge_local->loc.path,
+ priv->children[active_src]->name, type);
+ goto out;
+ break;
+ }
+
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ active_src = impunge_sh->active_source;
+
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_recreate_lookup_cbk,
+ (void *) (long) child_index,
+ priv->children[active_src],
+ priv->children[active_src]->fops->lookup,
+ &impunge_local->loc, 0);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+ active_src = impunge_sh->active_source;
+
+ if (op_ret == -1 && op_errno == ENOENT) {
+ /* decrease call_count in recreate-callback */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing entry %s on %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+
+ afr_sh_entry_impunge_recreate (impunge_frame, this,
+ child_index);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s exists under %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s under %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
+ char *name)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ call_frame_t *impunge_frame = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+ int i = 0;
+ int call_count = 0;
+ int op_errno = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if ((strcmp (name, ".") == 0)
+ || (strcmp (name, "..") == 0)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "skipping inspection of %s under %s",
+ name, local->loc.path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inspecting existance of %s under %s",
+ name, local->loc.path);
+
+ impunge_frame = copy_frame (frame);
+ if (!impunge_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
+
+ impunge_frame->local = impunge_local;
+ impunge_sh = &impunge_local->self_heal;
+ impunge_sh->sh_frame = frame;
+ impunge_sh->active_source = active_src;
+
+ ret = build_child_loc (this, &impunge_local->loc, &local->loc, name);
+ if (ret != 0) {
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == active_src)
+ continue;
+ if (local->child_up[i] == 0)
+ continue;
+ if (sh->sources[i] == 1)
+ continue;
+ call_count++;
+ }
+
+ impunge_local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == active_src)
+ continue;
+ if (local->child_up[i] == 0)
+ continue;
+ if (sh->sources[i] == 1)
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s", impunge_local->loc.path,
+ priv->children[i]->name);
+
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_entry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &impunge_local->loc, 0);
+
+ if (!--call_count)
+ break;
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ gf_dirent_t *entry = NULL;
+ off_t last_offset = 0;
+ int active_src = 0;
+ int entry_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if (op_ret <= 0) {
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readdir of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir of %s on subvolume %s complete",
+ local->loc.path,
+ priv->children[active_src]->name);
+ }
+
+ afr_sh_entry_impunge_all (frame, this);
+ return 0;
+ }
+
+ list_for_each_entry (entry, &entries->list, list) {
+ last_offset = entry->d_off;
+ entry_count++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir'ed %d entries from %s",
+ entry_count, priv->children[active_src]->name);
+
+ sh->offset = last_offset;
+ local->call_count = entry_count;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ afr_sh_entry_impunge_entry (frame, this, entry->d_name);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readdir,
+ sh->healing_fd, sh->block_size, sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->offset = 0;
+
+ active_src = next_active_source (frame, this, sh->active_source);
+ sh->active_source = active_src;
+
+ if (sh->op_failed) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+
+ if (active_src == -1) {
+ /* completed creating missing files on all subvolumes */
+ afr_sh_entry_expunge_all (frame, this);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "impunging entries of %s on %s to other sinks",
+ local->loc.path, priv->children[active_src]->name);
+
+ afr_sh_entry_impunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ /* TODO: some of the open's might fail.
+ In that case, modify cleanup fn to send flush on those
+ fd's which are already open */
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir of %s failed on child %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fd for %s opened, commencing sync",
+ local->loc.path);
+
+ sh->active_source = -1;
+ afr_sh_entry_impunge_all (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ int source = -1;
+ int *sources = NULL;
+
+ fd_t *fd = NULL;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = local->self_heal.source;
+ sources = local->self_heal.sources;
+
+ sh->block_size = 131072;
+ sh->offset = 0;
+
+ call_count = sh->active_sinks;
+ if (source != -1)
+ call_count++;
+
+ local->call_count = call_count;
+
+ fd = fd_create (local->loc.inode, frame->root->pid);
+ sh->healing_fd = fd;
+
+ if (source != -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening directory %s on subvolume %s (source)",
+ local->loc.path, priv->children[source]->name);
+
+ /* open source */
+ STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->opendir,
+ &local->loc, fd);
+ call_count--;
+ }
+
+ /* open sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening directory %s on subvolume %s (sink)",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ &local->loc, fd);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ if (source != -1)
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for self-heal on dir %s",
+ local->loc.path);
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ if (source == -1 && active_sinks < 2) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot sync with 0 sources and 1 sink on dir %s",
+ local->loc.path);
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ if (source != -1)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name,
+ active_sinks);
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sources for %s found. "
+ "merging all entries as a conservative decision",
+ local->loc.path);
+
+ afr_sh_entry_open (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_ENTRY_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ afr_sh_entry_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ sh->xattr[child_index] = dict_ref (xattr);
+ sh->buf[child_index] = *buf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_entry_fix (frame, this);
+ }
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_self_heal_t * sh = NULL;
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+ int call_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame,
+ afr_sh_entry_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed == 1) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_entry_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_lock (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->loc, NULL,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_entry_self_heal && priv->entry_self_heal) {
+ afr_sh_entry_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to completion on %s",
+ local->loc.path);
+ afr_sh_entry_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
new file mode 100644
index 00000000000..e65a426db6c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -0,0 +1,791 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+int
+afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
+ memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
+ memset (sh->success, 0, sizeof (int) * priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i])
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+
+ if (local->govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "aborting selfheal of %s",
+ local->loc.path);
+ sh->completion_cbk (frame, this);
+ } else {
+ if (S_ISREG (local->cont.lookup.buf.st_mode)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to data check on %s",
+ local->loc.path);
+ afr_self_heal_data (frame, this);
+ return 0;
+ }
+
+ if (S_ISDIR (local->cont.lookup.buf.st_mode)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to entry check on %s",
+ local->loc.path);
+ afr_self_heal_entry (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "completed self heal of %s",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_done (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ struct flock flock = {0, };
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_UNLCK;
+
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND (frame, afr_sh_metadata_unlck_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_METADATA_PENDING);
+
+ local->call_count = call_count;
+
+ if (call_count == 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "metadata of %s not healed on any subvolume",
+ local->loc.path);
+
+ afr_sh_metadata_finish (frame, this);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting attributes failed for %s on %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->success[child_index] = 0;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_erase_pending (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+ int active_sinks = 0;
+ int call_count = 0;
+ int i = 0;
+ struct timespec ts[2];
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+ active_sinks = sh->active_sinks;
+
+ /*
+ * 4 calls per sink - chown, chmod, utimes, setxattr
+ */
+ if (xattr)
+ call_count = active_sinks * 4;
+ else
+ call_count = active_sinks * 3;
+
+ local->call_count = call_count;
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
+ ts[0] = sh->buf[source].st_atim;
+ ts[1] = sh->buf[source].st_mtim;
+#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
+ ts[0] = sh->buf[source].st_atimespec;
+ ts[1] = sh->buf[source].st_mtimespec;
+#else
+ ts[0].tv_sec = sh->buf[source].st_atime;
+ ts[1].tv_sec = sh->buf[source].st_mtime;
+#endif
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (call_count == 0) {
+ break;
+ }
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing metadata of %s from %s to %s",
+ local->loc.path, priv->children[source]->name,
+ priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chown,
+ &local->loc,
+ sh->buf[source].st_uid,
+ sh->buf[source].st_gid);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chmod,
+ &local->loc, sh->buf[source].st_mode);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->utimens,
+ &local->loc, ts);
+
+ call_count = call_count - 3;
+
+ if (!xattr)
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc, xattr, 0);
+ call_count--;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
+ local->loc.path, priv->children[source]->name,
+ strerror (op_errno));
+
+ afr_sh_metadata_sync (frame, this, NULL);
+ } else {
+ dict_del (xattr, AFR_DATA_PENDING);
+ dict_del (xattr, AFR_METADATA_PENDING);
+ dict_del (xattr, AFR_ENTRY_PENDING);
+ afr_sh_metadata_sync (frame, this, xattr);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for performing self-heal on file %s",
+ local->loc.path);
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing metadata of %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name, active_sinks);
+
+ STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
+ priv->children[source],
+ priv->children[source]->fops->getxattr,
+ &local->loc, NULL);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int nsources = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_METADATA_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ nsources = afr_sh_source_count (sh->sources, priv->child_count);
+
+ if ((nsources == 0)
+ && (priv->favorite_child != -1)
+ && (sh->child_errno[priv->favorite_child] == 0)) {
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
+ priv->children[priv->favorite_child]->name,
+ local->loc.path);
+
+ sh->sources[priv->favorite_child] = 1;
+
+ nsources = afr_sh_source_count (sh->sources,
+ priv->child_count);
+ }
+
+ if (nsources == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to resolve conflicting metadata of %s. "
+ "Please resolve manually by fixing the "
+ "permissions/ownership of %s on your subvolumes. "
+ "You can also consider 'option favorite-child <>'",
+ local->loc.path, local->loc.path);
+
+ local->govinda_gOvinda = 1;
+
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ /* detect changes not visible through pending flags -- JIC */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || sh->child_errno[i])
+ continue;
+
+ if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+
+ if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+ }
+
+ afr_sh_metadata_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s is of mode 0%o",
+ local->loc.path,
+ priv->children[child_index]->name,
+ buf->st_mode);
+
+ sh->buf[child_index] = *buf;
+ if (xattr)
+ sh->xattr[child_index] = dict_ref (xattr);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s => -1 (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->child_errno[child_index] = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_fix (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_metadata_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ struct flock flock = {0, };
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_WRLCK;
+
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_metadata_self_heal && priv->metadata_self_heal) {
+ afr_sh_metadata_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to data check on %s",
+ local->loc.path);
+ afr_sh_metadata_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
new file mode 100644
index 00000000000..1c97a9bc11b
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -0,0 +1,52 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __AFR_SELF_HEAL_H__
+#define __AFR_SELF_HEAL_H__
+
+#include <sys/stat.h>
+
+#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode))
+#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode))
+#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid)))
+#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size))
+
+
+
+int
+afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+int
+afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+int
+afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+
+int
+afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal (call_frame_t *frame, xlator_t *this,
+ int (*completion_cbk) (call_frame_t *, xlator_t *));
+
+#endif /* __AFR_SELF_HEAL_H__ */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
new file mode 100644
index 00000000000..3df9f07e5a3
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -0,0 +1,957 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "dict.h"
+#include "byte-order.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+#include <signal.h>
+
+
+static void
+__mark_all_pending (int32_t *pending, int child_count)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ pending[i] = hton32 (1);
+}
+
+
+static void
+__mark_child_dead (int32_t *pending, int child_count, int child)
+{
+ pending[child] = 0;
+}
+
+
+static void
+__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ if (!child_up[i])
+ pending[i] = 0;
+}
+
+
+static void
+__mark_all_success (int32_t *pending, int child_count)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ pending[i] = hton32 (-1);
+}
+
+
+static int
+__is_first_write_on_fd (xlator_t *this, fd_t *fd)
+{
+ int op_ret = 0;
+ int _ret = -1;
+
+ _ret = fd_ctx_get (fd, this, NULL);
+ if (_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "first writev() on fd=%p, writing changelog",
+ fd);
+
+ _ret = fd_ctx_set (fd, this, 0xaf1);
+ op_ret = 1;
+ }
+
+ return op_ret;
+}
+
+
+static int
+__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
+
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ if (priv->data_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ if (priv->metadata_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (priv->entry_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_FLUSH_TRANSACTION:
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+static int
+__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ fd_t * fd = NULL;
+
+ int op_ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (__changelog_enabled (priv, local->transaction.type)) {
+ switch (local->op) {
+
+ case GF_FOP_WRITE:
+ case GF_FOP_FTRUNCATE:
+ /*
+ if it's a data transaction, we write the changelog
+ only on the first write on an fd
+ */
+
+ fd = local->fd;
+ if (!fd || __is_first_write_on_fd (this, fd))
+ op_ret = 1;
+
+ break;
+
+ case GF_FOP_FLUSH:
+ /* only do post-op on flush() */
+
+ op_ret = 0;
+ break;
+
+ default:
+ op_ret = 1;
+ }
+ }
+
+ return op_ret;
+}
+
+
+static int
+__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = 0;
+ afr_transaction_type type = -1;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+
+ if (__changelog_enabled (priv, type)
+ && (local->op != GF_FOP_WRITE)
+ && (local->op != GF_FOP_FTRUNCATE))
+ ret = 1;
+
+ return ret;
+}
+
+
+static int
+afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
+
+ switch (type) {
+ case AFR_FLUSH_TRANSACTION:
+ case AFR_DATA_TRANSACTION:
+ ret = priv->data_lock_server_count;
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ ret = priv->metadata_lock_server_count;
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = priv->entry_lock_server_count;
+ break;
+ }
+
+ return ret;
+}
+
+
+/* {{{ unlock */
+
+int32_t
+afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ local->transaction.done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_unlock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t *local = NULL;
+ afr_private_t * priv = this->private;
+
+ local = frame->local;
+
+ call_count = afr_locked_nodes_count (local->transaction.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.done (frame, this);
+ return 0;
+ }
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = local->transaction.start;
+ flock.l_len = local->transaction.len;
+ flock.l_type = F_UNLCK;
+
+ if (local->transaction.locked_nodes[i]) {
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+
+ if (local->fd) {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ local->fd, F_SETLK, &flock);
+ } else {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ call_count--;
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd) {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ local->fd,
+ local->transaction.basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ } else {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
+/* {{{ pending */
+
+int32_t
+afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ afr_unlock (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ int ret = 0;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ dict_t * xattr = dict_ref (get_new_dict ());
+
+ local = frame->local;
+
+ __mark_all_success (local->pending_array, priv->child_count);
+ __mark_down_children (local->pending_array, priv->child_count, local->child_up);
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+
+ local->call_count = call_count;
+
+ if (call_count == 0) {
+ /* no child is up */
+ dict_unref (xattr);
+ afr_unlock (frame, this);
+ return 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+
+ call_count--;
+ }
+
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ dict_unref (xattr);
+ return 0;
+}
+
+
+int32_t
+afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = this->private;
+ loc_t * loc = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->child_up[child_index] = 0;
+
+ if (op_errno == ENOTSUP) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "xattrop not supported by %s",
+ priv->children[child_index]->name);
+ local->op_ret = -1;
+ } else if (!child_went_down (op_ret, op_errno)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "xattrop failed on child %s: %s",
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+ local->op_errno = op_errno;
+ }
+
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if ((local->op_ret == -1) &&
+ (local->op_errno == ENOTSUP)) {
+ local->transaction.resume (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ dict_t *xattr = NULL;
+
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ xattr = get_new_dict ();
+ dict_ref (xattr);
+
+ call_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+
+ if (call_count == 0) {
+ /* no child is up */
+ dict_unref (xattr);
+ afr_unlock (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ __mark_all_pending (local->pending_array, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ ret = dict_set_static_bin (xattr,
+ local->transaction.pending,
+ local->pending_array,
+ (priv->child_count *
+ sizeof (int32_t)));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+
+ call_count--;
+ }
+
+
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ dict_unref (xattr);
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ lock */
+
+static
+int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index);
+
+int32_t
+afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int done = 0;
+ int child_index = (long) cookie;
+
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ /* wait for the other lock to return */
+ call_count = --local->call_count;
+ }
+
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/posix-locks xlator on server");
+ local->op_ret = op_ret;
+ done = 1;
+ }
+
+ local->child_up[child_index] = 0;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if ((local->op_ret == -1) &&
+ (local->op_errno == ENOSYS)) {
+ afr_unlock (frame, this);
+ } else {
+ local->transaction.locked_nodes[child_index] = 1;
+ local->transaction.lock_count++;
+ afr_lock_rec (frame, this, child_index + 1);
+ }
+ }
+
+ return 0;
+}
+
+
+static loc_t *
+lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
+{
+ int ret = 0;
+
+ ret = strcmp (l1->path, l2->path);
+
+ if (ret == 0)
+ ret = strcmp (b1, b2);
+
+ if (ret <= 0)
+ return l1;
+ else
+ return l2;
+}
+
+
+static
+int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ struct flock flock;
+
+ loc_t * lower = NULL;
+ loc_t * higher = NULL;
+
+ const char *lower_name = NULL;
+ const char *higher_name = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ flock.l_start = local->transaction.start;
+ flock.l_len = local->transaction.len;
+ flock.l_type = F_WRLCK;
+
+ /* skip over children that are down */
+ while ((child_index < priv->child_count)
+ && !local->child_up[child_index])
+ child_index++;
+
+ if ((child_index == priv->child_count) &&
+ local->transaction.lock_count == 0) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to lock on even one child");
+
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+
+ local->transaction.done (frame, this);
+
+ return 0;
+
+ }
+
+ if ((child_index == priv->child_count)
+ || (local->transaction.lock_count ==
+ afr_lock_server_count (priv, local->transaction.type))) {
+
+ /* we're done locking */
+
+ if (__changelog_needed_pre_op (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+
+ return 0;
+ }
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+
+ if (local->fd) {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->finodelk,
+ local->fd, F_SETLKW, &flock);
+
+ } else {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->inodelk,
+ &local->loc, F_SETLKW, &flock);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ local->call_count = 2;
+
+ lower = lower_path (&local->transaction.parent_loc,
+ local->transaction.basename,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename);
+
+ lower_name = (lower == &local->transaction.parent_loc ?
+ local->transaction.basename :
+ local->transaction.new_basename);
+
+ higher = (lower == &local->transaction.parent_loc ?
+ &local->transaction.new_parent_loc :
+ &local->transaction.parent_loc);
+
+ higher_name = (higher == &local->transaction.parent_loc ?
+ local->transaction.basename :
+ local->transaction.new_basename);
+
+
+ /* TODO: these locks should be blocking */
+
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ lower, lower_name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ higher, higher_name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+
+ break;
+ }
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd) {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->fentrylk,
+ local->fd,
+ local->transaction.basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+
+int32_t afr_lock (call_frame_t *frame, xlator_t *this)
+{
+ return afr_lock_rec (frame, this, 0);
+}
+
+
+/* }}} */
+
+int32_t
+afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (__changelog_needed_post_op (frame, this)) {
+ afr_changelog_post_op (frame, this);
+ } else {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ afr_unlock (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * afr_transaction_child_died - inform that a child died during an fop
+ */
+
+void
+afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ __mark_child_dead (local->pending_array, priv->child_count, child_index);
+}
+
+
+int32_t
+afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_transaction_local_init (local, priv);
+
+ local->transaction.resume = afr_transaction_resume;
+ local->transaction.type = type;
+
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ if (__changelog_needed_pre_op (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+ } else {
+ afr_lock (frame, this);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
new file mode 100644
index 00000000000..49cdd219f25
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -0,0 +1,36 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __TRANSACTION_H__
+#define __TRANSACTION_H__
+
+#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending"
+
+#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending"
+
+#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending"
+
+void
+afr_transaction_child_died (call_frame_t *frame, xlator_t *this,
+ int child_index);
+
+int32_t
+afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+
+#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
new file mode 100644
index 00000000000..e4c1a847985
--- /dev/null
+++ b/xlators/cluster/afr/src/afr.c
@@ -0,0 +1,2338 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+
+#include "afr-self-heal.h"
+
+
+/**
+ * afr_local_cleanup - cleanup everything in frame->local
+ */
+
+void
+afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
+{
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (sh->buf)
+ FREE (sh->buf);
+
+ if (sh->xattr) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+ }
+ FREE (sh->xattr);
+ }
+
+ if (sh->child_errno)
+ FREE (sh->child_errno);
+
+ if (sh->pending_matrix) {
+ for (i = 0; i < priv->child_count; i++) {
+ FREE (sh->pending_matrix[i]);
+ }
+ FREE (sh->pending_matrix);
+ }
+
+ if (sh->delta_matrix) {
+ for (i = 0; i < priv->child_count; i++) {
+ FREE (sh->delta_matrix[i]);
+ }
+ FREE (sh->delta_matrix);
+ }
+
+ if (sh->sources)
+ FREE (sh->sources);
+
+ if (sh->success)
+ FREE (sh->success);
+
+ if (sh->healing_fd) {
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ }
+
+ loc_wipe (&sh->parent_loc);
+}
+
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this)
+{
+ if (!local)
+ return;
+
+ afr_local_sh_cleanup (local, this);
+
+ FREE (local->child_errno);
+ FREE (local->pending_array);
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->newloc);
+
+ FREE (local->transaction.locked_nodes);
+ FREE (local->transaction.child_errno);
+
+ FREE (local->transaction.basename);
+ FREE (local->transaction.new_basename);
+
+ loc_wipe (&local->transaction.parent_loc);
+ loc_wipe (&local->transaction.new_parent_loc);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ FREE (local->child_up);
+
+ { /* lookup */
+ if (local->cont.lookup.xattr)
+ dict_unref (local->cont.lookup.xattr);
+ }
+
+ { /* getxattr */
+ if (local->cont.getxattr.name)
+ FREE (local->cont.getxattr.name);
+ }
+
+ { /* lk */
+ if (local->cont.lk.locked_nodes)
+ FREE (local->cont.lk.locked_nodes);
+ }
+
+ { /* checksum */
+ if (local->cont.checksum.file_checksum)
+ FREE (local->cont.checksum.file_checksum);
+ if (local->cont.checksum.dir_checksum)
+ FREE (local->cont.checksum.dir_checksum);
+ }
+
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref (local->cont.create.fd);
+ }
+
+ { /* writev */
+ FREE (local->cont.writev.vector);
+ }
+
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref (local->cont.setxattr.dict);
+ }
+
+ { /* removexattr */
+ FREE (local->cont.removexattr.name);
+ }
+
+ { /* symlink */
+ FREE (local->cont.symlink.linkpath);
+ }
+}
+
+
+int
+afr_frame_return (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ return call_count;
+}
+
+/**
+ * first_up_child - return the index of the first child that is up
+ */
+
+int
+afr_first_up_child (afr_private_t *priv)
+{
+ xlator_t ** children = NULL;
+ int ret = -1;
+ int i = 0;
+
+ LOCK (&priv->lock);
+ {
+ children = priv->children;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i]) {
+ ret = i;
+ break;
+ }
+ }
+ }
+ UNLOCK (&priv->lock);
+
+ return ret;
+}
+
+
+/**
+ * up_children_count - return the number of children that are up
+ */
+
+int
+afr_up_children_count (int child_count, unsigned char *child_up)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (child_up[i])
+ ret++;
+ return ret;
+}
+
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ if (locked_nodes[i])
+ ret++;
+
+ return ret;
+}
+
+
+ino64_t
+afr_itransform (ino64_t ino, int child_count, int child_index)
+{
+ ino64_t scaled_ino = -1;
+
+ if (ino == ((uint64_t) -1)) {
+ scaled_ino = ((uint64_t) -1);
+ goto out;
+ }
+
+ scaled_ino = (ino * child_count) + child_index;
+
+out:
+ return scaled_ino;
+}
+
+
+int
+afr_deitransform_orig (ino64_t ino, int child_count)
+{
+ int index = -1;
+
+ index = ino % child_count;
+
+ return index;
+}
+
+
+int
+afr_deitransform (ino64_t ino, int child_count)
+{
+ return 0;
+}
+
+
+int
+afr_self_heal_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->govinda_gOvinda) {
+ ret = inode_ctx_put (local->cont.lookup.inode, this, 1);
+
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ }
+ } else {
+ inode_ctx_del (local->cont.lookup.inode, this, NULL);
+ }
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr);
+
+ return 0;
+}
+
+
+int
+afr_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ struct stat * lookup_buf = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ int prev_child_index = -1;
+ uint32_t open_fd_count = 0;
+ int ret = 0;
+
+ child_index = (long) cookie;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ lookup_buf = &local->cont.lookup.buf;
+
+ if (op_ret == -1) {
+ if (op_errno == ENOENT)
+ local->enoent_count++;
+
+ if (op_errno != ENOTCONN)
+ local->op_errno = op_errno;
+
+ goto unlock;
+ }
+
+ if (afr_sh_has_metadata_pending (xattr, child_index, this))
+ local->need_metadata_self_heal = 1;
+
+ if (afr_sh_has_entry_pending (xattr, child_index, this))
+ local->need_entry_self_heal = 1;
+
+ if (afr_sh_has_data_pending (xattr, child_index, this))
+ local->need_data_self_heal = 1;
+
+ ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ local->open_fd_count += open_fd_count;
+
+ /* in case of revalidate, we need to send stat of the
+ * child whose stat was sent during the first lookup.
+ * (so that time stamp does not vary with revalidate.
+ * in case it is down, stat of the fist success will
+ * be replied */
+
+ /* inode number should be preserved across revalidates */
+
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+
+ local->cont.lookup.inode = inode;
+ local->cont.lookup.xattr = dict_ref (xattr);
+
+ *lookup_buf = *buf;
+ lookup_buf->st_ino = afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ } else {
+ if (FILETYPE_DIFFERS (buf, lookup_buf)) {
+ /* mismatching filetypes with same name
+ -- Govinda !! GOvinda !!!
+ */
+ local->govinda_gOvinda = 1;
+ }
+
+ if (PERMISSION_DIFFERS (buf, lookup_buf)) {
+ /* mismatching permissions */
+ local->need_metadata_self_heal = 1;
+ }
+
+ if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
+ /* mismatching permissions */
+ local->need_metadata_self_heal = 1;
+ }
+
+ if (SIZE_DIFFERS (buf, lookup_buf)
+ && S_ISREG (buf->st_mode)) {
+ local->need_data_self_heal = 1;
+ }
+
+ prev_child_index = afr_deitransform_orig (lookup_buf->st_ino,
+ priv->child_count);
+ if (child_index < prev_child_index) {
+ *lookup_buf = *buf;
+ lookup_buf->st_ino = afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ }
+
+ local->success_count++;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (local->op_ret == 0) {
+ /* KLUDGE: assuming DHT will not itransform in
+ revalidate */
+ if (local->cont.lookup.inode->ino)
+ lookup_buf->st_ino =
+ local->cont.lookup.inode->ino;
+ }
+
+ if (local->success_count && local->enoent_count) {
+ local->need_metadata_self_heal = 1;
+ local->need_data_self_heal = 1;
+ local->need_entry_self_heal = 1;
+ }
+
+ if (local->success_count) {
+ /* check for govinda_gOvinda case in previous lookup */
+ if (!inode_ctx_get (local->cont.lookup.inode,
+ this, NULL))
+ local->need_data_self_heal = 1;
+ }
+
+ if ((local->need_metadata_self_heal
+ || local->need_data_self_heal
+ || local->need_entry_self_heal)
+ && (!local->open_fd_count)) {
+
+ if (!local->cont.lookup.inode->st_mode) {
+ /* fix for RT #602 */
+ local->cont.lookup.inode->st_mode =
+ lookup_buf->st_mode;
+ }
+
+ afr_self_heal (frame, this, afr_self_heal_cbk);
+ } else {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr_req)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int i = 0;
+ int32_t op_errno = 0;
+
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ local->op_ret = -1;
+
+ frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->reval_child_index = 0;
+
+ local->call_count = priv->child_count;
+
+ local->child_up = memdup (priv->child_up, priv->child_count);
+ local->child_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ /* By default assume ENOTCONN. On success it will be set to 0. */
+ local->op_errno = ENOTCONN;
+
+ if ((xattr_req == NULL)
+ && (priv->metadata_self_heal
+ || priv->data_self_heal
+ || priv->entry_self_heal))
+ local->xattr_req = dict_new ();
+ else
+ local->xattr_req = dict_ref (xattr_req);
+
+ if (priv->metadata_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ if (priv->data_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ if (priv->entry_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0);
+
+ for (i = 0; i < priv->child_count; i++) {
+ STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+/* {{{ open */
+
+int
+afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd);
+ return 0;
+}
+
+
+int
+afr_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if ((local->cont.open.flags & O_TRUNC)
+ && (local->op_ret >= 0)) {
+ STACK_WIND (frame, afr_open_ftruncate_cbk,
+ this, this->fops->ftruncate,
+ fd, 0);
+ } else {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int i = 0;
+ int ret = -1;
+
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t wind_flags = flags & (~O_TRUNC);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ ret = inode_ctx_get (loc->inode, this, NULL);
+ if (ret == 0) {
+ /* if ctx is set it means self-heal failed */
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "returning EIO, file has to be manually corrected "
+ "in backend");
+ op_errno = EIO;
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ local->cont.open.flags = flags;
+ local->fd = fd_ref (fd);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ loc, wind_flags, fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ flush */
+
+int
+afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_flush_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ local->fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_flush_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_simple_flush_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+static int
+__is_fd_ctx_set (xlator_t *this, fd_t *fd)
+{
+ int _ret = 0;
+ int op_ret = 0;
+
+ _ret = fd_ctx_get (fd, this, NULL);
+ if (_ret == 0)
+ op_ret = 1;
+
+ return op_ret;
+}
+
+
+int
+afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = -1;
+ int i = 0;
+ int call_count = 0;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ if (__is_fd_ctx_set (this, fd)) {
+ local->op = GF_FOP_FLUSH;
+ local->transaction.fop = afr_flush_wind;
+ local->transaction.done = afr_flush_done;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
+ } else {
+ /*
+ * if fd's ctx is not set, then there is no need
+ * to erase changelog. So just send the flush
+ */
+
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_simple_flush_cbk,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int
+afr_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsync_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsync,
+ fd, datasync);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int32_t
+afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsync_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsyncdir,
+ fd, datasync);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ xattrop */
+
+int32_t
+afr_xattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
+
+ return 0;
+}
+
+
+int32_t
+afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_xattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ loc, optype, xattr);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fxattrop */
+
+int32_t
+afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
+
+ return 0;
+}
+
+
+int32_t
+afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fxattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ fd, optype, xattr);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+
+int32_t
+afr_inodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t cmd, struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_inodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ loc, cmd, flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_finodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t cmd, struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_finodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ fd, cmd, flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_entrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_entrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ loc, basename, cmd, type);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+
+int32_t
+afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fentrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ fd, basename, cmd, type);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_checksum_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ uint8_t *file_checksum, uint8_t *dir_checksum)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0 && (local->op_ret != 0)) {
+ local->op_ret = 0;
+
+ local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX);
+ memcpy (local->cont.checksum.file_checksum, file_checksum,
+ ZR_FILENAME_MAX);
+
+ local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX);
+ memcpy (local->cont.checksum.dir_checksum, dir_checksum,
+ ZR_FILENAME_MAX);
+
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->cont.checksum.file_checksum,
+ local->cont.checksum.dir_checksum);
+
+ return 0;
+}
+
+
+int32_t
+afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flag)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_checksum_cbk,
+ priv->children[i],
+ priv->children[i]->fops->checksum,
+ loc, flag);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_statfs_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct statvfs *statvfs)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = 0;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret == 0) {
+ local->op_ret = op_ret;
+
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
+ local->cont.statfs.buf = *statvfs;
+ } else {
+ local->cont.statfs.buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ }
+ }
+
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf);
+
+ return 0;
+}
+
+
+int32_t
+afr_statfs (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ int child_count = 0;
+ afr_local_t * local = NULL;
+ int i = 0;
+
+ int ret = -1;
+ int call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+ child_count = priv->child_count;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_statfs_cbk,
+ priv->children[i],
+ priv->children[i]->fops->statfs,
+ loc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct flock *lock)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ lock);
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.lk.flock);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ local->cont.lk.flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND (frame, afr_lk_unlock_cbk,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ local->fd, F_SETLK,
+ &local->cont.lk.flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct flock *lock)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ call_count = --local->call_count;
+
+ if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ afr_lk_unlock (frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.flock = *lock;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk,
+ local->fd, local->cont.lk.cmd,
+ &local->cont.lk.flock);
+ } else if (local->op_ret == -1) {
+ /* all nodes have gone down */
+
+ AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock);
+ } else {
+ /* locking has succeeded on all nodes that are up */
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.lk.flock);
+ }
+
+ return 0;
+}
+
+
+int
+afr_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd,
+ struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int i = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ AFR_LOCAL_INIT (local, priv);
+
+ frame->local = local;
+
+ local->cont.lk.locked_nodes = CALLOC (priv->child_count,
+ sizeof (*local->cont.lk.locked_nodes));
+
+ if (!local->cont.lk.locked_nodes) {
+ gf_log (this->name, GF_LOG_ERROR, "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.flock = *flock;
+
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ fd, cmd, flock);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+
+static int
+find_child_index (xlator_t *this, xlator_t *child)
+{
+ afr_private_t *priv = NULL;
+
+ int i = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((xlator_t *) child == priv->children[i])
+ break;
+ }
+
+ return i;
+}
+
+
+int32_t
+notify (xlator_t *this, int32_t event,
+ void *data, ...)
+{
+ afr_private_t * priv = NULL;
+ unsigned char * child_up = NULL;
+
+ int i = -1;
+ int up_children = 0;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
+
+ child_up = priv->child_up;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ i = find_child_index (this, data);
+
+ child_up[i] = 1;
+
+ /*
+ if all the children were down, and one child came up,
+ send notify to parent
+ */
+
+ for (i = 0; i < priv->child_count; i++)
+ if (child_up[i])
+ up_children++;
+
+ if (up_children == 1)
+ default_notify (this, event, data);
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ i = find_child_index (this, data);
+
+ child_up[i] = 0;
+
+ /*
+ if all children are down, and this was the last to go down,
+ send notify to parent
+ */
+
+ for (i = 0; i < priv->child_count; i++)
+ if (child_up[i])
+ up_children++;
+
+ if (up_children == 0)
+ default_notify (this, event, data);
+
+ break;
+
+ default:
+ default_notify (this, event, data);
+ }
+
+ return 0;
+}
+
+
+static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
+ "as the 'favorite child'. This means that if a discrepancy in the content "
+ "or attributes (ownership, permission, etc.) of a file is detected among "
+ "the subvolumes, the file on '%s' will be considered the definitive "
+ "version and its contents will OVERWRITE the contents of the file on other "
+ "subvolumes. All versions of the file except that on '%s' "
+ "WILL BE LOST.";
+
+static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
+ "This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
+ "applications write to the same region of a file, there is a possibility that "
+ "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
+ "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
+ "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value "
+ "greater than 0.";
+
+int32_t
+init (xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ int child_count = 0;
+ xlator_list_t * trav = NULL;
+ int i = 0;
+ int ret = -1;
+ int op_errno = 0;
+
+ char * read_subvol = NULL;
+ char * fav_child = NULL;
+ char * self_heal = NULL;
+ char * change_log = NULL;
+
+ int32_t lock_server_count = 1;
+
+ int fav_ret = -1;
+ int read_ret = -1;
+ int dict_ret = -1;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "AFR needs more than one child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ ALLOC_OR_GOTO (this->private, afr_private_t, out);
+
+ priv = this->private;
+
+ read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
+ priv->read_child = -1;
+
+ fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
+ priv->favorite_child = -1;
+
+ /* Default values */
+
+ priv->data_self_heal = 1;
+ priv->metadata_self_heal = 1;
+ priv->entry_self_heal = 1;
+
+ dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->data_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option data-self-heal %s' "
+ "defaulting to data-self-heal as 'on'",
+ self_heal);
+ priv->data_self_heal = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "metadata-self-heal",
+ &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option metadata-self-heal %s' "
+ "defaulting to metadata-self-heal as 'on'",
+ self_heal);
+ priv->metadata_self_heal = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option entry-self-heal %s' "
+ "defaulting to entry-self-heal as 'on'",
+ self_heal);
+ priv->entry_self_heal = 1;
+ }
+ }
+
+ /* Change log options */
+
+ priv->data_change_log = 1;
+ priv->metadata_change_log = 0;
+ priv->entry_change_log = 1;
+
+ dict_ret = dict_get_str (this->options, "data-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->data_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option data-change-log %s'. "
+ "defaulting to data-change-log as 'on'",
+ change_log);
+ priv->data_change_log = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "metadata-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log,
+ &priv->metadata_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option metadata-change-log %s'. "
+ "defaulting to metadata-change-log as 'off'",
+ change_log);
+ priv->metadata_change_log = 0;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "entry-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->entry_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option entry-change-log %s'. "
+ "defaulting to entry-change-log as 'on'",
+ change_log);
+ priv->entry_change_log = 1;
+ }
+ }
+
+ /* Locking options */
+
+ priv->data_lock_server_count = 1;
+ priv->metadata_lock_server_count = 0;
+ priv->entry_lock_server_count = 1;
+
+ dict_ret = dict_get_int32 (this->options, "data-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting data lock server count to %d",
+ lock_server_count);
+
+ if (lock_server_count == 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ no_lock_servers_warning_str);
+
+ priv->data_lock_server_count = lock_server_count;
+ }
+
+
+ dict_ret = dict_get_int32 (this->options,
+ "metadata-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting metadata lock server count to %d",
+ lock_server_count);
+ priv->metadata_lock_server_count = lock_server_count;
+ }
+
+
+ dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting entry lock server count to %d",
+ lock_server_count);
+
+ priv->entry_lock_server_count = lock_server_count;
+ }
+
+
+ trav = this->children;
+ while (trav) {
+ if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume '%s' specified as read child",
+ trav->xlator->name);
+
+ priv->read_child = child_count;
+ }
+
+ if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ favorite_child_warning_str, trav->xlator->name,
+ trav->xlator->name, trav->xlator->name);
+ priv->favorite_child = child_count;
+ }
+
+ child_count++;
+ trav = trav->next;
+ }
+
+ /* XXX: return inode numbers from 1st subvolume till
+ afr supports read-subvolume based on inode's ctx
+ (and not itransform) for this reason afr_deitransform()
+ returns 0 always
+ */
+ priv->read_child = 0;
+
+ priv->wait_count = 1;
+
+ priv->child_count = child_count;
+ LOCK_INIT (&priv->lock);
+
+ priv->child_up = CALLOC (sizeof (unsigned char), child_count);
+ if (!priv->child_up) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ priv->children = CALLOC (sizeof (xlator_t *), child_count);
+ if (!priv->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+
+ trav = trav->next;
+ i++;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+fini (xlator_t *this)
+{
+ return 0;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = afr_lookup,
+ .open = afr_open,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsync = afr_fsync,
+ .fsyncdir = afr_fsyncdir,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .checksum = afr_checksum,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .readv = afr_readv,
+
+ /* inode write */
+ .chmod = afr_chmod,
+ .chown = afr_chown,
+ .fchmod = afr_fchmod,
+ .fchown = afr_fchown,
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .utimens = afr_utimens,
+ .setxattr = afr_setxattr,
+ .removexattr = afr_removexattr,
+
+ /* dir read */
+ .opendir = afr_opendir,
+ .readdir = afr_readdir,
+ .getdents = afr_getdents,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
+ .setdents = afr_setdents,
+};
+
+
+struct xlator_mops mops = {
+};
+
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"read-subvolume" },
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"favorite-child"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"data-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {"metadata-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {"entry-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
new file mode 100644
index 00000000000..4cf6cdf9dfe
--- /dev/null
+++ b/xlators/cluster/afr/src/afr.h
@@ -0,0 +1,523 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef __AFR_H__
+#define __AFR_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+#include "call-stub.h"
+#include "compat-errno.h"
+
+
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+
+ xlator_t **children;
+
+ unsigned char *child_up;
+
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+
+ gf_boolean_t data_change_log; /* on/off */
+ gf_boolean_t metadata_change_log; /* on/off */
+ gf_boolean_t entry_change_log; /* on/off */
+
+ unsigned int read_child; /* read-subvolume */
+ unsigned int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+
+ unsigned int data_lock_server_count;
+ unsigned int metadata_lock_server_count;
+ unsigned int entry_lock_server_count;
+
+ unsigned int wait_count; /* # of servers to wait for success */
+} afr_private_t;
+
+typedef struct {
+ /* array of stat's, one for each child */
+ struct stat *buf;
+
+ /* array of xattr's, one for each child */
+ dict_t **xattr;
+
+ /* array of errno's, one for each child */
+ int *child_errno;
+
+ int32_t **pending_matrix;
+ int32_t **delta_matrix;
+
+ int *sources;
+ int source;
+ int active_source;
+ int active_sinks;
+ int *success;
+
+ fd_t *healing_fd;
+ int op_failed;
+
+ int file_has_holes;
+ blksize_t block_size;
+ off_t file_size;
+ off_t offset;
+
+ loc_t parent_loc;
+ int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
+ call_frame_t *sh_frame;
+} afr_self_heal_t;
+
+
+typedef enum {
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+ AFR_FLUSH_TRANSACTION, /* flush */
+} afr_transaction_type;
+
+typedef struct _afr_local {
+ unsigned int call_count;
+ unsigned int success_count;
+ unsigned int enoent_count;
+
+ unsigned int need_metadata_self_heal;
+ unsigned int need_entry_self_heal;
+ unsigned int need_data_self_heal;
+ unsigned int govinda_gOvinda;
+
+ unsigned int reval_child_index;
+ int32_t op_ret;
+ int32_t op_errno;
+
+ int32_t *pending_array;
+
+ loc_t loc;
+ loc_t newloc;
+
+ fd_t *fd;
+
+ glusterfs_fop_t fop;
+
+ unsigned char *child_up;
+ int child_count;
+
+ int32_t *child_errno;
+
+ dict_t *xattr_req;
+ int open_fd_count;
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ int op;
+ struct {
+ struct {
+ unsigned char buf_set;
+ struct statvfs buf;
+ } statfs;
+
+ struct {
+ inode_t *inode;
+ struct stat buf;
+ dict_t *xattr;
+ } lookup;
+
+ struct {
+ int32_t flags;
+ } open;
+
+ struct {
+ int32_t cmd;
+ struct flock flock;
+ unsigned char *locked_nodes;
+ } lk;
+
+ struct {
+ uint8_t *file_checksum;
+ uint8_t *dir_checksum;
+ } checksum;
+
+ /* inode read */
+
+ struct {
+ int32_t mask;
+ int last_tried; /* index of the child we tried previously */
+ } access;
+
+ struct {
+ int last_tried;
+ ino_t ino;
+ } stat;
+
+ struct {
+ int last_tried;
+ ino_t ino;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_tried;
+ } readlink;
+
+ struct {
+ const char *name;
+ int last_tried;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_tried;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+
+ int last_tried;
+ } readdir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+
+ size_t size;
+ off_t offset;
+ int32_t flag;
+
+ int last_tried;
+ } getdents;
+
+ /* inode write */
+
+ struct {
+ ino_t ino;
+ mode_t mode;
+ struct stat buf;
+ } chmod;
+
+ struct {
+ ino_t ino;
+ mode_t mode;
+ struct stat buf;
+ } fchmod;
+
+ struct {
+ ino_t ino;
+ uid_t uid;
+ gid_t gid;
+ struct stat buf;
+ } chown;
+
+ struct {
+ ino_t ino;
+ uid_t uid;
+ gid_t gid;
+ struct stat buf;
+ } fchown;
+
+ struct {
+ ino_t ino;
+ struct stat buf;
+
+ int32_t op_ret;
+
+ struct iovec *vector;
+ dict_t *refs;
+ int32_t count;
+ off_t offset;
+ } writev;
+
+ struct {
+ ino_t ino;
+ off_t offset;
+ struct stat buf;
+ } truncate;
+
+ struct {
+ ino_t ino;
+ off_t offset;
+ struct stat buf;
+ } ftruncate;
+
+ struct {
+ ino_t ino;
+ struct timespec tv[2];
+ struct stat buf;
+ } utimens;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
+
+ struct {
+ const char *name;
+ } removexattr;
+
+ /* dir write */
+
+ struct {
+ ino_t ino;
+ fd_t *fd;
+ int32_t flags;
+ mode_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } create;
+
+ struct {
+ ino_t ino;
+ dev_t dev;
+ mode_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } mknod;
+
+ struct {
+ ino_t ino;
+ int32_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } mkdir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ } unlink;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ } rmdir;
+
+ struct {
+ ino_t ino;
+ struct stat buf;
+ } rename;
+
+ struct {
+ ino_t ino;
+ inode_t *inode;
+ struct stat buf;
+ } link;
+
+ struct {
+ ino_t ino;
+ inode_t *inode;
+ struct stat buf;
+ char *linkpath;
+ } symlink;
+
+ struct {
+ int32_t flags;
+ dir_entry_t *entries;
+ int32_t count;
+ } setdents;
+ } cont;
+
+ struct {
+ off_t start, len;
+
+ unsigned char *locked_nodes;
+ int lock_count;
+
+ const char *basename;
+ const char *new_basename;
+
+ char *pending;
+
+ loc_t parent_loc;
+ loc_t new_parent_loc;
+
+ afr_transaction_type type;
+
+ int success_count;
+ int erase_pending;
+ int failure_count;
+
+ int last_tried;
+ int32_t *child_errno;
+
+ call_frame_t *main_frame;
+
+ int (*fop) (call_frame_t *frame, xlator_t *this);
+
+ int (*done) (call_frame_t *frame, xlator_t *this);
+
+ int (*resume) (call_frame_t *frame, xlator_t *this);
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this);
+ } transaction;
+
+ afr_self_heal_t self_heal;
+} afr_local_t;
+
+/* try alloc and if it fails, goto label */
+#define ALLOC_OR_GOTO(var, type, label) do { \
+ var = CALLOC (sizeof (type), 1); \
+ if (!var) { \
+ gf_log (this->name, GF_LOG_ERROR, \
+ "out of memory :("); \
+ op_errno = ENOMEM; \
+ goto label; \
+ } \
+ } while (0);
+
+
+/* did a call fail due to a child failing? */
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
+
+/* have we tried all children? */
+#define all_tried(i, count) ((i) == (count) - 1)
+
+void
+afr_build_parent_loc (loc_t *parent, loc_t *child);
+
+int
+afr_up_children_count (int child_count, unsigned char *child_up);
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+
+int
+afr_first_up_child (afr_private_t *priv);
+
+ino64_t
+afr_itransform (ino64_t ino, int child_count, int child_index);
+
+int
+afr_deitransform (ino64_t ino, int child_count);
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this);
+
+int
+afr_frame_return (call_frame_t *frame);
+
+#define AFR_STACK_UNWIND(frame, params ...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_UNWIND (frame, params); \
+ afr_local_cleanup (__local, __this); \
+ free (__local); \
+} while (0);
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ afr_local_cleanup (__local, __this); \
+ free (__local); \
+} while (0);
+
+/* allocate and return a string that is the basename of argument */
+static inline char *
+AFR_BASENAME (const char *str)
+{
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = strdup (str);
+ __basename_str = strdup (basename (__tmp_str));
+ FREE (__tmp_str);
+ return __basename_str;
+}
+
+/* initialize local_t */
+static inline int
+AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
+{
+ local->child_up = CALLOC (sizeof (*local->child_up),
+ priv->child_count);
+ if (!local->child_up) {
+ return -ENOMEM;
+ }
+
+ memcpy (local->child_up, priv->child_up,
+ sizeof (*local->child_up) * priv->child_count);
+
+
+ local->call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (local->call_count == 0)
+ return -ENOTCONN;
+
+ local->transaction.erase_pending = 1;
+
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ return 0;
+}
+
+
+static inline int
+afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
+{
+ local->child_errno = CALLOC (sizeof (*local->child_errno),
+ priv->child_count);
+ if (!local->child_errno) {
+ return -ENOMEM;
+ }
+
+ local->pending_array = CALLOC (sizeof (*local->pending_array),
+ priv->child_count);
+ if (!local->pending_array) {
+ return -ENOMEM;
+ }
+
+ local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes),
+ priv->child_count);
+
+ local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno),
+ priv->child_count);
+
+ return 0;
+}
+
+#endif /* __AFR_H__ */
diff --git a/xlators/cluster/dht/Makefile.am b/xlators/cluster/dht/Makefile.am
new file mode 100644
index 00000000000..f963effea22
--- /dev/null
+++ b/xlators/cluster/dht/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src \ No newline at end of file
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
new file mode 100644
index 00000000000..b7d07d137a6
--- /dev/null
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -0,0 +1,30 @@
+
+xlator_LTLIBRARIES = dht.la nufa.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+
+dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \
+ dht-selfheal.c dht-rename.c dht-hashfn.c dht-hashfn-tea.c
+
+dht_la_SOURCES = $(dht_common_source) dht.c
+
+nufa_la_SOURCES = $(dht_common_source) nufa.c
+
+dht_la_LDFLAGS = -module -avoidversion
+dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = dht-common.h dht-common.c
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/distribute.so
+
+install-data-hook:
+ ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so \ No newline at end of file
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
new file mode 100644
index 00000000000..5e4979e31b0
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -0,0 +1,3470 @@
+/*
+ Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "defaults.h"
+
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+
+int
+dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ ret = op_ret;
+
+ if (ret == 0) {
+ layout = local->selfheal.layout;
+ ret = inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+
+ if (ret == 0)
+ local->selfheal.layout = NULL;
+
+ if (local->st_ino) {
+ local->stbuf.st_ino = local->st_ino;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hashed subvolume for %s",
+ local->loc.path);
+ }
+ }
+
+ DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode,
+ &local->stbuf, local->xattr);
+
+ return 0;
+}
+
+
+int
+dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+ int is_dir = 0;
+
+ conf = this->private;
+ local = frame->local;
+ prev = cookie;
+
+ layout = local->layout;
+
+ LOCK (&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+
+ else mkdir/chmod/chown and fix
+ */
+ /* TODO: assert equal hash type in xattr, local->xattr */
+
+ /* TODO: always ensure same subvolume is in layout->list[0] */
+
+ ret = dht_layout_merge (this, layout, prev->this,
+ op_ret, op_errno, xattr);
+
+ if (op_ret == -1) {
+ local->op_errno = ENOENT;
+ gf_log (this->name, GF_LOG_WARNING,
+ "lookup of %s on %s returned error (%s)",
+ local->loc.path, prev->this->name,
+ strerror (op_errno));
+
+ goto unlock;
+ }
+
+ is_dir = check_is_dir (inode, stbuf, xattr);
+ if (!is_dir)
+ goto unlock;
+
+ local->op_ret = 0;
+ if (local->xattr == NULL)
+ local->xattr = dict_ref (xattr);
+ if (local->inode == NULL)
+ local->inode = inode_ref (inode);
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ if (prev->this == local->hashed_subvol)
+ local->st_ino = local->stbuf.st_ino;
+
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ if (local->op_ret == 0) {
+ ret = dht_layout_normalize (this, &local->loc, layout);
+
+ local->layout = NULL;
+
+ if (ret != 0) {
+ layout->gen = conf->gen;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "fixing assignment on %s",
+ local->loc.path);
+ goto selfheal;
+ }
+
+ inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+
+ if (local->st_ino) {
+ local->stbuf.st_ino = local->st_ino;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hashed subvolume for %s",
+ local->loc.path);
+ }
+ }
+
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr);
+ }
+
+ return 0;
+
+selfheal:
+ ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+
+ return 0;
+}
+
+int
+dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int is_linkfile = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+
+ if (op_errno != ENOTCONN && op_errno != ENOENT) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ }
+
+ goto unlock;
+ }
+
+ if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mismatching filetypes 0%o v/s 0%o for %s",
+ (stbuf->st_mode & S_IFMT),
+ (local->inode->st_mode & S_IFMT),
+ local->loc.path);
+
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+
+ goto unlock;
+ }
+
+ layout = dht_layout_get (this, inode);
+
+ is_dir = check_is_dir (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+
+ if (is_linkfile) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "linkfile found in revalidate for %s",
+ local->loc.path);
+ local->layout_mismatch = 1;
+
+ goto unlock;
+ }
+
+ if (is_dir) {
+ ret = dht_layout_dir_mismatch (this, layout,
+ prev->this, &local->loc,
+ xattr);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mismatching layouts for %s",
+ local->loc.path);
+
+ local->layout_mismatch = 1;
+
+ goto unlock;
+ }
+ }
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ local->op_ret = 0;
+ local->stbuf.st_ino = local->st_ino;
+
+ if (!local->xattr)
+ local->xattr = dict_ref (xattr);
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ if (!S_ISDIR (local->stbuf.st_mode)
+ && (local->hashed_subvol != local->cached_subvol)
+ && (local->stbuf.st_nlink == 1))
+ local->stbuf.st_mode |= S_ISVTX;
+
+ if (local->layout_mismatch) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ }
+
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr);
+ }
+
+ return 0;
+}
+
+
+int
+dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *cached_subvol = NULL;
+
+ local = frame->local;
+ cached_subvol = local->cached_subvol;
+
+ layout = dht_layout_for_subvol (this, local->cached_subvol);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ cached_subvol ? cached_subvol->name : "<nil>");
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unwind;
+ }
+
+ inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+ local->op_ret = 0;
+ if (local->stbuf.st_nlink == 1)
+ local->stbuf.st_mode |= S_ISVTX;
+
+unwind:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr);
+ return 0;
+}
+
+
+int
+dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ int is_linkfile = 0;
+ int is_dir = 0;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+ xlator_t *link_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+
+ conf = this->private;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ prev = cookie;
+ subvol = prev->this;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT)
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+
+ is_linkfile = check_is_linkfile (inode, buf, xattr);
+ is_dir = check_is_dir (inode, buf, xattr);
+
+ if (is_linkfile) {
+ link_subvol = dht_linkfile_subvol (this, inode, buf,
+ xattr);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found on %s linkfile %s (-> %s)",
+ subvol->name, loc->path,
+ link_subvol ? link_subvol->name : "''");
+ goto unlock;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found on %s file %s",
+ subvol->name, loc->path);
+ }
+
+ if (!local->cached_subvol) {
+ /* found one file */
+ dht_stat_merge (this, &local->stbuf, buf, subvol);
+ local->xattr = dict_ref (xattr);
+ local->cached_subvol = subvol;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "multiple subvolumes (%s and %s atleast) have "
+ "file %s", local->cached_subvol->name,
+ subvol->name, local->loc.path);
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (is_linkfile) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "deleting stale linkfile %s on %s",
+ loc->path, subvol->name);
+ dht_linkfile_unlink (frame, this, subvol, loc);
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ hashed_subvol = local->hashed_subvol;
+ cached_subvol = local->cached_subvol;
+
+ if (!cached_subvol) {
+ DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "linking file %s existing on %s to %s (hash)",
+ loc->path, cached_subvol->name, hashed_subvol->name);
+
+ dht_linkfile_create (frame, dht_lookup_linkfile_create_cbk,
+ cached_subvol, hashed_subvol, loc);
+ }
+
+ return 0;
+}
+
+
+int
+dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ if (!local->inode)
+ local->inode = inode_ref (loc->inode);
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_everywhere_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ return 0;
+}
+
+
+int
+dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+
+ prev = cookie;
+ subvol = prev->this;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lookup of %s on %s (following linkfile) failed (%s)",
+ local->loc.path, subvol->name, strerror (op_errno));
+
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+
+ /* TODO: assert type is non-dir and non-linkfile */
+
+ if (stbuf->st_nlink == 1)
+ stbuf->st_mode |= S_ISVTX;
+ dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ inode_ctx_put (inode, this, (uint64_t)(long)layout);
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+
+ return 0;
+}
+
+
+int
+dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_layout_t *layout = NULL;
+ char is_linkfile = 0;
+ char is_dir = 0;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ int i = 0;
+ call_frame_t *prev = NULL;
+ int call_cnt = 0;
+
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ if (ENTRY_MISSING (op_ret, op_errno)) {
+ if (conf->search_unhashed) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+ }
+
+ if (op_ret == 0) {
+ is_dir = check_is_dir (inode, stbuf, xattr);
+ if (is_dir) {
+ local->inode = inode_ref (inode);
+ local->xattr = dict_ref (xattr);
+ }
+ }
+
+ if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+ return 0;
+ }
+
+ if (op_ret == -1)
+ goto out;
+
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_dir = check_is_dir (inode, stbuf, xattr);
+
+ if (!is_dir && !is_linkfile) {
+ /* non-directory and not a linkfile */
+
+ dht_itransform (this, prev->this, stbuf->st_ino,
+ &stbuf->st_ino);
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ goto out;
+ }
+
+ if (is_linkfile) {
+ subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "linkfile not having link subvolume. path=%s",
+ loc->path);
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+
+ STACK_WIND (frame, dht_lookup_linkfile_cbk,
+ subvol, subvol->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+ return 0;
+}
+
+
+int
+dht_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr_req)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "copying location failed for path=%s",
+ loc->path);
+ goto err;
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref (xattr_req);
+ } else {
+ local->xattr_req = dict_new ();
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+ cached_subvol = dht_subvol_get_cached (this, loc->inode);
+
+ local->cached_subvol = cached_subvol;
+ local->hashed_subvol = hashed_subvol;
+
+ if (is_revalidate (loc)) {
+ layout = dht_layout_get (this, loc->inode);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "revalidate without cache. path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "incomplete layout failure for path=%s",
+ loc->path);
+ op_errno = EAGAIN;
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->st_ino = loc->inode->ino;
+
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
+
+ /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
+ * revalidates directly go to the cached-subvolume.
+ */
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht", 4 * 4);
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ STACK_WIND (frame, dht_revalidate_cbk,
+ subvol, subvol->fops->lookup,
+ loc, local->xattr_req);
+
+ if (!--call_cnt)
+ break;
+ }
+ } else {
+ /* TODO: remove the hard-coding */
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht", 4 * 4);
+
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht.linkto", 256);
+
+ if (!hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s, "
+ "checking on all the subvols to see if "
+ "it is a directory", loc->path);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+ return 0;
+ }
+
+ STACK_WIND (frame, dht_lookup_cbk,
+ hashed_subvol, hashed_subvol->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto unlock;
+ }
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ if (local->inode)
+ local->stbuf.st_ino = local->inode->ino;
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->stat,
+ loc);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ layout = dht_layout_get (this, fd->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "local allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = layout->cnt;;
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->fstat,
+ fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->chmod,
+ loc, mode);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->chown,
+ loc, uid, gid);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+
+ layout = dht_layout_get (this, fd->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->fchmod,
+ fd, mode);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ layout = dht_layout_get (this, fd->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->fchown,
+ fd, uid, gid);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2])
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->utimens,
+ loc, tv);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->truncate,
+ loc, offset);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->ftruncate,
+ fd, offset);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->access,
+ loc, mask);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, const char *path)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, path);
+
+ return 0;
+}
+
+
+int
+dht_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_readlink_cbk,
+ subvol, subvol->fops->readlink,
+ loc, size);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr);
+
+ return 0;
+}
+
+
+int
+dht_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_getxattr_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr, int flags)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->setxattr,
+ loc, xattr, flags);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->removexattr,
+ loc, key);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd);
+
+ return 0;
+}
+
+
+int
+dht_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int flags, fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_fd_cbk,
+ subvol, subvol->fops->open,
+ loc, flags, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iovec *vector, int count, struct stat *stbuf)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t off)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_readv_cbk,
+ subvol, subvol->fops->readv,
+ fd, size, off);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
+
+ return 0;
+}
+
+
+int
+dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct stat *stbuf)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_writev (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, struct iovec *vector, int count, off_t off)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_writev_cbk,
+ subvol, subvol->fops->writev,
+ fd, vector, count, off);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0);
+
+ return 0;
+}
+
+
+int
+dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->flush, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int datasync)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocatoin failed :(");
+ goto err;
+ }
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->fsync,
+ fd, datasync);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct flock *flock)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, flock);
+
+ return 0;
+}
+
+
+int
+dht_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int cmd, struct flock *flock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_lk_cbk,
+ subvol, subvol->fops->lk,
+ fd, cmd, flock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+/* gf_lk no longer exists
+int
+dht_gf_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct flock *flock)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, flock);
+
+ return 0;
+}
+
+
+int
+dht_gf_lk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int cmd, struct flock *flock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_gf_lk_cbk,
+ subvol, subvol->fops->gf_lk,
+ fd, cmd, flock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+*/
+
+int
+dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct statvfs *statvfs)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ local->op_ret = 0;
+
+ /* TODO: normalize sizes */
+ local->statvfs.f_bsize = statvfs->f_bsize;
+ local->statvfs.f_frsize = statvfs->f_frsize;
+
+ local->statvfs.f_blocks += statvfs->f_blocks;
+ local->statvfs.f_bfree += statvfs->f_bfree;
+ local->statvfs.f_bavail += statvfs->f_bavail;
+ local->statvfs.f_files += statvfs->f_files;
+ local->statvfs.f_ffree += statvfs->f_ffree;
+ local->statvfs.f_favail += statvfs->f_favail;
+ local->statvfs.f_fsid = statvfs->f_fsid;
+ local->statvfs.f_flag = statvfs->f_flag;
+ local->statvfs.f_namemax = statvfs->f_namemax;
+
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->statvfs);
+
+ return 0;
+}
+
+
+int
+dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_statfs_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, loc);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_fd_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir,
+ loc, fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *orig_entries)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *next = NULL;
+ dht_layout_t *layout = NULL;
+ int count = 0;
+
+
+ INIT_LIST_HEAD (&entries.list);
+ prev = cookie;
+ local = frame->local;
+
+ if (op_ret < 0)
+ goto done;
+
+ layout = dht_layout_get (this, local->fd->inode);
+
+ list_for_each_entry (orig_entry, &orig_entries->list, list) {
+ subvol = dht_layout_search (this, layout, orig_entry->d_name);
+
+ if (!subvol || subvol == prev->this) {
+ entry = gf_dirent_for_name (orig_entry->d_name);
+ if (!entry) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto unwind;
+ }
+
+ dht_itransform (this, subvol, orig_entry->d_ino,
+ &entry->d_ino);
+ dht_itransform (this, subvol, orig_entry->d_off,
+ &entry->d_off);
+
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ list_add_tail (&entry->list, &entries.list);
+ count++;
+ }
+ }
+ op_ret = count;
+
+done:
+ if (count == 0) {
+ next = dht_subvol_next (this, prev->this);
+ if (!next) {
+ goto unwind;
+ }
+
+ STACK_WIND (frame, dht_readdir_cbk,
+ next, next->fops->readdir,
+ local->fd, local->size, 0);
+ return 0;
+ }
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries);
+
+ gf_dirent_free (&entries);
+
+ return 0;
+}
+
+
+int
+dht_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t yoff)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ xlator_t *xvol = NULL;
+ off_t xoff = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->size = size;
+
+ dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
+
+ /* TODO: do proper readdir */
+ STACK_WIND (frame, dht_readdir_cbk,
+ xvol, xvol->fops->readdir,
+ fd, size, xoff);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+
+ if (op_ret == 0)
+ local->op_ret = 0;
+ }
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_fsyncdir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsyncdir,
+ fd, datasync);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+
+
+ if (op_ret == -1)
+ goto out;
+
+ prev = cookie;
+
+ dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+ layout = dht_layout_for_subvol (this, prev->this);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not set inode context");
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+ return 0;
+}
+
+
+int
+dht_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t rdev)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_newfile_cbk,
+ subvol, subvol->fops->mknod,
+ loc, mode, rdev);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_newfile_cbk,
+ subvol, subvol->fops->symlink,
+ linkname, loc);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ cached_subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!cached_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+ if (!hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+ if (hashed_subvol != cached_subvol)
+ local->call_cnt++;
+
+ STACK_WIND (frame, dht_err_cbk,
+ cached_subvol, cached_subvol->fops->unlink, loc);
+
+ if (hashed_subvol != cached_subvol)
+ STACK_WIND (frame, dht_err_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink, loc);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+
+ prev = cookie;
+ local = frame->local;
+
+ if (op_ret == -1)
+ goto out;
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ stbuf->st_ino = local->loc.inode->ino;
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ xlator_t *srcvol = NULL;
+
+
+ if (op_ret == -1)
+ goto err;
+
+ local = frame->local;
+ srcvol = local->linkfile.srcvol;
+
+ STACK_WIND (frame, dht_link_cbk,
+ srcvol, srcvol->fops->link,
+ &local->loc, &local->loc2);
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (oldloc, err);
+ VALIDATE_OR_GOTO (newloc, err);
+
+ cached_subvol = dht_subvol_get_cached (this, oldloc->inode);
+ if (!cached_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", oldloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, newloc);
+ if (!hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ newloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc, oldloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ if (hashed_subvol != cached_subvol) {
+ dht_linkfile_create (frame, dht_link_linkfile_cbk,
+ cached_subvol, hashed_subvol, newloc);
+ } else {
+ STACK_WIND (frame, dht_link_cbk,
+ cached_subvol, cached_subvol->fops->link,
+ oldloc, newloc);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ fd_t *fd, inode_t *inode, struct stat *stbuf)
+{
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+
+
+ if (op_ret == -1)
+ goto out;
+
+ prev = cookie;
+
+ dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+ layout = dht_layout_for_subvol (this, prev->this);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not set inode context");
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf);
+ return 0;
+}
+
+
+int
+dht_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_create_cbk,
+ subvol, subvol->fops->create,
+ loc, flags, mode, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+
+ if (op_ret == 0) {
+ inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+ local->selfheal.layout = NULL;
+ local->stbuf.st_ino = local->st_ino;
+ }
+
+ DHT_STACK_UNWIND (frame, op_ret, op_errno,
+ local->inode, &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = -1;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+
+ LOCK (&frame->lock);
+ {
+ ret = dht_layout_merge (this, layout, prev->this,
+ op_ret, op_errno, NULL);
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ local->layout = NULL;
+ dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
+ &local->loc, layout);
+ }
+
+ return 0;
+}
+
+int
+dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *hashed_subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+ conf = this->private;
+ hashed_subvol = local->hashed_subvol;
+
+ ret = dht_layout_merge (this, layout, prev->this,
+ op_ret, op_errno, NULL);
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
+ local->op_ret = 0;
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ local->st_ino = local->stbuf.st_ino;
+
+ local->call_cnt = conf->subvolume_cnt - 1;
+
+ if (local->call_cnt == 0) {
+ local->layout = NULL;
+ dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
+ &local->loc, layout);
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == hashed_subvol)
+ continue;
+ STACK_WIND (frame, dht_mkdir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->mkdir,
+ &local->loc, local->mode);
+ }
+ return 0;
+err:
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int
+dht_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ xlator_t *hashed_subvol = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+
+ if (hashed_subvol == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "hashed subvol not found");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->hashed_subvol = hashed_subvol;
+ local->inode = inode_ref (loc->inode);
+ ret = loc_copy (&local->loc, loc);
+ local->mode = mode;
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_mkdir_hashed_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->mkdir,
+ loc, mode);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+ local->layout = NULL;
+
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ uint64_t tmp_layout = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+
+ if (op_errno != ENOENT)
+ local->need_selfheal = 1;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "rmdir on %s for %s failed (%s)",
+ prev->this->name, local->loc.path,
+ strerror (op_errno));
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ if (local->need_selfheal) {
+ inode_ctx_get (local->loc.inode, this,
+ &tmp_layout);
+ layout = (dht_layout_t *)(long)tmp_layout;
+
+ /* TODO: neater interface needed below */
+ local->stbuf.st_mode = local->loc.inode->st_mode;
+
+ dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk,
+ &local->loc, layout);
+ } else {
+ DHT_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+dht_rmdir_do (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (local->op_ret == -1)
+ goto err;
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rmdir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rmdir,
+ &local->loc);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ return 0;
+}
+
+
+int
+dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret > 2) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on %s for %s returned %d entries",
+ prev->this->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rmdir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir on %s for %s failed (%s)",
+ prev->this->name, local->loc.path,
+ strerror (op_errno));
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_rmdir_readdir_cbk,
+ prev->this, prev->this->fops->readdir,
+ local->fd, 4096, 0);
+
+ return 0;
+
+err:
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rmdir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+ local->op_ret = 0;
+
+ ret = loc_copy (&local->loc, loc);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_create (local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rmdir_opendir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir,
+ loc, local->fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+static int32_t
+dht_xattrop_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ return 0;
+}
+
+int32_t
+dht_xattrop (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame,
+ dht_xattrop_cbk,
+ subvol, subvol->fops->xattrop,
+ loc, flags, dict);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+static int32_t
+dht_fxattrop_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ return 0;
+}
+
+int32_t
+dht_fxattrop (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ dht_fxattrop_cbk,
+ subvol, subvol->fops->fxattrop,
+ fd, flags, dict);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+static int32_t
+dht_inodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+int32_t
+dht_inodelk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t cmd, struct flock *lock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame,
+ dht_inodelk_cbk,
+ subvol, subvol->fops->inodelk,
+ loc, cmd, lock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+static int32_t
+dht_finodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+int32_t
+dht_finodelk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd, struct flock *lock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+
+ STACK_WIND (frame,
+ dht_finodelk_cbk,
+ subvol, subvol->fops->finodelk,
+ fd, cmd, lock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+static int32_t
+dht_entrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+int32_t
+dht_entrylk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_entrylk_cbk,
+ subvol, subvol->fops->entrylk,
+ loc, basename, cmd, type);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+static int32_t
+dht_fentrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+int32_t
+dht_fentrylk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fentrylk_cbk,
+ subvol, subvol->fops->fentrylk,
+ fd, basename, cmd, type);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t tmp_layout = 0;
+ dht_layout_t *layout = NULL;
+
+ inode_ctx_get (inode, this, &tmp_layout);
+
+ if (!layout)
+ return 0;
+ layout = (dht_layout_t *)(long)tmp_layout;
+ if (!layout->preset)
+ FREE (layout);
+
+ return 0;
+}
+
+
+
+static int
+dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
+{
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
+
+
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
+
+ conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *));
+ if (!conf->subvolumes) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ return -1;
+ }
+ conf->subvolume_cnt = cnt;
+
+ cnt = 0;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ conf->subvolumes[cnt++] = subvols->xlator;
+
+ conf->subvolume_status = CALLOC (cnt, sizeof (char));
+ if (!conf->subvolume_status) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int
+dht_notify (xlator_t *this, int event, void *data, ...)
+{
+ xlator_t *subvol = NULL;
+ int cnt = -1;
+ int i = -1;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+
+ conf = this->private;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ subvol = data;
+
+ conf->gen++;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
+
+ if (cnt == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_UP bad subvolume %s",
+ subvol->name);
+ break;
+ }
+
+ LOCK (&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 1;
+ }
+ UNLOCK (&conf->subvolume_lock);
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ subvol = data;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
+
+ if (cnt == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_DOWN bad subvolume %s",
+ subvol->name);
+ break;
+ }
+
+ LOCK (&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 0;
+ }
+ UNLOCK (&conf->subvolume_lock);
+
+ break;
+ }
+
+ ret = default_notify (this, event, data);
+
+ return ret;
+}
+
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
new file mode 100644
index 00000000000..17017381b08
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -0,0 +1,212 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef _DHT_H
+#define _DHT_H
+
+
+typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno);
+
+
+struct dht_layout {
+ int cnt;
+ int preset;
+ int gen;
+ int type;
+ struct {
+ int err; /* 0 = normal
+ -1 = dir exists and no xattr
+ >0 = dir lookup failed with errno
+ */
+ uint32_t start;
+ uint32_t stop;
+ xlator_t *xlator;
+ } list[0];
+};
+typedef struct dht_layout dht_layout_t;
+
+
+struct dht_local {
+ int call_cnt;
+ loc_t loc;
+ loc_t loc2;
+ int op_ret;
+ int op_errno;
+ int layout_mismatch;
+ struct stat stbuf;
+ struct statvfs statvfs;
+ fd_t *fd;
+ inode_t *inode;
+ dict_t *xattr;
+ dict_t *xattr_req;
+ dht_layout_t *layout;
+ size_t size;
+ ino_t st_ino;
+ xlator_t *src_hashed, *src_cached;
+ xlator_t *dst_hashed, *dst_cached;
+ xlator_t *cached_subvol;
+ xlator_t *hashed_subvol;
+ char need_selfheal;
+ struct {
+ fop_mknod_cbk_t linkfile_cbk;
+ struct stat stbuf;
+ loc_t loc;
+ inode_t *inode;
+ dict_t *xattr;
+ xlator_t *srcvol;
+ } linkfile;
+ struct {
+ uint32_t hole_cnt;
+ uint32_t overlaps_cnt;
+ uint32_t missing;
+ uint32_t down;
+ uint32_t misc;
+ dht_selfheal_dir_cbk_t dir_cbk;
+ dht_layout_t *layout;
+ } selfheal;
+
+ /* needed by nufa */
+ int32_t flags;
+ mode_t mode;
+ dev_t rdev;
+};
+typedef struct dht_local dht_local_t;
+
+
+struct dht_conf {
+ gf_lock_t subvolume_lock;
+ int subvolume_cnt;
+ xlator_t **subvolumes;
+ xlator_t *local_volume; /* Needed by NUFA */
+ char *subvolume_status;
+ dht_layout_t **file_layouts;
+ dht_layout_t **dir_layouts;
+ dht_layout_t *default_dir_layout;
+ gf_boolean_t search_unhashed;
+ int gen;
+};
+typedef struct dht_conf dht_conf_t;
+
+
+struct dht_disk_layout {
+ uint32_t cnt;
+ uint32_t type;
+ struct {
+ uint32_t start;
+ uint32_t stop;
+ } list[1];
+};
+typedef struct dht_disk_layout dht_disk_layout_t;
+
+#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
+
+#define is_fs_root(loc) (strcmp (loc->path, "/") == 0)
+
+#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0)
+
+#define is_last_call(cnt) (cnt == 0)
+
+#define DHT_LINKFILE_MODE (S_ISVTX)
+#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE)
+
+#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode))
+
+#define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
+
+#define DHT_STACK_UNWIND(frame, params ...) do { \
+ dht_local_t *__local = NULL; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ STACK_UNWIND (frame, params); \
+ dht_local_wipe (__local); \
+ } while (0)
+
+#define DHT_STACK_DESTROY(frame) do { \
+ dht_local_t *__local = NULL; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ dht_local_wipe (__local); \
+ } while (0)
+
+dht_layout_t *dht_layout_new (xlator_t *this, int cnt);
+dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode);
+dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout,
+ const char *name);
+int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);
+int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p,
+ uint32_t *misc_p);
+int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout,
+ xlator_t *subvol, loc_t *loc, dict_t *xattr);
+
+xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode,
+ struct stat *buf, dict_t *xattr);
+int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc);
+
+int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
+int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr);
+
+int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t **disk_layout_p);
+int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t *disk_layout);
+
+
+int dht_frame_return (call_frame_t *frame);
+
+int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y);
+int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol,
+ uint64_t *x);
+
+void dht_local_wipe (dht_local_t *local);
+dht_local_t *dht_local_init (call_frame_t *frame);
+int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from,
+ xlator_t *subvol);
+
+xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
+xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
+xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
+
+int dht_hash_compute (int type, const char *name, uint32_t *hash_p);
+
+int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
+int
+dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+int
+dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+
+int dht_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-hashfn-tea.c b/xlators/cluster/dht/src/dht-hashfn-tea.c
new file mode 100644
index 00000000000..8437b495541
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-hashfn-tea.c
@@ -0,0 +1,146 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#define DELTA 0x9E3779B9
+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
+#define PARTROUNDS 6 /* 6 gets complete mixing */
+
+
+static int
+tearound (int rounds, uint32_t *array, uint32_t *h0, uint32_t *h1)
+{
+ uint32_t sum = 0;
+ int n = 0;
+ uint32_t b0 = 0;
+ uint32_t b1 = 0;
+
+ b0 = *h0;
+ b1 = *h1;
+
+ n = rounds;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4) + array[0])
+ ^ (b1 + sum)
+ ^ ((b1 >> 5) + array[1]);
+ b1 += ((b0 << 4) + array[2])
+ ^ (b0 + sum)
+ ^ ((b0 >> 5) + array[3]);
+ } while (--n);
+
+ *h0 += b0;
+ *h1 += b1;
+
+ return 0;
+}
+
+
+uint32_t
+__pad (int len)
+{
+ uint32_t pad = 0;
+
+ pad = (uint32_t) len | ((uint32_t) len << 8);
+ pad |= pad << 16;
+
+ return pad;
+}
+
+
+uint32_t
+dht_hashfn_tea (const char *msg, int len)
+{
+ uint32_t h0 = 0x9464a485;
+ uint32_t h1 = 0x542e1a94;
+ uint32_t array[4];
+ uint32_t pad = 0;
+ int i = 0;
+ int j = 0;
+ int full_quads = 0;
+ int full_words = 0;
+ int full_bytes = 0;
+ uint32_t *intmsg = NULL;
+ int word = 0;
+
+
+ intmsg = (uint32_t *) msg;
+ pad = __pad (len);
+
+ full_bytes = len;
+ full_words = len / 4;
+ full_quads = len / 16;
+
+ for (i = 0; i < full_quads; i++) {
+ for (j = 0; j < 4; j++) {
+ word = *intmsg;
+ array[j] = word;
+ intmsg++;
+ full_words--;
+ full_bytes -= 4;
+ }
+ tearound (PARTROUNDS, &array[0], &h0, &h1);
+ }
+
+ if ((len % 16) == 0) {
+ goto done;
+ }
+
+ for (j = 0; j < 4; j++) {
+ if (full_words) {
+ word = *intmsg;
+ array[j] = word;
+ intmsg++;
+ full_words--;
+ full_bytes -= 4;
+ } else {
+ array[j] = pad;
+ while (full_bytes) {
+ array[j] <<= 8;
+ array[j] |= msg[len - full_bytes];
+ full_bytes--;
+ }
+ }
+ }
+ tearound (FULLROUNDS, &array[0], &h0, &h1);
+
+done:
+ return h0 ^ h1;
+}
+
+
+#if 0
+int
+main (int argc, char *argv[])
+{
+ int i = 0;
+ int hashval = 0;
+
+ for (i = 1; i < argc; i++) {
+ hashval = tea (argv[i], strlen (argv[i]));
+ printf ("%s: %x\n", argv[i], hashval);
+ }
+}
+#endif
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
new file mode 100644
index 00000000000..9e321a43cec
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -0,0 +1,88 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+
+
+uint32_t dht_hashfn_tea (const char *name, int len);
+
+
+typedef enum {
+ DHT_HASH_TYPE_TEA,
+} dht_hashfn_type_t;
+
+
+int
+dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
+{
+ int ret = 0;
+ uint32_t hash = 0;
+
+ switch (type) {
+ case DHT_HASH_TYPE_TEA:
+ hash = dht_hashfn_tea (name, strlen (name));
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (ret == 0) {
+ *hash_p = hash;
+ }
+
+ return ret;
+}
+
+
+#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \
+ rsync_frndly_name = (char *) name; \
+ if (name[0] == '.') { \
+ char *dot = 0; \
+ int namelen = 0; \
+ \
+ dot = strrchr (name, '.'); \
+ if (dot && dot > (name + 1) && *(dot + 1)) { \
+ namelen = (dot - name); \
+ rsync_frndly_name = alloca (namelen); \
+ strncpy (rsync_frndly_name, name + 1, \
+ namelen); \
+ rsync_frndly_name[namelen - 1] = 0; \
+ } \
+ } \
+ } while (0);
+
+
+int
+dht_hash_compute (int type, const char *name, uint32_t *hash_p)
+{
+ char *rsync_friendly_name = NULL;
+
+ MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name);
+
+ return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
+}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
new file mode 100644
index 00000000000..52d0720025f
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -0,0 +1,326 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+
+
+int
+dht_frame_return (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+
+ if (!frame)
+ return -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ }
+ UNLOCK (&frame->lock);
+
+ return this_call_cnt;
+}
+
+
+int
+dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
+{
+ dht_conf_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t y = 0;
+
+
+ if (x == ((uint64_t) -1)) {
+ y = (uint64_t) -1;
+ goto out;
+ }
+
+ conf = this->private;
+
+ max = conf->subvolume_cnt;
+ cnt = dht_subvol_cnt (this, subvol);
+
+ y = ((x * max) + cnt);
+
+out:
+ if (y_p)
+ *y_p = y;
+
+ return 0;
+}
+
+
+int
+dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
+ uint64_t *x_p)
+{
+ dht_conf_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t x = 0;
+ xlator_t *subvol = 0;
+
+
+ conf = this->private;
+ max = conf->subvolume_cnt;
+
+ cnt = y % max;
+ x = y / max;
+
+ subvol = conf->subvolumes[cnt];
+
+ if (subvol_p)
+ *subvol_p = subvol;
+
+ if (x_p)
+ *x_p = x;
+
+ return 0;
+}
+
+
+void
+dht_local_wipe (dht_local_t *local)
+{
+ if (!local)
+ return;
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->loc2);
+
+ if (local->xattr)
+ dict_unref (local->xattr);
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ if (local->layout)
+ FREE (local->layout);
+
+ loc_wipe (&local->linkfile.loc);
+
+ if (local->linkfile.xattr)
+ dict_unref (local->linkfile.xattr);
+
+ if (local->linkfile.inode)
+ inode_unref (local->linkfile.inode);
+
+ if (local->fd) {
+ fd_unref (local->fd);
+ local->fd = NULL;
+ }
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ FREE (local);
+}
+
+
+dht_local_t *
+dht_local_init (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+
+ /* TODO: use mem-pool */
+ local = CALLOC (1, sizeof (*local));
+
+ if (!local)
+ return NULL;
+
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ frame->local = local;
+
+ return local;
+}
+
+
+char *
+basestr (const char *str)
+{
+ char *basestr = NULL;
+
+ basestr = strrchr (str, '/');
+ if (basestr)
+ basestr ++;
+
+ return basestr;
+}
+
+xlator_t *
+dht_first_up_child (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *child = NULL;
+ int i = 0;
+
+ conf = this->private;
+
+ LOCK (&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolume_status[i]) {
+ child = conf->subvolumes[i];
+ break;
+ }
+ }
+ }
+ UNLOCK (&conf->subvolume_lock);
+
+ return child;
+}
+
+xlator_t *
+dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
+{
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+
+ if (is_fs_root (loc)) {
+ subvol = dht_first_up_child (this);
+ goto out;
+ }
+
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout missing path=%s parent=%"PRId64,
+ loc->path, loc->parent->ino);
+ goto out;
+ }
+
+ subvol = dht_layout_search (this, layout, loc->name);
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not find subvolume for path=%s",
+ loc->path);
+ goto out;
+ }
+
+out:
+ return subvol;
+}
+
+
+xlator_t *
+dht_subvol_get_cached (xlator_t *this, inode_t *inode)
+{
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+
+
+ layout = dht_layout_get (this, inode);
+
+ if (!layout) {
+ goto out;
+ }
+
+ subvol = layout->list[0].xlator;
+
+out:
+ return subvol;
+}
+
+
+xlator_t *
+dht_subvol_next (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ if ((i + 1) < conf->subvolume_cnt)
+ next = conf->subvolumes[i + 1];
+ break;
+ }
+ }
+
+ return next;
+}
+
+
+int
+dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
+{
+ int i = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
+#define set_if_greater(a, b) do { \
+ if ((a) < (b)) \
+ (a) = (b); \
+ } while (0)
+
+int
+dht_stat_merge (xlator_t *this, struct stat *to,
+ struct stat *from, xlator_t *subvol)
+{
+ to->st_dev = from->st_dev;
+
+ dht_itransform (this, subvol, from->st_ino, &to->st_ino);
+
+ to->st_mode = from->st_mode;
+ to->st_nlink = from->st_nlink;
+ to->st_uid = from->st_uid;
+ to->st_gid = from->st_gid;
+ to->st_rdev = from->st_rdev;
+ to->st_size += from->st_size;
+ to->st_blksize = from->st_blksize;
+ to->st_blocks += from->st_blocks;
+
+ set_if_greater (to->st_atime, from->st_atime);
+ set_if_greater (to->st_mtime, from->st_mtime);
+ set_if_greater (to->st_ctime, from->st_ctime);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
new file mode 100644
index 00000000000..08b4a2746f8
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -0,0 +1,543 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "byte-order.h"
+
+#define layout_base_size (sizeof (dht_layout_t))
+
+#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0])
+
+#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
+
+
+dht_layout_t *
+dht_layout_new (xlator_t *this, int cnt)
+{
+ dht_layout_t *layout = NULL;
+
+
+ layout = CALLOC (1, layout_size (cnt));
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ layout->cnt = cnt;
+
+out:
+ return layout;
+}
+
+
+dht_layout_t *
+dht_layout_get (xlator_t *this, inode_t *inode)
+{
+ uint64_t layout = 0;
+ int ret = -1;
+
+ ret = inode_ctx_get (inode, this, &layout);
+
+ return (dht_layout_t *)(long)layout;
+}
+
+
+xlator_t *
+dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
+{
+ uint32_t hash = 0;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = 0;
+
+
+ ret = dht_hash_compute (layout->type, name, &hash);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "hash computation failed for type=%d name=%s",
+ layout->type, name);
+ goto out;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].start <= hash
+ && layout->list[i].stop >= hash) {
+ subvol = layout->list[i].xlator;
+ break;
+ }
+ }
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no subvolume for hash (value) = %u", hash);
+ }
+
+out:
+ return subvol;
+}
+
+
+dht_layout_t *
+dht_layout_for_subvol (xlator_t *this, xlator_t *subvol)
+{
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == subvol) {
+ layout = conf->file_layouts[i];
+ break;
+ }
+ }
+
+ return layout;
+}
+
+
+int
+dht_layouts_init (xlator_t *this, dht_conf_t *conf)
+{
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int ret = -1;
+
+
+ conf->file_layouts = CALLOC (conf->subvolume_cnt,
+ sizeof (dht_layout_t *));
+ if (!conf->file_layouts) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ layout = dht_layout_new (this, 1);
+
+ if (!layout) {
+ goto out;
+ }
+
+ layout->preset = 1;
+
+ layout->list[0].xlator = conf->subvolumes[i];
+
+ conf->file_layouts[i] = layout;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t **disk_layout_p)
+{
+ int ret = -1;
+ int32_t *disk_layout = NULL;
+
+ disk_layout = CALLOC (5, sizeof (int));
+ if (!disk_layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ disk_layout[0] = hton32 (1);
+ disk_layout[1] = hton32 (layout->type);
+ disk_layout[2] = hton32 (layout->list[pos].start);
+ disk_layout[3] = hton32 (layout->list[pos].stop);
+
+ if (disk_layout_p)
+ *disk_layout_p = disk_layout;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+
+int
+dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t *disk_layout)
+{
+ int cnt = 0;
+ int type = 0;
+ int start_off = 0;
+ int stop_off = 0;
+
+
+ /* TODO: assert disk_layout_ptr is of required length */
+
+ cnt = ntoh32 (disk_layout[0]);
+ if (cnt != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "disk layout has invalid count %d", cnt);
+ return -1;
+ }
+
+ /* TODO: assert type is compatible */
+ type = ntoh32 (disk_layout[1]);
+ start_off = ntoh32 (disk_layout[2]);
+ stop_off = ntoh32 (disk_layout[3]);
+
+ layout->list[pos].start = start_off;
+ layout->list[pos].stop = stop_off;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "merged to layout: %u - %u (type %d) from %s",
+ start_off, stop_off, type,
+ layout->list[pos].xlator->name);
+
+ return 0;
+}
+
+
+int
+dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr)
+{
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ int32_t *disk_layout = NULL;
+
+
+ if (op_ret != 0) {
+ err = op_errno;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == NULL) {
+ layout->list[i].err = err;
+ layout->list[i].xlator = subvol;
+ break;
+ }
+ }
+
+ if (op_ret != 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (xattr) {
+ /* during lookup and not mkdir */
+ ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ VOID(&disk_layout));
+ }
+
+ if (ret != 0) {
+ layout->list[i].err = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing disk layout on %s. err = %d",
+ subvol->name, err);
+ ret = 0;
+ goto out;
+ }
+
+ ret = dht_disk_layout_merge (this, layout, i, disk_layout);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout merge from subvolume %s failed",
+ subvol->name);
+ goto out;
+ }
+ layout->list[i].err = 0;
+
+out:
+ return ret;
+}
+
+
+void
+dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
+{
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+ xlator_t *xlator_swap = 0;
+ int err_swap = 0;
+
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+ xlator_swap = layout->list[i].xlator;
+ err_swap = layout->list[i].err;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+ layout->list[i].xlator = layout->list[j].xlator;
+ layout->list[i].err = layout->list[j].err;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+ layout->list[j].xlator = xlator_swap;
+ layout->list[j].err = err_swap;
+}
+
+
+int64_t
+dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
+{
+ int64_t diff = 0;
+
+ if (layout->list[i].err || layout->list[j].err)
+ diff = layout->list[i].err - layout->list[j].err;
+ else
+ diff = (int64_t) layout->list[i].start
+ - (int64_t) layout->list[j].start;
+
+ return diff;
+}
+
+
+int
+dht_layout_sort (dht_layout_t *layout)
+{
+ int i = 0;
+ int j = 0;
+ int64_t ret = 0;
+
+ /* TODO: O(n^2) -- bad bad */
+
+ for (i = 0; i < layout->cnt - 1; i++) {
+ for (j = i + 1; j < layout->cnt; j++) {
+ ret = dht_layout_entry_cmp (layout, i, j);
+ if (ret > 0)
+ dht_layout_entry_swap (layout, i, j);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p)
+{
+ dht_conf_t *conf = NULL;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ uint32_t hole_cnt = 0;
+ uint32_t overlap_cnt = 0;
+ int i = 0;
+ int ret = 0;
+ uint32_t prev_stop = 0;
+ uint32_t last_stop = 0;
+ char is_virgin = 1;
+
+
+ conf = this->private;
+
+ /* TODO: explain WTF is happening */
+
+ last_stop = layout->list[0].start - 1;
+ prev_stop = last_stop;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err) {
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ missing++;
+ break;
+ case ENOTCONN:
+ down++;
+ break;
+ default:
+ misc++;
+ }
+ continue;
+ }
+
+ is_virgin = 0;
+
+ if ((prev_stop + 1) < layout->list[i].start) {
+ hole_cnt++;
+ holes += (layout->list[i].start - (prev_stop + 1));
+ }
+
+ if ((prev_stop + 1) > layout->list[i].start) {
+ overlap_cnt++;
+ overlaps += ((prev_stop + 1) - layout->list[i].start);
+ }
+ prev_stop = layout->list[i].stop;
+ }
+
+ if ((last_stop - prev_stop) || is_virgin)
+ hole_cnt++;
+ holes += (last_stop - prev_stop);
+
+ if (holes_p)
+ *holes_p = hole_cnt;
+
+ if (overlaps_p)
+ *overlaps_p = overlap_cnt;
+
+ if (missing_p)
+ *missing_p = missing;
+
+ if (down_p)
+ *down_p = down;
+
+ if (misc_p)
+ *misc_p = misc;
+
+ return ret;
+}
+
+
+int
+dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout)
+{
+ int ret = 0;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+
+
+ ret = dht_layout_sort (layout);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "sort failed?! how the ....");
+ goto out;
+ }
+
+ ret = dht_layout_anomalies (this, loc, layout,
+ &holes, &overlaps,
+ &missing, &down, &misc);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error while finding anomalies in %s -- not good news",
+ loc->path);
+ goto out;
+ }
+
+ if (holes || overlaps) {
+ if (missing == layout->cnt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "directory %s looked up first time",
+ loc->path);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "found anomalies in %s. holes=%d overlaps=%d",
+ loc->path, holes, overlaps);
+ }
+ ret = 1;
+ }
+
+out:
+ return ret;
+}
+
+
+int
+dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ loc_t *loc, dict_t *xattr)
+{
+ int idx = 0;
+ int pos = -1;
+ int ret = -1;
+ int32_t *disk_layout = NULL;
+ int32_t count = -1;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+
+
+ for (idx = 0; idx < layout->cnt; idx++) {
+ if (layout->list[idx].xlator == subvol) {
+ pos = idx;
+ break;
+ }
+ }
+
+ if (pos == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s - no layout info for subvolume %s",
+ loc->path, subvol->name);
+ ret = 1;
+ goto out;
+ }
+
+ if (xattr == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s - xattr dictionary is NULL",
+ loc->path);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ VOID(&disk_layout));
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s - disk layout missing", loc->path);
+ ret = -1;
+ goto out;
+ }
+
+ count = ntoh32 (disk_layout[0]);
+ if (count != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s - disk layout has invalid count %d",
+ loc->path, count);
+ ret = -1;
+ goto out;
+ }
+
+ start_off = ntoh32 (disk_layout[2]);
+ stop_off = ntoh32 (disk_layout[3]);
+
+ if ((layout->list[pos].start != start_off)
+ || (layout->list[pos].stop != stop_off)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvol: %s; inode layout - %"PRId32" - %"PRId32"; "
+ "disk layout - %"PRId32" - %"PRId32,
+ layout->list[pos].xlator->name,
+ layout->list[pos].start, layout->list[pos].stop,
+ start_off, stop_off);
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+out:
+ return ret;
+}
+
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
new file mode 100644
index 00000000000..9cc24ccf6b3
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -0,0 +1,224 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "compat.h"
+#include "dht-common.h"
+
+
+
+int
+dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ local->linkfile.inode,
+ &local->linkfile.stbuf);
+
+ return 0;
+}
+
+
+int
+dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattr = NULL;
+ data_t *str_data = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1)
+ goto err;
+
+ xattr = get_new_dict ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->linkfile.xattr = dict_ref (xattr);
+ local->linkfile.inode = inode_ref (inode);
+
+ str_data = str_to_data (local->linkfile.srcvol->name);
+ if (!str_data) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to initialize linkfile data");
+ op_errno = EINVAL;
+ }
+ str_data = NULL;
+
+ local->linkfile.stbuf = *stbuf;
+
+ STACK_WIND (frame, dht_linkfile_xattr_cbk,
+ prev->this, prev->this->fops->setxattr,
+ &local->linkfile.loc, local->linkfile.xattr, 0);
+
+ return 0;
+
+err:
+ if (str_data) {
+ data_destroy (str_data);
+ str_data = NULL;
+ }
+
+ local->linkfile.linkfile_cbk (frame, cookie, this,
+ op_ret, op_errno, inode, stbuf);
+ return 0;
+}
+
+
+int
+dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
+{
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+ local->linkfile.linkfile_cbk = linkfile_cbk;
+ local->linkfile.srcvol = tovol;
+ loc_copy (&local->linkfile.loc, loc);
+
+ STACK_WIND (frame, dht_linkfile_create_cbk,
+ fromvol, fromvol->fops->mknod, loc,
+ S_IFREG | DHT_LINKFILE_MODE, 0);
+
+ return 0;
+}
+
+
+int
+dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ subvol = prev->this;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlinking linkfile %s on %s failed (%s)",
+ local->loc.path, subvol->name, strerror (op_errno));
+ }
+
+ DHT_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc)
+{
+ call_frame_t *unlink_frame = NULL;
+ dht_local_t *unlink_local = NULL;
+
+ unlink_frame = copy_frame (frame);
+ if (!unlink_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ unlink_local = dht_local_init (unlink_frame);
+ if (!unlink_local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ loc_copy (&unlink_local->loc, loc);
+
+ STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk,
+ subvol, subvol->fops->unlink,
+ &unlink_local->loc);
+
+ return 0;
+err:
+ if (unlink_frame)
+ DHT_STACK_DESTROY (unlink_frame);
+
+ return -1;
+}
+
+
+xlator_t *
+dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf,
+ dict_t *xattr)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ void *volname = NULL;
+ int i = 0, ret = 0;
+
+
+ conf = this->private;
+
+ if (!xattr)
+ goto out;
+
+ ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+
+ if ((-1 == ret) || !volname)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) {
+ subvol = conf->subvolumes[i];
+ break;
+ }
+ }
+
+out:
+ return subvol;
+}
+
+
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
new file mode 100644
index 00000000000..e5532f1bc87
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -0,0 +1,562 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should
+ * delete the newpath if it gets EEXISTS from link() call.
+ */
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "defaults.h"
+
+
+int
+dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ /* TODO: undo the damage */
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "rename %s -> %s on %s failed (%s)",
+ local->loc.path, local->loc2.path,
+ prev->this->name, strerror (op_errno));
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ } else {
+ /* TODO: construct proper stbuf for dir */
+ local->stbuf = *stbuf;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+ }
+
+ return 0;
+}
+
+
+
+int
+dht_rename_dir_do (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (local->op_ret == -1)
+ goto err;
+
+ local->call_cnt = conf->subvolume_cnt;
+ local->op_ret = 0;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rename_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rename,
+ &local->loc, &local->loc2);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ return 0;
+}
+
+
+int
+dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret > 2) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on %s for %s returned %d entries",
+ prev->this->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rename_dir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir on %s for %s failed (%s)",
+ prev->this->name, local->loc.path,
+ strerror (op_errno));
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_rename_readdir_cbk,
+ prev->this, prev->this->fops->readdir,
+ local->fd, 4096, 0);
+
+ return 0;
+
+err:
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rename_dir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rename_dir (call_frame_t *frame, xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int op_errno = -1;
+
+
+ conf = frame->this->private;
+ local = frame->local;
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ local->fd = fd_create (local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->op_ret = 0;
+
+ if (!local->dst_cached) {
+ dht_rename_dir_do (frame, this);
+ return 0;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rename_opendir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir,
+ &local->loc2, local->fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+
+int
+dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlink on %s failed (%s)",
+ prev->this->name, strerror (op_errno));
+ }
+
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *rename_subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "rename on %s failed (%s)", prev->this->name,
+ strerror (op_errno));
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto unwind;
+ }
+
+ /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
+ * is called. since rename has already happened on rename_subvol,
+ * unlink should not be sent for oldpath (either linkfile or cached-file)
+ * on rename_subvol. */
+ if (src_cached == dst_cached)
+ rename_subvol = src_cached;
+ else
+ rename_subvol = dst_hashed;
+
+ /* TODO: delete files in background */
+
+ if (src_cached != dst_hashed && src_cached != dst_cached)
+ local->call_cnt++;
+
+ if (src_hashed != rename_subvol && src_hashed != src_cached)
+ local->call_cnt++;
+
+ if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached)
+ local->call_cnt++;
+
+ if (local->call_cnt == 0)
+ goto unwind;
+
+ if (src_cached != dst_hashed && src_cached != dst_cached) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "deleting old src datafile %s @ %s",
+ local->loc.path, src_cached->name);
+
+ STACK_WIND (frame, dht_rename_unlink_cbk,
+ src_cached, src_cached->fops->unlink,
+ &local->loc);
+ }
+
+ if (src_hashed != rename_subvol && src_hashed != src_cached) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "deleting old src linkfile %s @ %s",
+ local->loc.path, src_hashed->name);
+
+ STACK_WIND (frame, dht_rename_unlink_cbk,
+ src_hashed, src_hashed->fops->unlink,
+ &local->loc);
+ }
+
+ if (dst_cached
+ && (dst_cached != dst_hashed)
+ && (dst_cached != src_cached)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "deleting old dst datafile %s @ %s",
+ local->loc2.path, dst_cached->name);
+
+ STACK_WIND (frame, dht_rename_unlink_cbk,
+ dst_cached, dst_cached->fops->unlink,
+ &local->loc2);
+ }
+ return 0;
+
+unwind:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_do_rename (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *this = NULL;
+ xlator_t *rename_subvol = NULL;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+ src_cached = local->src_cached;
+
+ if (src_cached == dst_cached)
+ rename_subvol = src_cached;
+ else
+ rename_subvol = dst_hashed;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "renaming %s => %s (%s)",
+ local->loc.path, local->loc2.path, rename_subvol->name);
+
+ STACK_WIND (frame, dht_rename_cbk,
+ rename_subvol, rename_subvol->fops->rename,
+ &local->loc, &local->loc2);
+
+ return 0;
+}
+
+
+int
+dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "link/file on %s failed (%s)",
+ prev->this->name, strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ if (local->op_ret == -1)
+ goto unwind;
+
+ dht_do_rename (frame);
+ }
+
+ return 0;
+
+unwind:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_rename_create_links (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ int call_cnt = 0;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ if (src_cached == dst_cached)
+ goto nolinks;
+
+ if (dst_hashed != src_hashed && dst_hashed != src_cached)
+ call_cnt++;
+
+ if (src_cached != dst_hashed)
+ call_cnt++;
+
+ local->call_cnt = call_cnt;
+
+ if (dst_hashed != src_hashed && dst_hashed != src_cached) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "linkfile %s @ %s => %s",
+ local->loc.path, dst_hashed->name, src_cached->name);
+ dht_linkfile_create (frame, dht_rename_links_cbk,
+ src_cached, dst_hashed, &local->loc);
+ }
+
+ if (src_cached != dst_hashed) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "link %s => %s (%s)", local->loc.path,
+ local->loc2.path, src_cached->name);
+ STACK_WIND (frame, dht_rename_links_cbk,
+ src_cached, src_cached->fops->link,
+ &local->loc, &local->loc2);
+ }
+
+nolinks:
+ if (!call_cnt) {
+ /* skip to next step */
+ dht_do_rename (frame);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ xlator_t *src_cached = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (oldloc, err);
+ VALIDATE_OR_GOTO (newloc, err);
+
+ src_hashed = dht_subvol_get_hashed (this, oldloc);
+ if (!src_hashed) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ oldloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ src_cached = dht_subvol_get_cached (this, oldloc->inode);
+ if (!src_cached) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", oldloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ dst_hashed = dht_subvol_get_hashed (this, newloc);
+ if (!dst_hashed) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ newloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (newloc->inode)
+ dst_cached = dht_subvol_get_cached (this, newloc->inode);
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc, oldloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->src_hashed = src_hashed;
+ local->src_cached = src_cached;
+ local->dst_hashed = dst_hashed;
+ local->dst_cached = dst_cached;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)",
+ oldloc->path, src_hashed->name, src_cached->name,
+ newloc->path, dst_hashed->name,
+ dst_cached ? dst_cached->name : "<nul>");
+
+ if (S_ISDIR (oldloc->inode->st_mode)) {
+ dht_rename_dir (frame, this);
+ } else {
+ local->op_ret = 0;
+ dht_rename_create_links (frame);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
new file mode 100644
index 00000000000..ee32b2253ed
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -0,0 +1,460 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+
+
+int
+dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)
+{
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+ local->selfheal.dir_cbk (frame, NULL, frame->this, ret,
+ local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int err = 0;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+ prev = cookie;
+ subvol = prev->this;
+
+ if (op_ret == 0)
+ err = 0;
+ else
+ err = op_errno;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ layout->list[i].err = err;
+ break;
+ }
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_selfheal_dir_finish (frame, this, 0);
+ }
+
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout, int i)
+{
+ xlator_t *subvol = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+ int32_t *disk_layout = NULL;
+
+
+ subvol = layout->list[i].xlator;
+ this = frame->this;
+
+ xattr = get_new_dict ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = dht_disk_layout_extract (this, layout, i, &disk_layout);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to extract disk layout");
+ goto err;
+ }
+
+ ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
+ disk_layout, 4 * 4);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set xattr dictionary");
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting hash range %u - %u (type %d) on subvolume %s for %s",
+ layout->list[i].start, layout->list[i].stop,
+ layout->type, subvol->name, loc->path);
+
+ dict_ref (xattr);
+
+ STACK_WIND (frame, dht_selfheal_dir_xattr_cbk,
+ subvol, subvol->fops->setxattr,
+ loc, xattr, 0);
+
+ dict_unref (xattr);
+
+ return 0;
+
+err:
+ if (xattr)
+ dict_destroy (xattr);
+
+ if (disk_layout)
+ FREE (disk_layout);
+
+ dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this,
+ -1, ENOMEM);
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ int missing_xattr = 0;
+ int i = 0;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ local = frame->local;
+ this = frame->this;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop)
+ continue;
+ /* attr missing and layout present */
+ missing_xattr++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%d subvolumes missing xattr for %s",
+ missing_xattr, loc->path);
+
+ if (missing_xattr == 0) {
+ dht_selfheal_dir_finish (frame, this, 0);
+ return 0;
+ }
+
+ local->call_cnt = missing_xattr;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop)
+ continue;
+
+ ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+
+ if (--missing_xattr == 0)
+ break;
+ }
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+ prev = cookie;
+ subvol = prev->this;
+
+ if ((op_ret == 0) || (op_errno == EEXIST)) {
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ layout->list[i].err = -1;
+ break;
+ }
+ }
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_selfheal_dir_xattr (frame, &local->loc, layout);
+ }
+
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout, int force)
+{
+ int missing_dirs = 0;
+ int i = 0;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == ENOENT || force)
+ missing_dirs++;
+ }
+
+ if (missing_dirs == 0) {
+ dht_selfheal_dir_xattr (frame, loc, layout);
+ return 0;
+ }
+
+ local->call_cnt = missing_dirs;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == ENOENT || force) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating directory %s on subvol %s",
+ loc->path, layout->list[i].xlator->name);
+
+ STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->mkdir,
+ loc, local->stbuf.st_mode);
+ }
+ }
+
+ return 0;
+}
+
+void
+dht_selfheal_fix_this_virgin (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ uint32_t chunk = 0;
+ int i = 0;
+ uint32_t start = 0;
+ int cnt = 0;
+ int err = 0;
+
+ this = frame->this;
+ conf = this->private;
+
+ for (i = 0; i < layout->cnt; i++) {
+ err = layout->list[i].err;
+ if (err == -1) {
+ cnt++;
+ }
+ }
+
+ chunk = ((unsigned long) 0xffffffff) / cnt;
+
+ start = 0;
+ for (i = 0; i < layout->cnt; i++) {
+ err = layout->list[i].err;
+ if (err == -1) {
+ layout->list[i].start = start;
+ layout->list[i].stop = start + chunk - 1;
+
+ start = start + chunk;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "gave fix: %u - %u on %s for %s",
+ layout->list[i].start, layout->list[i].stop,
+ layout->list[i].xlator->name, loc->path);
+ if (--cnt == 0) {
+ layout->list[i].stop = 0xffffffff;
+ break;
+ }
+ }
+ }
+}
+
+
+int
+dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ dht_local_t *local = NULL;
+ int missing = -1;
+ int down = -1;
+ int holes = -1;
+ int ret = -1;
+ int i = -1;
+
+ this = frame->this;
+ conf = this->private;
+ local = frame->local;
+
+ missing = local->selfheal.missing;
+ down = local->selfheal.down;
+ holes = local->selfheal.hole_cnt;
+
+ if ((missing + down) == conf->subvolume_cnt) {
+ dht_selfheal_fix_this_virgin (frame, loc, layout);
+ ret = 0;
+ }
+
+ if (holes <= down) {
+ /* the down subvol might fill up the holes */
+ ret = 0;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ /* directory not present */
+ if (layout->list[i].err == ENOENT) {
+ ret = 0;
+ break;
+ }
+ }
+
+ /* TODO: give a fix to these non-virgins */
+
+ return ret;
+}
+
+
+int
+dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ ret = dht_layout_anomalies (this, loc, layout,
+ &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt,
+ &local->selfheal.missing,
+ &local->selfheal.down,
+ &local->selfheal.misc);
+
+ holes = local->selfheal.hole_cnt;
+ overlaps = local->selfheal.overlaps_cnt;
+ missing = local->selfheal.missing;
+ down = local->selfheal.down;
+ misc = local->selfheal.misc;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = layout;
+
+/*
+ if (down) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%d subvolumes down -- not fixing", down);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ if (overlaps) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "not fixing overlaps in %s", loc->path);
+ local->op_errno = EINVAL;
+ ret = -1;
+ goto sorry_no_fix;
+ }
+
+ if (misc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%d subvolumes have unrecoverable errors", misc);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ if (holes > missing) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%d holes and %d pigeons -- not fixing",
+ holes, missing);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+*/
+ ret = dht_selfheal_dir_getafix (frame, loc, layout);
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "the directory is not a virgin");
+ goto sorry_no_fix;
+ }
+
+ dht_selfheal_dir_mkdir (frame, loc, layout, 0);
+
+ return 0;
+
+sorry_no_fix:
+ /* TODO: need to put appropriate local->op_errno */
+ dht_selfheal_dir_finish (frame, this, ret);
+
+ return 0;
+}
+
+
+int
+dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ int ret = 0;
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = layout;
+
+ ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
new file mode 100644
index 00000000000..836e7a4e81f
--- /dev/null
+++ b/xlators/cluster/dht/src/dht.c
@@ -0,0 +1,222 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "dht-common.c"
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+
+
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+ int ret = -1;
+
+ ret = dht_notify (this, event, data);
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ FREE (conf->file_layouts[i]);
+ }
+ FREE (conf->file_layouts);
+ }
+
+ if (conf->default_dir_layout)
+ FREE (conf->default_dir_layout);
+
+ if (conf->subvolumes)
+ FREE (conf->subvolumes);
+
+ if (conf->subvolume_status)
+ FREE (conf->subvolume_status);
+
+ FREE (conf);
+ }
+
+ return;
+}
+
+int
+init (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;