summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr.h
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr.h')
-rw-r--r--xlators/cluster/afr/src/afr.h1256
1 files changed, 655 insertions, 601 deletions
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 9593fec3c..36042f7b2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -30,153 +21,115 @@
#include "compat-errno.h"
#include "afr-mem-types.h"
+#include "libxlator.h"
+#include "timer.h"
+#include "syncop.h"
+
+#include "afr-self-heald.h"
+
#define AFR_XATTR_PREFIX "trusted.afr"
+#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
-struct _pump_private;
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
-typedef struct _afr_private {
- gf_lock_t lock; /* to guard access to child_count, etc */
- unsigned int child_count; /* total number of children */
+typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
+
+typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
+typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err);
+
+typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
+
+#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
+#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
+#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
+
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
- xlator_t **children;
+ xlator_t **children;
- gf_lock_t root_inode_lk;
- int first_lookup;
inode_t *root_inode;
- unsigned char *child_up;
+ unsigned char *child_up;
char **pending_key;
- gf_boolean_t data_self_heal; /* on/off */
+ char *data_self_heal; /* on/off/open */
char * data_self_heal_algorithm; /* name of algorithm */
unsigned int data_self_heal_window_size; /* max number of pipelined
read/writes */
unsigned int background_self_heal_count;
unsigned int background_self_heals_started;
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal; /* on/off */
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
+ gf_boolean_t data_change_log; /* on/off */
+ gf_boolean_t metadata_change_log; /* on/off */
+ gf_boolean_t entry_change_log; /* on/off */
- int read_child; /* read-subvolume */
- unsigned int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ int read_child; /* read-subvolume */
+ unsigned int hash_mode; /* for when read_child is not set */
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
- unsigned int data_lock_server_count;
- unsigned int metadata_lock_server_count;
- unsigned int entry_lock_server_count;
+ gf_boolean_t inodelk_trace;
+ gf_boolean_t entrylk_trace;
- gf_boolean_t inodelk_trace;
- gf_boolean_t entrylk_trace;
-
- gf_boolean_t strict_readdir;
-
- unsigned int wait_count; /* # of servers to wait for success */
+ unsigned int wait_count; /* # of servers to wait for success */
uint64_t up_count; /* number of CHILD_UPs we have seen */
uint64_t down_count; /* number of CHILD_DOWNs we have seen */
- struct _pump_private *pump_private; /* Set if we are loaded as pump */
- int use_afr_in_pump;
-
- pthread_mutex_t mutex;
- struct list_head saved_fds; /* list of fds on which locks have succeeded */
-} afr_private_t;
-
-typedef struct {
- /* External interface: These are variables (some optional) that
- are set by whoever has triggered self-heal */
-
- gf_boolean_t need_data_self_heal;
- gf_boolean_t need_metadata_self_heal;
- gf_boolean_t need_entry_self_heal;
-
- gf_boolean_t forced_merge; /* Is this a self-heal triggered to
- forcibly merge the directories? */
-
- gf_boolean_t healing_fd_opened; /* true if caller has already
- opened fd */
-
- gf_boolean_t data_lock_held; /* true if caller has already
- acquired 0-0 lock */
-
- fd_t *healing_fd; /* set if callers has opened fd */
-
- gf_boolean_t background; /* do self-heal in background
- if possible */
-
- ia_type_t type; /* st_mode of the entry we're doing
- self-heal on */
-
- /* Function to call to unwind. If self-heal is being done in the
- background, this function will be called as soon as possible. */
-
- int (*unwind) (call_frame_t *frame, xlator_t *this);
-
- /* End of external interface members */
-
-
- /* array of stat's, one for each child */
- struct iatt *buf;
- struct iatt parentbuf;
-
- /* array of xattr's, one for each child */
- dict_t **xattr;
-
- /* array of errno's, one for each child */
- int *child_errno;
-
- int32_t **pending_matrix;
- int32_t **delta_matrix;
-
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- int *success;
- unsigned char *locked_nodes;
- int lock_count;
-
- mode_t impunging_entry_mode;
- const char *linkname;
-
- int op_failed;
-
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
-
- loc_t parent_loc;
-
- call_frame_t *orig_frame;
- gf_boolean_t unwound;
-
- /* private data for the particular self-heal algorithm */
- void *private;
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
+
+ char vol_uuid[UUID_SIZE + 1];
+ int32_t *last_event;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
- int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this);
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
+ char *afr_dirty;
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
+ afr_self_heald_t shd;
- call_frame_t *sh_frame;
-} afr_self_heal_t;
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
+} afr_private_t;
typedef enum {
- AFR_DATA_TRANSACTION, /* truncate, write, ... */
- AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
- AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
- AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
} afr_transaction_type;
typedef enum {
@@ -232,11 +185,31 @@ afr_index_for_transaction_type (afr_transaction_type type)
return -1; /* make gcc happy */
}
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
+
+} afr_entry_lockee_t;
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
+
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -245,451 +218,564 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
int32_t lock_op_ret;
int32_t lock_op_errno;
+ afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+} afr_internal_lock_t;
- int (*lock_cbk) (call_frame_t *, xlator_t *);
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ uint8_t checksum[MD5_DIGEST_LENGTH];
+};
-} afr_internal_lock_t;
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
+
+typedef struct {
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+
+ unsigned int *lock_piggyback;
+ unsigned int *lock_acquired;
+
+ int flags;
+
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
+} afr_fd_ctx_t;
-typedef struct _afr_locked_fd {
- fd_t *fd;
- struct list_head list;
-} afr_locked_fd_t;
typedef struct _afr_local {
- unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
+ glusterfs_fop_t op;
+ unsigned int call_count;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- unsigned int govinda_gOvinda;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
- unsigned int read_child_index;
- unsigned char read_child_returned;
- unsigned int first_up_child;
+ gf_lkowner_t saved_lk_owner;
- pid_t saved_pid;
+ int32_t op_ret;
+ int32_t op_errno;
- int32_t op_ret;
- int32_t op_errno;
+ int32_t **pending;
- int32_t **pending;
+ int dirty[AFR_NUM_CHANGE_LOGS];
- loc_t loc;
- loc_t newloc;
+ loc_t loc;
+ loc_t newloc;
- fd_t *fd;
+ fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
- glusterfs_fop_t fop;
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
+ unsigned char *child_up;
+
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- unsigned char *child_up;
+ /* @readfn:
- int32_t *child_errno;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- dict_t *xattr_req;
+ afr_read_txn_wind_t readfn;
- int32_t inodelk_count;
- int32_t entrylk_count;
+ /* @refreshed:
- afr_internal_lock_t internal_lock;
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
- afr_locked_fd_t *locked_fd;
- int32_t source_child;
- int32_t lock_recovery_child;
+ /* @inode:
- dict_t *dict;
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
- int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this);
+ inode_t *inode;
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
+ /* @parent[2]:
+
+ parent inode[s] on which directory transactions are performed.
*/
- int op;
- struct {
- struct {
- unsigned char buf_set;
- struct statvfs buf;
- } statfs;
+ inode_t *parent;
+ inode_t *parent2;
- struct {
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt postparent;
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dict_t *xattr;
- dict_t **xattrs;
- gf_boolean_t is_revalidate;
- } lookup;
+ /* @readable:
- struct {
- int32_t flags;
- int32_t wbflags;
- } open;
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
- struct {
- int32_t cmd;
- struct gf_flock user_flock;
- struct gf_flock ret_flock;
- unsigned char *locked_nodes;
- } lk;
+ afr_inode_refresh_cbk_t refreshfn;
- /* inode read */
+ /* @refreshinode:
- struct {
- int32_t mask;
- int last_tried; /* index of the child we tried previously */
- } access;
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
- struct {
- int last_tried;
- ino_t ino;
- } stat;
+ /*
+ @pre_op_compat:
- struct {
- int last_tried;
- ino_t ino;
- } fstat;
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
- struct {
- size_t size;
- int last_tried;
- ino_t ino;
- } readlink;
+ gf_boolean_t pre_op_compat;
- struct {
- char *name;
- int last_tried;
- } getxattr;
+ dict_t *xattr_req;
- struct {
- ino_t ino;
- size_t size;
- off_t offset;
- int last_tried;
- } readv;
+ afr_internal_lock_t internal_lock;
- /* dir read */
+ dict_t *dict;
- struct {
- int success_count;
- int32_t op_ret;
- int32_t op_errno;
+ int optimistic_change_log;
+ gf_boolean_t delayed_post_op;
- uint32_t *checksum;
- } opendir;
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
- struct {
- int32_t op_ret;
- int32_t op_errno;
- size_t size;
- off_t offset;
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
- gf_boolean_t failed;
- int last_tried;
- } readdir;
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
- struct {
- int32_t op_ret;
- int32_t op_errno;
+ struct {
+ struct {
+ unsigned char buf_set;
+ struct statvfs buf;
+ } statfs;
- size_t size;
- off_t offset;
- int32_t flag;
+ struct {
+ int32_t flags;
+ } open;
- int last_tried;
- } getdents;
+ struct {
+ int32_t cmd;
+ struct gf_flock user_flock;
+ struct gf_flock ret_flock;
+ unsigned char *locked_nodes;
+ } lk;
- /* inode write */
+ /* inode read */
- struct {
- ino_t ino;
- struct iatt prebuf;
- struct iatt postbuf;
+ struct {
+ int32_t mask;
+ int last_index; /* index of the child we tried previously */
+ } access;
- int32_t op_ret;
+ struct {
+ int last_index;
+ } stat;
- struct iovec *vector;
- struct iobref *iobref;
- int32_t count;
- off_t offset;
- } writev;
+ struct {
+ int last_index;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_index;
+ } readlink;
+
+ struct {
+ char *name;
+ int last_index;
+ long xattr_len;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_index;
+ uint32_t flags;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+
+ uint32_t *checksum;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+ dict_t *dict;
+ gf_boolean_t failed;
+ int last_index;
+ } readdir;
+ /* inode write */
struct {
- ino_t ino;
struct iatt prebuf;
struct iatt postbuf;
- } fsync;
+ } inode_wfop; //common structure for all inode-write-fops
- struct {
- ino_t ino;
- off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
- } truncate;
+ struct {
+ int32_t op_ret;
- struct {
- ino_t ino;
- off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
- } ftruncate;
+ struct iovec *vector;
+ struct iobref *iobref;
+ int32_t count;
+ off_t offset;
+ uint32_t flags;
+ } writev;
- struct {
- ino_t ino;
- struct iatt in_buf;
+ struct {
+ off_t offset;
+ } truncate;
+
+ struct {
+ off_t offset;
+ } ftruncate;
+
+ struct {
+ struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
- } setattr;
+ } setattr;
- struct {
- ino_t ino;
- struct iatt in_buf;
+ struct {
+ struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
- } fsetattr;
+ } fsetattr;
- struct {
- dict_t *dict;
- int32_t flags;
- } setxattr;
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
- struct {
- char *name;
- } removexattr;
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
- /* dir write */
+ struct {
+ char *name;
+ } removexattr;
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- fd_t *fd;
- dict_t *params;
- int32_t flags;
- mode_t mode;
- inode_t *inode;
- struct iatt buf;
+ struct {
+ dict_t *xattr;
+ } xattrop;
+
+ struct {
+ dict_t *xattr;
+ } fxattrop;
+
+ /* dir write */
+
+ struct {
+ inode_t *inode;
+ struct iatt buf;
struct iatt preparent;
struct iatt postparent;
- struct iatt read_child_buf;
- } create;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; //common structure for all dir fops
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dev_t dev;
- mode_t mode;
+ struct {
+ fd_t *fd;
dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
- } mknod;
+ int32_t flags;
+ mode_t mode;
+ } create;
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- int32_t mode;
+ struct {
+ dev_t dev;
+ mode_t mode;
dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } mkdir;
+ } mknod;
- struct {
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } unlink;
+ struct {
+ int32_t mode;
+ dict_t *params;
+ } mkdir;
- struct {
- int flags;
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } rmdir;
+ struct {
+ int flags;
+ } rmdir;
- struct {
- ino_t oldparent_ino;
- ino_t newparent_ino;
- ino_t ino;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preoldparent;
- struct iatt prenewparent;
- struct iatt postoldparent;
- struct iatt postnewparent;
- } rename;
+ struct {
+ dict_t *params;
+ char *linkpath;
+ } symlink;
struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } link;
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- dict_t *params;
- struct iatt buf;
- struct iatt read_child_buf;
- char *linkpath;
- struct iatt preparent;
- struct iatt postparent;
- } symlink;
+ off_t offset;
+ size_t len;
+ } discard;
- struct {
- int32_t flags;
- dir_entry_t *entries;
- int32_t count;
- } setdents;
- } cont;
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
- struct {
- off_t start, len;
- char *basename;
- char *new_basename;
+ } cont;
- loc_t parent_loc;
- loc_t new_parent_loc;
+ struct {
+ off_t start, len;
- afr_transaction_type type;
+ gf_boolean_t eager_lock_on;
+ int *eager_lock;
- int success_count;
- int erase_pending;
- int failure_count;
+ char *basename;
+ char *new_basename;
- int last_tried;
- int32_t *child_errno;
+ loc_t parent_loc;
+ loc_t new_parent_loc;
- call_frame_t *main_frame;
+ afr_transaction_type type;
- int (*fop) (call_frame_t *frame, xlator_t *this);
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
- int (*done) (call_frame_t *frame, xlator_t *this);
+ struct list_head eager_locked;
- int (*resume) (call_frame_t *frame, xlator_t *this);
+ unsigned char *pre_op;
- int (*unwind) (call_frame_t *frame, xlator_t *this);
+ /* @fop_subvols: subvolumes on which FOP will be attempted */
+ unsigned char *fop_subvols;
- /* post-op hook */
- } transaction;
+ /* @failed_subvols: subvolumes on which FOP failed. Always
+ a subset of @fop_subvols */
+ unsigned char *failed_subvols;
- afr_self_heal_t self_heal;
-} afr_local_t;
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
-typedef struct {
- unsigned int *pre_op_done;
- unsigned int *opened_on; /* which subvolumes the fd is open on */
- unsigned int *pre_op_piggyback;
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
- int flags;
- int32_t wbflags;
- uint64_t up_count; /* number of CHILD_UPs this fd has seen */
- uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
- int32_t last_tried;
+ /* @uninherit_done:
+ @uninherit_value:
- int hit, miss;
- gf_boolean_t failed_over;
- struct list_head entries; /* needed for readdir failover */
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
- unsigned char *locked_on; /* which subvolumes locks have been successful */
-} afr_fd_ctx_t;
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+
+ afr_changelog_resume_t changelog_resume;
+
+ call_frame_t *main_frame;
+
+ int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
+
+ int (*fop) (call_frame_t *frame, xlator_t *this);
+ int (*done) (call_frame_t *frame, xlator_t *this);
+
+ int (*resume) (call_frame_t *frame, xlator_t *this);
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this);
+
+ /* post-op hook */
+ } transaction;
-/* try alloc and if it fails, goto label */
-#define ALLOC_OR_GOTO(var, type, label) do { \
- var = GF_CALLOC (sizeof (type), 1, \
- gf_afr_mt_##type); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
+ syncbarrier_t barrier;
+
+ struct marker_str marker;
+
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
+
+ mode_t umask;
+ int xflag;
+ gf_boolean_t do_discovery;
+ struct afr_reply *replies;
+} afr_local_t;
/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
- ((op_errno == ENOTCONN) || \
- (op_errno == EBADFD)))
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
+
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+int
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable);
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type);
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type);
+
+#define afr_data_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION)
+
+#define afr_metadata_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION)
int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t cbk);
int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, ...);
+afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
int
-afr_save_locked_fd (xlator_t *this, fd_t *fd);
+afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
int
afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
unsigned char *locked_nodes);
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this);
+afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
int
afr_set_lock_number (call_frame_t *frame, xlator_t *this);
-
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
-
int32_t
afr_unlock (call_frame_t *frame, xlator_t *this);
@@ -705,32 +791,30 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
+ unsigned int child_count);
-int pump_start (call_frame_t *frame, xlator_t *this);
+int
+__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
int
afr_fd_ctx_set (xlator_t *this, fd_t *fd);
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode);
-
-void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child);
-
-void
-afr_build_parent_loc (loc_t *parent, loc_t *child);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
-afr_up_children_count (int child_count, unsigned char *child_up);
+afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
int
afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index);
-
int
-afr_deitransform (ino64_t ino, int child_count);
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+void
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv);
void
afr_local_cleanup (afr_local_t *local, xlator_t *this);
@@ -738,21 +822,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);
int
afr_frame_return (call_frame_t *frame);
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode);
-
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
-
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags);
-
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode);
-
-uint64_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode);
+ fd_t *fd, dict_t *xdata);
void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
@@ -760,163 +832,145 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-int
-afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
-
-#define AFR_STACK_UNWIND(fop, frame, params ...) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- if (frame) { \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
+#define AFR_STACK_UNWIND(fop, frame, params ...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ if (frame) { \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT (fop, frame, params); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
+ } while (0)
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
} \
- STACK_UNWIND_STRICT (fop, frame, params); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
} while (0);
-#define AFR_STACK_DESTROY(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
- } while (0);
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({frame->local = mem_get0 (THIS->local_pool); \
+ if (afr_local_init (frame->local, THIS->private, &op_errno)) { \
+ afr_local_cleanup (frame->local, THIS); \
+ mem_put (frame->local); \
+ frame->local = NULL; }; \
+ frame->local;})
+
+#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0)
/* allocate and return a string that is the basename of argument */
static inline char *
AFR_BASENAME (const char *str)
{
- char *__tmp_str = NULL;
- char *__basename_str = NULL;
- __tmp_str = gf_strdup (str);
- __basename_str = gf_strdup (basename (__tmp_str));
- GF_FREE (__tmp_str);
- return __basename_str;
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = gf_strdup (str);
+ __basename_str = gf_strdup (basename (__tmp_str));
+ GF_FREE (__tmp_str);
+ return __basename_str;
}
-/* initialize local_t */
-static inline int
-AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
-{
- local->child_up = GF_CALLOC (sizeof (*local->child_up),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->child_up) {
- return -ENOMEM;
- }
+call_frame_t *
+afr_copy_frame (call_frame_t *base);
- memcpy (local->child_up, priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
+int
+afr_transaction_local_init (afr_local_t *local, xlator_t *this);
+int32_t
+afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
- local->call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->call_count == 0)
- return -ENOTCONN;
+int
+afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
- local->transaction.erase_pending = 1;
+int
+afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
+ transaction_lk_type_t lk_type);
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
+int
+afr_higher_errno (int32_t old_errno, int32_t new_errno);
- local->internal_lock.lock_op_ret = -1;
- local->internal_lock.lock_op_errno = EUCLEAN;
+int
+afr_final_errno (afr_local_t *local, afr_private_t *priv);
+int
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
- return 0;
-}
+void
+afr_fix_open (fd_t *fd, xlator_t *this);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
-/**
- * first_up_child - return the index of the first child that is up
- */
+void
+afr_set_low_priority (call_frame_t *frame);
+int
+afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
+ int flags);
-static inline int
-afr_first_up_child (afr_private_t *priv)
-{
- xlator_t ** children = NULL;
- int ret = -1;
- int i = 0;
-
- LOCK (&priv->lock);
- {
- children = priv->children;
- for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i]) {
- ret = i;
- break;
- }
- }
- }
- UNLOCK (&priv->lock);
-
- return ret;
-}
+gf_boolean_t
+afr_have_quorum (char *logname, afr_private_t *priv);
+void
+afr_matrix_cleanup (int32_t **pending, unsigned int m);
-static inline int
-afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
-{
- int i;
-
- local->first_up_child = afr_first_up_child (priv);
-
- local->child_errno = GF_CALLOC (sizeof (*local->child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!local->child_errno) {
- return -ENOMEM;
- }
-
- local->pending = GF_CALLOC (sizeof (*local->pending),
- priv->child_count,
- gf_afr_mt_int32_t);
-
- if (!local->pending) {
- return -ENOMEM;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]),
- 3, /* data + metadata + entry */
- gf_afr_mt_int32_t);
- if (!local->pending[i])
- return -ENOMEM;
- }
+int32_t**
+afr_matrix_create (unsigned int m, unsigned int n);
- local->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+void
+afr_filter_xattrs (dict_t *xattr);
- local->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
- local->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+/*
+ * Having this as a macro will make debugging a bit weirder, but does reduce
+ * the probability of functions handling this check inconsistently.
+ */
+#define QUORUM_CHECK(_func,_label) do { \
+ if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \
+ gf_log(this->name,GF_LOG_WARNING, \
+ "failing "#_func" due to lack of quorum"); \
+ op_errno = EROFS; \
+ goto _label; \
+ } \
+} while (0);
- local->internal_lock.lower_locked_nodes
- = GF_CALLOC (sizeof (*local->internal_lock.lower_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
- local->transaction.child_errno = GF_CALLOC (sizeof (*local->transaction.child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
- local->internal_lock.transaction_lk_type = AFR_TRANSACTION_LK;
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
- return 0;
-}
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local);
+void
+afr_remove_eager_lock_stub (afr_local_t *local);
#endif /* __AFR_H__ */