summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr.h
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr.h')
-rw-r--r--xlators/cluster/afr/src/afr.h989
1 files changed, 511 insertions, 478 deletions
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index a2da671a7..36042f7b2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -31,29 +22,47 @@
#include "afr-mem-types.h"
#include "libxlator.h"
+#include "timer.h"
+#include "syncop.h"
+
+#include "afr-self-heald.h"
#define AFR_XATTR_PREFIX "trusted.afr"
+#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+
+typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
-struct _pump_private;
+typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
+
+typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err);
+
+typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
+
+#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
+#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
+#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
-
xlator_t **children;
- gf_lock_t root_inode_lk;
- int first_lookup;
inode_t *root_inode;
unsigned char *child_up;
char **pending_key;
- gf_boolean_t data_self_heal; /* on/off */
+ char *data_self_heal; /* on/off/open */
char * data_self_heal_algorithm; /* name of algorithm */
unsigned int data_self_heal_window_size; /* max number of pipelined
read/writes */
@@ -67,116 +76,54 @@ typedef struct _afr_private {
gf_boolean_t metadata_change_log; /* on/off */
gf_boolean_t entry_change_log; /* on/off */
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
- unsigned int favorite_child; /* subvolume to be preferred in resolving
+ unsigned int hash_mode; /* for when read_child is not set */
+ int favorite_child; /* subvolume to be preferred in resolving
split-brain cases */
- unsigned int data_lock_server_count;
- unsigned int metadata_lock_server_count;
- unsigned int entry_lock_server_count;
-
gf_boolean_t inodelk_trace;
gf_boolean_t entrylk_trace;
- gf_boolean_t strict_readdir;
-
unsigned int wait_count; /* # of servers to wait for success */
uint64_t up_count; /* number of CHILD_UPs we have seen */
uint64_t down_count; /* number of CHILD_DOWNs we have seen */
- struct _pump_private *pump_private; /* Set if we are loaded as pump */
- int use_afr_in_pump;
-
- pthread_mutex_t mutex;
- struct list_head saved_fds; /* list of fds on which locks have succeeded */
- gf_boolean_t optimistic_change_log;
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
char vol_uuid[UUID_SIZE + 1];
+ int32_t *last_event;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
+
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
+ char *afr_dirty;
+
+ afr_self_heald_t shd;
+
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
} afr_private_t;
-typedef struct {
- /* External interface: These are variables (some optional) that
- are set by whoever has triggered self-heal */
-
- gf_boolean_t need_data_self_heal;
- gf_boolean_t need_metadata_self_heal;
- gf_boolean_t need_entry_self_heal;
-
- gf_boolean_t forced_merge; /* Is this a self-heal triggered to
- forcibly merge the directories? */
-
- gf_boolean_t healing_fd_opened; /* true if caller has already
- opened fd */
-
- gf_boolean_t data_lock_held; /* true if caller has already
- acquired 0-0 lock */
-
- fd_t *healing_fd; /* set if callers has opened fd */
-
- gf_boolean_t background; /* do self-heal in background
- if possible */
-
- ia_type_t type; /* st_mode of the entry we're doing
- self-heal on */
-
- /* Function to call to unwind. If self-heal is being done in the
- background, this function will be called as soon as possible. */
-
- int (*unwind) (call_frame_t *frame, xlator_t *this);
-
- /* End of external interface members */
-
-
- /* array of stat's, one for each child */
- struct iatt *buf;
- struct iatt parentbuf;
- struct iatt entrybuf;
-
- /* array of xattr's, one for each child */
- dict_t **xattr;
-
- /* array of errno's, one for each child */
- int *child_errno;
-
- int32_t **pending_matrix;
- int32_t **delta_matrix;
-
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- int *success;
- unsigned char *locked_nodes;
- int lock_count;
-
- mode_t impunging_entry_mode;
- const char *linkname;
-
- int op_failed;
-
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
-
- loc_t parent_loc;
-
- call_frame_t *orig_frame;
- gf_boolean_t unwound;
-
- /* private data for the particular self-heal algorithm */
- void *private;
-
- int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this);
-
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
-
- call_frame_t *sh_frame;
-} afr_self_heal_t;
-
typedef enum {
AFR_DATA_TRANSACTION, /* truncate, write, ... */
@@ -238,11 +185,31 @@ afr_index_for_transaction_type (afr_transaction_type type)
return -1; /* make gcc happy */
}
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
+
+} afr_entry_lockee_t;
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
+
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -251,85 +218,203 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
int32_t lock_op_ret;
int32_t lock_op_errno;
+ afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+} afr_internal_lock_t;
- int (*lock_cbk) (call_frame_t *, xlator_t *);
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ uint8_t checksum[MD5_DIGEST_LENGTH];
+};
-} afr_internal_lock_t;
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
+
+typedef struct {
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+
+ unsigned int *lock_piggyback;
+ unsigned int *lock_acquired;
+
+ int flags;
+
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
+} afr_fd_ctx_t;
-typedef struct _afr_locked_fd {
- fd_t *fd;
- struct list_head list;
-} afr_locked_fd_t;
typedef struct _afr_local {
- int uid;
- int gid;
+ glusterfs_fop_t op;
unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- unsigned int govinda_gOvinda;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
- unsigned int read_child_index;
- unsigned char read_child_returned;
- unsigned int first_up_child;
-
- pid_t saved_pid;
+ gf_lkowner_t saved_lk_owner;
int32_t op_ret;
int32_t op_errno;
int32_t **pending;
+ int dirty[AFR_NUM_CHANGE_LOGS];
+
loc_t loc;
loc_t newloc;
fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
- glusterfs_fop_t fop;
-
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
unsigned char *child_up;
- int32_t *child_errno;
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- dict_t *xattr_req;
+ /* @readfn:
- int32_t inodelk_count;
- int32_t entrylk_count;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- afr_internal_lock_t internal_lock;
+ afr_read_txn_wind_t readfn;
+
+ /* @refreshed:
+
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
+
+ /* @inode:
+
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
+
+ inode_t *inode;
+
+ /* @parent[2]:
- afr_locked_fd_t *locked_fd;
- int32_t source_child;
- int32_t lock_recovery_child;
+ parent inode[s] on which directory transactions are performed.
+ */
+
+ inode_t *parent;
+ inode_t *parent2;
+
+ /* @readable:
+
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+
+ afr_inode_refresh_cbk_t refreshfn;
+
+ /* @refreshinode:
+
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
+
+ /*
+ @pre_op_compat:
+
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
+
+ gf_boolean_t pre_op_compat;
+
+ dict_t *xattr_req;
+
+ afr_internal_lock_t internal_lock;
dict_t *dict;
+
int optimistic_change_log;
+ gf_boolean_t delayed_post_op;
+
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
- int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this);
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
*/
- int op;
struct {
struct {
unsigned char buf_set;
@@ -337,21 +422,7 @@ typedef struct _afr_local {
} statfs;
struct {
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt postparent;
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dict_t *xattr;
- dict_t **xattrs;
- gf_boolean_t is_revalidate;
- } lookup;
-
- struct {
int32_t flags;
- int32_t wbflags;
} open;
struct {
@@ -365,35 +436,33 @@ typedef struct _afr_local {
struct {
int32_t mask;
- int last_tried; /* index of the child we tried previously */
+ int last_index; /* index of the child we tried previously */
} access;
struct {
- int last_tried;
- ino_t ino;
+ int last_index;
} stat;
struct {
- int last_tried;
- ino_t ino;
+ int last_index;
} fstat;
struct {
size_t size;
- int last_tried;
- ino_t ino;
+ int last_index;
} readlink;
struct {
char *name;
- int last_tried;
+ int last_index;
+ long xattr_len;
} getxattr;
struct {
- ino_t ino;
size_t size;
off_t offset;
- int last_tried;
+ int last_index;
+ uint32_t flags;
} readv;
/* dir read */
@@ -411,71 +480,43 @@ typedef struct _afr_local {
int32_t op_errno;
size_t size;
off_t offset;
-
+ dict_t *dict;
gf_boolean_t failed;
- int last_tried;
+ int last_index;
} readdir;
-
- struct {
- int32_t op_ret;
- int32_t op_errno;
-
- size_t size;
- off_t offset;
- int32_t flag;
-
- int last_tried;
- } getdents;
-
/* inode write */
struct {
- ino_t ino;
struct iatt prebuf;
struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+ struct {
int32_t op_ret;
struct iovec *vector;
struct iobref *iobref;
int32_t count;
off_t offset;
+ uint32_t flags;
} writev;
struct {
- ino_t ino;
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
- ino_t ino;
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} truncate;
struct {
- ino_t ino;
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} ftruncate;
struct {
- ino_t ino;
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} setattr;
struct {
- ino_t ino;
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} fsetattr;
struct {
@@ -484,116 +525,87 @@ typedef struct _afr_local {
} setxattr;
struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
+
+ struct {
char *name;
} removexattr;
+ struct {
+ dict_t *xattr;
+ } xattrop;
+
+ struct {
+ dict_t *xattr;
+ } fxattrop;
+
/* dir write */
struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- fd_t *fd;
- dict_t *params;
- int32_t flags;
- mode_t mode;
inode_t *inode;
struct iatt buf;
struct iatt preparent;
struct iatt postparent;
- struct iatt read_child_buf;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; //common structure for all dir fops
+
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
} create;
struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
dev_t dev;
mode_t mode;
dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
} mknod;
struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
int32_t mode;
dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
} mkdir;
struct {
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } unlink;
-
- struct {
- int flags;
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
+ int flags;
} rmdir;
struct {
- ino_t oldparent_ino;
- ino_t newparent_ino;
- ino_t ino;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preoldparent;
- struct iatt prenewparent;
- struct iatt postoldparent;
- struct iatt postnewparent;
- } rename;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } link;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
dict_t *params;
- struct iatt buf;
- struct iatt read_child_buf;
char *linkpath;
- struct iatt preparent;
- struct iatt postparent;
} symlink;
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
struct {
- int32_t flags;
- dir_entry_t *entries;
- int32_t count;
- } setdents;
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
} cont;
struct {
off_t start, len;
+ gf_boolean_t eager_lock_on;
+ int *eager_lock;
+
char *basename;
char *new_basename;
@@ -602,15 +614,67 @@ typedef struct _afr_local {
afr_transaction_type type;
- int success_count;
- int erase_pending;
- int failure_count;
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head eager_locked;
+
+ unsigned char *pre_op;
+
+ /* @fop_subvols: subvolumes on which FOP will be attempted */
+ unsigned char *fop_subvols;
+
+ /* @failed_subvols: subvolumes on which FOP failed. Always
+ a subset of @fop_subvols */
+ unsigned char *failed_subvols;
+
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
- int last_tried;
- int32_t *child_errno;
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
+
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
+
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
+
+ /* @uninherit_done:
+ @uninherit_value:
+
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+
+ afr_changelog_resume_t changelog_resume;
call_frame_t *main_frame;
+ int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
+
int (*fop) (call_frame_t *frame, xlator_t *this);
int (*done) (call_frame_t *frame, xlator_t *this);
@@ -622,85 +686,96 @@ typedef struct _afr_local {
/* post-op hook */
} transaction;
- afr_self_heal_t self_heal;
+ syncbarrier_t barrier;
struct marker_str marker;
-} afr_local_t;
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
-typedef struct {
- unsigned int *pre_op_done;
- unsigned int *opened_on; /* which subvolumes the fd is open on */
- unsigned int *pre_op_piggyback;
+ mode_t umask;
+ int xflag;
+ gf_boolean_t do_discovery;
+ struct afr_reply *replies;
+} afr_local_t;
- int flags;
- int32_t wbflags;
- uint64_t up_count; /* number of CHILD_UPs this fd has seen */
- uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
- int32_t last_tried;
+/* did a call fail due to a child failing? */
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
- int hit, miss;
- gf_boolean_t failed_over;
- struct list_head entries; /* needed for readdir failover */
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
- unsigned char *locked_on; /* which subvolumes locks have been successful */
-} afr_fd_ctx_t;
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);
-/* try alloc and if it fails, goto label */
-#define ALLOC_OR_GOTO(var, type, label) do { \
- var = GF_CALLOC (sizeof (type), 1, \
- gf_afr_mt_##type); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
+int
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable);
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type);
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type);
-/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
- ((op_errno == ENOTCONN) || \
- (op_errno == EBADFD)))
+#define afr_data_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION)
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+#define afr_metadata_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION)
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t cbk);
int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
-int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, ...);
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
int
afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
int
-afr_save_locked_fd (xlator_t *this, fd_t *fd);
-
-int
afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
unsigned char *locked_nodes);
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this);
+afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
int
afr_set_lock_number (call_frame_t *frame, xlator_t *this);
-
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
-
int32_t
afr_unlock (call_frame_t *frame, xlator_t *this);
@@ -716,32 +791,30 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
+ unsigned int child_count);
-int pump_start (call_frame_t *frame, xlator_t *this);
+int
+__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
int
afr_fd_ctx_set (xlator_t *this, fd_t *fd);
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode);
-
-void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child);
-
-void
-afr_build_parent_loc (loc_t *parent, loc_t *child);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
-afr_up_children_count (int child_count, unsigned char *child_up);
+afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
int
afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index);
-
int
-afr_deitransform (ino64_t ino, int child_count);
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+void
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv);
void
afr_local_cleanup (afr_local_t *local, xlator_t *this);
@@ -749,21 +822,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);
int
afr_frame_return (call_frame_t *frame);
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode);
-
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
-
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags);
-
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode);
-
-uint64_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode);
+ fd_t *fd, dict_t *xdata);
void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
@@ -771,9 +832,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-int
-afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
-
#define AFR_STACK_UNWIND(fop, frame, params ...) \
do { \
afr_local_t *__local = NULL; \
@@ -784,22 +842,36 @@ afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
frame->local = NULL; \
} \
STACK_UNWIND_STRICT (fop, frame, params); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
- } while (0);
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
+ } while (0)
-#define AFR_STACK_DESTROY(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
} while (0);
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({frame->local = mem_get0 (THIS->local_pool); \
+ if (afr_local_init (frame->local, THIS->private, &op_errno)) { \
+ afr_local_cleanup (frame->local, THIS); \
+ mem_put (frame->local); \
+ frame->local = NULL; }; \
+ frame->local;})
+
+#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0)
+
/* allocate and return a string that is the basename of argument */
static inline char *
AFR_BASENAME (const char *str)
@@ -812,132 +884,93 @@ AFR_BASENAME (const char *str)
return __basename_str;
}
-/* initialize local_t */
-static inline int
-AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
-{
- int child_up_count = 0;
-
- local->child_up = GF_CALLOC (sizeof (*local->child_up),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->child_up) {
- return -ENOMEM;
- }
-
- memcpy (local->child_up, priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
-
- child_up_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (priv->optimistic_change_log && child_up_count == priv->child_count)
- local->optimistic_change_log = 1;
-
- local->call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->call_count == 0)
- return -ENOTCONN;
+call_frame_t *
+afr_copy_frame (call_frame_t *base);
- local->transaction.erase_pending = 1;
-
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
-
- local->internal_lock.lock_op_ret = -1;
- local->internal_lock.lock_op_errno = EUCLEAN;
+int
+afr_transaction_local_init (afr_local_t *local, xlator_t *this);
+int32_t
+afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
- return 0;
-}
+int
+afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
+int
+afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
+ transaction_lk_type_t lk_type);
-/**
- * first_up_child - return the index of the first child that is up
- */
+int
+afr_higher_errno (int32_t old_errno, int32_t new_errno);
-static inline int
-afr_first_up_child (afr_private_t *priv)
-{
- xlator_t ** children = NULL;
- int ret = -1;
- int i = 0;
-
- LOCK (&priv->lock);
- {
- children = priv->children;
- for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i]) {
- ret = i;
- break;
- }
- }
- }
- UNLOCK (&priv->lock);
+int
+afr_final_errno (afr_local_t *local, afr_private_t *priv);
- return ret;
-}
+int
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
+void
+afr_fix_open (fd_t *fd, xlator_t *this);
-static inline int
-afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
-{
- int i;
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
- local->first_up_child = afr_first_up_child (priv);
+void
+afr_set_low_priority (call_frame_t *frame);
+int
+afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
+ int flags);
- local->child_errno = GF_CALLOC (sizeof (*local->child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!local->child_errno) {
- return -ENOMEM;
- }
+gf_boolean_t
+afr_have_quorum (char *logname, afr_private_t *priv);
- local->pending = GF_CALLOC (sizeof (*local->pending),
- priv->child_count,
- gf_afr_mt_int32_t);
+void
+afr_matrix_cleanup (int32_t **pending, unsigned int m);
- if (!local->pending) {
- return -ENOMEM;
- }
+int32_t**
+afr_matrix_create (unsigned int m, unsigned int n);
- for (i = 0; i < priv->child_count; i++) {
- local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]),
- 3, /* data + metadata + entry */
- gf_afr_mt_int32_t);
- if (!local->pending[i])
- return -ENOMEM;
- }
+void
+afr_filter_xattrs (dict_t *xattr);
- local->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
- local->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+/*
+ * Having this as a macro will make debugging a bit weirder, but does reduce
+ * the probability of functions handling this check inconsistently.
+ */
+#define QUORUM_CHECK(_func,_label) do { \
+ if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \
+ gf_log(this->name,GF_LOG_WARNING, \
+ "failing "#_func" due to lack of quorum"); \
+ op_errno = EROFS; \
+ goto _label; \
+ } \
+} while (0);
- local->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
- local->internal_lock.lower_locked_nodes
- = GF_CALLOC (sizeof (*local->internal_lock.lower_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
- local->transaction.child_errno = GF_CALLOC (sizeof (*local->transaction.child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
- local->internal_lock.transaction_lk_type = AFR_TRANSACTION_LK;
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
- return 0;
-}
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
-int32_t
-afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local);
+void
+afr_remove_eager_lock_stub (afr_local_t *local);
#endif /* __AFR_H__ */