diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr.h')
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1256 |
1 files changed, 655 insertions, 601 deletions
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9593fec3c..36042f7b2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -30,153 +21,115 @@ #include "compat-errno.h" #include "afr-mem-types.h" +#include "libxlator.h" +#include "timer.h" +#include "syncop.h" + +#include "afr-self-heald.h" + #define AFR_XATTR_PREFIX "trusted.afr" +#define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) -struct _pump_private; +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ -typedef struct _afr_private { - gf_lock_t lock; /* to guard access to child_count, etc */ - unsigned int child_count; /* total number of children */ +typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); + +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); + +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); + +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) + +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ - xlator_t **children; + xlator_t **children; - gf_lock_t root_inode_lk; - int first_lookup; inode_t *root_inode; - unsigned char *child_up; + unsigned char *child_up; char **pending_key; - gf_boolean_t data_self_heal; /* on/off */ + char *data_self_heal; /* on/off/open */ char * data_self_heal_algorithm; /* name of algorithm */ unsigned int data_self_heal_window_size; /* max number of pipelined read/writes */ unsigned int background_self_heal_count; unsigned int background_self_heals_started; - gf_boolean_t metadata_self_heal; /* on/off */ - gf_boolean_t entry_self_heal; /* on/off */ + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ - gf_boolean_t data_change_log; /* on/off */ - gf_boolean_t metadata_change_log; /* on/off */ - gf_boolean_t entry_change_log; /* on/off */ + gf_boolean_t data_change_log; /* on/off */ + gf_boolean_t metadata_change_log; /* on/off */ + gf_boolean_t entry_change_log; /* on/off */ - int read_child; /* read-subvolume */ - unsigned int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ - unsigned int data_lock_server_count; - unsigned int metadata_lock_server_count; - unsigned int entry_lock_server_count; + gf_boolean_t inodelk_trace; + gf_boolean_t entrylk_trace; - gf_boolean_t inodelk_trace; - gf_boolean_t entrylk_trace; - - gf_boolean_t strict_readdir; - - unsigned int wait_count; /* # of servers to wait for success */ + unsigned int wait_count; /* # of servers to wait for success */ uint64_t up_count; /* number of CHILD_UPs we have seen */ uint64_t down_count; /* number of CHILD_DOWNs we have seen */ - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; - - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ -} afr_private_t; - -typedef struct { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ - - gf_boolean_t need_data_self_heal; - gf_boolean_t need_metadata_self_heal; - gf_boolean_t need_entry_self_heal; - - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ - - gf_boolean_t healing_fd_opened; /* true if caller has already - opened fd */ - - gf_boolean_t data_lock_held; /* true if caller has already - acquired 0-0 lock */ - - fd_t *healing_fd; /* set if callers has opened fd */ - - gf_boolean_t background; /* do self-heal in background - if possible */ - - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ - - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ - - int (*unwind) (call_frame_t *frame, xlator_t *this); - - /* End of external interface members */ - - - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt parentbuf; - - /* array of xattr's, one for each child */ - dict_t **xattr; - - /* array of errno's, one for each child */ - int *child_errno; - - int32_t **pending_matrix; - int32_t **delta_matrix; - - int *sources; - int source; - int active_source; - int active_sinks; - int *success; - unsigned char *locked_nodes; - int lock_count; - - mode_t impunging_entry_mode; - const char *linkname; - - int op_failed; - - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; - - loc_t parent_loc; - - call_frame_t *orig_frame; - gf_boolean_t unwound; - - /* private data for the particular self-heal algorithm */ - void *private; + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ + uint32_t post_op_delay_secs; + unsigned int quorum_count; + + char vol_uuid[UUID_SIZE + 1]; + int32_t *last_event; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; - int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this); + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; + char *afr_dirty; - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); + afr_self_heald_t shd; - call_frame_t *sh_frame; -} afr_self_heal_t; + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; +} afr_private_t; typedef enum { - AFR_DATA_TRANSACTION, /* truncate, write, ... */ - AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ - AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ - AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ } afr_transaction_type; typedef enum { @@ -232,11 +185,31 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -245,451 +218,564 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; int32_t lock_op_ret; int32_t lock_op_errno; + afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ +} afr_internal_lock_t; - int (*lock_cbk) (call_frame_t *, xlator_t *); +struct afr_reply { + int valid; + int32_t op_ret; + int32_t op_errno; + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + uint8_t checksum[MD5_DIGEST_LENGTH]; +}; -} afr_internal_lock_t; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + + unsigned int *lock_piggyback; + unsigned int *lock_acquired; + + int flags; + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + + + /* list of frames currently in progress */ + struct list_head eager_locked; +} afr_fd_ctx_t; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; typedef struct _afr_local { - unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; + glusterfs_fop_t op; + unsigned int call_count; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; - unsigned int govinda_gOvinda; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; + gf_lkowner_t saved_lk_owner; - pid_t saved_pid; + int32_t op_ret; + int32_t op_errno; - int32_t op_ret; - int32_t op_errno; + int32_t **pending; - int32_t **pending; + int dirty[AFR_NUM_CHANGE_LOGS]; - loc_t loc; - loc_t newloc; + loc_t loc; + loc_t newloc; - fd_t *fd; + fd_t *fd; + afr_fd_ctx_t *fd_ctx; - glusterfs_fop_t fop; + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ + unsigned char *child_up; + + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; - unsigned char *child_up; + /* @readfn: - int32_t *child_errno; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - dict_t *xattr_req; + afr_read_txn_wind_t readfn; - int32_t inodelk_count; - int32_t entrylk_count; + /* @refreshed: - afr_internal_lock_t internal_lock; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; + /* @inode: - dict_t *dict; + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ - int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this); + inode_t *inode; - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* @parent[2]: + + parent inode[s] on which directory transactions are performed. */ - int op; - struct { - struct { - unsigned char buf_set; - struct statvfs buf; - } statfs; + inode_t *parent; + inode_t *parent2; - struct { - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt postparent; - ino_t ino; - uint64_t gen; - ino_t parent_ino; - dict_t *xattr; - dict_t **xattrs; - gf_boolean_t is_revalidate; - } lookup; + /* @readable: - struct { - int32_t flags; - int32_t wbflags; - } open; + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; - struct { - int32_t cmd; - struct gf_flock user_flock; - struct gf_flock ret_flock; - unsigned char *locked_nodes; - } lk; + afr_inode_refresh_cbk_t refreshfn; - /* inode read */ + /* @refreshinode: - struct { - int32_t mask; - int last_tried; /* index of the child we tried previously */ - } access; + Inode currently getting refreshed. + */ + inode_t *refreshinode; - struct { - int last_tried; - ino_t ino; - } stat; + /* + @pre_op_compat: - struct { - int last_tried; - ino_t ino; - } fstat; + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ - struct { - size_t size; - int last_tried; - ino_t ino; - } readlink; + gf_boolean_t pre_op_compat; - struct { - char *name; - int last_tried; - } getxattr; + dict_t *xattr_req; - struct { - ino_t ino; - size_t size; - off_t offset; - int last_tried; - } readv; + afr_internal_lock_t internal_lock; - /* dir read */ + dict_t *dict; - struct { - int success_count; - int32_t op_ret; - int32_t op_errno; + int optimistic_change_log; + gf_boolean_t delayed_post_op; - uint32_t *checksum; - } opendir; + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; - struct { - int32_t op_ret; - int32_t op_errno; - size_t size; - off_t offset; + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; - gf_boolean_t failed; - int last_tried; - } readdir; + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ - struct { - int32_t op_ret; - int32_t op_errno; + struct { + struct { + unsigned char buf_set; + struct statvfs buf; + } statfs; - size_t size; - off_t offset; - int32_t flag; + struct { + int32_t flags; + } open; - int last_tried; - } getdents; + struct { + int32_t cmd; + struct gf_flock user_flock; + struct gf_flock ret_flock; + unsigned char *locked_nodes; + } lk; - /* inode write */ + /* inode read */ - struct { - ino_t ino; - struct iatt prebuf; - struct iatt postbuf; + struct { + int32_t mask; + int last_index; /* index of the child we tried previously */ + } access; - int32_t op_ret; + struct { + int last_index; + } stat; - struct iovec *vector; - struct iobref *iobref; - int32_t count; - off_t offset; - } writev; + struct { + int last_index; + } fstat; + + struct { + size_t size; + int last_index; + } readlink; + + struct { + char *name; + int last_index; + long xattr_len; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_index; + uint32_t flags; + } readv; + + /* dir read */ + + struct { + int success_count; + int32_t op_ret; + int32_t op_errno; + + uint32_t *checksum; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + dict_t *dict; + gf_boolean_t failed; + int last_index; + } readdir; + /* inode write */ struct { - ino_t ino; struct iatt prebuf; struct iatt postbuf; - } fsync; + } inode_wfop; //common structure for all inode-write-fops - struct { - ino_t ino; - off_t offset; - struct iatt prebuf; - struct iatt postbuf; - } truncate; + struct { + int32_t op_ret; - struct { - ino_t ino; - off_t offset; - struct iatt prebuf; - struct iatt postbuf; - } ftruncate; + struct iovec *vector; + struct iobref *iobref; + int32_t count; + off_t offset; + uint32_t flags; + } writev; - struct { - ino_t ino; - struct iatt in_buf; + struct { + off_t offset; + } truncate; + + struct { + off_t offset; + } ftruncate; + + struct { + struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; - } setattr; + } setattr; - struct { - ino_t ino; - struct iatt in_buf; + struct { + struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; - } fsetattr; + } fsetattr; - struct { - dict_t *dict; - int32_t flags; - } setxattr; + struct { + dict_t *dict; + int32_t flags; + } setxattr; - struct { - char *name; - } removexattr; + struct { + dict_t *dict; + int32_t flags; + } fsetxattr; - /* dir write */ + struct { + char *name; + } removexattr; - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - fd_t *fd; - dict_t *params; - int32_t flags; - mode_t mode; - inode_t *inode; - struct iatt buf; + struct { + dict_t *xattr; + } xattrop; + + struct { + dict_t *xattr; + } fxattrop; + + /* dir write */ + + struct { + inode_t *inode; + struct iatt buf; struct iatt preparent; struct iatt postparent; - struct iatt read_child_buf; - } create; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; //common structure for all dir fops - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - dev_t dev; - mode_t mode; + struct { + fd_t *fd; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt read_child_buf; - } mknod; + int32_t flags; + mode_t mode; + } create; - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - int32_t mode; + struct { + dev_t dev; + mode_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } mkdir; + } mknod; - struct { - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } unlink; + struct { + int32_t mode; + dict_t *params; + } mkdir; - struct { - int flags; - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } rmdir; + struct { + int flags; + } rmdir; - struct { - ino_t oldparent_ino; - ino_t newparent_ino; - ino_t ino; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preoldparent; - struct iatt prenewparent; - struct iatt postoldparent; - struct iatt postnewparent; - } rename; + struct { + dict_t *params; + char *linkpath; + } symlink; struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } link; + int32_t mode; + off_t offset; + size_t len; + } fallocate; struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; - dict_t *params; - struct iatt buf; - struct iatt read_child_buf; - char *linkpath; - struct iatt preparent; - struct iatt postparent; - } symlink; + off_t offset; + size_t len; + } discard; - struct { - int32_t flags; - dir_entry_t *entries; - int32_t count; - } setdents; - } cont; + struct { + off_t offset; + off_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; - struct { - off_t start, len; - char *basename; - char *new_basename; + } cont; - loc_t parent_loc; - loc_t new_parent_loc; + struct { + off_t start, len; - afr_transaction_type type; + gf_boolean_t eager_lock_on; + int *eager_lock; - int success_count; - int erase_pending; - int failure_count; + char *basename; + char *new_basename; - int last_tried; - int32_t *child_errno; + loc_t parent_loc; + loc_t new_parent_loc; - call_frame_t *main_frame; + afr_transaction_type type; - int (*fop) (call_frame_t *frame, xlator_t *this); + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; - int (*done) (call_frame_t *frame, xlator_t *this); + struct list_head eager_locked; - int (*resume) (call_frame_t *frame, xlator_t *this); + unsigned char *pre_op; - int (*unwind) (call_frame_t *frame, xlator_t *this); + /* @fop_subvols: subvolumes on which FOP will be attempted */ + unsigned char *fop_subvols; - /* post-op hook */ - } transaction; + /* @failed_subvols: subvolumes on which FOP failed. Always + a subset of @fop_subvols */ + unsigned char *failed_subvols; - afr_self_heal_t self_heal; -} afr_local_t; + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; -typedef struct { - unsigned int *pre_op_done; - unsigned int *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; - int flags; - int32_t wbflags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; - int32_t last_tried; + /* @uninherit_done: + @uninherit_value: - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; - unsigned char *locked_on; /* which subvolumes locks have been successful */ -} afr_fd_ctx_t; + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + + afr_changelog_resume_t changelog_resume; + + call_frame_t *main_frame; + + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + + int (*fop) (call_frame_t *frame, xlator_t *this); + int (*done) (call_frame_t *frame, xlator_t *this); + + int (*resume) (call_frame_t *frame, xlator_t *this); + + int (*unwind) (call_frame_t *frame, xlator_t *this); + + /* post-op hook */ + } transaction; -/* try alloc and if it fails, goto label */ -#define ALLOC_OR_GOTO(var, type, label) do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_afr_mt_##type); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); + syncbarrier_t barrier; + + struct marker_str marker; + + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; + + mode_t umask; + int xflag; + gf_boolean_t do_discovery; + struct afr_reply *replies; +} afr_local_t; /* did a call fail due to a child failing? */ -#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ - ((op_errno == ENOTCONN) || \ - (op_errno == EBADFD))) +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this); -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type); + +#define afr_data_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) + +#define afr_metadata_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t cbk); int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, ...); +afr_notify (xlator_t *this, int32_t event, void *data, void *data2); int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); int -afr_save_locked_fd (xlator_t *this, fd_t *fd); +afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); int afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this); +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); - int32_t afr_unlock (call_frame_t *frame, xlator_t *this); @@ -705,32 +791,30 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); +int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd); int afr_fd_ctx_set (xlator_t *this, fd_t *fd); -uint64_t -afr_read_child (xlator_t *this, inode_t *inode); - -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child); - -void -afr_build_parent_loc (loc_t *parent, loc_t *child); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); int -afr_up_children_count (int child_count, unsigned char *child_up); +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); int afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index); - int -afr_deitransform (ino64_t ino, int child_count); +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode); + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv); void afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -738,21 +822,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode); - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags); - -void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); + fd_t *fd, dict_t *xdata); void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); @@ -760,163 +832,145 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); - -#define AFR_STACK_UNWIND(fop, frame, params ...) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - if (frame) { \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ +#define AFR_STACK_UNWIND(fop, frame, params ...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + if (frame) { \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ + } while (0) + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ } while (0); -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ - } while (0); +#define AFR_FRAME_INIT(frame, op_errno) \ + ({frame->local = mem_get0 (THIS->local_pool); \ + if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ + frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) { - char *__tmp_str = NULL; - char *__basename_str = NULL; - __tmp_str = gf_strdup (str); - __basename_str = gf_strdup (basename (__tmp_str)); - GF_FREE (__tmp_str); - return __basename_str; + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = gf_strdup (str); + __basename_str = gf_strdup (basename (__tmp_str)); + GF_FREE (__tmp_str); + return __basename_str; } -/* initialize local_t */ -static inline int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) -{ - local->child_up = GF_CALLOC (sizeof (*local->child_up), - priv->child_count, - gf_afr_mt_char); - if (!local->child_up) { - return -ENOMEM; - } +call_frame_t * +afr_copy_frame (call_frame_t *base); - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); +int +afr_transaction_local_init (afr_local_t *local, xlator_t *this); +int32_t +afr_marker_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); - local->call_count = afr_up_children_count (priv->child_count, local->child_up); - if (local->call_count == 0) - return -ENOTCONN; +int +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); - local->transaction.erase_pending = 1; +int +afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, + transaction_lk_type_t lk_type); - local->op_ret = -1; - local->op_errno = EUCLEAN; +int +afr_higher_errno (int32_t old_errno, int32_t new_errno); - local->internal_lock.lock_op_ret = -1; - local->internal_lock.lock_op_errno = EUCLEAN; +int +afr_final_errno (afr_local_t *local, afr_private_t *priv); +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); - return 0; -} +void +afr_fix_open (fd_t *fd, xlator_t *this); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); -/** - * first_up_child - return the index of the first child that is up - */ +void +afr_set_low_priority (call_frame_t *frame); +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, + int flags); -static inline int -afr_first_up_child (afr_private_t *priv) -{ - xlator_t ** children = NULL; - int ret = -1; - int i = 0; - - LOCK (&priv->lock); - { - children = priv->children; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i]) { - ret = i; - break; - } - } - } - UNLOCK (&priv->lock); - - return ret; -} +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv); +void +afr_matrix_cleanup (int32_t **pending, unsigned int m); -static inline int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) -{ - int i; - - local->first_up_child = afr_first_up_child (priv); - - local->child_errno = GF_CALLOC (sizeof (*local->child_errno), - priv->child_count, - gf_afr_mt_int32_t); - if (!local->child_errno) { - return -ENOMEM; - } - - local->pending = GF_CALLOC (sizeof (*local->pending), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->pending) { - return -ENOMEM; - } - - for (i = 0; i < priv->child_count; i++) { - local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), - 3, /* data + metadata + entry */ - gf_afr_mt_int32_t); - if (!local->pending[i]) - return -ENOMEM; - } +int32_t** +afr_matrix_create (unsigned int m, unsigned int n); - local->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*local->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); +void +afr_filter_xattrs (dict_t *xattr); - local->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*local->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX - local->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*local->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do { \ + if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ + gf_log(this->name,GF_LOG_WARNING, \ + "failing "#_func" due to lack of quorum"); \ + op_errno = EROFS; \ + goto _label; \ + } \ +} while (0); - local->internal_lock.lower_locked_nodes - = GF_CALLOC (sizeof (*local->internal_lock.lower_locked_nodes), - priv->child_count, - gf_afr_mt_char); +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); - local->transaction.child_errno = GF_CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count, - gf_afr_mt_int32_t); +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); - local->internal_lock.transaction_lk_type = AFR_TRANSACTION_LK; +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); - return 0; -} +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local); +void +afr_remove_eager_lock_stub (afr_local_t *local); #endif /* __AFR_H__ */ |
