summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr.h
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2018-01-31 16:41:14 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2018-03-14 13:32:35 +0000
commit346714305f9de30d5f78494091770c1555c601bb (patch)
treeebdb744fd1858a98495e77069cb4e670b2ef87c6 /xlators/cluster/afr/src/afr.h
parentf32f85c4e6c8128643e1f88fe981a63680e79fe0 (diff)
cluster/afr: Make AFR eager-locking similar to EC
Problem: 1) Afr's eager-lock only works for data transactions. 2) When there are conflicting writes, write with conflicting region initiates unlock of eager-lock leading to extra pre-ops and post-ops on the file. When eager-lock goes off, it leads to extra fsyncs for random-write workload in afr. Solution (that is modeled after EC): In EC, when there is a conflicting write, it waits for the current write to complete before it winds the conflicted write. This leads to better utilization of network and disk, because we will not be doing extra xattrops and FSYNCs and inodelk/unlock. Moved fd based counters to inode based counters. I tried to model the solution based on EC's locking, but it is not similar to AFR because we had to keep backward compatibility. Lifecycle of lock: ================== First transaction is added to inode->owners list and an inodelk will be sent on the wire. All the next transactions will be put in inode->waiters list until the first transaction completes inodelk and [f]xattrop completely. Once [f]xattrop also completes, all the requests in the inode->waiters list are checked if it conflict with any of the existing locks which are in inode->owners list and if not are added to inode->owners list and resumed with doing transaction. When these transactions complete fop phase they will be moved to inode->post_op list and resume the transactions that were paused because of conflicts. Post-op and unlock will not be issued on the wire until that is the last transaction on that inode. Last transaction when it has to perform post-op can choose to sleep for deyed-post-op-secs value. During that time if any other transaction comes, it will wake up the sleeping transaction and takes over the ownership of the lock and the cycle continues. If the dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping transaction and it will set lock->release to true and starts doing post-op and then unlock. During this time if any other transactions come, they will be put in inode->frozen list. Once the previous unlock comes it will move the frozen list to waiters list and moves the first element from this waiters-list to owners-list and attempts the lock and the cycle continues. This is the general idea. There is logic at the time of dealying and at the time of new transaction or in flush fop to wakeup existing sleeping transactions or choosing whether to delay a transaction etc, which is subjected to change based on future enhancements etc. Fixes: #418 BUG: 1549606 Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr.h')
-rw-r--r--xlators/cluster/afr/src/afr.h97
1 files changed, 40 insertions, 57 deletions
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index dcaf2887173..b2f3af136bd 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -229,19 +229,12 @@ int
afr_entry_lockee_cmp (const void *l1, const void *l2);
typedef struct {
- char *domain; /* Domain on which inodelk is taken */
- struct gf_flock flock;
- unsigned char *locked_nodes;
- int32_t lock_count;
-} afr_inodelk_t;
-
-typedef struct {
loc_t *lk_loc;
int lockee_count;
afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
- afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
+ struct gf_flock flock;
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -254,7 +247,6 @@ typedef struct {
int32_t lock_count;
int32_t entrylk_lock_count;
- uint64_t lock_number;
int32_t lk_call_count;
int32_t lk_expected_count;
int32_t lk_attempted_count;
@@ -292,37 +284,9 @@ typedef enum {
} afr_fd_open_status_t;
typedef struct {
- unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
- int inherited[AFR_NUM_CHANGE_LOGS];
- int on_disk[AFR_NUM_CHANGE_LOGS];
afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
-
- unsigned int *lock_piggyback;
- unsigned int *lock_acquired;
-
int flags;
- /* used for delayed-post-op optimization */
- pthread_mutex_t delay_lock;
- gf_timer_t *delay_timer;
- call_frame_t *delay_frame;
-
- /* set if any write on this fd was a non stable write
- (i.e, without O_SYNC or O_DSYNC)
- */
- gf_boolean_t witnessed_unstable_write;
-
- /* @open_fd_count:
- Number of open FDs queried from the server, as queried through
- xdata in FOPs. Currently, used to decide if eager-locking must be
- temporarily disabled.
- */
- uint32_t open_fd_count;
-
-
- /* list of frames currently in progress */
- struct list_head eager_locked;
-
/* the subvolume on which the latest sequence of readdirs (starting
at offset 0) has begun. Till the next readdir request with 0 offset
arrives, we continue to read off this subvol.
@@ -336,6 +300,20 @@ typedef enum {
AFR_FOP_LOCK_QUORUM_FAILED,
} afr_fop_lock_state_t;
+typedef struct _afr_inode_lock_t {
+ unsigned int event_generation;
+ gf_boolean_t release;
+ gf_boolean_t acquired;
+ gf_timer_t *delay_timer;
+ struct list_head owners; /*Transactions that are performing fop*/
+ struct list_head post_op;/*Transactions that are done with the fop
+ *So can not conflict with the fops*/
+ struct list_head waiting;/*Transaction that are waiting for
+ *conflicting transactions to complete*/
+ struct list_head frozen;/*Transactions that need to go as part of
+ * next batch of eager-lock*/
+} afr_lock_t;
+
typedef struct _afr_inode_ctx {
uint64_t read_subvol;
uint64_t write_subvol;
@@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx {
int spb_choice;
gf_timer_t *timer;
gf_boolean_t need_refresh;
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+ afr_lock_t lock[2];
} afr_inode_ctx_t;
@@ -457,7 +452,6 @@ typedef struct _afr_local {
dict_t *dict;
int optimistic_change_log;
- gf_boolean_t delayed_post_op;
/* Is the current writev() going to perform a stable write?
i.e, is fd->flags or @flags writev param have O_SYNC or
@@ -693,7 +687,7 @@ typedef struct _afr_local {
off_t start, len;
gf_boolean_t eager_lock_on;
- int *eager_lock;
+ gf_boolean_t do_eager_unlock;
char *basename;
char *new_basename;
@@ -707,7 +701,8 @@ typedef struct _afr_local {
of the transaction frame */
call_stub_t *resume_stub;
- struct list_head eager_locked;
+ struct list_head owner_list;
+ struct list_head wait_list;
unsigned char *pre_op;
@@ -768,7 +763,8 @@ typedef struct _afr_local {
*/
afr_changelog_resume_t changelog_resume;
- call_frame_t *main_frame;
+ call_frame_t *main_frame; /*Fop frame*/
+ call_frame_t *frame; /*Transaction frame*/
int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
@@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
afr_local_cleanup (frame->local, THIS); \
mem_put (frame->local); \
frame->local = NULL; }; \
- frame->local;})
+ frame->local; })
#define AFR_STACK_RESET(frame) \
do { \
@@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr);
#define AFR_QUORUM_AUTO INT_MAX
int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local);
gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
-
-void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
-
-int
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
-
-void
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
-
-void
-afr_remove_eager_lock_stub (afr_local_t *local);
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode);
void
afr_reply_wipe (struct afr_reply *reply);
@@ -1225,5 +1209,4 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
int
afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode);
-
#endif /* __AFR_H__ */