From 78d67da17356b48cf1d5a6595764650d5b200ba7 Mon Sep 17 00:00:00 2001 From: Sunil Kumar Acharya Date: Thu, 23 Mar 2017 12:50:41 +0530 Subject: cluster/ec: OpenFD heal implementation for EC Existing EC code doesn't try to heal the OpenFD to avoid unnecessary healing of the data later. Fix implements the healing of open FDs before carrying out file operations on them by making an attempt to open the FDs on required up nodes. BUG: 1431955 Change-Id: Ib696f59c41ffd8d5678a484b23a00bb02764ed15 Signed-off-by: Sunil Kumar Acharya --- xlators/cluster/ec/src/ec-common.c | 113 +++++++++++++++++++++ xlators/cluster/ec/src/ec-common.h | 3 + xlators/cluster/ec/src/ec-dir-read.c | 8 +- xlators/cluster/ec/src/ec-dir-write.c | 1 + xlators/cluster/ec/src/ec-helpers.c | 29 +++--- xlators/cluster/ec/src/ec-inode-read.c | 3 + xlators/cluster/ec/src/ec-types.h | 177 ++++++++++++++++++--------------- 7 files changed, 243 insertions(+), 91 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index b7088e54724..cb627a92c9c 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -26,6 +26,114 @@ EC_FLAG_WAITING_DATA_DIRTY |\ EC_FLAG_WAITING_METADATA_DIRTY) +void +ec_update_fd_status (fd_t *fd, xlator_t *xl, int idx, + int32_t ret_status) +{ + ec_fd_t *fd_ctx; + + if (fd == NULL) + return; + + LOCK (&fd->lock); + { + fd_ctx = __ec_fd_get(fd, xl); + if (fd_ctx) { + if (ret_status >= 0) + fd_ctx->fd_status[idx] = EC_FD_OPENED; + else + fd_ctx->fd_status[idx] = EC_FD_NOT_OPENED; + } + } + UNLOCK (&fd->lock); +} + +static int +ec_fd_ctx_need_open (fd_t *fd, xlator_t *this, uintptr_t *need_open) +{ + int i = 0; + int count = 0; + ec_t *ec = NULL; + ec_fd_t *fd_ctx = NULL; + + ec = this->private; + *need_open = 0; + + fd_ctx = ec_fd_get (fd, this); + if (!fd_ctx) + return count; + + LOCK (&fd->lock); + { + for (i = 0; i < ec->nodes; i++) { + if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) && + (ec->xl_up & (1<fd_status[i] = EC_FD_OPENING; + *need_open |= (1<lock); + + /* If fd needs to open on minimum number of nodes + * then ignore fixing the fd as it has been + * requested from heal operation. + */ + if (count >= ec->fragments) + count = 0; + + return count; +} + +static gf_boolean_t +ec_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (gf_uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +static void +ec_fix_open (ec_fop_data_t *fop) +{ + int call_count = 0; + uintptr_t need_open = 0; + int ret = 0; + loc_t loc = {0, }; + + if (!ec_is_fd_fixable (fop->fd)) + goto out; + + /* Evaluate how many remote fd's to be opened */ + call_count = ec_fd_ctx_need_open (fop->fd, fop->xl, &need_open); + if (!call_count) + goto out; + + loc.inode = inode_ref (fop->fd->inode); + gf_uuid_copy (loc.gfid, fop->fd->inode->gfid); + ret = loc_path (&loc, NULL); + if (ret < 0) { + goto out; + } + + if (IA_IFDIR == fop->fd->inode->ia_type) { + ec_opendir(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, + NULL, NULL, &fop->loc[0], fop->fd, NULL); + } else{ + ec_open(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, + NULL, NULL, &loc, fop->fd->flags, fop->fd, NULL); + } + +out: + loc_wipe (&loc); +} + off_t ec_range_end_get (off_t fl_start, size_t fl_size) { @@ -1677,6 +1785,11 @@ void ec_lock_acquired(ec_lock_link_t *link) ec_lock_apply(link); + if (fop->use_fd && + (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) { + ec_fix_open(fop); + } + ec_lock_resume_shared(&list); } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 35c6a8107c2..c3e291585ef 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -140,4 +140,7 @@ int32_t ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata); +void +ec_update_fd_status (fd_t *fd, xlator_t *xl, + int child_index, int32_t ret_status); #endif /* __EC_COMMON_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index 48afe54460f..b44bb4239b1 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -19,7 +19,11 @@ #include "ec-method.h" #include "ec-fops.h" -/* FOP: opendir */ +/**************************************************************** + * + * File Operation: opendir + * + ***************************************************************/ int32_t ec_combine_opendir(ec_fop_data_t * fop, ec_cbk_data_t * dst, ec_cbk_data_t * src) @@ -88,6 +92,8 @@ int32_t ec_opendir_cbk(call_frame_t * frame, void * cookie, xlator_t * this, } ec_combine(cbk, ec_combine_opendir); + + ec_update_fd_status (fd, this, idx, op_ret); } out: diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index 150dc66f21b..7779d4849f3 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -71,6 +71,7 @@ ec_dir_write_cbk (call_frame_t *frame, xlator_t *this, out: if (cbk) ec_combine (cbk, ec_combine_write); + if (fop) ec_complete (fop); return 0; diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index fe610748f0f..83f96ba6cb2 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -764,27 +764,32 @@ ec_inode_t * ec_inode_get(inode_t * inode, xlator_t * xl) ec_fd_t * __ec_fd_get(fd_t * fd, xlator_t * xl) { + int i = 0; ec_fd_t * ctx = NULL; uint64_t value = 0; + ec_t *ec = xl->private; - if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) - { - ctx = GF_MALLOC(sizeof(*ctx), ec_mt_ec_fd_t); - if (ctx != NULL) - { + if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) { + ctx = GF_MALLOC(sizeof(*ctx) + (sizeof (ec_fd_status_t) * ec->nodes), + ec_mt_ec_fd_t); + if (ctx != NULL) { memset(ctx, 0, sizeof(*ctx)); - value = (uint64_t)(uintptr_t)ctx; - if (__fd_ctx_set(fd, xl, value) != 0) - { - GF_FREE(ctx); + for (i = 0; i < ec->nodes; i++) { + if (fd_is_anonymous (fd)) { + ctx->fd_status[i] = EC_FD_OPENED; + } else { + ctx->fd_status[i] = EC_FD_NOT_OPENED; + } + } + value = (uint64_t)(uintptr_t)ctx; + if (__fd_ctx_set(fd, xl, value) != 0) { + GF_FREE (ctx); return NULL; } } - } - else - { + } else { ctx = (ec_fd_t *)(uintptr_t)value; } diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 03690ab8e96..d58ed9e5795 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -749,6 +749,9 @@ int32_t ec_open_cbk(call_frame_t * frame, void * cookie, xlator_t * this, } ec_combine(cbk, ec_combine_open); + + ec_update_fd_status (fd, this, idx, op_ret); + } out: diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 7b2b7b8247d..23dc434bc42 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -145,6 +145,13 @@ enum _ec_stripe_part { EC_STRIPE_TAIL }; +/* Enumartions to indicate FD status. */ +typedef enum { + EC_FD_NOT_OPENED, + EC_FD_OPENED, + EC_FD_OPENING +} ec_fd_status_t; + struct _ec_config { uint32_t version; uint8_t algorithm; @@ -158,6 +165,7 @@ struct _ec_fd { loc_t loc; uintptr_t open; int32_t flags; + ec_fd_status_t fd_status[0]; }; struct _ec_stripe { @@ -309,75 +317,82 @@ struct _ec_fragment_range { the bricks (offset on brick) */ }; +/* EC xlator data structure to collect all the data required to perform + * the file operation.*/ struct _ec_fop_data { - int32_t id; - int32_t refs; - int32_t state; - int32_t minimum; - int32_t expected; - int32_t winds; - int32_t jobs; - int32_t error; - ec_fop_data_t *parent; - xlator_t *xl; - call_frame_t *req_frame; /* frame of the calling xlator */ - call_frame_t *frame; /* frame used by this fop */ - struct list_head cbk_list; /* sorted list of groups of answers */ - struct list_head answer_list; /* list of answers */ - struct list_head pending_list; /* member of ec_t.pending_fops */ - ec_cbk_data_t *answer; /* accepted answer */ - int32_t lock_count; - int32_t locked; - ec_lock_link_t locks[2]; - int32_t first_lock; - gf_lock_t lock; - - uint32_t flags; - uint32_t first; - uintptr_t mask; - uintptr_t healing; /*Dispatch is done but call is successful - only if fop->minimum number of subvolumes - succeed which are not healing*/ - uintptr_t remaining; - uintptr_t received; /* Mask of responses */ - uintptr_t good; - - uid_t uid; - gid_t gid; - - ec_wind_f wind; - ec_handler_f handler; - ec_resume_f resume; - ec_cbk_t cbks; - void *data; - ec_heal_t *heal; - struct list_head healer; - - uint64_t user_size; - uint32_t head; - - int32_t use_fd; - - dict_t *xdata; - dict_t *dict; - int32_t int32; - uint32_t uint32; - uint64_t size; - off_t offset; - mode_t mode[2]; - entrylk_cmd entrylk_cmd; - entrylk_type entrylk_type; - gf_xattrop_flags_t xattrop_flags; - dev_t dev; - inode_t *inode; - fd_t *fd; - struct iatt iatt; - char *str[2]; - loc_t loc[2]; - struct gf_flock flock; - struct iovec *vector; - struct iobref *buffers; - gf_seek_what_t seek; + int32_t id; /* ID of the file operation */ + int32_t refs; + int32_t state; + int32_t minimum; /* Mininum number of successful + operation required to conclude a + fop as successful */ + int32_t expected; + int32_t winds; + int32_t jobs; + int32_t error; + ec_fop_data_t *parent; + xlator_t *xl; /* points to EC xlator */ + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ + int32_t lock_count; + int32_t locked; + ec_lock_link_t locks[2]; + int32_t first_lock; + gf_lock_t lock; + + uint32_t flags; + uint32_t first; + uintptr_t mask; + uintptr_t healing; /*Dispatch is done but call is successful only + if fop->minimum number of subvolumes succeed + which are not healing*/ + uintptr_t remaining; + uintptr_t received; /* Mask of responses */ + uintptr_t good; + + uid_t uid; + gid_t gid; + + ec_wind_f wind; /* Function to wind to */ + ec_handler_f handler; /* FOP manager function */ + ec_resume_f resume; + ec_cbk_t cbks; /* Callback function for this FOP */ + void *data; + ec_heal_t *heal; + struct list_head healer; + + uint64_t user_size; + uint32_t head; + + int32_t use_fd; /* Indicates whether this FOP uses FD or + not */ + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uint32_t uint32; + uint64_t size; + off_t offset; + mode_t mode[2]; + entrylk_cmd entrylk_cmd; + entrylk_type entrylk_type; + gf_xattrop_flags_t xattrop_flags; + dev_t dev; + inode_t *inode; + fd_t *fd; /* FD of the file on which FOP is + being carried upon */ + struct iatt iatt; + char *str[2]; + loc_t loc[2]; /* Holds the location details for + the file */ + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + gf_seek_what_t seek; ec_fragment_range_t frag_range; /* This will hold the range of stripes affected by the fop. */ }; @@ -623,18 +638,24 @@ struct _ec { xlator_t *xl; int32_t healers; int32_t heal_waiters; - int32_t nodes; + int32_t nodes; /* Total number of bricks(n) */ int32_t bits_for_nodes; - int32_t fragments; - int32_t redundancy; - uint32_t fragment_size; - uint32_t stripe_size; - int32_t up; + int32_t fragments; /* Data bricks(k) */ + int32_t redundancy; /* Redundant bricks(m) */ + uint32_t fragment_size; /* Size of fragment/chunk on a + brick. */ + uint32_t stripe_size; /* (fragment_size * fragments) + maximum size of user data + stored in one stripe. */ + int32_t up; /* Represents whether EC volume is + up or not. */ uint32_t idx; - uint32_t xl_up_count; - uintptr_t xl_up; - uint32_t xl_notify_count; - uintptr_t xl_notify; + uint32_t xl_up_count; /* Number of UP bricks. */ + uintptr_t xl_up; /* Bit flag representing UP + bricks */ + uint32_t xl_notify_count; /* Number of notifications. */ + uintptr_t xl_notify; /* Bit flag representing + notification for bricks. */ uintptr_t node_mask; xlator_t **xl_list; gf_lock_t lock; -- cgit