diff options
Diffstat (limited to 'xlators/cluster/dht/src')
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 59 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 41 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 154 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-read.c | 58 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-write.c | 197 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 35 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 78 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 33 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht.c | 1 |
9 files changed, 418 insertions, 238 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index b1fee655b..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -22,6 +22,7 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> @@ -154,7 +155,6 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) int ret = -1; dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; - uint32_t missing = 0; local = discover_frame->local; layout = local->layout; @@ -191,33 +191,20 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) goto out; } } else { - ret = dht_layout_normalize (this, &local->loc, layout, - &missing); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s (internal error)", - local->loc.path); - op_errno = EIO; - goto out; - } - if (missing == conf->subvolume_cnt) { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s, ENOENT errors: %u)", - local->loc.path, missing); - op_errno = ENOENT; - goto out; - } - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, "normalizing failed on %s " - "(overlaps/holes present)", local->loc.path); - /* We may need to do the lookup again */ - /* in discover call, parent is not know, and basename - * of entry is also not available. Without which we - * cannot build a layout correctly to heal it. Hence - * returning ESTALE */ - op_errno = ESTALE; - goto out; + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } } if (local->inode) @@ -421,7 +408,6 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_layout_t *layout = NULL; int ret = -1; int is_dir = 0; - uint32_t missing = 0; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -455,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_ret, op_errno, xattr); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -502,16 +488,9 @@ unlock: } if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout, - &missing); - - /* - * Arguably, we shouldn't do self-heal just because - * bricks are missing as long as there are no other - * anomalies. For now, though, just preserve the - * existing behavior. - */ - if ((ret != 0) || (missing != 0)) { + ret = dht_layout_normalize (this, &local->loc, layout); + + if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "fixing assignment on %s", local->loc.path); @@ -5194,8 +5173,8 @@ unlock: * not need to handle CHILD_DOWN event here. */ if (conf->defrag) { - ret = pthread_create (&conf->defrag->th, NULL, - gf_defrag_start, this); + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); if (ret) { conf->defrag = NULL; GF_FREE (conf->defrag); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 6ba18339b..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -212,6 +212,10 @@ enum gf_defrag_status_t { GF_DEFRAG_STATUS_STOPPED, GF_DEFRAG_STATUS_COMPLETE, GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, }; typedef enum gf_defrag_status_t gf_defrag_status_t; @@ -396,26 +400,18 @@ typedef enum { } while (0) #define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) - -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, - const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, - dht_layout_t *layout, - uint32_t *missing_p); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, - dht_layout_t *layout, - uint32_t *holes_p, - uint32_t *overlaps_p, - uint32_t *missing_p, - uint32_t *down_p, - uint32_t *misc_p, - uint32_t *no_space_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, - dict_t *xattr); +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p, uint32_t *no_space_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *buf, dict_t *xattr); @@ -699,6 +695,8 @@ int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, off_t offset, size_t len, dict_t *xdata); int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); int32_t dht_init (xlator_t *this); void dht_fini (xlator_t *this); @@ -783,4 +781,7 @@ dht_priv_dump (xlator_t *this); int32_t dht_inodectx_dump (xlator_t *this, inode_t *inode); +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); + #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 1cf343d24..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -18,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -340,20 +362,6 @@ out: return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -727,6 +735,10 @@ dht_migration_complete_check_task (void *data) loc_t tmp_loc = {0,}; char *path = NULL; dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; @@ -738,6 +750,8 @@ dht_migration_complete_check_task (void *data) if (!local->loc.inode && !local->fd) goto out; + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr * as root:root. If a fd is already open, access check wont be done*/ @@ -800,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -821,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -835,41 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) + + if (list_empty (&inode->fd_list)) goto out; /* perform open as root:root. There is window between linkfile * creation(root:root) and setattr with the correct uid/gid */ SYNCTASK_SETID(0, 0); - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + if (open_failed) { + ret = -1; goto out; } - ret = 0; out: @@ -914,6 +927,9 @@ dht_rebalance_inprogress_task (void *data) struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; @@ -925,6 +941,8 @@ dht_rebalance_inprogress_task (void *data) if (!local->loc.inode && !local->fd) goto out; + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr * as root:root. If a fd is already open, access check wont be done*/ if (local->loc.inode) { @@ -976,35 +994,47 @@ dht_rebalance_inprogress_task (void *data) } ret = 0; + + if (list_empty (&inode->fd_list)) + goto done; + /* perform open as root:root. There is window between linkfile * creation(root:root) and setattr with the correct uid/gid */ SYNCTASK_SETID (0, 0); - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index 10f59ab26..ece84151a 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -130,10 +130,11 @@ int dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { - uint64_t tmp_subvol = 0; + xlator_t *subvol = 0; dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -157,19 +158,20 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; /* Check if the rebalance phase2 is true */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { /* Phase 2 of migration */ local->rebalance.target_op_fn = dht_attr2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_attr2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -382,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; if (!local) { @@ -400,17 +404,18 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { /* File would be migrated to other node */ - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_readv2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_readv2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -616,8 +621,9 @@ int dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; @@ -627,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; /* If context is set, then send flush() it to the destination */ - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_flush2 (this, frame, 0); return 0; } @@ -644,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -713,6 +715,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; prev = cookie; @@ -734,8 +738,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, } local->op_errno = op_errno; - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_fsync2; /* Check if the rebalance phase1 is true */ @@ -750,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); } + if (!ret) + return 0; } else { dht_fsync2 (this, frame, 0); - } - if (!ret) return 0; + } out: DHT_STRIP_PHASE1_FLAGS (postbuf); @@ -770,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; - + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 9bcd84ae1..4b3f3a049 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -21,6 +21,7 @@ int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -29,6 +30,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = -1; + xlator_t *subvol = NULL; if (op_ret == -1 && (op_errno != ENOENT)) { goto out; @@ -65,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_writev2 (this, frame, 0); return 0; } @@ -90,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -172,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -213,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret || !local->fd) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_truncate2 (this, frame, 0); return 0; } @@ -238,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = local->fd ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -359,6 +357,7 @@ dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -398,8 +397,8 @@ dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_fallocate2 (this, frame, 0); return 0; } @@ -422,15 +421,10 @@ dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -497,6 +491,7 @@ dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -536,8 +531,8 @@ dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_discard2 (this, frame, 0); return 0; } @@ -560,15 +555,10 @@ dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -624,6 +614,141 @@ err: return 0; } +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + /* handle cases of migration here for 'setattr()' calls */ int dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -675,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 07e8cbae4..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -454,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -534,13 +541,13 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, case -1: case ENOENT: missing++; - break; + continue; case ENOTCONN: down++; - break; + continue; case ENOSPC: no_space++; - break; + continue; case 0: /* if err == 0 and start == stop, then it is a non misc++; * participating subvolume(spread-cnt). Then, do not @@ -552,6 +559,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, break; default: misc++; + continue; } is_virgin = 0; @@ -593,8 +601,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, int -dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *missing_p) +dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) { int ret = 0; int i = 0; @@ -606,7 +613,6 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, ret = dht_layout_sort (layout); if (ret == -1) { - /* defensive coding; this can't happen currently */ gf_log (this->name, GF_LOG_WARNING, "sort failed?! how the ...."); goto out; @@ -616,26 +622,23 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, &holes, &overlaps, &missing, &down, &misc, NULL); if (ret == -1) { - /* defensive coding; this can't happen currently */ gf_log (this->name, GF_LOG_WARNING, "error while finding anomalies in %s -- not good news", loc->path); goto out; } - ret = holes + overlaps; - if (ret) { + if (holes || overlaps) { if (missing == layout->cnt) { gf_log (this->name, GF_LOG_DEBUG, "directory %s looked up first time", loc->path); } else { gf_log (this->name, GF_LOG_INFO, - "found anomalies in %s. holes=%d overlaps=%d" - " missing=%d down=%d misc=%d", - loc->path, holes, overlaps, missing, down, - misc); + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); } + ret = -1; } for (i = 0; i < layout->cnt; i++) { @@ -650,14 +653,14 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, (layout->list[i].xlator ? layout->list[i].xlator->name : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } } } out: - if (missing_p) { - *missing_p = missing; - } return ret; } diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 5300ff2fe..bcb19f23e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -243,7 +243,7 @@ out: static inline int __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd) + dict_t *dict, fd_t **dst_fd, dict_t *xattr) { xlator_t *this = NULL; int ret = -1; @@ -307,6 +307,12 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + ret = syncop_ftruncate (to, fd, stbuf->ia_size); if (ret < 0) gf_log (this->name, GF_LOG_ERROR, @@ -340,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, int ret = -1; xlator_t *this = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + this = THIS; ret = syncop_statfs (from, loc, &src_statfs); @@ -363,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, if (flag != GF_DHT_MIGRATE_DATA) goto check_avail_space; - if (((dst_statfs.f_bavail * - dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < - (((src_statfs.f_bavail * src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - - /* this is not a 'failure', but we don't want to - consider this as 'success' too :-/ */ - ret = 1; - goto out; + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } } - check_avail_space: if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { @@ -708,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, - dict, &dst_fd); + dict, &dst_fd, xattr); if (ret) goto out; @@ -727,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + ret = syncop_fstat (from, src_fd, &stbuf); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", @@ -756,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s (%s)", - loc->path, from->name, strerror (errno)); - - ret = syncop_setxattr (to, loc, xattr, 0); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s (%s)", - loc->path, to->name, strerror (errno)); - /* TODO: Sync the locks */ ret = syncop_fsync (to, dst_fd, 0); @@ -1760,6 +1776,8 @@ log: case GF_DEFRAG_STATUS_FAILED: status = "failed"; break; + default: + break; } gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index b220a0e25..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -17,6 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -564,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -776,7 +801,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -789,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 0349b63a9..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -72,6 +72,7 @@ struct xlator_fops fops = { .fsetattr = dht_fsetattr, .fallocate = dht_fallocate, .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { |
