diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-algorithm.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-algorithm.c | 308 |
1 files changed, 205 insertions, 103 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 48399b5e9..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -1,23 +1,15 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ +#include <openssl/md5.h> #include "glusterfs.h" #include "afr.h" #include "xlator.h" @@ -33,7 +25,6 @@ #include "compat-errno.h" #include "compat.h" #include "byte-order.h" -#include "md5.h" #include "afr-transaction.h" #include "afr-self-heal.h" @@ -72,8 +63,7 @@ sh_private_cleanup (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; sh_priv = sh->private; - if (sh_priv) - GF_FREE (sh_priv); + GF_FREE (sh_priv); } static int @@ -104,14 +94,16 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, local = sh_frame->local; sh = &local->self_heal; sh_priv = sh->private; - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; + if (sh_priv) { + total_blocks = sh_priv->total_blocks; + diff_blocks = sh_priv->diff_blocks; + } sh_private_cleanup (sh_frame, this); - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { GF_ASSERT (!last_loop_frame); //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "self-heal aborting on %s", local->loc.path); @@ -119,20 +111,17 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, } else { GF_ASSERT (last_loop_frame); if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_INFO, "full self-heal " + gf_log (this->name, GF_LOG_DEBUG, "full self-heal " "completed on %s",local->loc.path); } else { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "diff self-heal on %s: completed. " "(%d blocks of %d were different (%.2f%%))", local->loc.path, diff_blocks, total_blocks, ((diff_blocks * 1.0)/total_blocks) * 100); } - if (sh_frame == last_loop_frame) - sh->old_loop_frame = NULL; - else - sh->old_loop_frame = last_loop_frame; + sh->old_loop_frame = last_loop_frame; local->self_heal.algo_completion_cbk (sh_frame, this); } @@ -153,17 +142,10 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) loop_sh = &loop_local->self_heal; } - if (loop_sh && loop_sh->loop_completion_cbk) { - if (loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, - loop_sh->loop_completion_cbk); - } else { - loop_sh->loop_completion_cbk (loop_frame, this); - } + if (loop_sh && loop_sh->data_lock_held) { + afr_sh_data_unlock (loop_frame, this, this->name, + sh_destroy_frame); } else { - //default loop_completion_cbk destroys the loop_frame - if (loop_sh && !loop_sh->loop_completion_cbk) - GF_ASSERT (!loop_sh->data_lock_held); sh_destroy_frame (loop_frame, this); } out: @@ -182,7 +164,7 @@ sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) sh_loop_finish (loop_sh->old_loop_frame, this); loop_sh->old_loop_frame = NULL; - gf_log (this->name, GF_LOG_DEBUG, "Aquired lock for range %"PRIu64 + gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 " %"PRIu64, loop_sh->offset, loop_sh->block_size); loop_sh->data_lock_held = _gf_true; loop_sh->sh_data_algo_start (loop_frame, this); @@ -209,8 +191,8 @@ sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) } static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) +sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, + call_frame_t *old_loop_frame, call_frame_t **loop_frame) { call_frame_t *new_loop_frame = NULL; afr_local_t *local = NULL; @@ -220,7 +202,9 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, afr_private_t *priv = NULL; GF_ASSERT (sh_frame); + GF_ASSERT (loop_frame); + *loop_frame = NULL; local = sh_frame->local; sh = &local->self_heal; priv = this->private; @@ -228,8 +212,9 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_frame = copy_frame (sh_frame); if (!new_loop_frame) goto out; - //We want the frame to have same lk_oner as sh_frame - new_loop_local = afr_local_copy (local, this); + //We want the frame to have same lk_owner as sh_frame + //so that locks translator allows conflicting locks + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -244,29 +229,54 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, gf_afr_mt_char); if (!new_loop_sh->write_needed) goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LEN, + new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, gf_afr_mt_uint8_t); if (!new_loop_sh->checksum) goto out; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; - new_loop_sh->old_loop_frame = old_loop_frame; - new_loop_sh->sh_frame = sh_frame; new_loop_sh->inode = inode_ref (sh->inode); new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; new_loop_sh->source = sh->source; new_loop_sh->active_sinks = sh->active_sinks; new_loop_sh->healing_fd = fd_ref (sh->healing_fd); new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->loop_completion_cbk = sh_destroy_frame; + new_loop_sh->old_loop_frame = old_loop_frame; + new_loop_sh->sh_frame = sh_frame; + *loop_frame = new_loop_frame; + return 0; +out: + sh_destroy_frame (new_loop_frame, this); + return -ENOMEM; +} + +static int +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, + call_frame_t *old_loop_frame) +{ + call_frame_t *new_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *new_loop_local = NULL; + afr_self_heal_t *new_loop_sh = NULL; + int ret = 0; + + GF_ASSERT (sh_frame); + + local = sh_frame->local; + sh = &local->self_heal; + + ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, + &new_loop_frame); + if (ret) + goto out; + new_loop_local = new_loop_frame->local; + new_loop_sh = &new_loop_local->self_heal; + new_loop_sh->offset = offset; + new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: - sh->op_failed = 1; - if (new_loop_frame) { - new_loop_frame->local = new_loop_local; - } + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); if (old_loop_frame) sh_loop_finish (old_loop_frame, this); sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); @@ -277,7 +287,6 @@ static int sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, gf_boolean_t is_first_call, call_frame_t *old_loop_frame) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; afr_sh_algo_private_t *sh_priv = NULL; @@ -285,6 +294,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, blksize_t block_size = 0; int loop = 0; off_t offset = 0; + afr_private_t *priv = NULL; priv = this->private; local = sh_frame->local; @@ -293,19 +303,20 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, LOCK (&sh_priv->lock); { - if (_gf_false == is_first_call) + if (!is_first_call) sh_priv->loops_running--; offset = sh_priv->offset; block_size = sh->block_size; - while ((!sh->eof_reached) && (0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; sh_priv->offset += block_size; sh_priv->loops_running++; - if (_gf_false == is_first_call) + if (!is_first_call) break; } if (0 == sh_priv->loops_running) { @@ -317,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, if (0 == loop) { //loop finish does unlock, but the erasing of the pending //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && !sh->op_failed) + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) goto driver_done; if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -328,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, //If we have more loops to form we should finish previous loop after //the next loop lock while (loop--) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { // op failed in other loop, stop spawning more loops if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -374,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame } if (op_ret == -1) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); if (loop_frame) { sh_loop_finish (loop_frame, this); @@ -390,7 +402,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame static int sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * loop_local = NULL; @@ -422,13 +434,22 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (loop_sh, op_errno); + } else if (op_ret < loop_local->cont.writev.vector->iov_len) { + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } call_count = afr_frame_return (loop_frame); if (call_count == 0) { + iobref_unref(loop_local->cont.writev.iobref); + sh_loop_return (sh_frame, this, loop_frame, loop_sh->op_ret, loop_sh->op_errno); } @@ -436,12 +457,41 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, return 0; } +static void +sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, + afr_private_t *priv) +{ + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int i = 0; + + sh_local = sh_frame->local; + sh = &sh_local->self_heal; + + if (!strcmp (sh->algo->name, "diff")) + return; + + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + + /* full self-heal guarantees there exists atleast 1 file with size 0 + * That means for other files we can preserve holes that come after + * its size before 'trim' + */ + for (i = 0; i < priv->child_count; i++) { + if (loop_sh->write_needed[i] && + ((loop_sh->offset + 1) > sh->buf[i].ia_size)) + loop_sh->write_needed[i] = 0; + } +} static int sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * loop_local = NULL; @@ -466,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, if (op_ret <= 0) { if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); gf_log (this->name, GF_LOG_ERROR, "read failed on %d " "for %s reason :%s", sh->source, sh_local->loc.path, strerror (errno)); @@ -479,18 +529,26 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, goto out; } - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) { - gf_log (this->name, GF_LOG_DEBUG, "0 filled block"); - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - goto out; - } + if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) + sh_prune_writes_needed (sh_frame, loop_frame, priv); call_count = sh_number_of_writes_needed (loop_sh->write_needed, priv->child_count); - GF_ASSERT (call_count > 0); + if (call_count == 0) { + sh_loop_return (sh_frame, this, loop_frame, 0, 0); + goto out; + } + loop_local->call_count = call_count; + /* + * We only really need the request size at the moment, but the buffer + * is required if we want to issue a retry in the event of a short write. + * Therefore, we duplicate the vector and ref the iobref here... + */ + loop_local->cont.writev.vector = iov_dup(vector, count); + loop_local->cont.writev.iobref = iobref_ref(iobref); + for (i = 0; i < priv->child_count; i++) { if (!loop_sh->write_needed[i]) continue; @@ -499,7 +557,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, priv->children[i], priv->children[i]->fops->writev, loop_sh->healing_fd, vector, count, - loop_sh->offset, iobref); + loop_sh->offset, 0, iobref, NULL); if (!--call_count) break; @@ -526,7 +584,7 @@ sh_loop_read (call_frame_t *loop_frame, xlator_t *this) priv->children[loop_sh->source], priv->children[loop_sh->source]->fops->readv, loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset); + loop_sh->offset, 0, NULL); return 0; } @@ -535,7 +593,8 @@ sh_loop_read (call_frame_t *loop_frame, xlator_t *this) static int sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum) + uint32_t weak_checksum, uint8_t *strong_checksum, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *loop_local = NULL; @@ -567,10 +626,10 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, MD5_DIGEST_LEN); + memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, + strong_checksum, MD5_DIGEST_LENGTH); } call_count = afr_frame_return (loop_frame); @@ -580,9 +639,9 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, if (sh->sources[i] || !sh_local->child_up[i]) continue; - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LEN), - loop_sh->checksum + (sh->source * MD5_DIGEST_LEN), - MD5_DIGEST_LEN)) { + if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), + loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), + MD5_DIGEST_LENGTH)) { /* Checksums differ, so this block must be written to this sink @@ -605,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh_loop_read (loop_frame, this); } else { sh_loop_return (sh_frame, this, loop_frame, @@ -638,7 +698,7 @@ sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) priv->children[loop_sh->source], priv->children[loop_sh->source]->fops->rchecksum, loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size); + loop_sh->offset, loop_sh->block_size, NULL); for (i = 0; i < priv->child_count; i++) { if (loop_sh->sources[i] || !loop_local->child_up[i]) @@ -649,7 +709,7 @@ sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) priv->children[i], priv->children[i]->fops->rchecksum, loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size); + loop_sh->offset, loop_sh->block_size, NULL); if (!--call_count) break; @@ -679,38 +739,80 @@ sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) return 0; } -static int -sh_do_nothing (call_frame_t *frame, xlator_t *this) +afr_sh_algo_private_t* +afr_sh_priv_init () { - return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, - afr_sh_algo_fn sh_data_algo_start) -{ - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; afr_sh_algo_private_t *sh_priv = NULL; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), gf_afr_mt_afr_private_t); if (!sh_priv) goto out; LOCK_INIT (&sh_priv->lock); +out: + return sh_priv; +} - sh->private = sh_priv; - sh->sh_data_algo_start = sh_data_algo_start; +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_self_heal_t *dst_sh = NULL; + afr_local_t *src_local = NULL; + afr_self_heal_t *src_sh = NULL; + int ret = -1; + + dst_local = dst->local; + dst_sh = &dst_local->self_heal; + src_local = src->local; + src_sh = &src_local->self_heal; + GF_ASSERT (src_sh->data_lock_held); + GF_ASSERT (!dst_sh->data_lock_held); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; + src_sh->data_lock_held = _gf_false; + dst_sh->data_lock_held = _gf_true; + return 0; +} - sh_local->call_count = 0; +int +afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, + afr_sh_algo_fn sh_data_algo_start) +{ + call_frame_t *first_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = 0; + afr_private_t *priv = NULL; + + local = sh_frame->local; + sh = &local->self_heal; + priv = this->private; - sh->loop_completion_cbk = sh_do_nothing; - sh_loop_driver (sh_frame, this, _gf_true, sh_frame); + sh->sh_data_algo_start = sh_data_algo_start; + local->call_count = 0; + ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); + if (ret) + goto out; + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; + sh->private = afr_sh_priv_init (); + if (!sh->private) { + ret = -1; + goto out; + } + sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); + ret = 0; out: + if (ret) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + sh_loop_driver_done (sh_frame, this, NULL); + } return 0; } |
