summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src/ec-heal.c
diff options
context:
space:
mode:
authorXavier Hernandez <xhernandez@datalab.es>2014-11-07 12:12:19 +0100
committerVijay Bellur <vbellur@redhat.com>2014-12-04 11:34:38 -0800
commitbc91dd4de39ffd481a52b837f322f6782c14e9f1 (patch)
treea7a550c659bdee92b5d29839f87f5473d0b367bc /xlators/cluster/ec/src/ec-heal.c
parent7319b01ffa3d4ff7b1405873c8caeaf8a1f7b5d6 (diff)
ec: Fix self-healing issues.
Three problems have been detected: 1. Self healing is executed in background, allowing the fop that detected the problem to continue without blocks nor delays. While this is quite interesting to avoid unnecessary delays, it can cause spurious failures of self-heal because it may try to recover a file inside a directory that a previous self-heal has not recovered yet, causing the file self-heal to fail. 2. When a partial self-heal is being executed on a directory, if a full self-heal is attempted, it won't be executed because another self-heal is already in process, so the directory won't be fully repaired. 3. Information contained in loc's of some fop's is not enough to do a complete self-heal. To solve these problems, I've made some changes: * Improved ec_loc_from_loc() to add all available information to a loc. * Before healing an entry, it's parent is checked and partially healed if necessary to avoid failures. * All heal requests received for the same inode while another self-heal is being processed are queued. When the first heal completes, all pending requests are answered using the results of the first heal (without full execution), unless the first heal was a partial heal. In this case all partial heals are answered, and the first full heal is processed normally. * An special virtual xattr (not physically stored on bricks) named 'trusted.ec.heal' has been created to allow synchronous self-heal of files. Now, the recommended way to heal an entire volume is this: find <mount> -d -exec getfattr -h -n trusted.ec.heal {} \; Some minor changes: * ec_loc_prepare() has been renamed to ec_loc_update(). * All loc management functions return 0 on success and -1 on error. * Do not delay fop unlocks if heal is needed. * Added basic ec xattrs initially on create, mkdir and mknod fops. * Some coding style changes Change-Id: I2a5fd9c57349a153710880d6ac4b1fa0c1475985 BUG: 1161588 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/9072 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-heal.c')
-rw-r--r--xlators/cluster/ec/src/ec-heal.c182
1 files changed, 137 insertions, 45 deletions
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 042f24e7d4a..da5f5947de3 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -62,8 +62,8 @@ void ec_heal_lookup_resume(ec_fop_data_t * fop)
heal->fop->post_size = cbk->iatt[0].ia_size;
heal->fop->have_size = 1;
- if (!ec_loc_prepare(heal->xl, &heal->loc, cbk->inode,
- &cbk->iatt[0]))
+ if (ec_loc_update(heal->xl, &heal->loc, cbk->inode,
+ &cbk->iatt[0]) != 0)
{
fop->answer = NULL;
fop->error = EIO;
@@ -383,6 +383,41 @@ int32_t ec_heal_create(ec_heal_t * heal, uintptr_t mask, int32_t try_link)
return 0;
}
+int32_t ec_heal_parent_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+ uintptr_t good, uintptr_t bad, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ /* Even if parent self-heal has failed, we try to heal the current entry */
+ ec_heal_create(heal, fop->mask, 0);
+
+ return 0;
+}
+
+void ec_heal_parent(ec_heal_t *heal, uintptr_t mask)
+{
+ loc_t parent;
+ int32_t healing = 0;
+
+ /* First we try to do a partial heal of the parent directory to avoid
+ * ENOENT/ENOTDIR errors caused by missing parents */
+ if (ec_loc_parent(heal->xl, &heal->loc, &parent) == 0) {
+ if (!__is_root_gfid(parent.gfid)) {
+ ec_heal(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
+ ec_heal_parent_cbk, heal, &parent, 1, NULL);
+
+ healing = 1;
+ }
+ loc_wipe(&parent);
+ }
+
+ if (!healing) {
+ ec_heal_create(heal, mask, 0);
+ }
+}
+
void ec_heal_recreate(ec_fop_data_t * fop)
{
ec_cbk_data_t * cbk;
@@ -405,7 +440,7 @@ void ec_heal_recreate(ec_fop_data_t * fop)
if (mask != 0)
{
- ec_heal_create(heal, mask, 0);
+ ec_heal_parent(heal, mask);
}
}
@@ -458,8 +493,7 @@ int32_t ec_heal_init(ec_fop_data_t * fop)
memset(heal, 0, sizeof(ec_heal_t));
- if (!ec_loc_from_loc(fop->xl, &heal->loc, &fop->loc[0]))
- {
+ if (ec_loc_from_loc(fop->xl, &heal->loc, &fop->loc[0]) != 0) {
error = ENOMEM;
goto out;
@@ -472,6 +506,7 @@ int32_t ec_heal_init(ec_fop_data_t * fop)
pool = fop->xl->ctx->iobuf_pool;
heal->size = iobpool_default_pagesize(pool) * ec->fragments;
heal->partial = fop->int32;
+ fop->heal = heal;
LOCK(&inode->lock);
@@ -483,20 +518,30 @@ int32_t ec_heal_init(ec_fop_data_t * fop)
goto unlock;
}
- if (ctx->heal != NULL)
- {
+ if (list_empty(&ctx->heal)) {
+ gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path,
+ uuid_utoa(heal->loc.gfid));
+ } else {
error = EEXIST;
-
- goto unlock;
}
- fop->data = heal;
-
- ctx->heal = heal;
+ list_add_tail(&heal->list, &ctx->heal);
heal = NULL;
unlock:
UNLOCK(&inode->lock);
+
+ if (error == EEXIST) {
+ LOCK(&fop->lock);
+
+ fop->jobs++;
+ fop->refs++;
+
+ UNLOCK(&fop->lock);
+
+ error = 0;
+ }
+
out:
GF_FREE(heal);
@@ -506,12 +551,9 @@ out:
void ec_heal_entrylk(ec_heal_t * heal, entrylk_cmd cmd)
{
loc_t loc;
- int32_t error;
- error = ec_loc_parent(heal->xl, &heal->loc, &loc);
- if (error != 0)
- {
- ec_fop_set_error(heal->fop, error);
+ if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) {
+ ec_fop_set_error(heal->fop, EIO);
return;
}
@@ -605,7 +647,8 @@ void ec_heal_remove_others(ec_heal_t * heal)
if (cbk->op_ret < 0)
{
- if ((cbk->op_errno != ENOENT) && (cbk->op_errno != ENOTDIR))
+ if ((cbk->op_errno != ENOENT) && (cbk->op_errno != ENOTDIR) &&
+ (cbk->op_errno != ESTALE))
{
gf_log(heal->xl->name, GF_LOG_WARNING, "Don't know how to "
"remove inode with "
@@ -635,7 +678,7 @@ void ec_heal_prepare_others(ec_heal_t * heal)
if (cbk->op_ret < 0)
{
- if (cbk->op_errno == ENOENT)
+ if ((cbk->op_errno == ENOENT) || (cbk->op_errno == ESTALE))
{
ec_heal_create(heal, cbk->mask, 1);
}
@@ -1061,35 +1104,61 @@ void ec_heal_data(ec_heal_t * heal)
}
}
-void ec_heal_dispatch(ec_heal_t * heal)
+void ec_heal_dispatch(ec_heal_t *heal)
{
- ec_fop_data_t * fop = heal->fop;
- ec_cbk_data_t * cbk;
- inode_t * inode;
- ec_inode_t * ctx;
+ ec_fop_data_t *fop;
+ ec_cbk_data_t *cbk;
+ inode_t *inode;
+ ec_inode_t *ctx;
+ ec_heal_t *next = NULL;
+ struct list_head list;
int32_t error;
inode = heal->loc.inode;
+ INIT_LIST_HEAD(&list);
+
LOCK(&inode->lock);
- ctx = __ec_inode_get(inode, heal->xl);
- if (ctx != NULL)
- {
- ctx->bad &= ~heal->good;
- ctx->heal = NULL;
- }
+ /* A heal object not belonging to any list means that it has not been fully
+ * executed. It got its information from a previous heal that was executing
+ * when this heal started. */
+ if (!list_empty(&heal->list)) {
+ list_del_init(&heal->list);
+ ctx = __ec_inode_get(inode, heal->xl);
+ if (ctx != NULL) {
+ ctx->bad &= ~heal->good;
- fop->data = NULL;
+ if (heal->partial) {
+ /* Collect all partial heal requests. All of them will receive
+ * the same answer. 'next' will contain a pointer to the first
+ * full request (if any) after this partial heal request.*/
+ while (!list_empty(&ctx->heal)) {
+ next = list_entry(ctx->heal.next, ec_heal_t, list);
+ if (!next->partial) {
+ break;
+ }
+ list_move_tail(&next->list, &list);
+ }
+ if (list_empty(&ctx->heal)) {
+ next = NULL;
+ }
+ } else {
+ /* This is a full heal request, so take all received heal
+ * requests to answer them now. */
+ list_splice_init(&ctx->heal, &list);
+ }
+ }
+ }
UNLOCK(&inode->lock);
+ fop = heal->fop;
error = fop->error;
cbk = ec_cbk_data_allocate(fop->frame, heal->xl, fop, fop->id, 0,
error == 0 ? 0 : -1, error);
- if (cbk != NULL)
- {
+ if (cbk != NULL) {
cbk->uintptr[0] = heal->available;
cbk->uintptr[1] = heal->good;
cbk->uintptr[2] = heal->fixed;
@@ -1097,9 +1166,7 @@ void ec_heal_dispatch(ec_heal_t * heal)
ec_combine(cbk, NULL);
fop->answer = cbk;
- }
- else if (error == 0)
- {
+ } else if (error == 0) {
error = ENOMEM;
}
@@ -1119,16 +1186,38 @@ void ec_heal_dispatch(ec_heal_t * heal)
GF_FREE(heal);
ec_fop_set_error(fop, error);
+
+ /* Resume all pending heal requests, setting the same data obtained by
+ * this heal execution. */
+ while (!list_empty(&list)) {
+ heal = list_entry(list.next, ec_heal_t, list);
+ list_del_init(&heal->list);
+
+ heal->available = cbk->uintptr[0];
+ heal->good = cbk->uintptr[1];
+ heal->fixed = cbk->uintptr[2];
+
+ /* Setting 'done' to 1 avoids executing all heal logic and directly
+ * reports the result to the caller. */
+ heal->done = 1;
+
+ ec_resume(heal->fop, error);
+ }
+
+ /* If there is a pending full request, resume it. */
+ if (next != NULL) {
+ ec_resume(next->fop, 0);
+ }
}
void ec_wind_heal(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
{
ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
+ ec_heal_t *heal = fop->heal;
ec_trace("WIND", fop, "idx=%d", idx);
- cbk = ec_cbk_data_allocate(fop->req_frame, fop->xl, fop, EC_FOP_HEAL, idx,
+ cbk = ec_cbk_data_allocate(fop->frame, fop->xl, fop, EC_FOP_HEAL, idx,
fop->error == 0 ? 0 : -1, fop->error);
if (cbk != NULL)
{
@@ -1145,7 +1234,7 @@ void ec_wind_heal(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
int32_t ec_manager_heal(ec_fop_data_t * fop, int32_t state)
{
ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
+ ec_heal_t *heal = fop->heal;
switch (state)
{
@@ -1158,10 +1247,14 @@ int32_t ec_manager_heal(ec_fop_data_t * fop, int32_t state)
return EC_STATE_REPORT;
}
- /* Fall through */
+ return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
- ec_heal_entrylk(fop->data, ENTRYLK_LOCK);
+ if (heal->done) {
+ return EC_STATE_HEAL_DISPATCH;
+ }
+
+ ec_heal_entrylk(heal, ENTRYLK_LOCK);
return EC_STATE_HEAL_ENTRY_LOOKUP;
@@ -1405,10 +1498,9 @@ void ec_heal(call_frame_t * frame, xlator_t * this, uintptr_t target,
gf_log("ec", GF_LOG_TRACE, "EC(HEAL) %p", frame);
VALIDATE_OR_GOTO(this, out);
- GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(NULL, this, EC_FOP_HEAL,
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL,
EC_FLAG_UPDATE_LOC_INODE, target, minimum,
ec_wind_heal, ec_manager_heal, callback, data);
if (fop == NULL)
@@ -1457,11 +1549,11 @@ out:
void ec_wind_fheal(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
{
ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
+ ec_heal_t *heal = fop->heal;
ec_trace("WIND", fop, "idx=%d", idx);
- cbk = ec_cbk_data_allocate(fop->req_frame, fop->xl, fop, EC_FOP_FHEAL, idx,
+ cbk = ec_cbk_data_allocate(fop->frame, fop->xl, fop, EC_FOP_FHEAL, idx,
fop->error == 0 ? 0 : -1, fop->error);
if (cbk != NULL)
{