diff options
| -rw-r--r-- | tests/basic/ec/ec-common | 5 | ||||
| -rw-r--r-- | tests/basic/ec/ec.t | 4 | ||||
| -rw-r--r-- | tests/basic/ec/self-heal.t | 4 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-combine.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 292 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 30 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 2 | 
8 files changed, 231 insertions, 113 deletions
diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common index 65422d72095..19818355f04 100644 --- a/tests/basic/ec/ec-common +++ b/tests/basic/ec/ec-common @@ -81,6 +81,11 @@ for dir in . dir1; do      EXPECT "$cs_big" echo $(sha1sum $dir/big | awk '{ print $1 }') +# Give enough time for current operations to complete. Otherwise the +# following kill_brick can cause data corruption and self-heal will be +# needed, but this script is not prepared to handle self-healing. +    sleep 2 +      for idx in `seq 0 $LAST_BRICK`; do          TEST kill_brick $V0 $H0 $B0/$V0$idx diff --git a/tests/basic/ec/ec.t b/tests/basic/ec/ec.t index 91bad52499a..77c6460f11a 100644 --- a/tests/basic/ec/ec.t +++ b/tests/basic/ec/ec.t @@ -190,6 +190,8 @@ TEST touch $M0/setxattr  TEST touch $M0/removexattr  TEST setfattr -n user.bar -v "ash_nazg_gimbatul" $M0/removexattr +sleep 2 +  # Kill a couple of bricks and allow some time for things to settle.  TEST kill_brick $V0 $H0 $B0/${V0}3  TEST kill_brick $V0 $H0 $B0/${V0}8 @@ -216,6 +218,8 @@ TEST setfattr -x user.bar $M0/removexattr  # Test uid/gid behavior  TEST setup_perm_file $M0 +sleep 2 +  # Unmount/remount so that create/write and truncate don't see cached data.  TEST umount $M0  TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 3df19184169..2eb2663f8ea 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -178,6 +178,8 @@ for idx1 in {0..4}; do      done  done +sleep 2 +  TEST kill_brick $V0 $H0 $B0/${V0}0  TEST kill_brick $V0 $H0 $B0/${V0}1  TEST cp $tmp/test test2 @@ -196,6 +198,8 @@ TEST [ -f test4 ]  EXPECT "2" stat -c "%h" test2  EXPECT "2" stat -c "%h" test4 +sleep 2 +  TEST $CLI volume start $V0 force  # Wait until the killed bricks have been started and recognized by the ec  # xlator diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 0fa5ac068de..93593fb8b6a 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -326,8 +326,10 @@ int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk,      len = prelen;      for (i = 0; i < num; i++)      { -        memcpy(str + len, sep, seplen); -        len += seplen; +        if (i > 0) { +            memcpy(str + len, sep, seplen); +            len += seplen; +        }          tmp = data[i]->len - 1;          memcpy(str + len, data[i]->data, tmp);          len += tmp; diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 561871cee93..2ba17305411 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -631,9 +631,11 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)      return uuid_compare(lock1->loc.gfid, lock2->loc.gfid);  } -void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update) +ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, +                               int32_t update)  {      ec_lock_t * tmp; +    ec_lock_link_t *link = NULL;      int32_t tmp_update;      if ((fop->lock_count > 0) && @@ -654,13 +656,23 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update)      fop->lock_count++; -    lock->refs++; +    if (lock->timer != NULL) { +        link = lock->timer->data; +        ec_trace("UNLOCK_CANCELLED", link->fop, "lock=%p", lock); +        gf_timer_call_cancel(fop->xl->ctx, lock->timer); +        lock->timer = NULL; +    } else { +        lock->refs++; +    } + +    return link;  }  void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update)  {      ec_lock_t * lock = NULL;      ec_inode_t * ctx = NULL; +    ec_lock_link_t *link = NULL;      loc_t tmp;      int32_t error; @@ -724,16 +736,21 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update)      ctx->entry_lock = lock;  insert: -    ec_lock_insert(fop, lock, update); +    link = ec_lock_insert(fop, lock, update);  unlock:      UNLOCK(&tmp.inode->lock);      loc_wipe(&tmp); + +    if (link != NULL) { +        ec_resume(link->fop, 0); +    }  }  void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)  { +    ec_lock_link_t *link = NULL;      ec_lock_t * lock;      ec_inode_t * ctx; @@ -778,10 +795,14 @@ void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)      ctx->inode_lock = lock;  insert: -    ec_lock_insert(fop, lock, update); +    link = ec_lock_insert(fop, lock, update);  unlock:      UNLOCK(&loc->inode->lock); + +    if (link != NULL) { +        ec_resume(link->fop, 0); +    }  }  void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) @@ -898,90 +919,6 @@ void ec_lock(ec_fop_data_t * fop)      }  } -int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this, -                    int32_t op_ret, int32_t op_errno, dict_t * xdata) -{ -    ec_fop_data_t * fop = cookie; - -    if (op_ret < 0) -    { -        gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", -               ec_fop_name(fop->parent->id)); -    } -    else -    { -        ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data); -    } - -    return 0; -} - -void ec_unlock(ec_fop_data_t * fop) -{ -    ec_lock_t * lock; -    int32_t i, refs; - -    for (i = 0; i < fop->lock_count; i++) -    { -        lock = fop->locks[i].lock; - -        LOCK(&lock->loc.inode->lock); - -        ec_trace("UNLOCK", fop, "lock=%p", lock); - -        refs = --lock->refs; -        if (refs == 0) -        { -            *lock->plock = NULL; -        } - -        UNLOCK(&lock->loc.inode->lock); - -        if (refs == 0) -        { -            if (lock->mask != 0) -            { -                ec_owner_set(fop->frame, lock); - -                switch (lock->kind) -                { -                    case EC_LOCK_ENTRY: -                        ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, " -                                                        "path=%s", -                                 lock, lock->loc.inode, lock->loc.path); - -                        ec_entrylk(fop->frame, fop->xl, lock->mask, -                                   EC_MINIMUM_ALL, ec_unlocked, lock, -                                   fop->xl->name, &lock->loc, NULL, -                                   ENTRYLK_UNLOCK, lock->type, NULL); - -                        break; - -                    case EC_LOCK_INODE: -                        lock->flock.l_type = F_UNLCK; -                        ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", -                                 lock, lock->loc.inode); - -                        ec_inodelk(fop->frame, fop->xl, lock->mask, -                                   EC_MINIMUM_ALL, ec_unlocked, lock, -                                   fop->xl->name, &lock->loc, F_SETLK, -                                   &lock->flock, NULL); - -                        break; - -                    default: -                        gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock " -                                                            "type"); -                } -            } - -            ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); - -            ec_lock_destroy(lock); -        } -    } -} -  int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,                                  xlator_t * this, int32_t op_ret,                                  int32_t op_errno, inode_t * inode, @@ -991,7 +928,7 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,      ec_t * ec;      ec_fop_data_t * fop = cookie;      ec_inode_t * ctx; -    ec_lock_t * lock; +    ec_lock_t *lock = NULL;      if (op_ret >= 0)      { @@ -1192,6 +1129,58 @@ out:      ec_fop_set_error(fop, error);  } +int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +    ec_fop_data_t *fop = cookie; + +    if (op_ret < 0) { +        gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", +               ec_fop_name(fop->parent->id)); +    } else { +        ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data); +    } + +    return 0; +} + +void ec_unlock_lock(ec_fop_data_t *fop, ec_lock_t *lock) +{ +    if (lock->mask != 0) { +        ec_owner_set(fop->frame, lock); + +        switch (lock->kind) { +        case EC_LOCK_ENTRY: +            ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s", lock, +                     lock->loc.inode, lock->loc.path); + +            ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, +                       ec_unlocked, lock, fop->xl->name, &lock->loc, NULL, +                       ENTRYLK_UNLOCK, lock->type, NULL); + +            break; + +        case EC_LOCK_INODE: +            lock->flock.l_type = F_UNLCK; +            ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, +                     lock->loc.inode); + +            ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, +                       ec_unlocked, lock, fop->xl->name, &lock->loc, F_SETLK, +                       &lock->flock, NULL); + +            break; + +        default: +            gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type"); +        } +    } + +    ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); + +    ec_lock_destroy(lock); +} +  int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,                                      xlator_t * this, int32_t op_ret,                                      int32_t op_errno, dict_t * xattr, @@ -1209,11 +1198,15 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,          fop->parent->mask &= fop->good;      } +    if (fop->data != NULL) { +        ec_unlock_lock(fop->parent, fop->data); +    } +      return 0;  }  void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version, -                            uint64_t size) +                            uint64_t size, ec_lock_t *lock)  {      dict_t * dict;      uid_t uid; @@ -1253,7 +1246,7 @@ void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version,      fop->frame->root->gid = 0;      ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, -               ec_update_size_version_done, NULL, loc, +               ec_update_size_version_done, lock, loc,                 GF_XATTROP_ADD_ARRAY64, dict, NULL);      fop->frame->root->uid = uid; @@ -1274,6 +1267,103 @@ out:      gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size");  } +void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock) +{ +    ec_trace("UNLOCK_NOW", fop, "lock=%p", lock); + +    if (lock->version_delta != 0) { +        ec_update_size_version(fop, &lock->loc, lock->version_delta, +                               lock->size_delta, lock); +    } else { +        ec_unlock_lock(fop, lock); +    } + +    ec_resume(fop, 0); +} + +void ec_unlock_timer_cbk(void *data) +{ +    ec_lock_link_t *link = data; +    ec_lock_t *lock = link->lock; +    ec_fop_data_t *fop = NULL; + +    LOCK(&lock->loc.inode->lock); + +    if (lock->timer != NULL) { +        fop = link->fop; + +        ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); + +        GF_ASSERT(lock->refs == 1); + +        gf_timer_call_cancel(fop->xl->ctx, lock->timer); +        lock->timer = NULL; +        *lock->plock = NULL; +    } + +    UNLOCK(&lock->loc.inode->lock); + +    if (fop != NULL) { +        ec_unlock_now(fop, lock); +    } +} + +void ec_unlock_timer_add(ec_lock_link_t *link) +{ +    struct timespec delay; +    ec_fop_data_t *fop = link->fop; +    ec_lock_t *lock = link->lock; +    int32_t refs = 1; + +    LOCK(&lock->loc.inode->lock); + +    GF_ASSERT(lock->timer == NULL); + +    if (lock->refs != 1) { +        ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); + +        lock->refs--; + +        UNLOCK(&lock->loc.inode->lock); +    } else { +        ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); + +        delay.tv_sec = 1; +        delay.tv_nsec = 0; + +        LOCK(&fop->lock); + +        fop->jobs++; +        fop->refs++; + +        UNLOCK(&fop->lock); + +        lock->timer = gf_timer_call_after(fop->xl->ctx, delay, +                                          ec_unlock_timer_cbk, link); +        if (lock->timer == NULL) { +            gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an unlock"); + +            *lock->plock = NULL; +            refs = 0; +        } + +        UNLOCK(&lock->loc.inode->lock); + +        if (refs == 0) { +            ec_unlock_now(fop, lock); +        } +    } +} + +void ec_unlock(ec_fop_data_t *fop) +{ +    int32_t i; + +    for (i = 0; i < fop->lock_count; i++) { +        ec_unlock_timer_add(&fop->locks[i]); +    } +} +  void ec_flush_size_version(ec_fop_data_t * fop)  {      ec_lock_t * lock; @@ -1296,7 +1386,7 @@ void ec_flush_size_version(ec_fop_data_t * fop)      if (version > 0)      { -        ec_update_size_version(fop, &lock->loc, version, delta); +        ec_update_size_version(fop, &lock->loc, version, delta, NULL);      }  } @@ -1305,16 +1395,10 @@ void ec_lock_reuse(ec_fop_data_t *fop)      ec_fop_data_t * wait_fop;      ec_lock_t * lock;      ec_lock_link_t * link; -    uint64_t version = 0, delta = 0; -    int32_t refs = 0;      int32_t i;      for (i = 0; i < fop->lock_count; i++)      { -        refs = 0; -        delta = 0; -        version = 0; -          wait_fop = NULL;          lock = fop->locks[i].lock; @@ -1338,14 +1422,6 @@ void ec_lock_reuse(ec_fop_data_t *fop)              }          } -        version = lock->version_delta; -        delta = lock->size_delta; -        refs = lock->refs; -        if (refs == 1) { -            lock->version_delta = 0; -            lock->size_delta = 0; -        } -          lock->good_mask &= fop->mask;          if (!list_empty(&lock->waiting)) @@ -1371,10 +1447,6 @@ void ec_lock_reuse(ec_fop_data_t *fop)              ec_resume(wait_fop, 0);          } - -        if ((refs == 1) && (version > 0)) { -            ec_update_size_version(fop, &lock->loc, version, delta); -        }      }  } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index ac197fe7f0b..97ec58469ad 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -142,6 +142,7 @@ union _ec_cbk  struct _ec_lock  {      ec_lock_t        **plock; +    gf_timer_t        *timer;      struct list_head   waiting;      uintptr_t          mask;      uintptr_t          good_mask; diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index 02961acb8bd..0bd10d4e27d 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -1016,11 +1016,41 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx)  int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)  { +    ec_t *ec;      ec_cbk_data_t * cbk;      switch (state)      {          case EC_STATE_INIT: +            if (S_ISREG(fop->mode[0])) { +                if (fop->xdata == NULL) { +                    fop->xdata = dict_new(); +                    if (fop->xdata == NULL) { +                        fop->error = EIO; + +                        return EC_STATE_REPORT; +                    } +                } + +                ec = fop->xl->private; + +                fop->config.version = EC_CONFIG_VERSION; +                fop->config.algorithm = EC_CONFIG_ALGORITHM; +                fop->config.gf_word_size = EC_GF_BITS; +                fop->config.bricks = ec->nodes; +                fop->config.redundancy = ec->redundancy; +                fop->config.chunk_size = EC_METHOD_CHUNK_SIZE; + +                if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, +                                       &fop->config) < 0) { +                    fop->error = EIO; + +                    return EC_STATE_REPORT; +                } +            } + +        /* Fall through */ +          case EC_STATE_LOCK:              ec_lock_prepare_entry(fop, &fop->loc[0], 1);              ec_lock(fop); diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 78a189bc325..c31f0d97674 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -81,7 +81,7 @@ int32_t ec_manager_access(ec_fop_data_t * fop, int32_t state)          case EC_STATE_DISPATCH:              ec_dispatch_one(fop); -            return EC_STATE_PREPARE_ANSWER; +            return EC_STATE_REPORT;          case -EC_STATE_REPORT:              if (fop->cbks.access != NULL)  | 
