diff options
Diffstat (limited to 'xlators/cluster')
| -rw-r--r-- | xlators/cluster/ec/src/ec-combine.c | 35 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 1509 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 36 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.c | 82 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 84 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-dir-read.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 96 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-fops.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 86 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 103 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 38 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.h | 7 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 51 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 159 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-locks.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.c | 97 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.h | 5 | 
17 files changed, 1347 insertions, 1065 deletions
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 9d4a18999f1..4617a0430f1 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -171,8 +171,10 @@ void ec_iatt_rebuild(ec_t * ec, struct iatt * iatt, int32_t count,  gf_boolean_t  ec_xattr_match (dict_t *dict, char *key, data_t *value, void *arg)  { -        if (fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) +        if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) || +            (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) {                  return _gf_false; +        }          return _gf_true;  } @@ -185,6 +187,8 @@ ec_value_ignore (char *key)              (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) ||              (strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) ||              (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) || +            (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || +            (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) ||              (strncmp(key, GF_XATTR_CLRLK_CMD,                       strlen (GF_XATTR_CLRLK_CMD)) == 0) ||              (strncmp(key, EC_QUOTA_PREFIX, strlen(EC_QUOTA_PREFIX)) == 0) || @@ -225,15 +229,9 @@ int32_t ec_dict_list(data_t ** list, int32_t * count, ec_cbk_data_t * cbk,          dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict;          list[i] = dict_get(dict, key); -        if (list[i] == NULL) -        { -            gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "Unexpected missing " -                                                     "dictionary entry"); - -            return 0; +        if (list[i] != NULL) { +            i++;          } - -        i++;      }      *count = i; @@ -471,11 +469,6 @@ int32_t ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key)          return -1;      } -    if (num <= 1) -    { -        return 0; -    } -      max = data_to_uint32(data[0]);      for (i = 1; i < num; i++)      { @@ -507,10 +500,6 @@ int32_t ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key)          return -1;      } -    if (num <= 1) { -        return 0; -    } -      max = data_to_uint64(data[0]);      for (i = 1; i < num; i++) {          tmp = data_to_uint64(data[i]); @@ -630,6 +619,10 @@ int32_t ec_dict_data_combine(dict_t * dict, char * key, data_t * value,      {          return ec_dict_data_max32(data->cbk, data->which, key);      } +    if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || +        (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) { +        return ec_dict_data_max32(data->cbk, data->which, key); +    }      if (strcmp(key, QUOTA_SIZE_KEY) == 0) {          return ec_dict_data_quota(data->cbk, data->which, key); @@ -831,6 +824,8 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)      LOCK(&fop->lock); +    fop->received |= newcbk->mask; +      item = fop->cbk_list.prev;      list_for_each_entry(cbk, &fop->cbk_list, list)      { @@ -868,7 +863,9 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)      }      cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); -    needed = fop->minimum - cbk->count - fop->winds + 1; +    if ((fop->mask ^ fop->remaining) == fop->received) { +        needed = fop->minimum - cbk->count; +    }      UNLOCK(&fop->lock); diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 393d9142797..ba81fc7313f 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -368,24 +368,34 @@ int32_t ec_child_select(ec_fop_data_t * fop)      uintptr_t mask = 0;      int32_t first = 0, num = 0; +    ec_fop_cleanup(fop); +      fop->mask &= ec->node_mask;      mask = ec->xl_up;      if (fop->parent == NULL)      { -        if (fop->loc[0].inode != NULL) { +        if ((fop->flags & EC_FLAG_UPDATE_LOC_PARENT) && fop->loc[0].parent) +            mask &= ec_inode_good(fop->loc[0].parent, fop->xl); + +        if ((fop->flags & EC_FLAG_UPDATE_LOC_INODE) && fop->loc[0].inode) {              mask &= ec_inode_good(fop->loc[0].inode, fop->xl);          } -        if (fop->loc[1].inode != NULL) { + +        if ((fop->flags & EC_FLAG_UPDATE_LOC_INODE) && fop->loc[1].inode) {              mask &= ec_inode_good(fop->loc[1].inode, fop->xl);          } -        if (fop->fd != NULL) { -            if (fop->fd->inode != NULL) { + +        if (fop->fd) { +            if ((fop->flags & EC_FLAG_UPDATE_FD_INODE) && fop->fd->inode) {                  mask &= ec_inode_good(fop->fd->inode, fop->xl);              } -            mask &= ec_fd_good(fop->fd, fop->xl); +            if (fop->flags & fop->flags & EC_FLAG_UPDATE_FD) { +                    mask &= ec_fd_good(fop->fd, fop->xl); +            }          }      } +      if ((fop->mask & ~mask) != 0)      {          gf_log(fop->xl->name, GF_LOG_WARNING, "Executing operation with " @@ -420,6 +430,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)      /*Unconditionally wind on healing subvolumes*/      fop->mask |= fop->healing;      fop->remaining = fop->mask; +    fop->received = 0;      ec_trace("SELECT", fop, ""); @@ -585,7 +596,7 @@ void ec_dispatch_min(ec_fop_data_t * fop)      }  } -ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) +ec_lock_t *ec_lock_allocate(xlator_t *xl, loc_t *loc)  {      ec_t * ec = xl->private;      ec_lock_t * lock; @@ -602,9 +613,9 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)      lock = mem_get0(ec->lock_pool);      if (lock != NULL)      { -        lock->kind = kind;          lock->good_mask = -1ULL;          INIT_LIST_HEAD(&lock->waiting); +        INIT_LIST_HEAD(&lock->frozen);          if (ec_loc_from_loc(xl, &lock->loc, loc) != 0)          {              mem_put(lock); @@ -618,6 +629,9 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)  void ec_lock_destroy(ec_lock_t * lock)  {      loc_wipe(&lock->loc); +    if (lock->fd != NULL) { +        fd_unref(lock->fd); +    }      mem_put(lock);  } @@ -627,166 +641,96 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)      return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);  } -ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, -                               int32_t update) +void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, +                    loc_t *base)  { -    ec_lock_t *new_lock, *tmp; -    ec_lock_link_t *link = NULL; -    int32_t tmp_update; +    ec_lock_link_t *link; -    new_lock = lock; +    /* This check is only prepared for up to 2 locks per fop. If more locks +     * are needed this must be changed. */      if ((fop->lock_count > 0) && -        (ec_lock_compare(fop->locks[0].lock, new_lock) > 0)) -    { -        tmp = fop->locks[0].lock; -        fop->locks[0].lock = new_lock; -        new_lock = tmp; - -        tmp_update = fop->locks_update; -        fop->locks_update = update; -        update = tmp_update; -    } -    fop->locks[fop->lock_count].lock = new_lock; -    fop->locks[fop->lock_count].fop = fop; - -    fop->locks_update |= update << fop->lock_count; - -    fop->lock_count++; - -    if (lock->timer != NULL) { -        link = lock->timer->data; -        ec_trace("UNLOCK_CANCELLED", link->fop, "lock=%p", lock); -        gf_timer_call_cancel(fop->xl->ctx, lock->timer); -        lock->timer = NULL; +        (ec_lock_compare(fop->locks[0].lock, lock) < 0)) { +        fop->first_lock = fop->lock_count;      } else { -        lock->refs++; -    } - -    return link; -} - -void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) -{ -    ec_lock_t * lock = NULL; -    ec_inode_t * ctx = NULL; -    ec_lock_link_t *link = NULL; -    loc_t tmp; - -    if ((fop->parent != NULL) || (fop->error != 0)) -    { -        return; -    } - -    /* update is only 0 for 'opendir', which needs to lock the entry pointed -     * by loc instead of its parent. -     */ -    if (update) -    { -        if (ec_loc_parent(fop->xl, loc, &tmp) != 0) { -            ec_fop_set_error(fop, EIO); - -            return; -        } - -        /* If there's another lock, make sure that it's not the same. Otherwise -         * do not insert it. -         * -         * This can only happen on renames where source and target names are -         * in the same directory. */ -        if ((fop->lock_count > 0) && -            (fop->locks[0].lock->loc.inode == tmp.inode)) { -            goto wipe; +        /* When the first lock is added to the current fop, request lock +         * counts from locks xlator to be able to determine if there is +         * contention and release the lock sooner. */ +        if (fop->xdata == NULL) { +            fop->xdata = dict_new(); +            if (fop->xdata == NULL) { +                ec_fop_set_error(fop, ENOMEM); +                return; +            }          } -    } else { -        if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) { -            ec_fop_set_error(fop, EIO); - +        if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT, +                         fop->xl->name) != 0) { +            ec_fop_set_error(fop, ENOMEM);              return;          }      } -    LOCK(&tmp.inode->lock); - -    ctx = __ec_inode_get(tmp.inode, fop->xl); -    if (ctx == NULL) -    { -        __ec_fop_set_error(fop, EIO); - -        goto unlock; -    } - -    if (ctx->entry_lock != NULL) -    { -        lock = ctx->entry_lock; -        ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s" -                                      "Lock already acquired", -                 lock, tmp.inode, tmp.path); +    link = &fop->locks[fop->lock_count++]; -        goto insert; -    } +    link->lock = lock; +    link->fop = fop; +    link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0; +    link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0; +    link->base = base; -    lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp); -    if (lock == NULL) -    { -        __ec_fop_set_error(fop, EIO); - -        goto unlock; -    } - -    ec_trace("LOCK_CREATE", fop, "lock=%p", lock); - -    lock->type = ENTRYLK_WRLCK; - -    lock->plock = &ctx->entry_lock; -    ctx->entry_lock = lock; - -insert: -    link = ec_lock_insert(fop, lock, update); - -unlock: -    UNLOCK(&tmp.inode->lock); - -wipe: -    loc_wipe(&tmp); - -    if (link != NULL) { -        ec_resume(link->fop, 0); -    } +    lock->refs++; +    lock->inserted++;  } -void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update) +void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, +                                    uint32_t flags, loc_t *base)  { -    ec_lock_link_t *link = NULL; -    ec_lock_t * lock; -    ec_inode_t * ctx; +    ec_lock_t *lock = NULL; +    ec_inode_t *ctx; -    if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) -    { +    if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) {          return;      }      LOCK(&loc->inode->lock);      ctx = __ec_inode_get(loc->inode, fop->xl); -    if (ctx == NULL) -    { +    if (ctx == NULL) {          __ec_fop_set_error(fop, EIO);          goto unlock;      } -    if (ctx->inode_lock != NULL) -    { +    if (ctx->inode_lock != NULL) {          lock = ctx->inode_lock; + +        /* If there's another lock, make sure that it's not the same. Otherwise +         * do not insert it. +         * +         * This can only happen on renames where source and target names are +         * in the same directory. */ +        if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) { +            /* Combine data/meta updates */ +            fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0; +            fop->locks[0].update[EC_METADATA_TXN] |= +                                                 (flags & EC_UPDATE_META) != 0; + +            /* Only one base inode is allowed per fop, so there shouldn't be +             * overwrites here. */ +            if (base != NULL) { +                fop->locks[0].base = base; +            } + +            goto update_query; +        } +          ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already "                                        "acquired", lock, loc->inode);          goto insert;      } -    lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc); -    if (lock == NULL) -    { +    lock = ec_lock_allocate(fop->xl, loc); +    if (lock == NULL) {          __ec_fop_set_error(fop, EIO);          goto unlock; @@ -797,154 +741,78 @@ void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)      lock->flock.l_type = F_WRLCK;      lock->flock.l_whence = SEEK_SET; -    lock->plock = &ctx->inode_lock; +    lock->ctx = ctx;      ctx->inode_lock = lock;  insert: -    link = ec_lock_insert(fop, lock, update); - +    ec_lock_insert(fop, lock, flags, base); +update_query: +    lock->query |= (flags & EC_QUERY_INFO) != 0;  unlock:      UNLOCK(&loc->inode->lock); +} -    if (link != NULL) { -        ec_resume(link->fop, 0); -    } +void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags) +{ +    ec_lock_prepare_inode_internal(fop, loc, flags, NULL);  } -void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) +void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, +                                  uint32_t flags)  { -    loc_t loc; +    loc_t tmp, *base = NULL; -    if ((fop->parent != NULL) || (fop->error != 0)) -    { +    if (fop->error != 0) {          return;      } -    if (ec_loc_from_fd(fop->xl, &loc, fd) == 0) -    { -        ec_lock_prepare_inode(fop, &loc, update); - -        loc_wipe(&loc); -    } -    else -    { +    if (ec_loc_parent(fop->xl, loc, &tmp) != 0) {          ec_fop_set_error(fop, EIO); -    } -} - -int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this, -                  int32_t op_ret, int32_t op_errno, dict_t * xdata) -{ -    ec_fop_data_t * fop = cookie; -    ec_lock_t * lock = NULL; - -    if (op_ret >= 0) -    { -        lock = fop->data; -        lock->mask = fop->good; -        lock->acquired = 1; -        fop->parent->mask &= fop->good; -        fop->parent->locked++; - -        ec_trace("LOCKED", fop->parent, "lock=%p", lock); - -        ec_lock(fop->parent); +        return;      } -    else -    { -        gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock"); + +    if ((flags & EC_INODE_SIZE) != 0) { +        base = loc; +        flags ^= EC_INODE_SIZE;      } -    return 0; +    ec_lock_prepare_inode_internal(fop, &tmp, flags, base); + +    loc_wipe(&tmp);  } -void ec_lock(ec_fop_data_t * fop) +void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)  { -    ec_lock_t * lock; - -    while (fop->locked < fop->lock_count) -    { -        lock = fop->locks[fop->locked].lock; - -        LOCK(&lock->loc.inode->lock); - -        if (lock->owner != NULL) -        { -            ec_trace("LOCK_WAIT", fop, "lock=%p", lock); - -            list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting); - -            ec_sleep(fop); - -            UNLOCK(&lock->loc.inode->lock); - -            break; -        } -        lock->owner = fop; - -        UNLOCK(&lock->loc.inode->lock); - -        if (!lock->acquired) -        { -            ec_owner_set(fop->frame, lock); - -            if (lock->kind == EC_LOCK_ENTRY) -            { -                ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s", -                         lock, lock->loc.inode, lock->loc.path); - -                ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, -                           lock, fop->xl->name, &lock->loc, NULL, -                           ENTRYLK_LOCK, lock->type, NULL); -            } -            else -            { -                ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, -                         lock->loc.inode); +    loc_t loc; -                ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, -                           lock, fop->xl->name, &lock->loc, F_SETLKW, -                           &lock->flock, NULL); -            } +    if (fop->error != 0) { +        return; +    } -            break; -        } +    if (ec_loc_from_fd(fop->xl, &loc, fd) != 0) { +        ec_fop_set_error(fop, EIO); -        ec_trace("LOCK_REUSE", fop, "lock=%p", lock); +        return; +    } -        if (lock->have_size) -        { -            fop->pre_size = fop->post_size = lock->size; -            fop->have_size = 1; -        } -        fop->mask &= lock->good_mask; +    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL); -        fop->locked++; -    } +    loc_wipe(&loc);  }  gf_boolean_t -ec_config_check (ec_fop_data_t *fop, dict_t *xdata) +ec_config_check (ec_fop_data_t *fop, ec_config_t *config)  {      ec_t *ec; -    if (ec_dict_del_config(xdata, EC_XATTR_CONFIG, &fop->config) < 0) { -        gf_log(fop->xl->name, GF_LOG_ERROR, "Failed to get a valid " -                                            "config"); - -        ec_fop_set_error(fop, EIO); - -        return _gf_false; -    } -      ec = fop->xl->private; -    if ((fop->config.version != EC_CONFIG_VERSION) || -        (fop->config.algorithm != EC_CONFIG_ALGORITHM) || -        (fop->config.gf_word_size != EC_GF_BITS) || -        (fop->config.bricks != ec->nodes) || -        (fop->config.redundancy != ec->redundancy) || -        (fop->config.chunk_size != EC_METHOD_CHUNK_SIZE)) { +    if ((config->version != EC_CONFIG_VERSION) || +        (config->algorithm != EC_CONFIG_ALGORITHM) || +        (config->gf_word_size != EC_GF_BITS) || +        (config->bricks != ec->nodes) || +        (config->redundancy != ec->redundancy) || +        (config->chunk_size != EC_METHOD_CHUNK_SIZE)) {          uint32_t data_bricks;          /* This combination of version/algorithm requires the following @@ -957,271 +825,201 @@ ec_config_check (ec_fop_data_t *fop, dict_t *xdata)               chunk_size (in bits) must be a multiple of gf_word_size *                   (bricks - redundancy) */ -        data_bricks = fop->config.bricks - fop->config.redundancy; -        if ((fop->config.redundancy < 1) || -            (fop->config.redundancy * 2 >= fop->config.bricks) || -            !ec_is_power_of_2(fop->config.gf_word_size) || -            ((fop->config.chunk_size * 8) % (fop->config.gf_word_size * -                                             data_bricks) != 0)) { +        data_bricks = config->bricks - config->redundancy; +        if ((config->redundancy < 1) || +            (config->redundancy * 2 >= config->bricks) || +            !ec_is_power_of_2(config->gf_word_size) || +            ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) +                                                                       != 0)) {              gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid or corrupted config");          } else {              gf_log(fop->xl->name, GF_LOG_ERROR, "Unsupported config "                                                  "(V=%u, A=%u, W=%u, "                                                  "N=%u, R=%u, S=%u)", -                   fop->config.version, fop->config.algorithm, -                   fop->config.gf_word_size, fop->config.bricks, -                   fop->config.redundancy, fop->config.chunk_size); +                   config->version, config->algorithm, +                   config->gf_word_size, config->bricks, +                   config->redundancy, config->chunk_size);          } -        ec_fop_set_error(fop, EIO); -          return _gf_false;      }      return _gf_true;  } -int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, -                                xlator_t * this, int32_t op_ret, -                                int32_t op_errno, inode_t * inode, -                                struct iatt * buf, dict_t * xdata, -                                struct iatt * postparent) +int32_t +ec_prepare_update_cbk (call_frame_t *frame, void *cookie, +                       xlator_t *this, int32_t op_ret, int32_t op_errno, +                       dict_t *dict, dict_t *xdata)  { -    ec_fop_data_t * fop = cookie; -    ec_inode_t * ctx; +    ec_fop_data_t *fop = cookie, *parent; +    ec_lock_link_t *link = fop->data;      ec_lock_t *lock = NULL; +    ec_inode_t *ctx; -    if (op_ret >= 0) -    { -        if ((buf->ia_type == IA_IFREG) && !ec_config_check(fop, xdata)) { -            return 0; -        } +    lock = link->lock; +    parent = link->fop; +    ctx = lock->ctx; -        LOCK(&inode->lock); +    if (op_ret < 0) { +        gf_log(this->name, GF_LOG_WARNING, +               "Failed to get size and version (error %d: %s)", op_errno, +               strerror (op_errno)); -        ctx = __ec_inode_get(inode, this); -        if (ctx != NULL) { -            if (ctx->inode_lock != NULL) { -                lock = ctx->inode_lock; -                lock->version[0] = fop->answer->version[0]; -                lock->version[1] = fop->answer->version[1]; +        goto out; +    } -                if (buf->ia_type == IA_IFREG) { -                    lock->have_size = 1; -                    lock->size = buf->ia_size; -                } -            } -            if (ctx->entry_lock != NULL) { -                lock = ctx->entry_lock; -                lock->version[0] = fop->answer->version[0]; -                lock->version[1] = fop->answer->version[1]; -            } -        } +    op_errno = EIO; -        UNLOCK(&inode->lock); +    LOCK(&lock->loc.inode->lock); -        if (lock != NULL) -        { -            // Only update parent mask if the lookup has been made with -            // inode locked. -            fop->parent->mask &= fop->good; -        } +    if (ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version, +                          EC_VERSION_SIZE) != 0) { +        gf_log(this->name, GF_LOG_ERROR, "Unable to get version xattr"); -        if (buf->ia_type == IA_IFREG) { -            fop->parent->pre_size = fop->parent->post_size = buf->ia_size; -            fop->parent->have_size = 1; -        } -    } -    else -    { -        gf_log(this->name, GF_LOG_WARNING, "Failed to get size and version " -                                           "(error %d)", op_errno); -        ec_fop_set_error(fop, op_errno); +        goto unlock;      } +    ctx->post_version[0] += ctx->pre_version[0]; +    ctx->post_version[1] += ctx->pre_version[1]; -    return 0; -} +    ctx->have_version = _gf_true; -gf_boolean_t -ec_is_data_fop (glusterfs_fop_t fop) -{ -        switch (fop) { -        case GF_FOP_WRITE: -        case GF_FOP_TRUNCATE: -        case GF_FOP_FTRUNCATE: -        case GF_FOP_FALLOCATE: -        case GF_FOP_DISCARD: -        case GF_FOP_ZEROFILL: -                return _gf_true; -        default: -                return _gf_false; -        } -        return _gf_false; -} +    if (ec_dict_del_array(dict, EC_XATTR_DIRTY, ctx->pre_dirty, +                          EC_VERSION_SIZE) == 0) { +        ctx->post_dirty[0] += ctx->pre_dirty[0]; +        ctx->post_dirty[1] += ctx->pre_dirty[1]; -gf_boolean_t -ec_is_metadata_fop (glusterfs_fop_t fop) -{ -        switch (fop) { -        case GF_FOP_SETATTR: -        case GF_FOP_FSETATTR: -        case GF_FOP_SETXATTR: -        case GF_FOP_FSETXATTR: -        case GF_FOP_REMOVEXATTR: -        case GF_FOP_FREMOVEXATTR: -                return _gf_true; -        default: -                return _gf_false; -        } -        return _gf_false; -} +        ctx->have_dirty = _gf_true; +    } -int32_t -ec_prepare_update_cbk (call_frame_t *frame, void *cookie, -                       xlator_t *this, int32_t op_ret, int32_t op_errno, -                       dict_t *dict, dict_t *xdata) -{ -    ec_fop_data_t *fop = cookie, *parent; -    ec_lock_t *lock = NULL; -    uint64_t size = 0; -    uint64_t version[EC_VERSION_SIZE] = {0, 0}; +    if (lock->loc.inode->ia_type == IA_IFREG) { +        if (ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size) != 0) { +            gf_log(this->name, GF_LOG_ERROR, "Unable to get size xattr"); -    if (op_ret >= 0) { -        parent = fop->parent; -        while ((parent != NULL) && (parent->locks[0].lock == NULL)) { -            parent = parent->parent; -        } -        if (parent == NULL) { -            return 0; +            goto unlock;          } +        ctx->post_size = ctx->pre_size; -        lock = parent->locks[0].lock; -        if (ec_is_metadata_fop (fop->parent->id)) -                lock->is_dirty[EC_METADATA_TXN] = _gf_true; -        else -                lock->is_dirty[EC_DATA_TXN] = _gf_true; - -	if (lock->loc.inode->ia_type == IA_IFREG) { -            if (!ec_config_check(fop, dict) || -                (ec_dict_del_number(dict, EC_XATTR_SIZE, &size) != 0)) { -                ec_fop_set_error(fop, EIO); -                return 0; -            } -        } +        ctx->have_size = _gf_true; -        if (ec_dict_del_array(dict, EC_XATTR_VERSION, version, -                               EC_VERSION_SIZE) != 0) { -            ec_fop_set_error(fop, EIO); -            return 0; +        if ((ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config) != 0) || +            !ec_config_check(parent, &ctx->config)) { +            gf_log(this->name, GF_LOG_ERROR, "Unable to get config xattr"); + +            goto unlock;          } -        LOCK(&lock->loc.inode->lock); +        ctx->have_config = _gf_true; +    } -        if (lock->loc.inode->ia_type == IA_IFREG) { -            lock->size = size; -            fop->parent->pre_size = fop->parent->post_size = size; -            fop->parent->have_size = lock->have_size = 1; -        } -        lock->version[0] = version[0]; -        lock->version[1] = version[1]; +    ctx->have_info = _gf_true; -        UNLOCK(&lock->loc.inode->lock); +    op_errno = 0; + +unlock: +    UNLOCK(&lock->loc.inode->lock); +out: +    if (op_errno == 0) { +        parent->mask &= fop->good; -        fop->parent->mask &= fop->good;          /*As of now only data healing marks bricks as healing*/ -        if (ec_is_data_fop (fop->parent->id)) -                fop->parent->healing |= fop->healing; +        if (ec_is_data_fop (parent->id)) { +            parent->healing |= fop->healing; +        }      } else { -        gf_log(this->name, GF_LOG_WARNING, -               "Failed to get size and version (error %d: %s)", op_errno, -               strerror (op_errno)); -        ec_fop_set_error(fop, op_errno); +        ec_fop_set_error(parent, op_errno);      }      return 0;  } -void ec_get_size_version(ec_fop_data_t * fop) +void ec_get_size_version(ec_lock_link_t *link)  {      loc_t loc; -    dict_t * xdata; +    ec_lock_t *lock; +    ec_inode_t *ctx; +    ec_fop_data_t *fop; +    dict_t *dict = NULL;      uid_t uid;      gid_t gid;      int32_t error = ENOMEM;      uint64_t allzero[EC_VERSION_SIZE] = {0, 0}; -    if (fop->have_size) -    { +    lock = link->lock; +    ctx = lock->ctx; + +    /* If ec metadata has already been retrieved, do not try again. */ +    if (ctx->have_info) {          return;      } -    if ((fop->parent != NULL) && fop->parent->have_size) -    { -        fop->pre_size = fop->parent->pre_size; -        fop->post_size = fop->parent->post_size; - -        fop->have_size = 1; - +    /* Determine if there's something we need to retrieve for the current +     * operation. */ +    if (!lock->query && (lock->loc.inode->ia_type != IA_IFREG)) {          return;      } + +    fop = link->fop; +      uid = fop->frame->root->uid;      gid = fop->frame->root->gid; -    fop->frame->root->uid = 0; -    fop->frame->root->gid = 0; -      memset(&loc, 0, sizeof(loc)); -    xdata = dict_new(); -    if (xdata == NULL) -    { +    dict = dict_new(); +    if (dict == NULL) {          goto out;      } -    if ((ec_dict_set_array(xdata, EC_XATTR_VERSION, -                           allzero, EC_VERSION_SIZE) != 0) || -        (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) || -        (ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) || -        (ec_dict_set_array(xdata, EC_XATTR_DIRTY, allzero, -                           EC_VERSION_SIZE) != 0)) -    { + +    /* Once we know that an xattrop will be needed, we try to get all available +     * information in a single call. */ +    if ((ec_dict_set_array(dict, EC_XATTR_VERSION, allzero, +                          EC_VERSION_SIZE) != 0) || +        (ec_dict_set_array(dict, EC_XATTR_DIRTY, allzero, +                           EC_VERSION_SIZE) != 0)) {          goto out;      } -    error = EIO; +    if (lock->loc.inode->ia_type == IA_IFREG) { +        if ((ec_dict_set_number(dict, EC_XATTR_SIZE, 0) != 0) || +            (ec_dict_set_number(dict, EC_XATTR_CONFIG, 0) != 0)) { +            goto out; +        } +    } -    if (!fop->use_fd) -    { -        if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) -        { +    fop->frame->root->uid = 0; +    fop->frame->root->gid = 0; + +    /* For normal fops, ec_[f]xattrop() must succeed on at least +     * EC_MINIMUM_MIN bricks, however when this is called as part of a +     * self-heal operation the mask of target bricks (fop->mask) could +     * contain less than EC_MINIMUM_MIN bricks, causing the lookup to +     * always fail. Thus we always use the same minimum used for the main +     * fop. +     */ +    if (lock->fd == NULL) { +        if (ec_loc_from_loc(fop->xl, &loc, &lock->loc) != 0) {              goto out;          } -        if (gf_uuid_is_null(loc.pargfid)) -        { -            if (loc.parent != NULL) -            { +        if (gf_uuid_is_null(loc.pargfid)) { +            if (loc.parent != NULL) {                  inode_unref(loc.parent);                  loc.parent = NULL;              }              GF_FREE((char *)loc.path); -	    loc.path = NULL; -	    loc.name = NULL; +            loc.path = NULL; +            loc.name = NULL;          } -	/* For normal fops, ec_lookup() must succeed on at least EC_MINIMUM_MIN -	 * bricks, however when this is called as part of a self-heal operation -	 * the mask of target bricks (fop->mask) could contain less than -	 * EC_MINIMUM_MIN bricks, causing the lookup to always fail. Thus we -	 * always use the same minimum used for the main fop. -	 */ -	 ec_lookup(fop->frame, fop->xl, fop->mask, fop->minimum, -                      ec_get_size_version_set, NULL, &loc, xdata); + +        ec_xattrop (fop->frame, fop->xl, fop->mask, fop->minimum, +                    ec_prepare_update_cbk, link, &loc, +                    GF_XATTROP_ADD_ARRAY64, dict, NULL);      } else { -        if (ec_loc_from_fd(fop->xl, &loc, fop->fd) != 0) { -	    goto out; -        }          ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum, -                ec_prepare_update_cbk, NULL, fop->fd, -                GF_XATTROP_ADD_ARRAY64, xdata, NULL); +                ec_prepare_update_cbk, link, lock->fd, +                GF_XATTROP_ADD_ARRAY64, dict, NULL);      } +      error = 0;  out: @@ -1230,9 +1028,8 @@ out:      loc_wipe(&loc); -    if (xdata != NULL) -    { -        dict_unref(xdata); +    if (dict != NULL) { +        dict_unref(dict);      }      if (error != 0) { @@ -1240,147 +1037,408 @@ out:      }  } -void ec_prepare_update(ec_fop_data_t *fop) +gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, +                               uint64_t *size)  { -    loc_t loc; -    dict_t *xdata; -    ec_fop_data_t *tmp; -    ec_lock_t *lock; -    ec_t *ec; -    uid_t uid; -    gid_t gid; -    uint64_t version[2] = {0, 0}; -    uint64_t dirty[2] = {0, 0}; -    int32_t error = ENOMEM; +    ec_inode_t *ctx; +    gf_boolean_t found = _gf_false; -    tmp = fop; -    while ((tmp != NULL) && (tmp->locks[0].lock == NULL)) { -        tmp = tmp->parent; -    } -    if ((tmp != NULL) && -        (tmp->locks[0].lock->is_dirty[0] || tmp->locks[0].lock->is_dirty[1])) { -        lock = tmp->locks[0].lock; +    LOCK(&inode->lock); -        fop->pre_size = fop->post_size = lock->size; -        fop->have_size = 1; +    ctx = __ec_inode_get(inode, fop->xl); +    if (ctx == NULL) { +        goto unlock; +    } -        return; +    if (ctx->have_size) { +        *size = ctx->post_size; +        found = _gf_true;      } -    uid = fop->frame->root->uid; -    gid = fop->frame->root->gid; -    fop->frame->root->uid = 0; -    fop->frame->root->gid = 0; +unlock: +    UNLOCK(&inode->lock); -    memset(&loc, 0, sizeof(loc)); +    return found; +} -    ec = fop->xl->private; -    if (ec_bits_count (fop->mask) >= ec->fragments) { -            /* It is changing data only if the update happens on at least -             * fragment number of bricks. Otherwise it probably is healing*/ -            if (ec_is_metadata_fop (fop->id)) -                    dirty[EC_METADATA_TXN] = 1; -            else -                    dirty[EC_DATA_TXN] = 1; +gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, +                               uint64_t size) +{ +    ec_inode_t *ctx; +    gf_boolean_t found = _gf_false; + +    LOCK(&inode->lock); + +    ctx = __ec_inode_get(inode, fop->xl); +    if (ctx == NULL) { +        goto unlock;      } -    xdata = dict_new(); -    if (xdata == NULL) { -        goto out; +    /* Normal fops always have ctx->have_size set. However self-heal calls this +     * to prepare the inode, so ctx->have_size will be false. In this case we +     * prepare both pre_size and post_size, and set have_size and have_info to +     * true. */ +    if (!ctx->have_size) { +        ctx->pre_size = size; +        ctx->have_size = ctx->have_info = _gf_true;      } -    if ((ec_dict_set_array(xdata, EC_XATTR_VERSION, -                           version, EC_VERSION_SIZE) != 0) || -        (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) || -        (ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) || -        (ec_dict_set_array(xdata, EC_XATTR_DIRTY, dirty, -                           EC_VERSION_SIZE) != 0)) { -            goto out; +    ctx->post_size = size; + +    found = _gf_true; + +unlock: +    UNLOCK(&inode->lock); + +    return found; +} + +void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode) +{ +    ec_inode_t *ctx; + +    LOCK(&inode->lock); + +    ctx = __ec_inode_get(inode, fop->xl); +    if (ctx == NULL) { +        goto unlock;      } -    error = EIO; +    ctx->have_info = _gf_false; +    ctx->have_config = _gf_false; +    ctx->have_version = _gf_false; +    ctx->have_size = _gf_false; +    ctx->have_dirty = _gf_false; -    if (!fop->use_fd) { -        if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) { -            goto out; -        } +    memset(&ctx->config, 0, sizeof(ctx->config)); +    memset(ctx->pre_version, 0, sizeof(ctx->pre_version)); +    memset(ctx->post_version, 0, sizeof(ctx->post_version)); +    ctx->pre_size = ctx->post_size = 0; +    memset(ctx->pre_dirty, 0, sizeof(ctx->pre_dirty)); +    memset(ctx->post_dirty, 0, sizeof(ctx->post_dirty)); + +unlock: +    UNLOCK(&inode->lock); +} -        ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum, -                   ec_prepare_update_cbk, NULL, &loc, GF_XATTROP_ADD_ARRAY64, -                   xdata, NULL); +int32_t ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                             int32_t op_ret, int32_t op_errno, inode_t *inode, +                             struct iatt *buf, dict_t *xdata, +                             struct iatt *postparent) +{ +    ec_fop_data_t *fop = cookie; +    ec_lock_link_t *link; + +    if (op_ret >= 0) { +        link = fop->data; +        if (ec_dict_del_number(xdata, EC_XATTR_SIZE, &link->size) != 0) { +            gf_log(this->name, GF_LOG_WARNING, +                   "Unable to determine real file size"); +        }      } else { -        ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum, -                   ec_prepare_update_cbk, NULL, fop->fd, -                   GF_XATTROP_ADD_ARRAY64, xdata, NULL); +        /* Prevent failure of parent fop. */ +        fop->error = 0;      } -    error = 0; +    return 0; +} -out: +/* This function is used to get the trusted.ec.size xattr from a file when + * no lock is needed on the inode. This is only required to maintan iatt + * structs on fops that manipulate directory entries but do not operate + * directly on the inode, like link, rename, ... + * + * Any error processing this request is ignored. In the worst case, an invalid + * or not up to date value in the iatt could cause some cache invalidation. + */ +void ec_get_real_size(ec_lock_link_t *link) +{ +    ec_fop_data_t *fop; +    dict_t *xdata; -    fop->frame->root->uid = uid; -    fop->frame->root->gid = gid; +    if (link->base == NULL) { +        return; +    } -    loc_wipe(&loc); +    if (link->base->inode->ia_type != IA_IFREG) { +        return; +    } +    fop = link->fop; + +    if (ec_get_inode_size(fop, link->base->inode, &link->size)) { +        return; +    } + +    xdata = dict_new(); +    if (xdata == NULL) { +        return; +    } +    if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) { +        goto out; +    } + +    /* Send a simple lookup. A single answer is considered ok since this value +     * is only used to return an iatt struct related to an inode that is not +     * locked and have not suffered any operation. */ +    ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link, +              link->base, xdata); + +out:      if (xdata != NULL) {          dict_unref(xdata);      } +} -    if (error != 0) { -        ec_fop_set_error(fop, error); +void ec_lock_acquired(ec_lock_link_t *link) +{ +    ec_lock_t *lock; +    ec_fop_data_t *fop; + +    lock = link->lock; +    fop = link->fop; + +    ec_trace("LOCKED", link->fop, "lock=%p", lock); + +    /* If the fop has an fd available, attach it to the lock structure to be +     * able to do fxattrop calls instead of xattrop. It's safe to change this +     * here because no xattrop using the fd can start concurrently at this +     * point. */ +    if (fop->use_fd) { +        if (lock->fd != NULL) { +            fd_unref(lock->fd); +        } +        lock->fd = fd_ref(fop->fd);      } +    lock->acquired = _gf_true; + +    fop->mask &= lock->good_mask; + +    fop->locked++; + +    ec_get_size_version(link); +    ec_get_real_size(link);  } -int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, -                    int32_t op_ret, int32_t op_errno, dict_t *xdata) +int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, +                  int32_t op_ret, int32_t op_errno, dict_t *xdata)  {      ec_fop_data_t *fop = cookie; +    ec_lock_link_t *link = NULL; +    ec_lock_t *lock = NULL; -    if (op_ret < 0) { -        gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", -               ec_fop_name(fop->parent->id)); +    if (op_ret >= 0) { +        link = fop->data; +        lock = link->lock; +        lock->mask = lock->good_mask = fop->good; + +        ec_lock_acquired(link); +        ec_lock(fop->parent);      } else { -        ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data); +        gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock");      }      return 0;  } -void ec_unlock_lock(ec_fop_data_t *fop, ec_lock_t *lock) +gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)  { -    if ((lock->mask != 0) && lock->acquired) { +    ec_lock_t *lock; +    ec_fop_data_t *fop; + +    lock = link->lock; +    fop = link->fop; +    if (!lock->acquired) {          ec_owner_set(fop->frame, lock); -        switch (lock->kind) { -        case EC_LOCK_ENTRY: -            ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s", lock, -                     lock->loc.inode, lock->loc.path); +        ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, +                 lock->loc.inode); + +        lock->flock.l_type = F_WRLCK; +        ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, +                   link, fop->xl->name, &lock->loc, F_SETLKW, &lock->flock, +                   NULL); + +        return _gf_false; +    } + +    ec_trace("LOCK_REUSE", fop, "lock=%p", lock); + +    ec_lock_acquired(link); + +    return _gf_true; +} + +void ec_lock(ec_fop_data_t *fop) +{ +    ec_lock_link_t *link; +    ec_lock_link_t *timer_link = NULL; +    ec_lock_t *lock; + +    /* There is a chance that ec_resume is called on fop even before ec_sleep. +     * Which can result in refs == 0 for fop leading to use after free in this +     * function when it calls ec_sleep so do ec_sleep at start and end of this +     * function.*/ +    ec_sleep (fop); +    while (fop->locked < fop->lock_count) { +        /* Since there are only up to 2 locks per fop, this xor will change +         * the order of the locks if fop->first_lock is 1. */ +        link = &fop->locks[fop->locked ^ fop->first_lock]; +        lock = link->lock; + +        timer_link = NULL; + +        LOCK(&lock->loc.inode->lock); +        GF_ASSERT (lock->inserted > 0); +        lock->inserted--; + +        if (lock->timer != NULL) { +            GF_ASSERT (lock->release == _gf_false); +            timer_link = lock->timer->data; +            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); +            gf_timer_call_cancel(fop->xl->ctx, lock->timer); +            lock->timer = NULL; + +            lock->refs--; +            /* There should remain at least 1 ref, the current one. */ +            GF_ASSERT(lock->refs > 0); +        } + +        GF_ASSERT(list_empty(&link->wait_list)); + +        if ((lock->owner != NULL) || lock->release) { +            if (lock->release) { +                ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock); -            ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, -                       ec_unlocked, lock, fop->xl->name, &lock->loc, NULL, -                       ENTRYLK_UNLOCK, lock->type, NULL); +                list_add_tail(&link->wait_list, &lock->frozen); + +                /* The lock is frozen, so we move the current reference to +                 * refs_frozen. After that, there should remain at least one +                 * ref belonging to the lock that is processing the release. */ +                lock->refs--; +                GF_ASSERT(lock->refs > 0); +                lock->refs_frozen++; +            } else { +                ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); + +                list_add_tail(&link->wait_list, &lock->waiting); +            } + +            UNLOCK(&lock->loc.inode->lock); + +            ec_sleep(fop);              break; +        } -        case EC_LOCK_INODE: -            lock->flock.l_type = F_UNLCK; -            ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, -                     lock->loc.inode); +        lock->owner = fop; -            ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, -                       ec_unlocked, lock, fop->xl->name, &lock->loc, F_SETLK, -                       &lock->flock, NULL); +        UNLOCK(&lock->loc.inode->lock); +        if (!ec_lock_acquire(link)) {              break; +        } -        default: -            gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type"); +        if (timer_link != NULL) { +            ec_resume(timer_link->fop, 0); +            timer_link = NULL;          }      } +    ec_resume (fop, 0); + +    if (timer_link != NULL) { +        ec_resume(timer_link->fop, 0); +    } +} + +void +ec_lock_unfreeze(ec_lock_link_t *link) +{ +    ec_lock_t *lock; -    ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); +    lock = link->lock; + +    LOCK(&lock->loc.inode->lock); + +    lock->acquired = _gf_false; +    lock->release = _gf_false; + +    lock->refs--; +    GF_ASSERT (lock->refs == lock->inserted); + +    GF_ASSERT(list_empty(&lock->waiting) && (lock->owner == NULL)); + +    list_splice_init(&lock->frozen, &lock->waiting); +    lock->refs += lock->refs_frozen; +    lock->refs_frozen = 0; + +    if (!list_empty(&lock->waiting)) { +        link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list); +        list_del_init(&link->wait_list); + +        lock->owner = link->fop; + +        UNLOCK(&lock->loc.inode->lock); + +        ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock); + +        if (ec_lock_acquire(link)) { +            ec_lock(link->fop); +        } +        ec_resume(link->fop, 0); +    } else if (lock->refs == 0) { +        ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock); + +        lock->ctx->inode_lock = NULL; + +        UNLOCK(&lock->loc.inode->lock); -    ec_lock_destroy(lock); +        ec_lock_destroy(lock); +    } else { +        UNLOCK(&lock->loc.inode->lock); +    } +} + +int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +    ec_fop_data_t *fop = cookie; +    ec_lock_link_t *link = fop->data; + +    if (op_ret < 0) { +        gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", +               ec_fop_name(link->fop->id)); +    } else { +        ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock); +    } + +    ec_lock_unfreeze(link); + +    return 0; +} + +void ec_unlock_lock(ec_lock_link_t *link) +{ +    ec_lock_t *lock; +    ec_fop_data_t *fop; + +    lock = link->lock; +    fop = link->fop; + +    ec_clear_inode_info(fop, lock->loc.inode); + +    if ((lock->mask != 0) && lock->acquired) { +        ec_owner_set(fop->frame, lock); + +        lock->flock.l_type = F_UNLCK; +        ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, +                 lock->loc.inode); + +        ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, +                   ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK, +                   &lock->flock, NULL); +    } else { +        ec_lock_unfreeze(link); +    }  }  int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, @@ -1388,111 +1446,128 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,                                      int32_t op_errno, dict_t * xattr,                                      dict_t * xdata)  { -    ec_fop_data_t * fop = cookie; +    ec_fop_data_t *fop = cookie; +    ec_lock_link_t *link; +    ec_lock_t *lock; +    ec_inode_t *ctx; -    if (op_ret < 0) -    { +    if (op_ret < 0) {          gf_log(fop->xl->name, GF_LOG_ERROR, "Failed to update version and "                                              "size (error %d)", op_errno); -    } -    else -    { +    } else {          fop->parent->mask &= fop->good; +        link = fop->data; +        lock = link->lock; +        ctx = lock->ctx; + +        if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version, +                              EC_VERSION_SIZE) == 0) { +            ctx->pre_version[0] = ctx->post_version[0]; +            ctx->pre_version[1] = ctx->post_version[1]; + +            ctx->have_version = _gf_true; +        } +        if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) { +            ctx->pre_size = ctx->post_size; + +            ctx->have_size = _gf_true; +        } +        if (ec_dict_del_array(xattr, EC_XATTR_DIRTY, ctx->post_dirty, +                              EC_VERSION_SIZE) == 0) { +            ctx->pre_dirty[0] = ctx->post_dirty[0]; +            ctx->pre_dirty[1] = ctx->post_dirty[1]; + +            ctx->have_dirty = _gf_true; +        } +        if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) && +            ec_config_check(fop->parent, &ctx->config)) { +            ctx->have_config = _gf_true; +        } + +        ctx->have_info = _gf_true;      } -    if (fop->data != NULL) { -        ec_unlock_lock(fop->parent, fop->data); +    if ((fop->parent->id != GF_FOP_FLUSH) && +        (fop->parent->id != GF_FOP_FSYNC) && +        (fop->parent->id != GF_FOP_FSYNCDIR)) { +        ec_unlock_lock(fop->data);      }      return 0;  } -uint64_t -ec_get_dirty_value (ec_t *ec, uintptr_t fop_mask, uint64_t version_delta, -                     gf_boolean_t dirty) -{ -        uint64_t dirty_val = 0; - -        if (version_delta) { -                if (~fop_mask & ec->node_mask) { -                        /* fop didn't succeed on all subvols so 'dirty' xattr -                         * shouldn't be cleared */ -                        if (!dirty) -                                dirty_val = 1; -                } else { -                        /* fop succeed on all subvols so 'dirty' xattr -                         * should be cleared */ -                        if (dirty) -                                dirty_val = -1; -                } -        } -        return dirty_val; -} -  void -ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2], -                       uint64_t size, gf_boolean_t dirty[2], ec_lock_t *lock) +ec_update_size_version(ec_lock_link_t *link, uint64_t *version, +                       uint64_t size, uint64_t *dirty)  { -    ec_t *ec = fop->xl->private; +    ec_fop_data_t *fop; +    ec_lock_t *lock; +    ec_inode_t *ctx;      dict_t * dict;      uid_t uid;      gid_t gid; -    uint64_t dirty_values[2] = {0}; -    int i = 0; -    if (fop->parent != NULL) -    { -        fop->parent->post_size = fop->post_size; +    fop = link->fop; -        return; -    } - -    ec_trace("UPDATE", fop, "version=%ld, size=%ld, dirty=%u", version, size, -             dirty); +    ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld", +             version[0], version[1], size, dirty[0], dirty[1]);      dict = dict_new(); -    if (dict == NULL) -    { +    if (dict == NULL) {          goto out;      } -    if (version[0] != 0 || version[1] != 0) { +    lock = link->lock; +    ctx = lock->ctx; + +    /* If we don't have version information or it has been modified, we +     * update it. */ +    if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {          if (ec_dict_set_array(dict, EC_XATTR_VERSION,                                version, EC_VERSION_SIZE) != 0) {              goto out;          }      } +      if (size != 0) { +        /* If size has been changed, we should already know the previous size +         * of the file. */ +        GF_ASSERT(ctx->have_size); +          if (ec_dict_set_number(dict, EC_XATTR_SIZE, size) != 0) {              goto out;          }      } -    for (i = 0; i < sizeof (dirty_values)/sizeof (dirty_values[0]); i++) { -            dirty_values[i] = ec_get_dirty_value (ec, fop->mask, version[i], -                                                  dirty[i]); -    } - -    if (dirty_values[0] || dirty_values[1]) { -        if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty_values, +    /* If we don't have dirty information or it has been modified, we update +     * it. */ +    if (!ctx->have_dirty || (dirty[0] != 0) || (dirty[1] != 0)) { +        if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty,                                EC_VERSION_SIZE) != 0) {              goto out;          }      } +    /* If config information is not know, we request it now. */ +    if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) { +        /* A failure requesting this xattr is ignored because it's not +         * absolutely required right now. */ +        ec_dict_set_number(dict, EC_XATTR_CONFIG, 0); +    } +      uid = fop->frame->root->uid;      gid = fop->frame->root->gid;      fop->frame->root->uid = 0;      fop->frame->root->gid = 0; -    if (fop->use_fd) { -            ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, -                       ec_update_size_version_done, lock, fop->fd, +    if (link->lock->fd == NULL) { +            ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, +                       ec_update_size_version_done, link, &link->lock->loc,                         GF_XATTROP_ADD_ARRAY64, dict, NULL);      } else { -            ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, -                       ec_update_size_version_done, lock, loc, +            ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, +                       ec_update_size_version_done, link, link->lock->fd,                         GF_XATTROP_ADD_ARRAY64, dict, NULL);      } @@ -1504,8 +1579,7 @@ ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2],      return;  out: -    if (dict != NULL) -    { +    if (dict != NULL) {          dict_unref(dict);      } @@ -1514,46 +1588,99 @@ out:      gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size");  } -void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock) +gf_boolean_t +ec_update_info(ec_lock_link_t *link)  { -    ec_trace("UNLOCK_NOW", fop, "lock=%p", lock); +    ec_lock_t *lock; +    ec_inode_t *ctx; +    uint64_t version[2]; +    uint64_t dirty[2]; +    uint64_t size; -    if ((lock->version_delta[0] != 0) || (lock->version_delta[1] != 0) || -         lock->is_dirty[0] || lock->is_dirty[1]) { -        ec_update_size_version(fop, &lock->loc, lock->version_delta, -                               lock->size_delta, lock->is_dirty, lock); -    } else { -        ec_unlock_lock(fop, lock); +    lock = link->lock; +    ctx = lock->ctx; + +    /* pre_version[*] will be 0 if have_version is false */ +    version[0] = ctx->post_version[0] - ctx->pre_version[0]; +    version[1] = ctx->post_version[1] - ctx->pre_version[1]; + +    size = ctx->post_size - ctx->pre_size; + +    /* pre_dirty[*] will be 0 if have_dirty is false */ +    dirty[0] = ctx->post_dirty[0] - ctx->pre_dirty[0]; +    dirty[1] = ctx->post_dirty[1] - ctx->pre_dirty[1]; + +    if ((version[0] != 0) || (version[1] != 0) || +        (dirty[0] != 0) || (dirty[1] != 0)) { +        ec_update_size_version(link, version, size, dirty); + +        return _gf_true;      } -    ec_resume(fop, 0); +    return _gf_false;  } -void ec_unlock_timer_cbk(void *data) +void +ec_unlock_now(ec_lock_link_t *link)  { -    ec_lock_link_t *link = data; -    ec_lock_t *lock = link->lock; -    ec_fop_data_t *fop = NULL; +    ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock); -    LOCK(&lock->loc.inode->lock); +    if (!ec_update_info(link)) { +        ec_unlock_lock(link); +    } -    if (lock->timer != NULL) { -        fop = link->fop; +    ec_resume(link->fop, 0); +} -        ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); +void +ec_unlock_timer_del(ec_lock_link_t *link) +{ +        int32_t before = 0; +        ec_lock_t *lock; +        inode_t *inode; +        gf_boolean_t now = _gf_false; + +        lock = link->lock; + +        /* A race condition can happen if timer expires, calls this function +         * and the lock is released (lock->loc is wiped) but the fop is not +         * fully completed yet (it's still on the list of pending fops). In +         * this case, this function can also be called if ec_unlock_force() is +         * called. */ +        inode = lock->loc.inode; +        if (inode == NULL) { +                return; +        } -        GF_ASSERT(lock->refs == 1); +        LOCK(&inode->lock); -        gf_timer_call_cancel(fop->xl->ctx, lock->timer); -        lock->timer = NULL; -        *lock->plock = NULL; -    } +        if (lock->timer != NULL) { +                ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock); -    UNLOCK(&lock->loc.inode->lock); +                gf_timer_call_cancel(link->fop->xl->ctx, lock->timer); +                lock->timer = NULL; -    if (fop != NULL) { -        ec_unlock_now(fop, lock); -    } +                lock->release = now = _gf_true; + +                before = lock->refs + lock->refs_frozen; +                list_splice_init(&lock->waiting, &lock->frozen); +                lock->refs_frozen += lock->refs - lock->inserted - 1; +                lock->refs = 1 + lock->inserted; +                /* We moved around the locks, so total number of locks shouldn't +                 * change by this operation*/ +                GF_ASSERT (before == (lock->refs + lock->refs_frozen)); +        } + +        UNLOCK(&inode->lock); + +        if (now) { +                ec_unlock_now(link); +        } +} + +void ec_unlock_timer_cbk(void *data) +{ +        ec_unlock_timer_del(data);  }  void ec_unlock_timer_add(ec_lock_link_t *link) @@ -1561,28 +1688,28 @@ void ec_unlock_timer_add(ec_lock_link_t *link)      struct timespec delay;      ec_fop_data_t *fop = link->fop;      ec_lock_t *lock = link->lock; -    int32_t refs = 1; +    gf_boolean_t now = _gf_false;      LOCK(&lock->loc.inode->lock);      GF_ASSERT(lock->timer == NULL); -    if (lock->refs != 1) { +    if ((lock->refs - lock->inserted) > 1) {          ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);          lock->refs--;          UNLOCK(&lock->loc.inode->lock);      } else if (lock->acquired) { -        delay.tv_sec = 1; -        delay.tv_nsec = 0; +        ec_t *ec = fop->xl->private;          ec_sleep(fop); -        /* If healing is needed, do not delay lock release to let self-heal -         * start working as soon as possible. */ -        if (!ec_fop_needs_heal(fop)) { -            ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); +        /* If healing is needed, the lock needs to be released due to +         * contention, or ec is shutting down, do not delay lock release. */ +        if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) { +            ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, +                     lock->release);              delay.tv_sec = 1;              delay.tv_nsec = 0; @@ -1592,26 +1719,25 @@ void ec_unlock_timer_add(ec_lock_link_t *link)                  gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an "                                                        "unlock"); -                *lock->plock = NULL; -                refs = 0; +                lock->release = now = _gf_true;              }          } else { -            ec_trace("UNLOCK_FORCE", fop, "lock=%p", lock); -            *lock->plock = NULL; -            refs = 0; +            ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock, +                     lock->release); +            lock->release = now = _gf_true;          }          UNLOCK(&lock->loc.inode->lock); -        if (refs == 0) { -            ec_unlock_now(fop, lock); +        if (now) { +            ec_unlock_now(link);          }      } else { -        *lock->plock = NULL; +        lock->release = _gf_true;          UNLOCK(&lock->loc.inode->lock); -        ec_lock_destroy(lock); +        ec_lock_unfreeze(link);      }  } @@ -1624,52 +1750,60 @@ void ec_unlock(ec_fop_data_t *fop)      }  } -void ec_flush_size_version(ec_fop_data_t * fop) +void +ec_unlock_force(ec_fop_data_t *fop)  { -    ec_lock_t * lock; -    uint64_t version[2], delta; -    gf_boolean_t dirty[2] = {_gf_false, _gf_false}; +        int32_t i; -    GF_ASSERT(fop->lock_count == 1); +        for (i = 0; i < fop->lock_count; i++) { +                ec_trace("UNLOCK_FORCED", fop, "lock=%p", &fop->locks[i]); -    lock = fop->locks[0].lock; - -    LOCK(&lock->loc.inode->lock); - -    GF_ASSERT(lock->owner == fop); - -    version[0] = lock->version_delta[0]; -    version[1] = lock->version_delta[1]; -    dirty[0] = lock->is_dirty[0]; -    dirty[1] = lock->is_dirty[1]; -    delta = lock->size_delta; -    lock->version_delta[0] = 0; -    lock->version_delta[1] = 0; -    lock->size_delta = 0; -    lock->is_dirty[0] = _gf_false; -    lock->is_dirty[1] = _gf_false; +                ec_unlock_timer_del(&fop->locks[i]); +        } +} -    UNLOCK(&lock->loc.inode->lock); +void ec_flush_size_version(ec_fop_data_t *fop) +{ +    GF_ASSERT(fop->lock_count == 1); -    if (version[0] > 0 || version[1] > 0 || dirty[0] || dirty[1]) -    { -        ec_update_size_version(fop, &lock->loc, version, delta, dirty, -                               NULL); -    } +    ec_update_info(&fop->locks[0]);  }  void ec_lock_reuse(ec_fop_data_t *fop)  { -    ec_fop_data_t * wait_fop; -    ec_lock_t * lock; -    ec_lock_link_t * link; -    int32_t i; +    ec_t *ec; +    ec_cbk_data_t *cbk; +    ec_lock_t *lock; +    ec_lock_link_t *link; +    ec_inode_t *ctx; +    int32_t i, count; +    gf_boolean_t release = _gf_false; + +    cbk = fop->answer; +    if (cbk != NULL) { +        if (cbk->xdata != NULL) { +            if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, +                                &count) == 0) && (count > 1)) { +                release = _gf_true; +            } +            if (release) { +                gf_log(fop->xl->name, GF_LOG_DEBUG, +                       "Lock contention detected"); +            } +        } +    } else { +        /* If we haven't get an answer with enough quorum, we always release +         * the lock. */ +        release = _gf_true; +    } + +    ec = fop->xl->private;      for (i = 0; i < fop->lock_count; i++)      { -        wait_fop = NULL; - -        lock = fop->locks[i].lock; +        link = &fop->locks[i]; +        lock = link->lock; +        ctx = lock->ctx;          LOCK(&lock->loc.inode->lock); @@ -1677,47 +1811,42 @@ void ec_lock_reuse(ec_fop_data_t *fop)          GF_ASSERT(lock->owner == fop);          lock->owner = NULL; +        lock->release |= release; -        if (((fop->locks_update >> i) & 1) != 0) { -            if (fop->error == 0) -            { -		if (ec_is_metadata_fop (fop->id)) { -                    lock->version_delta[1]++; -		} else { -                    lock->version_delta[0]++; -		} -                lock->size_delta += fop->post_size - fop->pre_size; -                if (fop->have_size) { -                    lock->size = fop->post_size; -                    lock->have_size = 1; +        if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) { +            if (link->update[0]) { +                ctx->post_version[0]++; +                if (ec->node_mask & ~fop->mask) { +                    ctx->post_dirty[0]++; +                } +            } +            if (link->update[1]) { +                ctx->post_version[1]++; +                if (ec->node_mask & ~fop->mask) { +                    ctx->post_dirty[1]++;                  }              }          }          lock->good_mask &= fop->mask; +        link = NULL;          if (!list_empty(&lock->waiting))          {              link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);              list_del_init(&link->wait_list); -            wait_fop = link->fop; - -            if (lock->kind == EC_LOCK_INODE) -            { -                wait_fop->pre_size = wait_fop->post_size = fop->post_size; -                wait_fop->have_size = fop->have_size; -            } -            wait_fop->mask &= fop->mask; +            lock->owner = link->fop;          }          UNLOCK(&lock->loc.inode->lock); -        if (wait_fop != NULL) +        if (link != NULL)          { -            ec_lock(wait_fop); - -            ec_resume(wait_fop, 0); +            if (ec_lock_acquire(link)) { +                ec_lock(link->fop); +            } +            ec_resume(link->fop, 0);          }      }  } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 9e0aaa0f079..c0db0218699 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -20,6 +20,9 @@ typedef enum {          EC_METADATA_TXN  } ec_txn_t; +#define EC_FOP_HEAL     -1 +#define EC_FOP_FHEAL    -2 +  #define EC_CONFIG_VERSION 0  #define EC_CONFIG_ALGORITHM 0 @@ -35,19 +38,20 @@ typedef enum {  #define EC_MINIMUM_MIN   -2  #define EC_MINIMUM_ALL   -3 -#define EC_LOCK_ENTRY   0 -#define EC_LOCK_INODE   1 +#define EC_UPDATE_DATA   1 +#define EC_UPDATE_META   2 +#define EC_QUERY_INFO    4 +#define EC_INODE_SIZE    8  #define EC_STATE_START                        0  #define EC_STATE_END                          0  #define EC_STATE_INIT                         1  #define EC_STATE_LOCK                         2 -#define EC_STATE_GET_SIZE_AND_VERSION         3 -#define EC_STATE_DISPATCH                     4 -#define EC_STATE_PREPARE_ANSWER               5 -#define EC_STATE_REPORT                       6 -#define EC_STATE_LOCK_REUSE                   7 -#define EC_STATE_UNLOCK                       8 +#define EC_STATE_DISPATCH                     3 +#define EC_STATE_PREPARE_ANSWER               4 +#define EC_STATE_REPORT                       5 +#define EC_STATE_LOCK_REUSE                   6 +#define EC_STATE_UNLOCK                       7  #define EC_STATE_DELAYED_START              100 @@ -81,15 +85,21 @@ void ec_update_bad(ec_fop_data_t * fop, uintptr_t good);  void ec_fop_set_error(ec_fop_data_t * fop, int32_t error); -void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update); -void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update); -void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update); +void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags); +void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, +                                  uint32_t flags); +void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags);  void ec_lock(ec_fop_data_t * fop);  void ec_lock_reuse(ec_fop_data_t *fop);  void ec_unlock(ec_fop_data_t * fop); +void ec_unlock_force(ec_fop_data_t *fop); + +gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, +                               uint64_t *size); +gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, +                               uint64_t size); +void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode); -void ec_get_size_version(ec_fop_data_t * fop); -void ec_prepare_update(ec_fop_data_t *fop);  void ec_flush_size_version(ec_fop_data_t * fop);  void ec_dispatch_all(ec_fop_data_t * fop); diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c index fb47aea90a8..047ccd5ff31 100644 --- a/xlators/cluster/ec/src/ec-data.c +++ b/xlators/cluster/ec/src/ec-data.c @@ -96,6 +96,19 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk)      mem_put(cbk);  } +/* PARENT_DOWN will be notified to children only after these fops are complete + * when graph switch happens.  We do not want graph switch to be waiting on + * heal to complete as healing big file/directory could take a while. Which + * will lead to hang on the mount. + */ +static inline gf_boolean_t +ec_needs_graceful_completion (ec_fop_data_t *fop) +{ +        if ((fop->id != EC_FOP_HEAL) && (fop->id != EC_FOP_FHEAL)) +                return _gf_true; +        return _gf_false; +} +  ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,                                       int32_t id, uint32_t flags,                                       uintptr_t target, int32_t minimum, @@ -114,6 +127,12 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,          return NULL;      } +    INIT_LIST_HEAD(&fop->cbk_list); +    INIT_LIST_HEAD(&fop->answer_list); +    INIT_LIST_HEAD(&fop->pending_list); +    INIT_LIST_HEAD(&fop->locks[0].wait_list); +    INIT_LIST_HEAD(&fop->locks[1].wait_list); +      fop->xl = this;      fop->req_frame = frame; @@ -148,9 +167,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,      fop->minimum = minimum;      fop->mask = target; -    INIT_LIST_HEAD(&fop->cbk_list); -    INIT_LIST_HEAD(&fop->answer_list); -      fop->wind = wind;      fop->handler = handler;      fop->cbks = cbks; @@ -165,17 +181,20 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,          parent = frame->local;          if (parent != NULL)          { -            LOCK(&parent->lock); - -            parent->jobs++; -            parent->refs++; - -            UNLOCK(&parent->lock); +            ec_sleep(parent);          }          fop->parent = parent;      } +    if (ec_needs_graceful_completion (fop)) { +            LOCK(&ec->lock); + +            list_add_tail(&fop->pending_list, &ec->pending_fops); + +            UNLOCK(&ec->lock); +    } +      return fop;  } @@ -190,10 +209,41 @@ void ec_fop_data_acquire(ec_fop_data_t * fop)      UNLOCK(&fop->lock);  } +static void +ec_handle_last_pending_fop_completion (ec_fop_data_t *fop, gf_boolean_t *notify) +{ +        ec_t *ec = fop->xl->private; + +        if (!list_empty (&fop->pending_list)) { +                LOCK(&ec->lock); +                { +                        list_del_init (&fop->pending_list); +                        *notify = list_empty (&ec->pending_fops); +                } +                UNLOCK(&ec->lock); +        } +} + +void +ec_fop_cleanup(ec_fop_data_t *fop) +{ +        ec_cbk_data_t *cbk, *tmp; + +        list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list) { +            list_del_init(&cbk->answer_list); + +            ec_cbk_data_destroy(cbk); +        } +        INIT_LIST_HEAD(&fop->cbk_list); + +        fop->answer = NULL; +} +  void ec_fop_data_release(ec_fop_data_t * fop)  { -    ec_cbk_data_t * cbk, * tmp; +    ec_t *ec = NULL;      int32_t refs; +    gf_boolean_t notify = _gf_false;      LOCK(&fop->lock); @@ -238,13 +288,13 @@ void ec_fop_data_release(ec_fop_data_t * fop)          ec_resume_parent(fop, fop->error); -        list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list) -        { -            list_del_init(&cbk->answer_list); - -            ec_cbk_data_destroy(cbk); -        } +        ec_fop_cleanup(fop); +        ec = fop->xl->private; +        ec_handle_last_pending_fop_completion (fop, ¬ify);          mem_put(fop); +        if (notify) { +            ec_pending_fops_completed(ec); +        }      }  } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9e5c92dd5b8..8204cf087de 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -67,10 +67,20 @@ struct _ec_fd  struct _ec_inode  {      uintptr_t         bad; -    ec_lock_t        *entry_lock;      ec_lock_t        *inode_lock; +    gf_boolean_t      have_info; +    gf_boolean_t      have_config; +    gf_boolean_t      have_version; +    gf_boolean_t      have_size; +    gf_boolean_t      have_dirty; +    ec_config_t       config; +    uint64_t          pre_version[2]; +    uint64_t          post_version[2]; +    uint64_t          pre_size; +    uint64_t          post_size; +    uint64_t          pre_dirty[2]; +    uint64_t          post_dirty[2];      struct list_head  heal; -  };  typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, @@ -80,7 +90,6 @@ typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *,                                      int32_t, int32_t, uintptr_t, uintptr_t,                                      uintptr_t, dict_t *); -  union _ec_cbk  {      fop_access_cbk_t       access; @@ -132,21 +141,21 @@ union _ec_cbk  struct _ec_lock  { -    ec_lock_t        **plock; +    ec_inode_t        *ctx;      gf_timer_t        *timer; -    struct list_head   waiting; +    struct list_head   waiting; /* Queue of requests being serviced. */ +    struct list_head   frozen;  /* Queue of requests that will be serviced in +                                   the next unlock/lock cycle. */      uintptr_t          mask;      uintptr_t          good_mask; -    int32_t            kind;      int32_t            refs; -    int32_t            acquired; -    int32_t            have_size; -    uint64_t           size; -    uint64_t           size_delta; -    uint64_t           version[2]; -    uint64_t           version_delta[2]; -    gf_boolean_t       is_dirty[2]; +    int32_t            refs_frozen; +    int32_t            inserted; +    gf_boolean_t       acquired; +    gf_boolean_t       release; +    gf_boolean_t       query;      ec_fop_data_t     *owner; +    fd_t              *fd;      loc_t              loc;      union      { @@ -157,9 +166,12 @@ struct _ec_lock  struct _ec_lock_link  { -    ec_lock_t *      lock; -    ec_fop_data_t *  fop; -    struct list_head wait_list; +    ec_lock_t        *lock; +    ec_fop_data_t    *fop; +    struct list_head  wait_list; +    gf_boolean_t      update[2]; +    loc_t            *base; +    uint64_t          size;  };  struct _ec_fop_data @@ -172,22 +184,19 @@ struct _ec_fop_data      int32_t            winds;      int32_t            jobs;      int32_t            error; -    ec_fop_data_t *    parent; -    xlator_t *         xl; -    call_frame_t *     req_frame;   // frame of the calling xlator -    call_frame_t *     frame;       // frame used by this fop -    struct list_head   cbk_list;    // sorted list of groups of answers -    struct list_head   answer_list; // list of answers -    ec_cbk_data_t *    answer;      // accepted answer +    ec_fop_data_t     *parent; +    xlator_t          *xl; +    call_frame_t      *req_frame;    /* frame of the calling xlator */ +    call_frame_t      *frame;        /* frame used by this fop */ +    struct list_head   cbk_list;     /* sorted list of groups of answers */ +    struct list_head   answer_list;  /* list of answers */ +    struct list_head   pending_list; /* member of ec_t.pending_fops */ +    ec_cbk_data_t     *answer;       /* accepted answer */      int32_t            lock_count;      int32_t            locked;      ec_lock_link_t     locks[2]; -    int32_t            locks_update; -    int32_t            have_size; -    uint64_t           pre_size; -    uint64_t           post_size; +    int32_t            first_lock;      gf_lock_t          lock; -    ec_config_t        config;      uint32_t           flags;      uint32_t           first; @@ -196,6 +205,7 @@ struct _ec_fop_data                                    if fop->minimum number of subvolumes succeed                                    which are not healing*/      uintptr_t          remaining; +    uintptr_t          received; /* Mask of responses */      uintptr_t          good;      uintptr_t          bad; @@ -203,7 +213,7 @@ struct _ec_fop_data      ec_handler_f       handler;      ec_resume_f        resume;      ec_cbk_t           cbks; -    void *             data; +    void              *data;      ec_heal_t         *heal;      uint64_t           user_size; @@ -211,8 +221,8 @@ struct _ec_fop_data      int32_t            use_fd; -    dict_t *           xdata; -    dict_t *           dict; +    dict_t            *xdata; +    dict_t            *dict;      int32_t            int32;      uint32_t           uint32;      uint64_t           size; @@ -222,14 +232,14 @@ struct _ec_fop_data      entrylk_type       entrylk_type;      gf_xattrop_flags_t xattrop_flags;      dev_t              dev; -    inode_t *          inode; -    fd_t *             fd; +    inode_t           *inode; +    fd_t              *fd;      struct iatt        iatt; -    char *             str[2]; +    char              *str[2];      loc_t              loc[2];      struct gf_flock    flock; -    struct iovec *     vector; -    struct iobref *    buffers; +    struct iovec      *vector; +    struct iobref     *buffers;  };  struct _ec_cbk_data @@ -299,4 +309,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,  void ec_fop_data_acquire(ec_fop_data_t * fop);  void ec_fop_data_release(ec_fop_data_t * fop); +void ec_fop_cleanup(ec_fop_data_t *fop); +  #endif /* __EC_DATA_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index ffc3ed5a7cd..354c63d3683 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -128,14 +128,9 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)              /* Fall through */          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 0); +            ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_get_size_version(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -195,7 +190,6 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index ffc96bf4351..ce09138fb7a 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -98,11 +98,10 @@ void ec_wind_create(ec_t * ec, ec_fop_data_t * fop, int32_t idx)  int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)  { - - -    ec_t * ec; -    ec_cbk_data_t * cbk; -    ec_fd_t * ctx; +    ec_config_t config; +    ec_t *ec; +    ec_cbk_data_t *cbk; +    ec_fd_t *ctx;      uint64_t version[2] = {0, 0};      switch (state) @@ -137,16 +136,15 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)              ec = fop->xl->private; -            fop->config.version = EC_CONFIG_VERSION; -            fop->config.algorithm = EC_CONFIG_ALGORITHM; -            fop->config.gf_word_size = EC_GF_BITS; -            fop->config.bricks = ec->nodes; -            fop->config.redundancy = ec->redundancy; -            fop->config.chunk_size = EC_METHOD_CHUNK_SIZE; +            config.version = EC_CONFIG_VERSION; +            config.algorithm = EC_CONFIG_ALGORITHM; +            config.gf_word_size = EC_GF_BITS; +            config.bricks = ec->nodes; +            config.redundancy = ec->redundancy; +            config.chunk_size = EC_METHOD_CHUNK_SIZE;              if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, -                                   &fop->config) < 0) -            { +                                   &config) < 0) {                  fop->error = EIO;                  return EC_STATE_REPORT; @@ -172,7 +170,8 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop);              return EC_STATE_DISPATCH; @@ -376,17 +375,11 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            // Parent entry of fop->loc[0] should be locked, but I don't -            // receive enough information to do it (fop->loc[0].parent is -            // NULL). -            ec_lock_prepare_entry(fop, &fop->loc[1], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[1], EC_UPDATE_DATA | +                                                            EC_UPDATE_META | +                                                            EC_INODE_SIZE);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_get_size_version(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -410,7 +403,7 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3,                                      cbk->count);                      if (cbk->iatt[0].ia_type == IA_IFREG) { -                        cbk->iatt[0].ia_size = fop->pre_size; +                        cbk->iatt[0].ia_size = fop->locks[0].size;                      }                      if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, @@ -446,7 +439,6 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: @@ -589,7 +581,8 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop);              return EC_STATE_DISPATCH; @@ -764,6 +757,7 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx)  int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)  { +    ec_config_t config;      ec_t *ec;      ec_cbk_data_t * cbk;      uint64_t version[2] = {0, 0}; @@ -783,15 +777,15 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)                  ec = fop->xl->private; -                fop->config.version = EC_CONFIG_VERSION; -                fop->config.algorithm = EC_CONFIG_ALGORITHM; -                fop->config.gf_word_size = EC_GF_BITS; -                fop->config.bricks = ec->nodes; -                fop->config.redundancy = ec->redundancy; -                fop->config.chunk_size = EC_METHOD_CHUNK_SIZE; +                config.version = EC_CONFIG_VERSION; +                config.algorithm = EC_CONFIG_ALGORITHM; +                config.gf_word_size = EC_GF_BITS; +                config.bricks = ec->nodes; +                config.redundancy = ec->redundancy; +                config.chunk_size = EC_METHOD_CHUNK_SIZE;                  if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, -                                       &fop->config) < 0) { +                                       &config) < 0) {                      fop->error = EIO;                      return EC_STATE_REPORT; @@ -814,7 +808,8 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop);              return EC_STATE_DISPATCH; @@ -997,15 +992,13 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); -            ec_lock_prepare_entry(fop, &fop->loc[1], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], EC_UPDATE_DATA | +                                                            EC_UPDATE_META | +                                                            EC_INODE_SIZE); +            ec_lock_prepare_parent_inode(fop, &fop->loc[1], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_get_size_version(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -1034,9 +1027,8 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5,                                      cbk->count); -                    if (cbk->iatt[0].ia_type == IA_IFREG) -                    { -                        cbk->iatt[0].ia_size = fop->pre_size; +                    if (cbk->iatt[0].ia_type == IA_IFREG) { +                        cbk->iatt[0].ia_size = fop->locks[0].size;                      }                  }              } @@ -1064,7 +1056,6 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: @@ -1191,7 +1182,8 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop);              return EC_STATE_DISPATCH; @@ -1361,7 +1353,8 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop);              return EC_STATE_DISPATCH; @@ -1552,14 +1545,10 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_prepare_entry(fop, &fop->loc[0], 1); +            ec_lock_prepare_parent_inode(fop, &fop->loc[0], +                                         EC_UPDATE_DATA | EC_UPDATE_META);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_get_size_version(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -1607,7 +1596,6 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h index d6b9770f720..7661077cca3 100644 --- a/xlators/cluster/ec/src/ec-fops.h +++ b/xlators/cluster/ec/src/ec-fops.h @@ -16,9 +16,6 @@  #include "ec-data.h"  #include "ec-common.h" -#define EC_FOP_HEAL     -1 -#define EC_FOP_FHEAL    -2 -  void ec_access(call_frame_t * frame, xlator_t * this, uintptr_t target,                 int32_t minimum, fop_access_cbk_t func, void *data, loc_t * loc,                 int32_t mask, dict_t * xdata); diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index d957bf6533d..f50c7a70560 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -320,15 +320,10 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_prepare_fd(fop, fop->fd, 0); +            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -	    ec_get_size_version(fop); - -	    return EC_STATE_DISPATCH; +            return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH:              ec_flush_size_version(fop); @@ -361,8 +356,10 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,                                      cbk->count); -                    cbk->iatt[0].ia_size = fop->pre_size; -                    cbk->iatt[1].ia_size = fop->post_size; +                    /* This shouldn't fail because we have the inode locked. */ +                    GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, +                                                &cbk->iatt[0].ia_size)); +                    cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;                  }              }              else @@ -388,7 +385,6 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: @@ -705,7 +701,6 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)  {      ec_cbk_data_t * ans = NULL;      ec_inode_t * ctx = NULL; -    ec_lock_t * lock = NULL;      data_t * data = NULL;      uint8_t * buff = NULL;      uint64_t size = 0; @@ -729,14 +724,14 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)      LOCK(&cbk->inode->lock);      ctx = __ec_inode_get(cbk->inode, fop->xl); -    if ((ctx != NULL) && (ctx->inode_lock != NULL)) +    if (ctx != NULL)      { -        lock = ctx->inode_lock; -        cbk->version[0] = lock->version[0]; -        cbk->version[1] = lock->version[1]; -        if (lock->have_size) -        { -            size = lock->size; +        if (ctx->have_version) { +            cbk->version[0] = ctx->post_version[0]; +            cbk->version[1] = ctx->post_version[1]; +        } +        if (ctx->have_size) { +            size = ctx->post_size;              have_size = 1;          }      } @@ -964,9 +959,20 @@ int32_t ec_manager_lookup(ec_fop_data_t * fop, int32_t state)              return EC_STATE_PREPARE_ANSWER;          case EC_STATE_PREPARE_ANSWER: +            /* +             * Lookup happens without any lock, so there is a chance that it +             * will have answers before modification happened and after +             * modification happened in the same response. So choose the next +             * best answer when the answers don't match for EC_MINIMUM_MIN +             */ + +            if (!fop->answer && !list_empty(&fop->cbk_list)) { +                fop->answer = list_entry (fop->cbk_list.next, ec_cbk_data_t, +                                          list); +            } +              cbk = fop->answer; -            if (cbk != NULL) -            { +            if (cbk != NULL) {                  if (!ec_dict_combine(cbk, EC_COMBINE_XDATA))                  {                      if (cbk->op_ret >= 0) @@ -986,9 +992,7 @@ int32_t ec_manager_lookup(ec_fop_data_t * fop, int32_t state)                      ec_lookup_rebuild(fop->xl->private, fop, cbk);                  } -            } -            else -            { +            } else {                  ec_fop_set_error(fop, EIO);              } @@ -1295,8 +1299,8 @@ out:  /* FOP: xattrop */ -int32_t ec_combine_xattrop(ec_fop_data_t * fop, ec_cbk_data_t * dst, -                           ec_cbk_data_t * src) +int32_t ec_combine_xattrop(ec_fop_data_t *fop, ec_cbk_data_t *dst, +                           ec_cbk_data_t *src)  {      if (!ec_dict_compare(dst->dict, src->dict))      { @@ -1316,9 +1320,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  {          ec_fop_data_t *fop = NULL;          ec_cbk_data_t *cbk = NULL; +        data_t *data; +        uint64_t *version;          int32_t idx = (int32_t)(uintptr_t)cookie; -        uint64_t version = 0; -        uint64_t *version_xattr = 0;          VALIDATE_OR_GOTO (this, out);          GF_VALIDATE_OR_GOTO (this->name, frame, out); @@ -1338,12 +1342,19 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          if (op_ret >= 0) {                  cbk->dict = dict_ref (xattr); -                if (dict_get_bin (xattr, EC_XATTR_VERSION, -                                  (void **)&version_xattr) == 0) { -                        version = ntoh64(version_xattr[0]); -                        if ((version >> EC_SELFHEAL_BIT) & 1) -                                fop->healing |= (1ULL<<idx); +                data = dict_get(cbk->dict, EC_XATTR_VERSION); +                if ((data != NULL) && (data->len >= sizeof(uint64_t))) { +                    version = (uint64_t *)data->data; + +                    if (((ntoh64(version[0]) >> EC_SELFHEAL_BIT) & 1) != 0) { +                            LOCK(&fop->lock); + +                            fop->healing |= 1ULL << idx; + +                            UNLOCK(&fop->lock); +                    }                  } +                  ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,                                     EC_VERSION_SIZE);          } @@ -1377,13 +1388,10 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            if (fop->fd == NULL) -            { -                ec_lock_prepare_inode(fop, &fop->loc[0], 1); -            } -            else -            { -                ec_lock_prepare_fd(fop, fop->fd, 1); +            if (fop->fd == NULL) { +                ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META); +            } else { +                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META);              }              ec_lock(fop); diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index ceddfeb6ac7..80725e5a9fa 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -119,9 +119,8 @@ void ec_heal_lookup_resume(ec_fop_data_t * fop)                  heal->version[0] = cbk->version[0];                  heal->version[1] = cbk->version[1];                  heal->raw_size = cbk->size; -                heal->fop->pre_size = cbk->iatt[0].ia_size; -                heal->fop->post_size = cbk->iatt[0].ia_size; -                heal->fop->have_size = 1; + +                GF_ASSERT(ec_set_inode_size(fop, cbk->inode, cbk->size));                  if (ec_loc_update(heal->xl, &heal->loc, cbk->inode,                                    &cbk->iatt[0]) != 0) @@ -532,12 +531,7 @@ ec_heal_init (ec_fop_data_t * fop)          gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path,                 uuid_utoa(heal->loc.gfid));      } else { -        LOCK(&fop->lock); - -        fop->jobs++; -        fop->refs++; - -        UNLOCK(&fop->lock); +        ec_sleep(fop);      }      list_add_tail(&heal->list, &ctx->heal); @@ -552,25 +546,8 @@ out:      return error;  } -void ec_heal_entrylk(ec_heal_t * heal, entrylk_cmd cmd) -{ -    loc_t loc; - -    if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) { -        gf_log("ec", GF_LOG_NOTICE, "ec_loc_parent() failed"); -        ec_fop_set_error(heal->fop, EIO); - -        return; -    } - -    ec_entrylk(heal->fop->frame, heal->xl, -1, EC_MINIMUM_ALL, NULL, NULL, -               heal->xl->name, &loc, NULL, cmd, ENTRYLK_WRLCK, NULL); - -    loc_wipe(&loc); -} - -void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd, -                     off_t offset, size_t size) +void ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc, +                  off_t offset, size_t size)  {      struct gf_flock flock; @@ -581,20 +558,47 @@ void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd,      flock.l_pid = 0;      flock.l_owner.len = 0; -    if (use_fd) +    /* Remove inode size information before unlocking it. */ +    if ((type == F_UNLCK) && (heal->loc.inode != NULL)) { +        ec_clear_inode_info(heal->fop, heal->loc.inode); +    } + +    if (fd != NULL)      {          ec_finodelk(heal->fop->frame, heal->xl, heal->fop->mask, -                    EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, heal->fd, +                    EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, fd,                      F_SETLKW, &flock, NULL);      }      else      {          ec_inodelk(heal->fop->frame, heal->xl, heal->fop->mask, EC_MINIMUM_ALL, -                   NULL, NULL, heal->xl->name, &heal->loc, F_SETLKW, &flock, +                   NULL, NULL, heal->xl->name, loc, F_SETLKW, &flock,                     NULL);      }  } +void ec_heal_entrylk(ec_heal_t *heal, int32_t type) +{ +    loc_t loc; + +    if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) { +        ec_fop_set_error(heal->fop, EIO); + +        return; +    } + +    ec_heal_lock(heal, type, NULL, &loc, 0, 0); + +    loc_wipe(&loc); +} + +void ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd, +                     off_t offset, size_t size) +{ +    ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset, +                 size); +} +  void ec_heal_lookup(ec_heal_t *heal, uintptr_t mask)  {      dict_t * xdata; @@ -1302,13 +1306,10 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)          case EC_STATE_DISPATCH:              if (heal->done != 0) { -                gf_log("ec", GF_LOG_NOTICE, "heal already done");                  return EC_STATE_HEAL_DISPATCH;              } -            gf_log("ec", GF_LOG_NOTICE, "heal before entrylk"); -            ec_heal_entrylk(heal, ENTRYLK_LOCK); -            gf_log("ec", GF_LOG_NOTICE, "heal after entrylk"); +            ec_heal_entrylk(heal, F_WRLCK);              return EC_STATE_HEAL_ENTRY_LOOKUP; @@ -1336,7 +1337,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)              /* Only heal data/metadata if enough information is supplied. */              if (gf_uuid_is_null(heal->loc.gfid))              { -                ec_heal_entrylk(heal, ENTRYLK_UNLOCK); +                ec_heal_entrylk(heal, F_UNLCK);                  return EC_STATE_HEAL_DISPATCH;              } @@ -1392,7 +1393,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)          case -EC_STATE_HEAL_UNLOCK_ENTRY:          case EC_STATE_HEAL_UNLOCK_ENTRY:              if (heal->nameheal) -                    ec_heal_entrylk(heal, ENTRYLK_UNLOCK); +                    ec_heal_entrylk(heal, F_UNLCK);              heal->bad = ec_heal_needs_data_rebuild(heal);              if (heal->bad != 0) @@ -2562,9 +2563,9 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,          EC_REPLIES_ALLOC (replies, ec->nodes);          output = alloca0 (ec->nodes);          locked_on = alloca0 (ec->nodes); -        ret = cluster_entrylk (ec->xl_list, participants, ec->nodes, replies, +        ret = cluster_inodelk (ec->xl_list, participants, ec->nodes, replies,                                 locked_on, frame, ec->xl, ec->xl->name, parent, -                               NULL); +                               0, 0);          {                  if (ret <= ec->fragments) {                          gf_log (ec->xl->name, GF_LOG_DEBUG, "%s/%s: Skipping " @@ -2578,8 +2579,8 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,                  ret = __ec_heal_name (frame, ec, parent, name, participants);          }  unlock: -        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, -                           frame, ec->xl, ec->xl->name, parent, NULL); +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, parent, 0, 0);  out:          cluster_replies_wipe (replies, ec->nodes);          loc_wipe (&loc); @@ -2663,9 +2664,9 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,          dirty = alloca0 (ec->nodes * sizeof (*dirty));          EC_REPLIES_ALLOC (replies, ec->nodes); -        ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies, +        ret = cluster_inodelk (ec->xl_list, heal_on, ec->nodes, replies,                                 locked_on, frame, ec->xl, ec->xl->name, inode, -                               NULL); +                               0, 0);          {                  if (ret <= ec->fragments) {                          gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " @@ -2680,8 +2681,8 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,                  source = ret;          }  unlock: -        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, -                           frame, ec->xl, ec->xl->name, inode, NULL); +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, inode, 0, 0);          if (ret < 0)                  goto out; @@ -2728,9 +2729,9 @@ ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,          sprintf (selfheal_domain, "%s:self-heal", ec->xl->name);          ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);          /*If other processes are already doing the heal, don't block*/ -        ret = cluster_entrylk (ec->xl_list, up_subvols, ec->nodes, replies, +        ret = cluster_inodelk (ec->xl_list, up_subvols, ec->nodes, replies,                                 locked_on, frame, ec->xl, selfheal_domain, inode, -                               NULL); +                               0, 0);          {                  if (ret <= ec->fragments) {                          gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " @@ -2743,8 +2744,8 @@ ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,                                         sources, healed_sinks);          }  unlock: -        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, -                           frame, ec->xl, selfheal_domain, inode, NULL); +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, selfheal_domain, inode, 0, 0);          cluster_replies_wipe (replies, ec->nodes);          return ret;  } @@ -3086,8 +3087,8 @@ ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target,      if (fop == NULL)          goto out; -    fop->pre_size = fop->post_size = heal->total_size; -    fop->have_size = 1; +    GF_ASSERT(ec_set_inode_size(fop, heal->fd->inode, heal->total_size)); +      error = 0;  out: diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 8ce3087d5a6..48251c84bac 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -738,3 +738,41 @@ ec_filter_internal_xattrs (dict_t *xattr)          dict_foreach_match (xattr, ec_is_internal_xattr, NULL,                              dict_remove_foreach_fn, NULL);  } + +gf_boolean_t +ec_is_data_fop (glusterfs_fop_t fop) +{ +        switch (fop) { +        case GF_FOP_WRITE: +        case GF_FOP_TRUNCATE: +        case GF_FOP_FTRUNCATE: +        case GF_FOP_FALLOCATE: +        case GF_FOP_DISCARD: +        case GF_FOP_ZEROFILL: +                return _gf_true; +        default: +                return _gf_false; +        } +        return _gf_false; +} +/* +gf_boolean_t +ec_is_metadata_fop (int32_t lock_kind, glusterfs_fop_t fop) +{ +        if (lock_kind == EC_LOCK_ENTRY) { +                return _gf_false; +        } + +        switch (fop) { +        case GF_FOP_SETATTR: +        case GF_FOP_FSETATTR: +        case GF_FOP_SETXATTR: +        case GF_FOP_FSETXATTR: +        case GF_FOP_REMOVEXATTR: +        case GF_FOP_FREMOVEXATTR: +                return _gf_true; +        default: +                return _gf_false; +        } +        return _gf_false; +}*/ diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index df4495138fe..14243df54f3 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -59,4 +59,11 @@ ec_is_internal_xattr (dict_t *dict, char *key, data_t *value, void *data);  void  ec_filter_internal_xattrs (dict_t *xattr); + +gf_boolean_t +ec_is_data_fop (glusterfs_fop_t fop); +/* +gf_boolean_t +ec_is_metadata_fop (glusterfs_fop_t fop); +*/  #endif /* __EC_HELPERS_H__ */ diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 7372c0a0599..853d914148b 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -267,9 +267,9 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)                  (strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,                           strlen(GF_XATTR_CLRLK_CMD)) != 0)) {                  if (fop->fd == NULL) { -                    ec_lock_prepare_inode(fop, &fop->loc[0], 0); +                    ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);                  } else { -                    ec_lock_prepare_fd(fop, fop->fd, 0); +                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);                  }                  ec_lock(fop);              } @@ -1094,12 +1094,12 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)      size_t fsize = 0, size = 0, max = 0;      int32_t i = 0; -    if (cbk->op_ret < 0) -    { +    if (cbk->op_ret < 0) {          goto out;      } -    cbk->iatt[0].ia_size = fop->pre_size; +    /* This shouldn't fail because we have the inode locked. */ +    GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size));      if (cbk->op_ret > 0)      { @@ -1331,15 +1331,10 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_prepare_fd(fop, fop->fd, 0); +            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -	    ec_get_size_version(fop); - -	    return EC_STATE_DISPATCH; +            return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH:              ec_dispatch_min(fop); @@ -1396,7 +1391,6 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: @@ -1580,22 +1574,14 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            if (fop->fd == NULL) -            { -                ec_lock_prepare_inode(fop, &fop->loc[0], 0); -            } -            else -            { -                ec_lock_prepare_fd(fop, fop->fd, 0); +            if (fop->fd == NULL) { +                ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO); +            } else { +                ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);              }              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -	    ec_get_size_version(fop); - -	    return EC_STATE_DISPATCH; +            return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH:              ec_dispatch_all(fop); @@ -1614,16 +1600,16 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)                          cbk->op_errno = EIO;                      }                  } -                if (cbk->op_ret < 0) -                { +                if (cbk->op_ret < 0) {                      ec_fop_set_error(fop, cbk->op_errno); -                } -                else -                { +                } else if (cbk->iatt[0].ia_type == IA_IFREG) {                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1,                                      cbk->count); -                    cbk->iatt[0].ia_size = fop->pre_size; +                    /* This shouldn't fail because we have the inode locked. */ +                    GF_ASSERT(ec_get_inode_size(fop, +                                                fop->locks[0].lock->loc.inode, +                                                &cbk->iatt[0].ia_size));                  }              }              else @@ -1659,7 +1645,6 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 6b485a26fbc..368b3ae5edf 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -123,11 +123,13 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state)          switch (state) {          case EC_STATE_INIT:          case EC_STATE_LOCK: -                if (fop->fd == NULL) -                    ec_lock_prepare_inode(fop, &fop->loc[0], 1); -                else -                    ec_lock_prepare_fd(fop, fop->fd, 1); - +                if (fop->fd == NULL) { +                        ec_lock_prepare_inode(fop, &fop->loc[0], +                                              EC_UPDATE_META | EC_QUERY_INFO); +                } else { +                        ec_lock_prepare_fd(fop, fop->fd, +                                           EC_UPDATE_META | EC_QUERY_INFO); +                }                  ec_lock(fop);                  return EC_STATE_DISPATCH; @@ -378,21 +380,15 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            if (fop->fd == NULL) -            { -                ec_lock_prepare_inode(fop, &fop->loc[0], 1); -            } -            else -            { -                ec_lock_prepare_fd(fop, fop->fd, 1); +            if (fop->fd == NULL) { +                ec_lock_prepare_inode(fop, &fop->loc[0], +                                      EC_UPDATE_META | EC_QUERY_INFO); +            } else { +                ec_lock_prepare_fd(fop, fop->fd, +                                   EC_UPDATE_META | EC_QUERY_INFO);              }              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_get_size_version(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -412,17 +408,17 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)                          cbk->op_errno = EIO;                      }                  } -                if (cbk->op_ret < 0) -                { +                if (cbk->op_ret < 0) {                      ec_fop_set_error(fop, cbk->op_errno); -                } -                else -                { +                } else if (cbk->iatt[0].ia_type == IA_IFREG) {                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,                                      cbk->count); -                    cbk->iatt[0].ia_size = fop->pre_size; -                    cbk->iatt[1].ia_size = fop->pre_size; +                    /* This shouldn't fail because we have the inode locked. */ +                    GF_ASSERT(ec_get_inode_size(fop, +                                                fop->locks[0].lock->loc.inode, +                                                &cbk->iatt[0].ia_size)); +                    cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;                  }              }              else @@ -462,7 +458,6 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: @@ -992,21 +987,17 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            if (fop->id == GF_FOP_TRUNCATE) -            { -                ec_lock_prepare_inode(fop, &fop->loc[0], 1); -            } -            else -            { -                ec_lock_prepare_fd(fop, fop->fd, 1); +            if (fop->id == GF_FOP_TRUNCATE) { +                ec_lock_prepare_inode(fop, &fop->loc[0], +                                      EC_UPDATE_DATA | EC_UPDATE_META | +                                      EC_QUERY_INFO); +            } else { +                ec_lock_prepare_fd(fop, fop->fd, +                                   EC_UPDATE_DATA | EC_UPDATE_META | +                                   EC_QUERY_INFO);              }              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_prepare_update(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -1035,14 +1026,18 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,                                      cbk->count); -                    cbk->iatt[0].ia_size = fop->pre_size; +                    /* This shouldn't fail because we have the inode locked. */ +                    GF_ASSERT(ec_get_inode_size(fop, +                                                fop->locks[0].lock->loc.inode, +                                                &cbk->iatt[0].ia_size));                      cbk->iatt[1].ia_size = fop->user_size; -                    fop->post_size = fop->user_size; -                    if ((fop->pre_size > fop->post_size) && -                        (fop->user_size != fop->offset)) -                    { -                        if (!ec_truncate_clean(fop)) -                        { +                    /* This shouldn't fail because we have the inode locked. */ +                    GF_ASSERT(ec_set_inode_size(fop, +                                                fop->locks[0].lock->loc.inode, +                                                fop->user_size)); +                    if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) && +                        (fop->user_size != fop->offset)) { +                        if (!ec_truncate_clean(fop)) {                              ec_fop_set_error(fop, EIO);                          }                      } @@ -1085,7 +1080,6 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT: @@ -1355,9 +1349,13 @@ void ec_writev_start(ec_fop_data_t *fop)      ec_fd_t *ctx;      fd_t *fd;      size_t tail; +    uint64_t current;      uid_t uid;      gid_t gid; +    /* This shouldn't fail because we have the inode locked. */ +    GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); +      fd = fd_anonymous(fop->fd->inode);      if (fd == NULL) {          ec_fop_set_error(fop, EIO); @@ -1373,7 +1371,7 @@ void ec_writev_start(ec_fop_data_t *fop)      ctx = ec_fd_get(fop->fd, fop->xl);      if (ctx != NULL) {          if ((ctx->flags & O_APPEND) != 0) { -            fop->offset = fop->pre_size; +            fop->offset = current;          }      } @@ -1404,22 +1402,17 @@ void ec_writev_start(ec_fop_data_t *fop)      iobref_unref(fop->buffers);      fop->buffers = iobref; -    if (fop->head > 0) -    { +    if (fop->head > 0) {          ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head,                   NULL, fd, ec->stripe_size, fop->offset, 0, NULL);      }      tail = fop->size - fop->user_size - fop->head; -    if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) -    { -        if (fop->pre_size > fop->offset + fop->head + fop->user_size) -        { +    if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { +        if (current > fop->offset + fop->head + fop->user_size) {              ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,                       ec_writev_merge_tail, NULL, fd, ec->stripe_size,                       fop->offset + fop->size - ec->stripe_size, 0, NULL); -        } -        else -        { +        } else {              memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);          }      } @@ -1530,14 +1523,11 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_prepare_fd(fop, fop->fd, 1); +            ec_lock_prepare_fd(fop, fop->fd, +                               EC_UPDATE_DATA | EC_UPDATE_META | +                               EC_QUERY_INFO);              ec_lock(fop); -            return EC_STATE_GET_SIZE_AND_VERSION; - -        case EC_STATE_GET_SIZE_AND_VERSION: -            ec_prepare_update(fop); -              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: @@ -1574,27 +1564,34 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)                      ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,                                      cbk->count); +                    /* This shouldn't fail because we have the inode locked. */ +                    GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, +                                                &cbk->iatt[0].ia_size)); +                    cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;                      size = fop->offset + fop->head + fop->user_size; -                    if (size > fop->pre_size) -                    { -                        fop->post_size = size; -                    } - -                    cbk->iatt[0].ia_size = fop->pre_size; -                    cbk->iatt[1].ia_size = fop->post_size; - -                    cbk->op_ret *= ec->fragments; -                    if (cbk->op_ret < fop->head) -                    { -                        cbk->op_ret = 0; -                    } -                    else -                    { -                        cbk->op_ret -= fop->head; +                    if (size > cbk->iatt[0].ia_size) { +                        /* Only update inode size if this is a top level fop. +                         * Otherwise this is an internal write and the top +                         * level fop should take care of the real inode size. +                         */ +                        if (fop->parent == NULL) { +                            /* This shouldn't fail because we have the inode +                             * locked. */ +                            GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode, +                                                        size)); +                        } +                        cbk->iatt[1].ia_size = size;                      } -                    if (cbk->op_ret > fop->user_size) -                    { -                        cbk->op_ret = fop->user_size; +                    if (fop->error == 0) { +                        cbk->op_ret *= ec->fragments; +                        if (cbk->op_ret < fop->head) { +                            cbk->op_ret = 0; +                        } else { +                            cbk->op_ret -= fop->head; +                        } +                        if (cbk->op_ret > fop->user_size) { +                            cbk->op_ret = fop->user_size; +                        }                      }                  }              } @@ -1621,8 +1618,8 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)          case -EC_STATE_INIT:          case -EC_STATE_LOCK: -        case -EC_STATE_GET_SIZE_AND_VERSION:          case -EC_STATE_DISPATCH: +        case -EC_STATE_DELAYED_START:          case -EC_STATE_PREPARE_ANSWER:          case -EC_STATE_REPORT:              GF_ASSERT(fop->error != 0); diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c index 10572037932..22b6fa4d6e5 100644 --- a/xlators/cluster/ec/src/ec-locks.c +++ b/xlators/cluster/ec/src/ec-locks.c @@ -37,13 +37,22 @@ int32_t ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)              locked |= ans->mask;              cbk = ans;          } else { -            notlocked |= ans->mask; +                if (ans->op_errno == EAGAIN) { +                        switch (fop->uint32) { +                        case EC_LOCK_MODE_NONE: +                        case EC_LOCK_MODE_ALL: +                                /* Goal is to treat non-blocking lock as failure +                                 * even if there is a signle EAGAIN*/ +                                notlocked |= ans->mask; +                                break; +                        } +                }          }      }      if (error == -1) {          if (ec_bits_count(locked | notlocked) >= ec->fragments) { -            if (ec_bits_count (locked) >= ec->fragments) { +            if (notlocked == 0) {                  if (fop->answer == NULL) {                      fop->answer = cbk;                  } diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 3dd04299541..4028aa4d2bb 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -278,6 +278,7 @@ ec_notify_cbk (void *data)  {          ec_t *ec = data;          glusterfs_event_t event = GF_EVENT_MAXVAL; +        gf_boolean_t propagate = _gf_false;          LOCK(&ec->lock);          { @@ -309,10 +310,14 @@ ec_notify_cbk (void *data)                  /* CHILD_DOWN should not come here as no grace period is given                   * for notifying CHILD_DOWN. */ -                default_notify (ec->xl, event, NULL); +                propagate = _gf_true;          }  unlock:          UNLOCK(&ec->lock); + +        if (propagate) { +                default_notify (ec->xl, event, NULL); +        }  }  void @@ -360,6 +365,49 @@ ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx)          }  } +gf_boolean_t +ec_force_unlocks(ec_t *ec) +{ +        struct list_head list; +        ec_fop_data_t *fop; + +        if (list_empty(&ec->pending_fops)) { +                return _gf_true; +        } + +        INIT_LIST_HEAD(&list); + +        /* All pending fops when GF_EVENT_PARENT_DOWN is received should only +         * be fops waiting for a delayed unlock. However the unlock can +         * generate new fops. We don't want to trverse these new fops while +         * forcing unlocks, so we move all fops to a temporal list. To process +         * them without interferences.*/ +        list_splice_init(&ec->pending_fops, &list); + +        while (!list_empty(&list)) { +                fop = list_entry(list.next, ec_fop_data_t, pending_list); +                list_move_tail(&fop->pending_list, &ec->pending_fops); + +                UNLOCK(&ec->lock); + +                ec_unlock_force(fop); + +                LOCK(&ec->lock); +        } + +        ec->shutdown = _gf_true; + +        return list_empty(&ec->pending_fops); +} + +void +ec_pending_fops_completed(ec_t *ec) +{ +        if (ec->shutdown) { +                default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL); +        } +} +  int32_t  ec_notify (xlator_t *this, int32_t event, void *data, void *data2)  { @@ -367,14 +415,16 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)          int32_t           idx       = 0;          int32_t           error     = 0;          glusterfs_event_t old_event = GF_EVENT_MAXVAL; -        glusterfs_event_t new_event = GF_EVENT_MAXVAL;          dict_t            *input    = NULL;          dict_t            *output   = NULL; +        gf_boolean_t      propagate = _gf_true; + +        gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %p", +                event, data, data2);          if (event == GF_EVENT_TRANSLATOR_OP) {                  if (!ec->up) {                          error = -1; -                        goto out;                  } else {                          input = data;                          output = data2; @@ -400,13 +450,14 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)                   */                  ec_launch_notify_timer (this, ec);                  goto unlock; +        } else if (event == GF_EVENT_PARENT_DOWN) { +                /* If there aren't pending fops running after we have waken up +                 * them, we immediately propagate the notification. */ +                propagate = ec_force_unlocks(ec); +                goto unlock;          } -        gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d", -                event, data, idx); -          if (idx < ec->nodes) { /* CHILD_* events */ -                  old_event = ec_get_event_from_state (ec);                  if (event == GF_EVENT_CHILD_UP) { @@ -415,28 +466,30 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)                          ec_handle_down (this, ec, idx);                  } -                new_event = ec_get_event_from_state (ec); +                event = ec_get_event_from_state (ec); -                if (new_event == GF_EVENT_CHILD_UP && !ec->up) { +                if (event == GF_EVENT_CHILD_UP && !ec->up) {                          ec_up (this, ec); -                } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) { +                } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {                          ec_down (this, ec);                  } -                if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL)) -                        new_event = GF_EVENT_CHILD_MODIFIED; - -                event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */ -                if (new_event != GF_EVENT_MAXVAL) -                        error = default_notify (this, new_event, data); +                if (event != GF_EVENT_MAXVAL) { +                        if (event == old_event) { +                                event = GF_EVENT_CHILD_MODIFIED; +                        } +                } else { +                        propagate = _gf_false; +                }          } -    unlock: -            UNLOCK (&ec->lock); +unlock: +        UNLOCK (&ec->lock); -            if (event != GF_EVENT_MAXVAL) -                    return default_notify (this, event, data); +        if (propagate) { +                error = default_notify (this, event, data); +        }  out: -            return error; +        return error;  }  int32_t @@ -478,6 +531,8 @@ init (xlator_t *this)      ec->xl = this;      LOCK_INIT(&ec->lock); +    INIT_LIST_HEAD(&ec->pending_fops); +      ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);      ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096);      ec->lock_pool = mem_pool_new(ec_lock_t, 1024); diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index b8f3e288197..fdedb89ec18 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -44,6 +44,8 @@ struct _ec      xlator_t **       xl_list;      gf_lock_t         lock;      gf_timer_t *      timer; +    gf_boolean_t      shutdown; +    struct list_head  pending_fops;      struct mem_pool * fop_pool;      struct mem_pool * cbk_pool;      struct mem_pool * lock_pool; @@ -51,4 +53,7 @@ struct _ec      char              vol_uuid[UUID_SIZE + 1];      dict_t           *leaf_to_subvolid;  }; + +void ec_pending_fops_completed(ec_t *ec); +  #endif /* __EC_H__ */  | 
