/* Copyright (c) 2012-2014 DataLab, s.l. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include "ec-messages.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" #include "ec-mem-types.h" int32_t ec_update_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { ec_fop_data_t *fop = cookie; ec_cbk_data_t *cbk = NULL; ec_fop_data_t *parent = fop->parent; int i = 0; ec_trace("UPDATE_WRITEV_CBK", cookie, "ret=%d, errno=%d, parent-fop=%s", op_ret, op_errno, ec_fop_name(parent->id)); if (op_ret < 0) { ec_fop_set_error(parent, op_errno); goto out; } cbk = ec_cbk_data_allocate(parent->frame, this, parent, parent->id, 0, op_ret, op_errno); if (!cbk) { ec_fop_set_error(parent, ENOMEM); goto out; } if (xdata) cbk->xdata = dict_ref(xdata); if (prebuf) cbk->iatt[i++] = *prebuf; if (postbuf) cbk->iatt[i++] = *postbuf; LOCK(&parent->lock); { parent->good &= fop->good; if (gf_bits_count(parent->good) < parent->minimum) { __ec_fop_set_error(parent, EIO); } else if (fop->error == 0 && parent->answer == NULL) { parent->answer = cbk; } } UNLOCK(&parent->lock); out: return 0; } static int32_t ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, uint64_t size) { struct iobref *iobref = NULL; struct iobuf *iobuf = NULL; struct iovec vector; int32_t err = -ENOMEM; iobref = iobref_new(); if (iobref == NULL) { goto out; } iobuf = iobuf_get(fop->xl->ctx->iobuf_pool); if (iobuf == NULL) { goto out; } err = iobref_add(iobref, iobuf); if (err != 0) { goto out; } if (fop->locks[0].lock) ec_lock_update_good(fop->locks[0].lock, fop); vector.iov_base = iobuf->ptr; vector.iov_len = size; memset(vector.iov_base, 0, vector.iov_len); ec_writev(fop->frame, fop->xl, mask, fop->minimum, ec_update_writev_cbk, NULL, fop->fd, &vector, 1, offset, 0, iobref, NULL); err = 0; out: if (iobuf != NULL) { iobuf_unref(iobuf); } if (iobref != NULL) { iobref_unref(iobref); } return err; } int ec_inode_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie, int op_ret, int op_errno, struct iatt *prestat, struct iatt *poststat, dict_t *xdata) { ec_fop_data_t *fop = NULL; ec_cbk_data_t *cbk = NULL; int i = 0; int idx = 0; VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, frame->local, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = frame->local; idx = (int32_t)(uintptr_t)cookie; ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, op_ret, op_errno); cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret, op_errno); if (!cbk) goto out; if (op_ret < 0) goto out; if (xdata) cbk->xdata = dict_ref(xdata); if (prestat) cbk->iatt[i++] = *prestat; if (poststat) cbk->iatt[i++] = *poststat; out: if (cbk) ec_combine(cbk, ec_combine_write); if (fop) ec_complete(fop); return 0; } /* FOP: removexattr */ int32_t ec_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL, xdata); } void ec_wind_removexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_removexattr_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->removexattr, &fop->loc[0], fop->str[0], fop->xdata); } void ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { ec_fop_data_t *fop = cookie; switch (fop->id) { case GF_FOP_SETXATTR: if (fop->cbks.setxattr) { QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret, op_errno, xdata); } break; case GF_FOP_REMOVEXATTR: if (fop->cbks.removexattr) { QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this, op_ret, op_errno, xdata); } break; case GF_FOP_FSETXATTR: if (fop->cbks.fsetxattr) { QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this, op_ret, op_errno, xdata); } break; case GF_FOP_FREMOVEXATTR: if (fop->cbks.fremovexattr) { QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this, op_ret, op_errno, xdata); } break; } } int32_t ec_manager_xattr(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk; switch (state) { case EC_STATE_INIT: case EC_STATE_LOCK: if (fop->fd == NULL) { ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META | EC_QUERY_INFO, 0, EC_RANGE_FULL); } else { ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO, 0, EC_RANGE_FULL); } ec_lock(fop); return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_dispatch_all(fop); return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: ec_fop_prepare_answer(fop, _gf_false); return EC_STATE_REPORT; case EC_STATE_REPORT: cbk = fop->answer; GF_ASSERT(cbk != NULL); ec_xattr_cbk(fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, cbk->xdata); return EC_STATE_LOCK_REUSE; case -EC_STATE_INIT: case -EC_STATE_LOCK: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); ec_xattr_cbk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL); return EC_STATE_LOCK_REUSE; case -EC_STATE_LOCK_REUSE: case EC_STATE_LOCK_REUSE: ec_lock_reuse(fop); return EC_STATE_UNLOCK; case -EC_STATE_UNLOCK: case EC_STATE_UNLOCK: ec_unlock(fop); return EC_STATE_END; default: gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", state, ec_fop_name(fop->id)); return EC_STATE_END; } } void ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_removexattr_cbk_t func, void *data, loc_t *loc, const char *name, dict_t *xdata) { ec_cbk_t callback = {.removexattr = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(REMOVEXATTR) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_REMOVEXATTR, 0, target, fop_flags, ec_wind_removexattr, ec_manager_xattr, callback, data); if (fop == NULL) { goto out; } if (loc != NULL) { if (loc_copy(&fop->loc[0], loc) != 0) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, "Failed to copy a location."); goto out; } } if (name != NULL) { fop->str[0] = gf_strdup(name); if (fop->str[0] == NULL) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, "Failed to duplicate a string."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL); } } /* FOP: fremovexattr */ int32_t ec_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL, xdata); } void ec_wind_fremovexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_fremovexattr_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->fremovexattr, fop->fd, fop->str[0], fop->xdata); } void ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data, fd_t *fd, const char *name, dict_t *xdata) { ec_cbk_t callback = {.fremovexattr = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(FREMOVEXATTR) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FREMOVEXATTR, 0, target, fop_flags, ec_wind_fremovexattr, ec_manager_xattr, callback, data); if (fop == NULL) { goto out; } fop->use_fd = 1; if (fd != NULL) { fop->fd = fd_ref(fd); if (fop->fd == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, "Failed to reference a " "file descriptor."); goto out; } } if (name != NULL) { fop->str[0] = gf_strdup(name); if (fop->str[0] == NULL) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, "Failed to duplicate a string."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL); } } /* FOP: setattr */ int32_t ec_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prestat, struct iatt *poststat, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, poststat, xdata); } void ec_wind_setattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_setattr_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->setattr, &fop->loc[0], &fop->iatt, fop->int32, fop->xdata); } int32_t ec_manager_setattr(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk; switch (state) { case EC_STATE_INIT: case EC_STATE_LOCK: if (fop->fd == NULL) { ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META | EC_QUERY_INFO, 0, EC_RANGE_FULL); } else { ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO, 0, EC_RANGE_FULL); } ec_lock(fop); return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_dispatch_all(fop); return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: cbk = ec_fop_prepare_answer(fop, _gf_false); if (cbk != NULL) { if (cbk->iatt[0].ia_type == IA_IFREG) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &cbk->iatt[0].ia_size)); cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } } return EC_STATE_REPORT; case EC_STATE_REPORT: cbk = fop->answer; GF_ASSERT(cbk != NULL); if (fop->id == GF_FOP_SETATTR) { if (fop->cbks.setattr != NULL) { QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } else { if (fop->cbks.fsetattr != NULL) { QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } return EC_STATE_LOCK_REUSE; case -EC_STATE_INIT: case -EC_STATE_LOCK: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); if (fop->id == GF_FOP_SETATTR) { if (fop->cbks.setattr != NULL) { fop->cbks.setattr(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } } else { if (fop->cbks.fsetattr != NULL) { fop->cbks.fsetattr(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } } return EC_STATE_LOCK_REUSE; case -EC_STATE_LOCK_REUSE: case EC_STATE_LOCK_REUSE: ec_lock_reuse(fop); return EC_STATE_UNLOCK; case -EC_STATE_UNLOCK: case EC_STATE_UNLOCK: ec_unlock(fop); return EC_STATE_END; default: gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", state, ec_fop_name(fop->id)); return EC_STATE_END; } } void ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { ec_cbk_t callback = {.setattr = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(SETATTR) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target, fop_flags, ec_wind_setattr, ec_manager_setattr, callback, data); if (fop == NULL) { goto out; } fop->int32 = valid; if (loc != NULL) { if (loc_copy(&fop->loc[0], loc) != 0) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, "Failed to copy a location."); goto out; } } if (stbuf != NULL) { fop->iatt = *stbuf; } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } } /* FOP: fsetattr */ int32_t ec_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prestat, struct iatt *poststat, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, poststat, xdata); } void ec_wind_fsetattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_fsetattr_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->fsetattr, fop->fd, &fop->iatt, fop->int32, fop->xdata); } void ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) { ec_cbk_t callback = {.fsetattr = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(FSETATTR) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target, fop_flags, ec_wind_fsetattr, ec_manager_setattr, callback, data); if (fop == NULL) { goto out; } fop->use_fd = 1; fop->int32 = valid; if (fd != NULL) { fop->fd = fd_ref(fd); if (fop->fd == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, "Failed to reference a " "file descriptor."); goto out; } } if (stbuf != NULL) { fop->iatt = *stbuf; } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } } /* FOP: setxattr */ int32_t ec_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL, xdata); } void ec_wind_setxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_setxattr_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->setxattr, &fop->loc[0], fop->dict, fop->int32, fop->xdata); } void ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { ec_cbk_t callback = {.setxattr = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(SETXATTR) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target, fop_flags, ec_wind_setxattr, ec_manager_xattr, callback, data); if (fop == NULL) { goto out; } fop->int32 = flags; if (loc != NULL) { if (loc_copy(&fop->loc[0], loc) != 0) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, "Failed to copy a location."); goto out; } } if (dict != NULL) { fop->dict = dict_copy_with_ref(dict, NULL); if (fop->dict == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL); } } /* FOP: fsetxattr */ int32_t ec_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { ec_fop_data_t *fop = NULL; ec_cbk_data_t *cbk = NULL; int32_t idx = (int32_t)(uintptr_t)cookie; VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, frame->local, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = frame->local; ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, op_ret, op_errno); cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSETXATTR, idx, op_ret, op_errno); if (cbk != NULL) { if (xdata != NULL) { cbk->xdata = dict_ref(xdata); if (cbk->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } ec_combine(cbk, NULL); } out: if (fop != NULL) { ec_complete(fop); } return 0; } void ec_wind_fsetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_fsetxattr_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->fsetxattr, fop->fd, fop->dict, fop->int32, fop->xdata); } void ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { ec_cbk_t callback = {.fsetxattr = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(FSETXATTR) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETXATTR, 0, target, fop_flags, ec_wind_fsetxattr, ec_manager_xattr, callback, data); if (fop == NULL) { goto out; } fop->use_fd = 1; fop->int32 = flags; if (fd != NULL) { fop->fd = fd_ref(fd); if (fop->fd == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, "Failed to reference a " "file descriptor."); goto out; } } if (dict != NULL) { fop->dict = dict_copy_with_ref(dict, NULL); if (fop->dict == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL); } } /********************************************************************* * * File Operation : fallocate * *********************************************************************/ int32_t ec_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf, postbuf, xdata); } void ec_wind_fallocate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_fallocate_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->fallocate, fop->fd, fop->int32, fop->offset, fop->size, fop->xdata); } int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk = NULL; switch (state) { case EC_STATE_INIT: if (fop->size == 0) { ec_fop_set_error(fop, EINVAL); return EC_STATE_REPORT; } if (fop->int32 & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_PUNCH_HOLE)) { ec_fop_set_error(fop, ENOTSUP); return EC_STATE_REPORT; } fop->user_size = fop->offset + fop->size; fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset, _gf_true); fop->size += fop->head; ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true); /* Fall through */ case EC_STATE_LOCK: ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, fop->offset, fop->size); ec_lock(fop); return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_dispatch_all(fop); return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: cbk = ec_fop_prepare_answer(fop, _gf_false); if (cbk != NULL) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); /* This shouldn't fail because we have the inode locked. */ LOCK(&fop->locks[0].lock->loc.inode->lock); { GF_ASSERT(__ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &cbk->iatt[0].ia_size)); /*If mode has FALLOC_FL_KEEP_SIZE keep the size */ if (fop->int32 & FALLOC_FL_KEEP_SIZE) { cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } else if (fop->user_size > cbk->iatt[0].ia_size) { cbk->iatt[1].ia_size = fop->user_size; /* This shouldn't fail because we have the inode * locked. */ GF_ASSERT(__ec_set_inode_size( fop, fop->locks[0].lock->loc.inode, cbk->iatt[1].ia_size)); } else { cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } } UNLOCK(&fop->locks[0].lock->loc.inode->lock); } return EC_STATE_REPORT; case EC_STATE_REPORT: cbk = fop->answer; GF_ASSERT(cbk != NULL); if (fop->cbks.fallocate != NULL) { QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; case -EC_STATE_INIT: case -EC_STATE_LOCK: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); if (fop->cbks.fallocate != NULL) { fop->cbks.fallocate(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } return EC_STATE_LOCK_REUSE; case -EC_STATE_LOCK_REUSE: case EC_STATE_LOCK_REUSE: ec_lock_reuse(fop); return EC_STATE_UNLOCK; case -EC_STATE_UNLOCK: case EC_STATE_UNLOCK: ec_unlock(fop); return EC_STATE_END; default: gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", state, ec_fop_name(fop->id)); return EC_STATE_END; } } void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd, int32_t mode, off_t offset, size_t len, dict_t *xdata) { ec_cbk_t callback = {.fallocate = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(FALLOCATE) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target, fop_flags, ec_wind_fallocate, ec_manager_fallocate, callback, data); if (fop == NULL) { goto out; } fop->use_fd = 1; fop->int32 = mode; fop->offset = offset; fop->size = len; if (fd != NULL) { fop->fd = fd_ref(fd); if (fop->fd == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, "Failed to reference a " "file descriptor."); goto out; } } if (xdata != NULL) { fop->xdata = dict_ref(xdata); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } } /********************************************************************* * * File Operation : Discard * *********************************************************************/ void ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask) { ec_t *ec = fop->xl->private; off_t off_head = 0; off_t off_tail = 0; uint64_t size_head = 0; uint64_t size_tail = 0; int error = 0; off_head = fop->offset * ec->fragments - fop->int32; if (fop->size == 0) { error = ec_update_write(fop, mask, off_head, fop->user_size); } else { size_head = fop->int32; size_tail = (off_head + fop->user_size) % ec->stripe_size; off_tail = off_head + fop->user_size - size_tail; if (size_head) { error = ec_update_write(fop, mask, off_head, size_head); if (error) { goto out; } } if (size_tail) { error = ec_update_write(fop, mask, off_tail, size_tail); } } out: if (error) ec_fop_set_error(fop, -error); } void ec_discard_adjust_offset_size(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; fop->user_size = fop->size; /* If discard length covers at least a fragment on brick, we will * perform discard operation(when fop->size is non-zero) else we just * write zeros. */ fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true); fop->frag_range.first = fop->offset; if (fop->size < fop->int32) { fop->size = 0; } else { fop->size -= fop->int32; ec_adjust_size_down(ec, &fop->size, _gf_true); } fop->frag_range.last = fop->offset + fop->size; } int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf, postbuf, xdata); } void ec_wind_discard(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_discard_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->discard, fop->fd, fop->offset, fop->size, fop->xdata); } int32_t ec_manager_discard(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk = NULL; off_t fl_start = 0; uint64_t fl_size = 0; switch (state) { case EC_STATE_INIT: if ((fop->size <= 0) || (fop->offset < 0)) { ec_fop_set_error(fop, EINVAL); return EC_STATE_REPORT; } /* Because of the head/tail writes, "discard" happens on the * remaining regions, but we need to compute region including * head/tail writes so compute them separately*/ fl_start = fop->offset; fl_size = fop->size; fl_size += ec_adjust_offset_down(fop->xl->private, &fl_start, _gf_true); ec_adjust_size_up(fop->xl->private, &fl_size, _gf_true); ec_discard_adjust_offset_size(fop); /* Fall through */ case EC_STATE_LOCK: ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, fl_start, fl_size); ec_lock(fop); return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: /* Dispatch discard fop only if we have whole fragment * to deallocate */ if (fop->size) { ec_dispatch_all(fop); return EC_STATE_DELAYED_START; } else { /* Assume discard to have succeeded on all bricks */ ec_succeed_all(fop); } /* Fall through */ case EC_STATE_DELAYED_START: if (fop->size) { if (fop->answer && fop->answer->op_ret == 0) ec_update_discard_write(fop, fop->answer->mask); } else { ec_update_discard_write(fop, fop->mask); } return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: cbk = ec_fop_prepare_answer(fop, _gf_false); if (cbk != NULL) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &cbk->iatt[0].ia_size)); cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } return EC_STATE_REPORT; case EC_STATE_REPORT: cbk = fop->answer; GF_ASSERT(cbk != NULL); if (fop->cbks.discard != NULL) { QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; case -EC_STATE_INIT: case -EC_STATE_LOCK: case -EC_STATE_DISPATCH: case -EC_STATE_DELAYED_START: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); if (fop->cbks.discard != NULL) { fop->cbks.discard(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } return EC_STATE_LOCK_REUSE; case -EC_STATE_LOCK_REUSE: case EC_STATE_LOCK_REUSE: ec_lock_reuse(fop); return EC_STATE_UNLOCK; case -EC_STATE_UNLOCK: case EC_STATE_UNLOCK: ec_unlock(fop); return EC_STATE_END; default: gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", state, ec_fop_name(fop->id)); return EC_STATE_END; } } void ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd, off_t offset, size_t len, dict_t *xdata) { ec_cbk_t callback = {.discard = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(DISCARD) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target, fop_flags, ec_wind_discard, ec_manager_discard, callback, data); if (fop == NULL) { goto out; } fop->use_fd = 1; fop->offset = offset; fop->size = len; if (fd != NULL) { fop->fd = fd_ref(fd); } if (xdata != NULL) { fop->xdata = dict_ref(xdata); } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } } /********************************************************************* * * File Operation : truncate * *********************************************************************/ int32_t ec_update_truncate_write(ec_fop_data_t *fop, uintptr_t mask) { ec_t *ec = fop->xl->private; uint64_t size = fop->offset * ec->fragments - fop->user_size; return ec_update_write(fop, mask, fop->user_size, size); } int32_t ec_truncate_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { ec_fop_data_t *fop = cookie; int32_t err; fop->parent->good &= fop->good; if (op_ret >= 0) { fd_bind(fd); err = ec_update_truncate_write(fop->parent, fop->answer->mask); if (err != 0) { ec_fop_set_error(fop->parent, -err); } } return 0; } int32_t ec_truncate_clean(ec_fop_data_t *fop) { if (fop->fd == NULL) { fop->fd = fd_create(fop->loc[0].inode, fop->frame->root->pid); if (fop->fd == NULL) { return -ENOMEM; } ec_open(fop->frame, fop->xl, fop->answer->mask, fop->minimum, ec_truncate_open_cbk, fop, &fop->loc[0], O_RDWR, fop->fd, NULL); return 0; } else { return ec_update_truncate_write(fop, fop->answer->mask); } } int32_t ec_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prestat, struct iatt *poststat, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, poststat, xdata); } void ec_wind_truncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_truncate_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->truncate, &fop->loc[0], fop->offset, fop->xdata); } int32_t ec_manager_truncate(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk; off_t offset_down; switch (state) { case EC_STATE_INIT: fop->user_size = fop->offset; ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true); fop->frag_range.first = fop->offset; fop->frag_range.last = UINT64_MAX; /* Fall through */ case EC_STATE_LOCK: offset_down = fop->user_size; ec_adjust_offset_down(fop->xl->private, &offset_down, _gf_true); if (fop->id == GF_FOP_TRUNCATE) { ec_lock_prepare_inode( fop, &fop->loc[0], EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, offset_down, EC_RANGE_FULL); } else { ec_lock_prepare_fd( fop, fop->fd, EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, offset_down, EC_RANGE_FULL); } ec_lock(fop); return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_dispatch_all(fop); return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: cbk = ec_fop_prepare_answer(fop, _gf_false); if (cbk != NULL) { int32_t err; ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); /* This shouldn't fail because we have the inode locked. */ /* Inode size doesn't need to be updated under locks, because * conflicting operations won't be in-flight */ GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &cbk->iatt[0].ia_size)); cbk->iatt[1].ia_size = fop->user_size; /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_set_inode_size(fop, fop->locks[0].lock->loc.inode, fop->user_size)); if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) && (fop->user_size != fop->offset)) { err = ec_truncate_clean(fop); if (err != 0) { ec_cbk_set_error(cbk, -err, _gf_false); } } } return EC_STATE_REPORT; case EC_STATE_REPORT: cbk = fop->answer; GF_ASSERT(cbk != NULL); if (fop->id == GF_FOP_TRUNCATE) { if (fop->cbks.truncate != NULL) { QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } else { if (fop->cbks.ftruncate != NULL) { QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } } return EC_STATE_LOCK_REUSE; case -EC_STATE_INIT: case -EC_STATE_LOCK: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); if (fop->id == GF_FOP_TRUNCATE) { if (fop->cbks.truncate != NULL) { fop->cbks.truncate(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } } else { if (fop->cbks.ftruncate != NULL) { fop->cbks.ftruncate(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } } return EC_STATE_LOCK_REUSE; case -EC_STATE_LOCK_REUSE: case EC_STATE_LOCK_REUSE: ec_lock_reuse(fop); return EC_STATE_UNLOCK; case -EC_STATE_UNLOCK: case EC_STATE_UNLOCK: ec_unlock(fop); return EC_STATE_END; default: gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", state, ec_fop_name(fop->id)); return EC_STATE_END; } } void ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc, off_t offset, dict_t *xdata) { ec_cbk_t callback = {.truncate = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(TRUNCATE) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target, fop_flags, ec_wind_truncate, ec_manager_truncate, callback, data); if (fop == NULL) { goto out; } fop->offset = offset; if (loc != NULL) { if (loc_copy(&fop->loc[0], loc) != 0) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, "Failed to copy a location."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } } /* FOP: ftruncate */ int32_t ec_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prestat, struct iatt *poststat, dict_t *xdata) { return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, poststat, xdata); } void ec_wind_ftruncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); STACK_WIND_COOKIE(fop->frame, ec_ftruncate_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->ftruncate, fop->fd, fop->offset, fop->xdata); } void ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd, off_t offset, dict_t *xdata) { ec_cbk_t callback = {.ftruncate = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(FTRUNCATE) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_FTRUNCATE, 0, target, fop_flags, ec_wind_ftruncate, ec_manager_truncate, callback, data); if (fop == NULL) { goto out; } fop->use_fd = 1; fop->offset = offset; if (fd != NULL) { fop->fd = fd_ref(fd); if (fop->fd == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, "Failed to reference a " "file descriptor."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } } /* FOP: writev */ static ec_stripe_t * ec_allocate_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache) { ec_stripe_t *stripe = NULL; if (stripe_cache->count >= stripe_cache->max) { GF_ASSERT(!list_empty(&stripe_cache->lru)); stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru); list_move_tail(&stripe->lru, &stripe_cache->lru); GF_ATOMIC_INC(ec->stats.stripe_cache.evicts); } else { stripe = GF_MALLOC(sizeof(ec_stripe_t) + ec->stripe_size, ec_mt_ec_stripe_t); if (stripe != NULL) { stripe_cache->count++; list_add_tail(&stripe->lru, &stripe_cache->lru); GF_ATOMIC_INC(ec->stats.stripe_cache.allocs); } else { GF_ATOMIC_INC(ec->stats.stripe_cache.errors); } } return stripe; } static void ec_write_stripe_data(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) { off_t base; base = fop->size - ec->stripe_size; memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size); stripe->frag_offset = fop->frag_range.last - ec->fragment_size; } static void ec_add_stripe_in_cache(ec_t *ec, ec_fop_data_t *fop) { ec_inode_t *ctx = NULL; ec_stripe_t *stripe = NULL; ec_stripe_list_t *stripe_cache = NULL; gf_boolean_t failed = _gf_true; LOCK(&fop->fd->inode->lock); ctx = __ec_inode_get(fop->fd->inode, fop->xl); if (ctx == NULL) { goto out; } stripe_cache = &ctx->stripe_cache; if (stripe_cache->max > 0) { stripe = ec_allocate_stripe(ec, stripe_cache); if (stripe == NULL) { goto out; } ec_write_stripe_data(ec, fop, stripe); } failed = _gf_false; out: UNLOCK(&fop->fd->inode->lock); if (failed) { gf_msg(ec->xl->name, GF_LOG_DEBUG, ENOMEM, EC_MSG_FILE_DESC_REF_FAIL, "Failed to create and add stripe in cache"); } } int32_t ec_writev_merge_tail(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { ec_t *ec = this->private; ec_fop_data_t *fop = frame->local; uint64_t size, base, tmp; if (op_ret >= 0) { tmp = 0; size = fop->size - fop->user_size - fop->head; base = ec->stripe_size - size; if (op_ret > base) { tmp = min(op_ret - base, size); ec_iov_copy_to(fop->vector[0].iov_base + fop->size - size, vector, count, base, tmp); size -= tmp; } if (size > 0) { memset(fop->vector[0].iov_base + fop->size - size, 0, size); } if (ec->stripe_cache) { ec_add_stripe_in_cache(ec, fop); } } return 0; } int32_t ec_writev_merge_head(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { ec_t *ec = this->private; ec_fop_data_t *fop = frame->local; uint64_t size, base; if (op_ret >= 0) { size = fop->head; base = 0; if (op_ret > 0) { base = min(op_ret, size); ec_iov_copy_to(fop->vector[0].iov_base, vector, count, 0, base); size -= base; } if (size > 0) { memset(fop->vector[0].iov_base + base, 0, size); } size = fop->size - fop->user_size - fop->head; if ((size > 0) && (fop->size == ec->stripe_size)) { ec_writev_merge_tail(frame, cookie, this, op_ret, op_errno, vector, count, stbuf, iobref, xdata); } } return 0; } static int ec_make_internal_fop_xdata(dict_t **xdata) { dict_t *dict = NULL; if (*xdata) return 0; dict = dict_new(); if (!dict) goto out; if (dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes")) goto out; *xdata = dict; return 0; out: if (dict) dict_unref(dict); return -1; } static int32_t ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop) { struct iobref *iobref = NULL; struct iovec *iov; void *ptr; int32_t err; fop->user_size = iov_length(fop->vector, fop->int32); fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false); fop->frag_range.first = fop->offset / ec->fragments; fop->size = fop->user_size + fop->head; ec_adjust_size_up(ec, &fop->size, _gf_false); fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments; if ((fop->int32 != 1) || (fop->head != 0) || (fop->size > fop->user_size) || !EC_ALIGN_CHECK(fop->vector[0].iov_base, EC_METHOD_WORD_SIZE)) { err = ec_buffer_alloc(ec->xl, fop->size, &iobref, &ptr); if (err != 0) { goto out; } ec_iov_copy_to(ptr + fop->head, fop->vector, fop->int32, 0, fop->user_size); fop->vector[0].iov_base = ptr; fop->vector[0].iov_len = fop->size; iobref_unref(fop->buffers); fop->buffers = iobref; } if (fop->int32 != 2) { iov = GF_MALLOC(VECTORSIZE(2), gf_common_mt_iovec); if (iov == NULL) { err = -ENOMEM; goto out; } iov[0].iov_base = fop->vector[0].iov_base; iov[0].iov_len = fop->vector[0].iov_len; GF_FREE(fop->vector); fop->vector = iov; } fop->vector[1].iov_len = fop->size / ec->fragments; err = ec_buffer_alloc(ec->xl, fop->vector[1].iov_len * ec->nodes, &fop->buffers, &fop->vector[1].iov_base); if (err != 0) { goto out; } err = 0; out: return err; } static void ec_merge_stripe_head_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) { uint32_t head, size; head = fop->head; memcpy(fop->vector[0].iov_base, stripe->data, head); size = ec->stripe_size - head; if (size > fop->user_size) { head += fop->user_size; size = ec->stripe_size - head; memcpy(fop->vector[0].iov_base + head, stripe->data + head, size); } } static void ec_merge_stripe_tail_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) { uint32_t head, tail; off_t offset; offset = fop->user_size + fop->head; tail = fop->size - offset; head = ec->stripe_size - tail; memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail); } static ec_stripe_t * ec_get_stripe_from_cache_locked(ec_t *ec, ec_fop_data_t *fop, uint64_t frag_offset) { ec_inode_t *ctx = NULL; ec_stripe_t *stripe = NULL; ec_stripe_list_t *stripe_cache = NULL; ctx = __ec_inode_get(fop->fd->inode, fop->xl); if (ctx == NULL) { GF_ATOMIC_INC(ec->stats.stripe_cache.errors); return NULL; } stripe_cache = &ctx->stripe_cache; list_for_each_entry(stripe, &stripe_cache->lru, lru) { if (stripe->frag_offset == frag_offset) { list_move_tail(&stripe->lru, &stripe_cache->lru); GF_ATOMIC_INC(ec->stats.stripe_cache.hits); return stripe; } } GF_ATOMIC_INC(ec->stats.stripe_cache.misses); return NULL; } static gf_boolean_t ec_get_and_merge_stripe(ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which) { uint64_t frag_offset; ec_stripe_t *stripe = NULL; gf_boolean_t found = _gf_false; if (!ec->stripe_cache) { return found; } LOCK(&fop->fd->inode->lock); if (which == EC_STRIPE_HEAD) { frag_offset = fop->frag_range.first; stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); if (stripe) { ec_merge_stripe_head_locked(ec, fop, stripe); found = _gf_true; } } if (which == EC_STRIPE_TAIL) { frag_offset = fop->frag_range.last - ec->fragment_size; stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); if (stripe) { ec_merge_stripe_tail_locked(ec, fop, stripe); found = _gf_true; } } UNLOCK(&fop->fd->inode->lock); return found; } static uintptr_t ec_get_lock_good_mask(inode_t *inode, xlator_t *xl) { ec_lock_t *lock = NULL; ec_inode_t *ictx = NULL; LOCK(&inode->lock); { ictx = __ec_inode_get(inode, xl); if (ictx) lock = ictx->inode_lock; } UNLOCK(&inode->lock); if (lock) return lock->good_mask; return 0; } void ec_writev_start(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; ec_fd_t *ctx; fd_t *fd; dict_t *xdata = NULL; uint64_t tail, current; int32_t err = -ENOMEM; gf_boolean_t found_stripe = _gf_false; /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); fd = fd_anonymous(fop->fd->inode); if (fd == NULL) { goto failed; } fop->frame->root->uid = 0; fop->frame->root->gid = 0; ctx = ec_fd_get(fop->fd, fop->xl); if (ctx != NULL) { if ((ctx->flags & O_APPEND) != 0) { /* Appending writes take full locks so size won't change because * of any parallel operations */ fop->offset = current; } } err = ec_writev_prepare_buffers(ec, fop); if (err != 0) { goto failed_fd; } tail = fop->size - fop->user_size - fop->head; if (fop->head > 0) { if (current > fop->offset) { found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD); if (!found_stripe) { if (ec_make_internal_fop_xdata(&xdata)) { err = -ENOMEM; goto failed_xdata; } ec_readv(fop->frame, fop->xl, ec_get_lock_good_mask(fop->fd->inode, fop->xl), EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd, ec->stripe_size, fop->offset, 0, xdata); } } else { memset(fop->vector[0].iov_base, 0, fop->head); memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); if (ec->stripe_cache && (fop->size <= ec->stripe_size)) { ec_add_stripe_in_cache(ec, fop); } } } if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { /* Current locking scheme will make sure the 'current' below will * never decrease while the fop is in progress, so the checks will * work as expected */ if (current > fop->offset + fop->head + fop->user_size) { found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_TAIL); if (!found_stripe) { if (ec_make_internal_fop_xdata(&xdata)) { err = -ENOMEM; goto failed_xdata; } ec_readv(fop->frame, fop->xl, ec_get_lock_good_mask(fop->fd->inode, fop->xl), EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd, ec->stripe_size, fop->offset + fop->size - ec->stripe_size, 0, xdata); } } else { memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); if (ec->stripe_cache) { ec_add_stripe_in_cache(ec, fop); } } } err = 0; failed_xdata: if (xdata) { dict_unref(xdata); } failed_fd: fd_unref(fd); failed: ec_fop_set_error(fop, -err); } int32_t ec_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prestat, struct iatt *poststat, dict_t *xdata) { ec_t *ec = NULL; if (this && this->private) { ec = this->private; if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) { op_ret = -1; op_errno = EIO; } } return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, poststat, xdata); } void ec_wind_writev(ec_t *ec, ec_fop_data_t *fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); struct iovec vector[1]; size_t size; size = fop->vector[1].iov_len; vector[0].iov_base = fop->vector[1].iov_base + idx * size; vector[0].iov_len = size; STACK_WIND_COOKIE(fop->frame, ec_writev_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->writev, fop->fd, vector, 1, fop->offset / ec->fragments, fop->uint32, fop->buffers, fop->xdata); } static void ec_writev_encode(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; void *blocks[ec->nodes]; uint32_t i; blocks[0] = fop->vector[1].iov_base; for (i = 1; i < ec->nodes; i++) { blocks[i] = blocks[i - 1] + fop->vector[1].iov_len; } ec_method_encode(&ec->matrix, fop->vector[0].iov_len, fop->vector[0].iov_base, blocks); } int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state) { ec_cbk_data_t *cbk; ec_fd_t *ctx = NULL; ec_t *ec = fop->xl->private; off_t fl_start = 0; uint64_t fl_size = LONG_MAX; switch (state) { case EC_STATE_INIT: case EC_STATE_LOCK: ctx = ec_fd_get(fop->fd, fop->xl); if (ctx != NULL) { if ((ctx->flags & O_APPEND) == 0) { off_t user_size = 0; off_t head = 0; fl_start = fop->offset; user_size = iov_length(fop->vector, fop->int32); head = ec_adjust_offset_down(ec, &fl_start, _gf_true); fl_size = user_size + head; ec_adjust_size_up(ec, &fl_size, _gf_true); } } ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, fl_start, fl_size); ec_lock(fop); return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_writev_start(fop); return EC_STATE_DELAYED_START; case EC_STATE_DELAYED_START: /* Restore uid, gid if they were changed to do some partial * reads. */ fop->frame->root->uid = fop->uid; fop->frame->root->gid = fop->gid; ec_writev_encode(fop); ec_dispatch_all(fop); return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: cbk = ec_fop_prepare_answer(fop, _gf_false); if (cbk != NULL) { ec_t *ec = fop->xl->private; uint64_t size; ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); /* This shouldn't fail because we have the inode locked. */ LOCK(&fop->fd->inode->lock); { GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size)); cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; size = fop->offset + fop->head + fop->user_size; if (size > cbk->iatt[0].ia_size) { /* Only update inode size if this is a top level fop. * Otherwise this is an internal write and the top * level fop should take care of the real inode size. */ if (fop->parent == NULL) { /* This shouldn't fail because we have the inode * locked. */ GF_ASSERT( __ec_set_inode_size(fop, fop->fd->inode, size)); } cbk->iatt[1].ia_size = size; } } UNLOCK(&fop->fd->inode->lock); if (fop->error == 0) { cbk->op_ret *= ec->fragments; if (cbk->op_ret < fop->head) { cbk->op_ret = 0; } else { cbk->op_ret -= fop->head; } if (cbk->op_ret > fop->user_size) { cbk->op_ret = fop->user_size; } } } return EC_STATE_REPORT; case EC_STATE_REPORT: cbk = fop->answer; GF_ASSERT(cbk != NULL); if (fop->cbks.writev != NULL) { QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); } return EC_STATE_LOCK_REUSE; case -EC_STATE_DELAYED_START: /* We have failed while doing partial reads. We need to restore * original uid, gid. */ fop->frame->root->uid = fop->uid; fop->frame->root->gid = fop->gid; /* Fall through */ case -EC_STATE_INIT: case -EC_STATE_LOCK: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); if (fop->cbks.writev != NULL) { fop->cbks.writev(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, NULL, NULL); } return EC_STATE_LOCK_REUSE; case -EC_STATE_LOCK_REUSE: case EC_STATE_LOCK_REUSE: ec_lock_reuse(fop); return EC_STATE_UNLOCK; case -EC_STATE_UNLOCK: case EC_STATE_UNLOCK: ec_unlock(fop); return EC_STATE_END; default: gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", state, ec_fop_name(fop->id)); return EC_STATE_END; } } void ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { ec_cbk_t callback = {.writev = func}; ec_fop_data_t *fop = NULL; int32_t error = ENOMEM; gf_msg_trace("ec", 0, "EC(WRITE) %p", frame); VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, fop_flags, ec_wind_writev, ec_manager_writev, callback, data); if (fop == NULL) { goto out; } fop->int32 = count; fop->offset = offset; fop->uint32 = flags; fop->use_fd = 1; if (fd != NULL) { fop->fd = fd_ref(fd); if (fop->fd == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, "Failed to reference a " "file descriptor."); goto out; } } if (count > 0) { fop->vector = iov_dup(vector, count); if (fop->vector == NULL) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, "Failed to duplicate a " "vector list."); goto out; } fop->int32 = count; } if (iobref != NULL) { fop->buffers = iobref_ref(iobref); if (fop->buffers == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL, "Failed to reference a " "buffer."); goto out; } } if (xdata != NULL) { fop->xdata = dict_copy_with_ref(xdata, NULL); if (fop->xdata == NULL) { gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, "Failed to reference a " "dictionary."); goto out; } } error = 0; out: if (fop != NULL) { ec_manager(fop, error); } else { func(frame, NULL, this, -1, error, NULL, NULL, NULL); } }