summaryrefslogtreecommitdiffstats
path: root/xlators/features/bit-rot/src/stub/bit-rot-stub.c
diff options
context:
space:
mode:
authorRaghavendra Bhat <raghavendra@redhat.com>2015-04-09 15:38:47 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-08 11:27:42 -0700
commitc93c433a44770de931f837be179c5ccdba958cad (patch)
treef6c5f513e8b2ccadc3b53d73db609f731d54ba8b /xlators/features/bit-rot/src/stub/bit-rot-stub.c
parent680b3bf629f0fef038470baab62c6d6d8f5988ce (diff)
features/bit-rot-stub: versioning of objects in write/truncate fop instead of open
* This patch brings in the changes where object versioning is done in write and truncate fops instead of tracking them in open and create fops. This model works for both regular and anonymous fds. It also removes the race associated with open calls, create and lookups. This patch follows the below method for object versioning and notifications: Before sending writev on the fd, increase the ongoing version first. This makes anonymous fd write similar to the regular fd write by having the ongoing version increased before doing the write. Do following steps to do versioning: 1) For anonymous fds set the fd context (so that release is invoked) and add the fd context to the list maintained in the inode context. For regular fds the above think would have been done in open itself. 2) Increase the on-disk ongoing version 3) Increase the in memory ongoing version and mark inode as non-dirty 3) Once versioning is successfully done send write operation. If versioning fails, then fail the write fop. 5) In writev_cbk mark inode as modified. Change-Id: I7104391bbe076d8fc49b68745d2ec29a6e92476c BUG: 1207979 Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com> Reviewed-on: http://review.gluster.org/10233 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/features/bit-rot/src/stub/bit-rot-stub.c')
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c1070
1 files changed, 804 insertions, 266 deletions
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
index f9c3886..93db072 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
@@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode,
if (!ctx)
goto error_return;
+ INIT_LIST_HEAD (&ctx->fd_list);
(markdirty) ? __br_stub_mark_inode_dirty (ctx)
: __br_stub_mark_inode_synced (ctx);
__br_stub_set_ongoing_version (ctx, version);
- __br_stub_reset_release_counters (ctx);
if (fd) {
- br_stub_require_release_call (this, fd);
- __br_stub_track_openfd (fd, ctx);
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret)
+ goto free_ctx;
}
ret = br_stub_set_inode_ctx (this, inode, ctx);
if (ret)
@@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this,
__br_stub_mark_inode_synced (ctx);
}
- __br_stub_track_openfd (fd, ctx);
ret = 0;
}
unblock:
@@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this,
static inline void
br_stub_fill_local (br_stub_local_t *local,
call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid,
- int versioningtype, unsigned long memversion, int dirty)
+ int versioningtype, unsigned long memversion)
{
local->fopstub = stub;
local->versioningtype = versioningtype;
local->u.context.version = memversion;
- if (fd)
+ if (fd && !local->u.context.fd)
local->u.context.fd = fd_ref (fd);
if (inode)
local->u.context.inode = inode_ref (inode);
gf_uuid_copy (local->u.context.gfid, gfid);
-
- /* mark inode dirty/fresh according to durability */
- local->u.context.markdirty = (dirty) ? _gf_true : _gf_false;
}
static inline void
@@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local)
inode_unref (local->u.context.inode);
local->u.context.inode = NULL;
}
- local->u.context.markdirty = _gf_true;
memset (local->u.context.gfid, '\0', sizeof (uuid_t));
}
/**
- * callback for inode/fd full versioning
+ * callback for inode/fd versioning
*/
int
-br_stub_inode_fullversioning_cbk (call_frame_t *frame,
- void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- fd_t *fd = NULL;
- inode_t *inode = NULL;
- unsigned long version = 0;
- gf_boolean_t dirty = _gf_true;
- br_stub_local_t *local = NULL;
-
- local = (br_stub_local_t *)frame->local;
-
- /* be graceful to EEXIST */
- if ((op_ret < 0) && (op_errno == EEXIST)) {
- op_ret = 0;
- goto done;
- }
-
- if (op_ret < 0)
- goto done;
-
- fd = local->u.context.fd;
- inode = local->u.context.inode;
- version = local->u.context.version;
- dirty = local->u.context.markdirty;
-
- op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty);
- if (op_ret < 0)
- op_errno = EINVAL;
-
- done:
- frame->local = NULL;
- if (op_ret < 0)
- call_unwind_error (local->fopstub, op_ret, op_errno);
- else
- call_resume (local->fopstub);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
-
- return 0;
-}
-
-int
br_stub_fd_incversioning_cbk (call_frame_t *frame,
void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,
op_errno = EINVAL;
done:
- frame->local = NULL;
- if (op_ret < 0)
+ if (op_ret < 0) {
+ frame->local = NULL;
call_unwind_error (local->fopstub, -1, op_errno);
- else
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ } else {
call_resume (local->fopstub);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
-
+ }
return 0;
}
@@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,
* Initial object versioning
*
* Version persists two (2) extended attributes as explained below:
- * 1. Current (ongoing) version: This is incremented on an open()
- * or creat() and is the running version for an object.
+ * 1. Current (ongoing) version: This is incremented on an writev ()
+ * or truncate () and is the running version for an object.
* 2. Signing version: This is the version against which an object
* was signed (checksummed).
*
* During initial versioning, both ongoing and signing versions are
- * set of one and zero respectively. An open() call increments the
+ * set of one and zero respectively. A write() call increments the
* ongoing version as an indication of modification to the object.
* Additionally this needs to be persisted on disk and needs to be
* durable: fsync().. :-/
- * As an optimization only the first open() synchronizes the ongoing
- * version to disk, subsequent open()s before the *last* release()
+ * As an optimization only the first write() synchronizes the ongoing
+ * version to disk, subsequent write()s before the *last* release()
* are no-op's.
*
* create(), just like lookup() initializes the object versions to
- * the default, but persists the version to disk. As an optimization
- * this is not a durable operation: in case of a crash, hard reboot
- * etc.. absence of versioning xattrs is ignored in scrubber along
- * with the one time crawler explicitly triggering signing for such
- * objects.
+ * the default. As an optimization this is not a durable operation:
+ * in case of a crash, hard reboot etc.. absence of versioning xattrs
+ * is ignored in scrubber along with the one time crawler explicitly
+ * triggering signing for such objects.
*
- * c.f. br_stub_open_cbk() / br_stub_create_cbk()
+ * c.f. br_stub_writev() / br_stub_truncate()
*/
/**
@@ -400,7 +352,7 @@ int
br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
call_stub_t *stub, dict_t *dict, fd_t *fd,
br_stub_version_cbk *callback, unsigned long memversion,
- int versioningtype, int durable, int dirty)
+ int versioningtype, int durable)
{
int32_t ret = -1;
int flags = 0;
@@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
goto dealloc_xdata;
}
- local = br_stub_alloc_local (this);
- if (!local) {
- ret = -1;
- goto dealloc_xdata;
- }
-
- if (versioningtype == BR_STUB_FULL_VERSIONING)
- flags |= XATTR_CREATE;
+ local = frame->local;
br_stub_fill_local (local, stub, fd,
fd->inode, fd->inode->gfid,
- versioningtype, memversion, dirty);
+ versioningtype, memversion);
frame->local = local;
STACK_WIND (frame, callback,
@@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
}
static inline int
-br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame,
- call_stub_t *stub, fd_t *fd)
-{
- int32_t ret = -1;
- dict_t *dict = NULL;
- br_version_t *obuf = NULL;
- int op_errno = 0;
-
- op_errno = ENOMEM;
- dict = dict_new ();
- if (!dict)
- goto done;
- ret = br_stub_alloc_versions (&obuf, NULL, 0);
- if (ret)
- goto dealloc_dict;
-
- op_errno = EINVAL;
- ret = br_stub_prepare_version_request (this, dict, obuf,
- BITROT_DEFAULT_CURRENT_VERSION);
- if (ret)
- goto dealloc_versions;
-
- /**
- * Version extended attributes need not be durable at this point of
- * time. If the objects (inode) data gets persisted on disk but the
- * version extended attributes are lost due to a crash/power failure,
- * a subsequent lookup marks the objects signature as stale. This way,
- * dentry operation times do not shoot up.
- */
- ret = br_stub_fd_versioning (this, frame, stub, dict, fd,
- br_stub_inode_fullversioning_cbk,
- BITROT_DEFAULT_CURRENT_VERSION,
- BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0);
-
- dealloc_versions:
- br_stub_dealloc_versions (obuf);
- dealloc_dict:
- dict_unref (dict);
- done:
- if (ret)
- call_unwind_error (stub, -1, op_errno);
- return ret;
-}
-
-static inline int
br_stub_perform_incversioning (xlator_t *this,
call_frame_t *frame, call_stub_t *stub,
fd_t *fd, br_stub_inode_ctx_t *ctx)
{
- int32_t ret = -1;
- dict_t *dict = NULL;
- inode_t *inode = NULL;
- br_version_t *obuf = NULL;
- unsigned long writeback_version = 0;
- int op_errno = 0;
-
- inode = fd->inode;
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ br_version_t *obuf = NULL;
+ unsigned long writeback_version = 0;
+ int op_errno = 0;
+ br_stub_local_t *local = NULL;
op_errno = EINVAL;
- ret = br_stub_require_release_call (this, fd);
- if (ret)
- goto done;
-
- LOCK (&inode->lock);
- {
- if (__br_stub_is_inode_dirty (ctx))
- writeback_version = __br_stub_writeback_version (ctx);
- else
- __br_stub_track_openfd (fd, ctx);
- }
- UNLOCK (&inode->lock);
+ local = frame->local;
- if (!writeback_version) {
- ret = 0;
- goto done;
- }
+ writeback_version = __br_stub_writeback_version (ctx);
/* inode requires writeback to disk */
op_errno = ENOMEM;
@@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this,
ret = br_stub_fd_versioning
(this, frame, stub, dict,
fd, br_stub_fd_incversioning_cbk, writeback_version,
- BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0);
+ BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE);
dealloc_versions:
br_stub_dealloc_versions (obuf);
dealloc_dict:
dict_unref (dict);
done:
- if (!ret && !writeback_version)
- call_resume (stub);
- if (ret)
+ if (ret) {
+ if (local)
+ frame->local = NULL;
call_unwind_error (stub, -1, op_errno);
+ if (local) {
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ }
+ }
+
return ret;
}
@@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this,
/* fsetxattr() */
static inline int
+br_stub_compare_sign_version (xlator_t *this, inode_t *inode,
+ br_signature_t *sbuf, dict_t *dict)
+{
+ int32_t ret = -1;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t tmp_ctx = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, sbuf, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx);
+ if (ret) {
+ dict_del (dict, BITROT_SIGNING_VERSION_KEY);
+ goto out;
+ }
+
+ ret = -1;
+ ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx;
+
+ LOCK (&inode->lock);
+ {
+ if (ctx->currentversion == sbuf->signedversion)
+ ret = 0;
+ else
+ gf_log (this->name, GF_LOG_WARNING, "current version "
+ "%lu and version of the signature %lu are not "
+ "same", ctx->currentversion,
+ sbuf->signedversion);
+ }
+ UNLOCK (&inode->lock);
+
+out:
+ return ret;
+}
+
+static inline int
br_stub_prepare_signature (xlator_t *this, dict_t *dict,
inode_t *inode, br_isignature_t *sign)
{
@@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict,
ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen);
if (ret)
goto dealloc_versions;
+
+ ret = br_stub_compare_sign_version (this, inode, sbuf, dict);
+ if (ret)
+ goto dealloc_versions;
+
return 0;
dealloc_versions:
@@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
if (ret)
goto unwind;
+ gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu",
+ sign->signedversion);
wind:
STACK_WIND (frame, default_setxattr_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd,
@@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-/** }}} */
+/**
+ * The first write response on the first fd in the list of fds will set
+ * the flag to indicate that the inode is modified. The subsequent write
+ * respnses coming on either the first fd or some other fd will not change
+ * the fd. The inode-modified flag is unset only upon release of all the
+ * fds.
+ */
+int32_t
+br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
+ if (op_ret < 0)
+ goto unwind;
-/** {{{ */
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
+ goto unwind;
-/* open() */
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-int
-br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+/**
+ * Ongoing version is increased only for the first modify operation.
+ * First modify version means the first write or truncate call coming on the
+ * first fd in the list of inodes.
+ * For anonymous fds open would not have come, so check if its the first write
+ * by doing both inode dirty check and ensuring list of fds is empty
+ */
+static inline gf_boolean_t
+br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- call_stub_t *stub = NULL;
+ gf_boolean_t inc_version = _gf_false;
+
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ LOCK (&fd->inode->lock);
+ {
+ if (__br_stub_is_inode_dirty (ctx))
+ inc_version = _gf_true;
+ }
+ UNLOCK (&fd->inode->lock);
+
+out:
+ return inc_version;
+}
+
+/**
+ * Since NFS does not do open, writes from NFS are sent over an anonymous
+ * fd. It means each write fop might come on a different anonymous fd and
+ * will lead to very large number of notifications being sent. It might
+ * affect the perfromance as, there will too many sign requests.
+ * To avoid that whenever the last fd released from an inode (logical release)
+ * is an anonymous fd the release notification is sent with a flag being set
+ * __br_stub_anon_release (ctx);
+ * BitD checks for the flag and if set, it will send a dummy write request
+ * (again on an anonymous fd) instead of triggering sign.
+ * Bit-rot-stub should identify such dummy writes and should send success to
+ * them instead of winding them downwards.
+ */
+gf_boolean_t
+br_stub_dummy_write (call_frame_t *frame)
+{
+ return (frame->root->pid == GF_CLIENT_PID_BITD)
+ ? _gf_true : _gf_false;
+}
+
+int32_t
+br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (!br_stub_fd) {
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "add fd to the inode (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ if (frame->root->pid == GF_CLIENT_PID_BITD)
+ br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0,
+ NULL, NULL, NULL);
+ else
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
+ return 0;
+}
+
+/**
+ TODO: If possible add pictorial represention of below comment.
+
+ Before sending writev on the ANONYMOUS FD, increase the ongoing
+ version first. This brings anonymous fd write closer to the regular
+ fd write by having the ongoing version increased before doing the
+ write (In regular fd, after open the ongoing version is incremented).
+ Do following steps to handle writes on anonymous fds:
+ 1) Increase the on-disk ongoing version
+ 2) Once versioning is successfully done send write operation. If versioning
+ fails, then fail the write fop.
+ 3) In writev_cbk do below things:
+ a) Increase in-memory version
+ b) set the fd context (so that br_stub_release is invoked)
+ c) add the fd to the list of fds maintained in the inode context of
+ bitrot-stub.
+ d) Mark inode as non dirty
+ e) Mard inode as modified (in the inode context)
+**/
+int32_t
+br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd_ref (fd);
+ frame->local = local;
+
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto unwind;
+ }
+
+ /* TODO: Better to do a dummy fsetxattr instead of write. Keep write
+ simple */
+ if (br_stub_dummy_write (frame)) {
+ LOCK (&fd->inode->lock);
+ {
+ (void) __br_stub_inode_sign_state
+ (ctx, GF_FOP_WRITE, fd);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ if (xdata && dict_get (xdata, "br-fd-reopen")) {
+ op_ret = vector->iov_len;
+ op_errno = 0;
+ goto unwind;
+ }
+ }
+
+ /**
+ * Check whether this is the first write on this inode since the last
+ * sign notification has been sent. If so, do versioning. Otherwise
+ * go ahead with the fop.
+ */
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
+
+ /* Create the stub for the write fop */
+ stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count,
+ offset, flags, iobref, xdata);
+
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "write fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
if (op_ret < 0)
goto unwind;
- if (cookie != (void *) BR_STUB_REQUEST_COOKIE)
+
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
goto unwind;
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd_ref (fd);
+ frame->local = local;
+
ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0)
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto unwind;
+ }
- stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata);
+ /**
+ * c.f. br_stub_writev()
+ */
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
+
+ /* Create the stub for the ftruncate fop */
+ stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset,
+ xdata);
if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "ftruncate fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
+
+ if (op_ret < 0)
+ goto unwind;
+
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
+ goto unwind;
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+}
+
+/**
+ * Bit-rot-stub depends heavily on the fd based operations to for doing
+ * versioning and sending notification. It starts tracking the operation
+ * upon getting first fd based modify operation by doing versioning and
+ * sends notification when last fd using which the inode was modified is
+ * released.
+ * But for truncate there is no fd and hence it becomes difficult to do
+ * the versioning and send notification. It is handled by doing versioning
+ * on an anonymous fd. The fd will be valid till the completion of the
+ * truncate call. It guarantees that release on this anonymous fd will happen
+ * after the truncate call and notification is sent after the truncate call.
+ */
+int32_t
+br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
+
+ fd = fd_anonymous (loc->inode);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous "
+ "fd for the inode %s", uuid_utoa (loc->inode->gfid));
+ goto unwind;
+ }
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (loc->inode->gfid));
op_ret = -1;
- op_errno = EINVAL;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd;
+ frame->local = local;
+
+ ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
goto unwind;
}
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx);
+ if (ret)
+ goto unwind;
+
/**
- * Ongoing version needs to be incremented. If the inode is not dirty,
- * things are simple: increment the ongoing version safely and be done.
- * If inode is dirty, a writeback to disk is required. This is tricky in
- * case of multiple open()'s as ongoing version needs to be incremented
- * on a successful writeback. It's probably safe to remember the ongoing
- * version before writeback and *assigning* it in the callback, but that
- * may lead to a trustable checksum to be treated as stale by scrubber
- * (the case where the in-memory ongoing version is lesser than the
- * on-disk version). Therefore, *all* open() calls (which might have
- * come in parallel) try to synchronize the next ongoing version to
- * disk. In the callback path, the winner marks the inode as synced
- * therby loosing open() calls become no-op's.
+ * c.f. br_stub_writev()
*/
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
- return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
- unwind:
- STACK_UNWIND_STRICT (open, frame,
- op_ret, op_errno, fd, xdata);
+ /* Create the stub for the truncate fop */
+ stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset,
+ xdata);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "truncate fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub,
+ local->u.context.fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
return 0;
}
+/** }}} */
+
+
+/** {{{ */
+
+/* open() */
+
+/**
+ * It's probably worth mentioning a bit about why some of the housekeeping
+ * work is done in open() call path, rather than the callback path.
+ * Two (or more) open()'s in parallel can race and lead to a situation
+ * where a release() gets triggered (possibly after a series of write()
+ * calls) when *other* open()'s have still not reached callback path
+ * thereby having an active fd on an inode that is in process of getting
+ * signed with the current version.
+ *
+ * Maintaining fd list in the call path ensures that a release() would
+ * not be triggered if an open() call races ahead (followed by a close())
+ * threby finding non-empty fd list.
+ */
+
int
br_stub_open (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
{
- void *cookie = NULL;
+ int32_t ret = -1;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
- if (!flags)
- goto wind;
if (frame->root->pid == GF_CLIENT_PID_SCRUB)
goto wind;
- cookie = (void *) BR_STUB_REQUEST_COOKIE;
- wind:
- STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie,
- FIRST_CHILD (this), FIRST_CHILD (this)->fops->open,
- loc, flags, fd, xdata);
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the file %s (gfid: %s)", loc->path,
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+ if (flags == O_RDONLY)
+ goto wind;
+
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+wind:
+ STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+unwind:
+ STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL);
return 0;
}
@@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this,
/* creat() */
+/**
+ * This routine registers a release callback for the given fd and adds the
+ * fd to the inode context fd tracking list.
+ */
+int32_t
+br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ ret = br_stub_require_release_call (this, fd, &br_stub_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set the fd "
+ "context for the file (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ LOCK (&fd->inode->lock);
+ {
+ list_add_tail (&ctx->fd_list, &br_stub_fd->list);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
int
br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- call_stub_t *stub = NULL;
- br_stub_inode_ctx_t *ctx = NULL;
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
if (op_ret < 0)
goto unwind;
- stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode,
- stbuf, preparent, postparent, xdata);
- if (!stub) {
- op_ret = -1;
- op_errno = EINVAL;
- goto unwind;
- }
-
ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-
- /* see comment in br_stub_open_cbk().. */
- return (ctx)
- ? br_stub_perform_incversioning (this, frame, stub, fd, ctx)
- : br_stub_perform_fullversioning (this, frame, stub, fd);
+ if (ret < 0) {
+ ret = br_stub_init_inode_versions (this, fd, inode, version,
+ _gf_true);
+ } else {
+ ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ }
- unwind:
+unwind:
STACK_UNWIND_STRICT (create, frame, op_ret, op_errno,
fd, inode, stbuf, preparent, postparent, xdata);
return 0;
@@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame,
xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
+
STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->create,
loc, flags, mode, umask, fd, xdata);
return 0;
+unwind:
+ STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
/** }}} */
@@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this,
* out the correct version to use in the inode context (start with
* the default version if unavailable). As of now versions are not
* persisted on-disk. The inode is marked dirty, so that the first
- * operation (such as open(), etc..) would trigger synchronization
- * to disk.
+ * operation (such as write(), etc..) triggers synchronization to
+ * disk.
*/
status = br_version_xattr_state (xattr, &obuf, &sbuf);
- /**
- * stub does not know how to handle presence of signature but not
- * the object version, therefore, in such cases, bail out..
- */
- if (status == BR_VXATTR_STATUS_INVALID) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. "
- "Bailing out [GFID: %s]", uuid_utoa (gfid));
- return -1;
- }
-
version = ((status == BR_VXATTR_STATUS_FULL)
|| (status == BR_VXATTR_STATUS_UNSIGNED))
? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION;
@@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this,
}
static inline void
-br_stub_send_ipc_fop (xlator_t *this,
- fd_t *fd, unsigned long releaseversion, int32_t flags)
+br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion,
+ int sign_info)
{
int32_t op = 0;
int32_t ret = 0;
@@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this,
changelog_event_t ev = {0,};
ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE;
- ev.u.releasebr.flags = flags;
ev.u.releasebr.version = releaseversion;
+ ev.u.releasebr.sign_info = sign_info;
gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid);
xdata = dict_new ();
@@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this,
return;
}
+/**
+ * This is how the state machine of sign info works:
+ * 3 states:
+ * 1) BR_SIGN_NORMAL => The default State of the inode
+ * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen
+ * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign
+ * 2 events:
+ * 1) GF_FOP_RELEASE
+ * 2) GF_FOP_WRITE (actually a dummy write fro BitD)
+ *
+ * This is how states are changed based on events:
+ * EVENT: GF_FOP_RELEASE:
+ * if (state == BR_SIGN_NORMAL) ; then
+ * set state = BR_SIGN_REOPEN_WAIT;
+ * if (state == BR_SIGN_QUICK); then
+ * set state = BR_SIGN_NORMAL;
+ * EVENT: GF_FOP_WRITE:
+ * if (state == BR_SIGN_REOPEN_WAIT); then
+ * set state = BR_SIGN_QUICK;
+ */
+br_sign_state_t
+__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx,
+ glusterfs_fop_t fop, fd_t *fd)
+{
+ br_sign_state_t sign_info = BR_SIGN_INVALID;
+
+ switch (fop) {
+
+ case GF_FOP_WRITE:
+ sign_info = ctx->info_sign = BR_SIGN_QUICK;
+ break;
+
+ case GF_FOP_RELEASE:
+ GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT);
+
+ if (ctx->info_sign == BR_SIGN_NORMAL) {
+ sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT;
+ } else {
+ sign_info = ctx->info_sign;
+ ctx->info_sign = BR_SIGN_NORMAL;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ return sign_info;
+}
+
int32_t
br_stub_release (xlator_t *this, fd_t *fd)
{
- int32_t ret = 0;
- int32_t flags = 0;
- inode_t *inode = NULL;
- unsigned long releaseversion = 0;
- br_stub_inode_ctx_t *ctx = NULL;
+ int32_t ret = 0;
+ int32_t flags = 0;
+ inode_t *inode = NULL;
+ unsigned long releaseversion = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t tmp = 0;
+ br_stub_fd_t *br_stub_fd = NULL;
+ int32_t signinfo = 0;
inode = fd->inode;
@@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd)
ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL);
if (ctx == NULL)
goto unblock;
- __br_stub_track_release (ctx);
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (br_stub_fd) {
+ list_del_init (&br_stub_fd->list);
+ }
+
ret = __br_stub_can_trigger_release
- (inode, ctx, &releaseversion, &flags);
- if (ret) {
- GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0);
+ (inode, ctx, &releaseversion);
+ if (!ret)
+ goto unblock;
+
+ signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd);
+ signinfo = htonl (signinfo);
+
+ /* inode back to initital state: mark dirty */
+ if (ctx->info_sign == BR_SIGN_NORMAL) {
__br_stub_mark_inode_dirty (ctx);
+ __br_stub_unset_inode_modified (ctx);
}
}
unblock:
@@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd)
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
- "releaseversion: %lu|flags: %d", releaseversion, flags);
- br_stub_send_ipc_fop (this, fd, releaseversion, flags);
+ "releaseversion: %lu | flags: %d | signinfo: %d",
+ (unsigned long) ntohl (releaseversion),
+ flags, ntohl(signinfo));
+ br_stub_send_ipc_fop (this, fd, releaseversion, signinfo);
}
+ ret = fd_ctx_del (fd, this, &tmp);
+ br_stub_fd = (br_stub_fd_t *)(long)tmp;
+
+ GF_FREE (br_stub_fd);
+
return 0;
}
@@ -1351,11 +1899,12 @@ void
br_stub_ictxmerge (xlator_t *this, fd_t *fd,
inode_t *inode, inode_t *linked_inode)
{
- int32_t ret = 0;
- uint64_t ctxaddr = 0;
- uint64_t lctxaddr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- br_stub_inode_ctx_t *lctx = NULL;
+ int32_t ret = 0;
+ uint64_t ctxaddr = 0;
+ uint64_t lctxaddr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_inode_ctx_t *lctx = NULL;
+ br_stub_fd_t *br_stub_fd = NULL;
ret = br_stub_get_inode_ctx (this, inode, &ctxaddr);
if (ret < 0)
@@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd,
goto unblock;
lctx = (br_stub_inode_ctx_t *) lctxaddr;
- if (__br_stub_is_inode_dirty (lctx)) {
- /**
- * RACY code: An inode can end up in this situation
- * after a lookup() or after a create() followed by
- * a release(). Even if we distinguish b/w the two,
- * there needs to be more infrastructure built up
- * in stub to handle these races. Note, that it's
- * probably OK to ignore the race iff the version
- * was initialized on the very first lookup(), i.e.,
- * [ongoingversion: default].
- *
- * FIXME: fixup races [create(1..n)/lookup(1..n)].
- */
- GF_ASSERT (lctx->currentversion
- == BITROT_DEFAULT_CURRENT_VERSION);
- __br_stub_track_openfd (fd, lctx);
- __br_stub_mark_inode_synced (lctx);
- } else {
- GF_ASSERT (ctx->currentversion <= lctx->currentversion);
- __br_stub_track_openfd (fd, lctx);
+ GF_ASSERT (list_is_singular (&ctx->fd_list));
+ br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t,
+ list);
+ if (br_stub_fd) {
+ GF_ASSERT (br_stub_fd->fd == fd);
+ list_move_tail (&br_stub_fd->list, &lctx->fd_list);
}
}
- unblock:
+unblock:
UNLOCK (&linked_inode->lock);
done:
@@ -1409,6 +1944,9 @@ struct xlator_fops fops = {
.getxattr = br_stub_getxattr,
.fgetxattr = br_stub_fgetxattr,
.fsetxattr = br_stub_fsetxattr,
+ .writev = br_stub_writev,
+ .truncate = br_stub_truncate,
+ .ftruncate = br_stub_ftruncate,
};
struct xlator_cbks cbks = {