summaryrefslogtreecommitdiffstats
path: root/xlators/features/bit-rot/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/features/bit-rot/src')
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c138
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h6
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h7
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h3
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c1070
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h216
6 files changed, 1101 insertions, 339 deletions
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index 5638b0f348b..61d461f897b 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -171,11 +171,11 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
if (fd)
ret = syncop_fgetxattr (child->xl, fd, &xattr,
- BITROT_OBJECT_BAD_KEY, NULL,
+ "trusted.glusterfs.bad-file", NULL,
NULL);
else if (loc)
ret = syncop_getxattr (child->xl, loc, &xattr,
- BITROT_OBJECT_BAD_KEY, NULL,
+ "trusted.glusterfs.bad-file", NULL,
NULL);
if (!ret) {
@@ -484,6 +484,98 @@ br_log_object_path (xlator_t *this, char *op,
op, path, strerror (op_errno));
}
+static void
+br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child,
+ dict_t *xdata)
+{
+ struct iovec iov = {0, };
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ char *msg = NULL;
+ size_t size = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ msg = gf_strdup ("GLUSTERFS");
+ if (!msg)
+ goto out;
+
+ size = strlen (msg);
+
+ iov.iov_base = msg;
+ iov.iov_len = size;
+
+ iobref = iobref_new ();
+ if (!iobref)
+ goto free_msg;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf)
+ goto free_iobref;
+
+ iobref_add (iobref, iobuf);
+
+ iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = size;
+
+ ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL);
+ if (ret <= 0) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "dummy write failed (%s)", strerror (errno));
+ goto free_iobuf;
+ }
+
+ /* iobref_unbref() takes care of iobuf unref */
+ ret = 0;
+
+ free_iobuf:
+ iobuf_unref (iobuf);
+ free_iobref:
+ iobref_unref (iobref);
+ free_msg:
+ GF_FREE (msg);
+ out:
+ return;
+}
+
+static void
+br_object_handle_reopen (xlator_t *this,
+ br_object_t *object, inode_t *linked_inode)
+{
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ loc_t loc = {0, };
+
+ /**
+ * Here dict is purposefully not checked for NULL, because at any cost
+ * sending a re-open should not be missed. This re-open is an indication
+ * for the stub to properly mark inode's status.
+ */
+ dict = dict_new ();
+ if (dict) {
+ /* TODO: Make it a #define */
+ ret = dict_set_int32 (dict, "br-fd-reopen", 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Object reopen would trigger versioning.");
+ }
+
+ loc.inode = inode_ref (linked_inode);
+ gf_uuid_copy (loc.gfid, linked_inode->gfid);
+
+ br_trigger_sign (this, object->child, linked_inode, &loc, dict);
+
+ if (dict)
+ dict_unref (dict);
+ loc_wipe (&loc);
+}
+
/**
* Sign a given object. This routine runs full throttle. There needs to be
* some form of priority scheduling and/or read burstness to avoid starving
@@ -497,6 +589,7 @@ static inline int32_t br_sign_object (br_object_t *object)
fd_t *fd = NULL;
struct iatt iatt = {0, };
pid_t pid = GF_CLIENT_PID_BITD;
+ br_sign_state_t sign_info = BR_SIGN_NORMAL;
GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
@@ -515,6 +608,20 @@ static inline int32_t br_sign_object (br_object_t *object)
goto out;
}
+ /* sanity check */
+ sign_info = ntohl (object->sign_info);
+ GF_ASSERT (sign_info != BR_SIGN_NORMAL);
+
+ /**
+ * For fd's that have notified for reopening, we send an explicit
+ * open() followed by a dummy write() call. This triggers the
+ * actual signing of the object.
+ */
+ if (sign_info == BR_SIGN_REOPEN_WAIT) {
+ br_object_handle_reopen (this, object, linked_inode);
+ goto unref_inode;
+ }
+
ret = br_object_open (this, object, linked_inode, &fd);
if (!fd) {
br_log_object (this, "open", object->gfid, -ret);
@@ -648,6 +755,7 @@ br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev)
/* NOTE: it's BE, but no worry */
object->signedversion = ev->u.releasebr.version;
+ object->sign_info = ev->u.releasebr.sign_info;
out:
return object;
@@ -693,7 +801,6 @@ br_brick_callback (void *xl, char *brick,
xlator_t *this = NULL;
br_object_t *object = NULL;
br_child_t *child = NULL;
- int32_t flags = 0;
struct gf_tw_timer_list *timer = NULL;
this = xl;
@@ -710,14 +817,6 @@ br_brick_callback (void *xl, char *brick,
gf_log (this->name, GF_LOG_DEBUG,
"RELEASE EVENT [GFID %s]", uuid_utoa (gfid));
- flags = (int32_t)ntohl (ev->u.releasebr.flags);
- if (flags == O_RDONLY) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Read only fd [GFID: %s], ignoring signing..",
- uuid_utoa (gfid));
- goto out;
- }
-
child = br_get_child_from_brick_path (this, brick);
if (!child) {
gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume "
@@ -804,12 +903,15 @@ out:
return need_sign;
}
-static inline void
+void
br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
- loc_t *loc)
+ loc_t *loc, dict_t *xdata)
{
fd_t *fd = NULL;
int32_t ret = -1;
+ pid_t pid = GF_CLIENT_PID_BITD;
+
+ syncopctx_setfspid (&pid);
fd = fd_create (linked_inode, 0);
if (!fd) {
@@ -828,8 +930,10 @@ br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
fd_bind (fd);
}
- if (fd)
+ if (fd) {
+ br_send_dummy_write (this, fd, child, xdata);
syncop_close (fd);
+ }
out:
return;
@@ -972,7 +1076,7 @@ bitd_oneshot_crawl (xlator_t *subvol,
gf_log (this->name, GF_LOG_INFO,
"Triggering signing for %s [GFID: %s | Brick: %s]",
loc.path, uuid_utoa (linked_inode->gfid), child->brick_path);
- br_trigger_sign (this, child, linked_inode, &loc);
+ br_trigger_sign (this, child, linked_inode, &loc, NULL);
ret = 0;
@@ -1600,7 +1704,9 @@ struct xlator_cbks cbks;
struct volume_options options[] = {
{ .key = {"expiry-time"},
.type = GF_OPTION_TYPE_INT,
- .default_value = "120",
+ /* Let the default timer be half the value of the wait time for
+ * sining (which is 120 as of now) */
+ .default_value = "60",
.description = "default time duration for which an object waits "
"before it is signed",
},
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
index 66515e3213c..bbaf86fa65f 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -152,6 +152,8 @@ struct br_object {
be signed */
br_child_t *child; /* object's subvolume */
+ int sign_info;
+
struct list_head list; /* hook to add to the queue once the
object is expired from timer wheel */
void *data;
@@ -175,4 +177,8 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
gf_boolean_t
bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *);
+void
+br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
+ loc_t *loc, dict_t *xdata);
+
#endif /* __BIT_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h
index 699323170d3..7fd584e5970 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-common.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h
@@ -33,6 +33,13 @@ typedef enum br_vxattr_state {
BR_VXATTR_STATUS_INVALID = 3,
} br_vxattr_status_t;
+typedef enum br_sign_state {
+ BR_SIGN_INVALID = -1,
+ BR_SIGN_NORMAL = 0,
+ BR_SIGN_REOPEN_WAIT = 1,
+ BR_SIGN_QUICK = 2,
+} br_sign_state_t;
+
static inline br_vxattr_status_t
br_version_xattr_state (dict_t *xattr,
br_version_t **obuf, br_signature_t **sbuf)
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index 46271407219..9f6da89032f 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -28,7 +28,8 @@ enum br_mem_types {
gf_br_mt_br_tbf_opspec_t,
gf_br_mt_br_scrubber_t,
gf_br_mt_br_fsscan_entry_t,
- gf_br_stub_mt_end
+ gf_br_stub_mt_br_stub_fd_t,
+ gf_br_stub_mt_end,
};
#endif
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
index f9c3886948a..93db072f671 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
@@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode,
if (!ctx)
goto error_return;
+ INIT_LIST_HEAD (&ctx->fd_list);
(markdirty) ? __br_stub_mark_inode_dirty (ctx)
: __br_stub_mark_inode_synced (ctx);
__br_stub_set_ongoing_version (ctx, version);
- __br_stub_reset_release_counters (ctx);
if (fd) {
- br_stub_require_release_call (this, fd);
- __br_stub_track_openfd (fd, ctx);
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret)
+ goto free_ctx;
}
ret = br_stub_set_inode_ctx (this, inode, ctx);
if (ret)
@@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this,
__br_stub_mark_inode_synced (ctx);
}
- __br_stub_track_openfd (fd, ctx);
ret = 0;
}
unblock:
@@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this,
static inline void
br_stub_fill_local (br_stub_local_t *local,
call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid,
- int versioningtype, unsigned long memversion, int dirty)
+ int versioningtype, unsigned long memversion)
{
local->fopstub = stub;
local->versioningtype = versioningtype;
local->u.context.version = memversion;
- if (fd)
+ if (fd && !local->u.context.fd)
local->u.context.fd = fd_ref (fd);
if (inode)
local->u.context.inode = inode_ref (inode);
gf_uuid_copy (local->u.context.gfid, gfid);
-
- /* mark inode dirty/fresh according to durability */
- local->u.context.markdirty = (dirty) ? _gf_true : _gf_false;
}
static inline void
@@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local)
inode_unref (local->u.context.inode);
local->u.context.inode = NULL;
}
- local->u.context.markdirty = _gf_true;
memset (local->u.context.gfid, '\0', sizeof (uuid_t));
}
/**
- * callback for inode/fd full versioning
+ * callback for inode/fd versioning
*/
int
-br_stub_inode_fullversioning_cbk (call_frame_t *frame,
- void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- fd_t *fd = NULL;
- inode_t *inode = NULL;
- unsigned long version = 0;
- gf_boolean_t dirty = _gf_true;
- br_stub_local_t *local = NULL;
-
- local = (br_stub_local_t *)frame->local;
-
- /* be graceful to EEXIST */
- if ((op_ret < 0) && (op_errno == EEXIST)) {
- op_ret = 0;
- goto done;
- }
-
- if (op_ret < 0)
- goto done;
-
- fd = local->u.context.fd;
- inode = local->u.context.inode;
- version = local->u.context.version;
- dirty = local->u.context.markdirty;
-
- op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty);
- if (op_ret < 0)
- op_errno = EINVAL;
-
- done:
- frame->local = NULL;
- if (op_ret < 0)
- call_unwind_error (local->fopstub, op_ret, op_errno);
- else
- call_resume (local->fopstub);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
-
- return 0;
-}
-
-int
br_stub_fd_incversioning_cbk (call_frame_t *frame,
void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,
op_errno = EINVAL;
done:
- frame->local = NULL;
- if (op_ret < 0)
+ if (op_ret < 0) {
+ frame->local = NULL;
call_unwind_error (local->fopstub, -1, op_errno);
- else
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ } else {
call_resume (local->fopstub);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
-
+ }
return 0;
}
@@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,
* Initial object versioning
*
* Version persists two (2) extended attributes as explained below:
- * 1. Current (ongoing) version: This is incremented on an open()
- * or creat() and is the running version for an object.
+ * 1. Current (ongoing) version: This is incremented on an writev ()
+ * or truncate () and is the running version for an object.
* 2. Signing version: This is the version against which an object
* was signed (checksummed).
*
* During initial versioning, both ongoing and signing versions are
- * set of one and zero respectively. An open() call increments the
+ * set of one and zero respectively. A write() call increments the
* ongoing version as an indication of modification to the object.
* Additionally this needs to be persisted on disk and needs to be
* durable: fsync().. :-/
- * As an optimization only the first open() synchronizes the ongoing
- * version to disk, subsequent open()s before the *last* release()
+ * As an optimization only the first write() synchronizes the ongoing
+ * version to disk, subsequent write()s before the *last* release()
* are no-op's.
*
* create(), just like lookup() initializes the object versions to
- * the default, but persists the version to disk. As an optimization
- * this is not a durable operation: in case of a crash, hard reboot
- * etc.. absence of versioning xattrs is ignored in scrubber along
- * with the one time crawler explicitly triggering signing for such
- * objects.
+ * the default. As an optimization this is not a durable operation:
+ * in case of a crash, hard reboot etc.. absence of versioning xattrs
+ * is ignored in scrubber along with the one time crawler explicitly
+ * triggering signing for such objects.
*
- * c.f. br_stub_open_cbk() / br_stub_create_cbk()
+ * c.f. br_stub_writev() / br_stub_truncate()
*/
/**
@@ -400,7 +352,7 @@ int
br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
call_stub_t *stub, dict_t *dict, fd_t *fd,
br_stub_version_cbk *callback, unsigned long memversion,
- int versioningtype, int durable, int dirty)
+ int versioningtype, int durable)
{
int32_t ret = -1;
int flags = 0;
@@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
goto dealloc_xdata;
}
- local = br_stub_alloc_local (this);
- if (!local) {
- ret = -1;
- goto dealloc_xdata;
- }
-
- if (versioningtype == BR_STUB_FULL_VERSIONING)
- flags |= XATTR_CREATE;
+ local = frame->local;
br_stub_fill_local (local, stub, fd,
fd->inode, fd->inode->gfid,
- versioningtype, memversion, dirty);
+ versioningtype, memversion);
frame->local = local;
STACK_WIND (frame, callback,
@@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
}
static inline int
-br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame,
- call_stub_t *stub, fd_t *fd)
-{
- int32_t ret = -1;
- dict_t *dict = NULL;
- br_version_t *obuf = NULL;
- int op_errno = 0;
-
- op_errno = ENOMEM;
- dict = dict_new ();
- if (!dict)
- goto done;
- ret = br_stub_alloc_versions (&obuf, NULL, 0);
- if (ret)
- goto dealloc_dict;
-
- op_errno = EINVAL;
- ret = br_stub_prepare_version_request (this, dict, obuf,
- BITROT_DEFAULT_CURRENT_VERSION);
- if (ret)
- goto dealloc_versions;
-
- /**
- * Version extended attributes need not be durable at this point of
- * time. If the objects (inode) data gets persisted on disk but the
- * version extended attributes are lost due to a crash/power failure,
- * a subsequent lookup marks the objects signature as stale. This way,
- * dentry operation times do not shoot up.
- */
- ret = br_stub_fd_versioning (this, frame, stub, dict, fd,
- br_stub_inode_fullversioning_cbk,
- BITROT_DEFAULT_CURRENT_VERSION,
- BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0);
-
- dealloc_versions:
- br_stub_dealloc_versions (obuf);
- dealloc_dict:
- dict_unref (dict);
- done:
- if (ret)
- call_unwind_error (stub, -1, op_errno);
- return ret;
-}
-
-static inline int
br_stub_perform_incversioning (xlator_t *this,
call_frame_t *frame, call_stub_t *stub,
fd_t *fd, br_stub_inode_ctx_t *ctx)
{
- int32_t ret = -1;
- dict_t *dict = NULL;
- inode_t *inode = NULL;
- br_version_t *obuf = NULL;
- unsigned long writeback_version = 0;
- int op_errno = 0;
-
- inode = fd->inode;
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ br_version_t *obuf = NULL;
+ unsigned long writeback_version = 0;
+ int op_errno = 0;
+ br_stub_local_t *local = NULL;
op_errno = EINVAL;
- ret = br_stub_require_release_call (this, fd);
- if (ret)
- goto done;
-
- LOCK (&inode->lock);
- {
- if (__br_stub_is_inode_dirty (ctx))
- writeback_version = __br_stub_writeback_version (ctx);
- else
- __br_stub_track_openfd (fd, ctx);
- }
- UNLOCK (&inode->lock);
+ local = frame->local;
- if (!writeback_version) {
- ret = 0;
- goto done;
- }
+ writeback_version = __br_stub_writeback_version (ctx);
/* inode requires writeback to disk */
op_errno = ENOMEM;
@@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this,
ret = br_stub_fd_versioning
(this, frame, stub, dict,
fd, br_stub_fd_incversioning_cbk, writeback_version,
- BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0);
+ BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE);
dealloc_versions:
br_stub_dealloc_versions (obuf);
dealloc_dict:
dict_unref (dict);
done:
- if (!ret && !writeback_version)
- call_resume (stub);
- if (ret)
+ if (ret) {
+ if (local)
+ frame->local = NULL;
call_unwind_error (stub, -1, op_errno);
+ if (local) {
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ }
+ }
+
return ret;
}
@@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this,
/* fsetxattr() */
static inline int
+br_stub_compare_sign_version (xlator_t *this, inode_t *inode,
+ br_signature_t *sbuf, dict_t *dict)
+{
+ int32_t ret = -1;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t tmp_ctx = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, sbuf, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx);
+ if (ret) {
+ dict_del (dict, BITROT_SIGNING_VERSION_KEY);
+ goto out;
+ }
+
+ ret = -1;
+ ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx;
+
+ LOCK (&inode->lock);
+ {
+ if (ctx->currentversion == sbuf->signedversion)
+ ret = 0;
+ else
+ gf_log (this->name, GF_LOG_WARNING, "current version "
+ "%lu and version of the signature %lu are not "
+ "same", ctx->currentversion,
+ sbuf->signedversion);
+ }
+ UNLOCK (&inode->lock);
+
+out:
+ return ret;
+}
+
+static inline int
br_stub_prepare_signature (xlator_t *this, dict_t *dict,
inode_t *inode, br_isignature_t *sign)
{
@@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict,
ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen);
if (ret)
goto dealloc_versions;
+
+ ret = br_stub_compare_sign_version (this, inode, sbuf, dict);
+ if (ret)
+ goto dealloc_versions;
+
return 0;
dealloc_versions:
@@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
if (ret)
goto unwind;
+ gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu",
+ sign->signedversion);
wind:
STACK_WIND (frame, default_setxattr_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd,
@@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-/** }}} */
+/**
+ * The first write response on the first fd in the list of fds will set
+ * the flag to indicate that the inode is modified. The subsequent write
+ * respnses coming on either the first fd or some other fd will not change
+ * the fd. The inode-modified flag is unset only upon release of all the
+ * fds.
+ */
+int32_t
+br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
+ if (op_ret < 0)
+ goto unwind;
-/** {{{ */
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
+ goto unwind;
-/* open() */
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-int
-br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+/**
+ * Ongoing version is increased only for the first modify operation.
+ * First modify version means the first write or truncate call coming on the
+ * first fd in the list of inodes.
+ * For anonymous fds open would not have come, so check if its the first write
+ * by doing both inode dirty check and ensuring list of fds is empty
+ */
+static inline gf_boolean_t
+br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- call_stub_t *stub = NULL;
+ gf_boolean_t inc_version = _gf_false;
+
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ LOCK (&fd->inode->lock);
+ {
+ if (__br_stub_is_inode_dirty (ctx))
+ inc_version = _gf_true;
+ }
+ UNLOCK (&fd->inode->lock);
+
+out:
+ return inc_version;
+}
+
+/**
+ * Since NFS does not do open, writes from NFS are sent over an anonymous
+ * fd. It means each write fop might come on a different anonymous fd and
+ * will lead to very large number of notifications being sent. It might
+ * affect the perfromance as, there will too many sign requests.
+ * To avoid that whenever the last fd released from an inode (logical release)
+ * is an anonymous fd the release notification is sent with a flag being set
+ * __br_stub_anon_release (ctx);
+ * BitD checks for the flag and if set, it will send a dummy write request
+ * (again on an anonymous fd) instead of triggering sign.
+ * Bit-rot-stub should identify such dummy writes and should send success to
+ * them instead of winding them downwards.
+ */
+gf_boolean_t
+br_stub_dummy_write (call_frame_t *frame)
+{
+ return (frame->root->pid == GF_CLIENT_PID_BITD)
+ ? _gf_true : _gf_false;
+}
+
+int32_t
+br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (!br_stub_fd) {
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "add fd to the inode (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ if (frame->root->pid == GF_CLIENT_PID_BITD)
+ br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0,
+ NULL, NULL, NULL);
+ else
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
+ return 0;
+}
+
+/**
+ TODO: If possible add pictorial represention of below comment.
+
+ Before sending writev on the ANONYMOUS FD, increase the ongoing
+ version first. This brings anonymous fd write closer to the regular
+ fd write by having the ongoing version increased before doing the
+ write (In regular fd, after open the ongoing version is incremented).
+ Do following steps to handle writes on anonymous fds:
+ 1) Increase the on-disk ongoing version
+ 2) Once versioning is successfully done send write operation. If versioning
+ fails, then fail the write fop.
+ 3) In writev_cbk do below things:
+ a) Increase in-memory version
+ b) set the fd context (so that br_stub_release is invoked)
+ c) add the fd to the list of fds maintained in the inode context of
+ bitrot-stub.
+ d) Mark inode as non dirty
+ e) Mard inode as modified (in the inode context)
+**/
+int32_t
+br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd_ref (fd);
+ frame->local = local;
+
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto unwind;
+ }
+
+ /* TODO: Better to do a dummy fsetxattr instead of write. Keep write
+ simple */
+ if (br_stub_dummy_write (frame)) {
+ LOCK (&fd->inode->lock);
+ {
+ (void) __br_stub_inode_sign_state
+ (ctx, GF_FOP_WRITE, fd);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ if (xdata && dict_get (xdata, "br-fd-reopen")) {
+ op_ret = vector->iov_len;
+ op_errno = 0;
+ goto unwind;
+ }
+ }
+
+ /**
+ * Check whether this is the first write on this inode since the last
+ * sign notification has been sent. If so, do versioning. Otherwise
+ * go ahead with the fop.
+ */
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
+
+ /* Create the stub for the write fop */
+ stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count,
+ offset, flags, iobref, xdata);
+
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "write fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
if (op_ret < 0)
goto unwind;
- if (cookie != (void *) BR_STUB_REQUEST_COOKIE)
+
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
goto unwind;
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd_ref (fd);
+ frame->local = local;
+
ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0)
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto unwind;
+ }
- stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata);
+ /**
+ * c.f. br_stub_writev()
+ */
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
+
+ /* Create the stub for the ftruncate fop */
+ stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset,
+ xdata);
if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "ftruncate fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
+
+ if (op_ret < 0)
+ goto unwind;
+
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
+ goto unwind;
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+}
+
+/**
+ * Bit-rot-stub depends heavily on the fd based operations to for doing
+ * versioning and sending notification. It starts tracking the operation
+ * upon getting first fd based modify operation by doing versioning and
+ * sends notification when last fd using which the inode was modified is
+ * released.
+ * But for truncate there is no fd and hence it becomes difficult to do
+ * the versioning and send notification. It is handled by doing versioning
+ * on an anonymous fd. The fd will be valid till the completion of the
+ * truncate call. It guarantees that release on this anonymous fd will happen
+ * after the truncate call and notification is sent after the truncate call.
+ */
+int32_t
+br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
+
+ fd = fd_anonymous (loc->inode);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous "
+ "fd for the inode %s", uuid_utoa (loc->inode->gfid));
+ goto unwind;
+ }
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (loc->inode->gfid));
op_ret = -1;
- op_errno = EINVAL;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd;
+ frame->local = local;
+
+ ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
goto unwind;
}
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx);
+ if (ret)
+ goto unwind;
+
/**
- * Ongoing version needs to be incremented. If the inode is not dirty,
- * things are simple: increment the ongoing version safely and be done.
- * If inode is dirty, a writeback to disk is required. This is tricky in
- * case of multiple open()'s as ongoing version needs to be incremented
- * on a successful writeback. It's probably safe to remember the ongoing
- * version before writeback and *assigning* it in the callback, but that
- * may lead to a trustable checksum to be treated as stale by scrubber
- * (the case where the in-memory ongoing version is lesser than the
- * on-disk version). Therefore, *all* open() calls (which might have
- * come in parallel) try to synchronize the next ongoing version to
- * disk. In the callback path, the winner marks the inode as synced
- * therby loosing open() calls become no-op's.
+ * c.f. br_stub_writev()
*/
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
- return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
- unwind:
- STACK_UNWIND_STRICT (open, frame,
- op_ret, op_errno, fd, xdata);
+ /* Create the stub for the truncate fop */
+ stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset,
+ xdata);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "truncate fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub,
+ local->u.context.fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
return 0;
}
+/** }}} */
+
+
+/** {{{ */
+
+/* open() */
+
+/**
+ * It's probably worth mentioning a bit about why some of the housekeeping
+ * work is done in open() call path, rather than the callback path.
+ * Two (or more) open()'s in parallel can race and lead to a situation
+ * where a release() gets triggered (possibly after a series of write()
+ * calls) when *other* open()'s have still not reached callback path
+ * thereby having an active fd on an inode that is in process of getting
+ * signed with the current version.
+ *
+ * Maintaining fd list in the call path ensures that a release() would
+ * not be triggered if an open() call races ahead (followed by a close())
+ * threby finding non-empty fd list.
+ */
+
int
br_stub_open (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
{
- void *cookie = NULL;
+ int32_t ret = -1;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
- if (!flags)
- goto wind;
if (frame->root->pid == GF_CLIENT_PID_SCRUB)
goto wind;
- cookie = (void *) BR_STUB_REQUEST_COOKIE;
- wind:
- STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie,
- FIRST_CHILD (this), FIRST_CHILD (this)->fops->open,
- loc, flags, fd, xdata);
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the file %s (gfid: %s)", loc->path,
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+ if (flags == O_RDONLY)
+ goto wind;
+
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+wind:
+ STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+unwind:
+ STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL);
return 0;
}
@@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this,
/* creat() */
+/**
+ * This routine registers a release callback for the given fd and adds the
+ * fd to the inode context fd tracking list.
+ */
+int32_t
+br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ ret = br_stub_require_release_call (this, fd, &br_stub_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set the fd "
+ "context for the file (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ LOCK (&fd->inode->lock);
+ {
+ list_add_tail (&ctx->fd_list, &br_stub_fd->list);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
int
br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- call_stub_t *stub = NULL;
- br_stub_inode_ctx_t *ctx = NULL;
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
if (op_ret < 0)
goto unwind;
- stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode,
- stbuf, preparent, postparent, xdata);
- if (!stub) {
- op_ret = -1;
- op_errno = EINVAL;
- goto unwind;
- }
-
ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-
- /* see comment in br_stub_open_cbk().. */
- return (ctx)
- ? br_stub_perform_incversioning (this, frame, stub, fd, ctx)
- : br_stub_perform_fullversioning (this, frame, stub, fd);
+ if (ret < 0) {
+ ret = br_stub_init_inode_versions (this, fd, inode, version,
+ _gf_true);
+ } else {
+ ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ }
- unwind:
+unwind:
STACK_UNWIND_STRICT (create, frame, op_ret, op_errno,
fd, inode, stbuf, preparent, postparent, xdata);
return 0;
@@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame,
xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
+
STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->create,
loc, flags, mode, umask, fd, xdata);
return 0;
+unwind:
+ STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
/** }}} */
@@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this,
* out the correct version to use in the inode context (start with
* the default version if unavailable). As of now versions are not
* persisted on-disk. The inode is marked dirty, so that the first
- * operation (such as open(), etc..) would trigger synchronization
- * to disk.
+ * operation (such as write(), etc..) triggers synchronization to
+ * disk.
*/
status = br_version_xattr_state (xattr, &obuf, &sbuf);
- /**
- * stub does not know how to handle presence of signature but not
- * the object version, therefore, in such cases, bail out..
- */
- if (status == BR_VXATTR_STATUS_INVALID) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. "
- "Bailing out [GFID: %s]", uuid_utoa (gfid));
- return -1;
- }
-
version = ((status == BR_VXATTR_STATUS_FULL)
|| (status == BR_VXATTR_STATUS_UNSIGNED))
? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION;
@@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this,
}
static inline void
-br_stub_send_ipc_fop (xlator_t *this,
- fd_t *fd, unsigned long releaseversion, int32_t flags)
+br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion,
+ int sign_info)
{
int32_t op = 0;
int32_t ret = 0;
@@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this,
changelog_event_t ev = {0,};
ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE;
- ev.u.releasebr.flags = flags;
ev.u.releasebr.version = releaseversion;
+ ev.u.releasebr.sign_info = sign_info;
gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid);
xdata = dict_new ();
@@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this,
return;
}
+/**
+ * This is how the state machine of sign info works:
+ * 3 states:
+ * 1) BR_SIGN_NORMAL => The default State of the inode
+ * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen
+ * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign
+ * 2 events:
+ * 1) GF_FOP_RELEASE
+ * 2) GF_FOP_WRITE (actually a dummy write fro BitD)
+ *
+ * This is how states are changed based on events:
+ * EVENT: GF_FOP_RELEASE:
+ * if (state == BR_SIGN_NORMAL) ; then
+ * set state = BR_SIGN_REOPEN_WAIT;
+ * if (state == BR_SIGN_QUICK); then
+ * set state = BR_SIGN_NORMAL;
+ * EVENT: GF_FOP_WRITE:
+ * if (state == BR_SIGN_REOPEN_WAIT); then
+ * set state = BR_SIGN_QUICK;
+ */
+br_sign_state_t
+__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx,
+ glusterfs_fop_t fop, fd_t *fd)
+{
+ br_sign_state_t sign_info = BR_SIGN_INVALID;
+
+ switch (fop) {
+
+ case GF_FOP_WRITE:
+ sign_info = ctx->info_sign = BR_SIGN_QUICK;
+ break;
+
+ case GF_FOP_RELEASE:
+ GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT);
+
+ if (ctx->info_sign == BR_SIGN_NORMAL) {
+ sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT;
+ } else {
+ sign_info = ctx->info_sign;
+ ctx->info_sign = BR_SIGN_NORMAL;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ return sign_info;
+}
+
int32_t
br_stub_release (xlator_t *this, fd_t *fd)
{
- int32_t ret = 0;
- int32_t flags = 0;
- inode_t *inode = NULL;
- unsigned long releaseversion = 0;
- br_stub_inode_ctx_t *ctx = NULL;
+ int32_t ret = 0;
+ int32_t flags = 0;
+ inode_t *inode = NULL;
+ unsigned long releaseversion = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t tmp = 0;
+ br_stub_fd_t *br_stub_fd = NULL;
+ int32_t signinfo = 0;
inode = fd->inode;
@@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd)
ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL);
if (ctx == NULL)
goto unblock;
- __br_stub_track_release (ctx);
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (br_stub_fd) {
+ list_del_init (&br_stub_fd->list);
+ }
+
ret = __br_stub_can_trigger_release
- (inode, ctx, &releaseversion, &flags);
- if (ret) {
- GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0);
+ (inode, ctx, &releaseversion);
+ if (!ret)
+ goto unblock;
+
+ signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd);
+ signinfo = htonl (signinfo);
+
+ /* inode back to initital state: mark dirty */
+ if (ctx->info_sign == BR_SIGN_NORMAL) {
__br_stub_mark_inode_dirty (ctx);
+ __br_stub_unset_inode_modified (ctx);
}
}
unblock:
@@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd)
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
- "releaseversion: %lu|flags: %d", releaseversion, flags);
- br_stub_send_ipc_fop (this, fd, releaseversion, flags);
+ "releaseversion: %lu | flags: %d | signinfo: %d",
+ (unsigned long) ntohl (releaseversion),
+ flags, ntohl(signinfo));
+ br_stub_send_ipc_fop (this, fd, releaseversion, signinfo);
}
+ ret = fd_ctx_del (fd, this, &tmp);
+ br_stub_fd = (br_stub_fd_t *)(long)tmp;
+
+ GF_FREE (br_stub_fd);
+
return 0;
}
@@ -1351,11 +1899,12 @@ void
br_stub_ictxmerge (xlator_t *this, fd_t *fd,
inode_t *inode, inode_t *linked_inode)
{
- int32_t ret = 0;
- uint64_t ctxaddr = 0;
- uint64_t lctxaddr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- br_stub_inode_ctx_t *lctx = NULL;
+ int32_t ret = 0;
+ uint64_t ctxaddr = 0;
+ uint64_t lctxaddr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_inode_ctx_t *lctx = NULL;
+ br_stub_fd_t *br_stub_fd = NULL;
ret = br_stub_get_inode_ctx (this, inode, &ctxaddr);
if (ret < 0)
@@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd,
goto unblock;
lctx = (br_stub_inode_ctx_t *) lctxaddr;
- if (__br_stub_is_inode_dirty (lctx)) {
- /**
- * RACY code: An inode can end up in this situation
- * after a lookup() or after a create() followed by
- * a release(). Even if we distinguish b/w the two,
- * there needs to be more infrastructure built up
- * in stub to handle these races. Note, that it's
- * probably OK to ignore the race iff the version
- * was initialized on the very first lookup(), i.e.,
- * [ongoingversion: default].
- *
- * FIXME: fixup races [create(1..n)/lookup(1..n)].
- */
- GF_ASSERT (lctx->currentversion
- == BITROT_DEFAULT_CURRENT_VERSION);
- __br_stub_track_openfd (fd, lctx);
- __br_stub_mark_inode_synced (lctx);
- } else {
- GF_ASSERT (ctx->currentversion <= lctx->currentversion);
- __br_stub_track_openfd (fd, lctx);
+ GF_ASSERT (list_is_singular (&ctx->fd_list));
+ br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t,
+ list);
+ if (br_stub_fd) {
+ GF_ASSERT (br_stub_fd->fd == fd);
+ list_move_tail (&br_stub_fd->list, &lctx->fd_list);
}
}
- unblock:
+unblock:
UNLOCK (&linked_inode->lock);
done:
@@ -1409,6 +1944,9 @@ struct xlator_fops fops = {
.getxattr = br_stub_getxattr,
.fgetxattr = br_stub_fgetxattr,
.fsetxattr = br_stub_fsetxattr,
+ .writev = br_stub_writev,
+ .truncate = br_stub_truncate,
+ .ftruncate = br_stub_ftruncate,
};
struct xlator_cbks cbks = {
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
index 86090bfb877..69e212bb81f 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
@@ -21,6 +21,7 @@
#include "xlator.h"
#include "defaults.h"
#include "call-stub.h"
+#include "bit-rot-stub-mem-types.h"
#include "bit-rot-common.h"
@@ -32,17 +33,18 @@ typedef struct br_stub_inode_ctx {
a writeback to disk? */
unsigned long currentversion; /* ongoing version */
- struct release {
- int32_t ordflags;
- unsigned long opencount; /* number of open()s before
- final release() */
- unsigned long releasecount; /* number of release()s */
- } releasectx;
-#define BR_STUB_REQUIRE_RELEASE_CBK 0x0E0EA0E
+ int info_sign;
+ struct list_head fd_list; /* list of open fds or fds participating in
+ write operations */
} br_stub_inode_ctx_t;
+typedef struct br_stub_fd {
+ fd_t *fd;
+ struct list_head list;
+} br_stub_fd_t;
#define I_DIRTY (1<<0) /* inode needs writeback */
+#define I_MODIFIED (1<<1)
#define WRITEBACK_DURABLE 1 /* writeback is durable */
/**
@@ -60,12 +62,10 @@ typedef struct br_stub_local {
uuid_t gfid;
inode_t *inode;
unsigned long version;
- gf_boolean_t markdirty;
} context;
} u;
} br_stub_local_t;
-#define BR_STUB_FULL_VERSIONING (1<<0)
#define BR_STUB_INCREMENTAL_VERSIONING (1<<1)
typedef struct br_stub_private {
@@ -96,16 +96,131 @@ __br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx)
return (ctx->need_writeback & I_DIRTY);
}
+/* inode mofification markers */
+static inline void
+__br_stub_set_inode_modified (br_stub_inode_ctx_t *ctx)
+{
+ ctx->need_writeback |= I_MODIFIED;
+}
+
+static inline void
+__br_stub_unset_inode_modified (br_stub_inode_ctx_t *ctx)
+{
+ ctx->need_writeback &= ~I_MODIFIED;
+}
+
+static inline int
+__br_stub_is_inode_modified (br_stub_inode_ctx_t *ctx)
+{
+ return (ctx->need_writeback & I_MODIFIED);
+}
+
+br_stub_fd_t *
+br_stub_fd_new (void)
+{
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = GF_CALLOC (1, sizeof (*br_stub_fd),
+ gf_br_stub_mt_br_stub_fd_t);
+
+ return br_stub_fd;
+}
+
+int
+__br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd)
+{
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out);
+
+ value = (uint64_t)(long) br_stub_fd;
+
+ ret = __fd_ctx_set (fd, this, value);
+
+out:
+ return ret;
+}
+
+br_stub_fd_t *
+__br_stub_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ br_stub_fd_t *br_stub_fd = NULL;
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ ret = __fd_ctx_get (fd, this, &value);
+ if (ret)
+ return NULL;
+
+ br_stub_fd = (br_stub_fd_t *) ((long) value);
+
+out:
+ return br_stub_fd;
+}
+
+br_stub_fd_t *
+br_stub_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ LOCK (&fd->lock);
+ {
+ br_stub_fd = __br_stub_fd_ctx_get (this, fd);
+ }
+ UNLOCK (&fd->lock);
+
+out:
+ return br_stub_fd;
+}
+
+int32_t
+br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd)
+{
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out);
+
+ LOCK (&fd->lock);
+ {
+ ret = __br_stub_fd_ctx_set (this, fd, br_stub_fd);
+ }
+ UNLOCK (&fd->lock);
+
+out:
+ return ret;
+}
+
static inline int
-br_stub_require_release_call (xlator_t *this, fd_t *fd)
+br_stub_require_release_call (xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx)
{
int32_t ret = 0;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = br_stub_fd_new ();
+ if (!br_stub_fd)
+ return -1;
+
+ br_stub_fd->fd = fd;
+ INIT_LIST_HEAD (&br_stub_fd->list);
- ret = fd_ctx_set (fd, this,
- (uint64_t)(long)BR_STUB_REQUIRE_RELEASE_CBK);
+ ret = br_stub_fd_ctx_set (this, fd, br_stub_fd);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
"could not set fd context (for release callback");
+ else
+ *fd_ctx = br_stub_fd;
+
return ret;
}
@@ -122,7 +237,15 @@ static inline int
br_stub_get_inode_ctx (xlator_t *this,
inode_t *inode, uint64_t *ctx)
{
- return inode_ctx_get (inode, this, ctx);
+ int ret = -1;
+
+ LOCK (&inode->lock);
+ {
+ ret = __br_stub_get_inode_ctx (this, inode, ctx);
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
}
static inline int
@@ -144,55 +267,31 @@ __br_stub_writeback_version (br_stub_inode_ctx_t *ctx)
static inline void
__br_stub_set_ongoing_version (br_stub_inode_ctx_t *ctx, unsigned long version)
{
- ctx->currentversion = version;
-}
-
-static inline void
-__br_stub_reset_release_counters (br_stub_inode_ctx_t *ctx)
-{
- ctx->releasectx.ordflags = 0;
- ctx->releasectx.opencount = 0;
- ctx->releasectx.releasecount = 0;
-}
-
-static inline void
-__br_stub_track_release (br_stub_inode_ctx_t *ctx)
-{
- ++ctx->releasectx.releasecount;
-}
-
-static inline void
-___br_stub_track_open (br_stub_inode_ctx_t *ctx)
-{
- ++ctx->releasectx.opencount;
-}
-
-static inline void
-___br_stub_track_open_flags (fd_t *fd, br_stub_inode_ctx_t *ctx)
-{
- ctx->releasectx.ordflags |= fd->flags;
-}
-
-static inline void
-__br_stub_track_openfd (fd_t *fd, br_stub_inode_ctx_t *ctx)
-{
- ___br_stub_track_open (ctx);
- ___br_stub_track_open_flags (fd, ctx);
+ if (ctx->currentversion < version)
+ ctx->currentversion = version;
+ else
+ gf_log ("bit-rot-stub", GF_LOG_WARNING, "current version: %lu"
+ "new version: %lu", ctx->currentversion, version);
}
static inline int
__br_stub_can_trigger_release (inode_t *inode,
- br_stub_inode_ctx_t *ctx,
- unsigned long *version, int32_t *flags)
+ br_stub_inode_ctx_t *ctx, unsigned long *version)
{
- if (list_empty (&inode->fd_list)
- && (ctx->releasectx.releasecount == ctx->releasectx.opencount)) {
- if (flags)
- *flags = htonl (ctx->releasectx.ordflags);
+ /**
+ * If the inode is modified, then it has to be dirty. An inode is
+ * marked dirty once version is increased. Its marked as modified
+ * when the modification call (write/truncate) which triggered
+ * the versioning is successful.
+ */
+ if (__br_stub_is_inode_modified (ctx)
+ && list_empty (&ctx->fd_list)
+ && (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) {
+
+ GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0);
+
if (version)
*version = htonl (ctx->currentversion);
-
- __br_stub_reset_release_counters (ctx);
return 1;
}
@@ -261,11 +360,16 @@ static inline void
br_stub_remove_vxattrs (dict_t *xattr)
{
if (xattr) {
- dict_del (xattr, BITROT_OBJECT_BAD_KEY);
dict_del (xattr, BITROT_CURRENT_VERSION_KEY);
dict_del (xattr, BITROT_SIGNING_VERSION_KEY);
dict_del (xattr, BITROT_SIGNING_XATTR_SIZE_KEY);
}
}
+int32_t
+br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx);
+
+br_sign_state_t
+__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop,
+ fd_t *fd);
#endif /* __BIT_ROT_STUB_H__ */