summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2015-04-25 15:58:09 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-08 05:14:38 -0700
commitfc199e1b6f9423b4dc0c9b34bf894ad66ffafabf (patch)
tree0b7621ff9143bfbeeb2fca62c6a7032775381c01
parent33fdc310700da74a4142dab48d00c4753100904b (diff)
cluster/ec: data heal implementation for ec
Data self-heal: 1) Take inode lock in domain 'this->name:self-heal' on 0-0 range (full file), So that no other processes try to do self-heal at the same time. 2) Take inode lock in domain 'this->name' on 0-0 range (full file), 3) perform fxattrop+fstat and get the xattrs on all the bricks 3) Choose the brick with ec->fragment number of same version as source 4) Truncate sinks 5) Unlock lock taken in 2) 5) For each block take full file lock, Read from sources write to the sinks, Unlock 6) Take full file lock and see if the file is still sane copy i.e. File didn't become unusable while the bricks are offline. Update mtime to before healing 7) xattrop with -ve values of 'dirty' and difference of highest and its own version values for version xattr 8) unlock lock acquired in 6) 9) unlock lock acquired in 1) Change-Id: I6f4d42cd5423c767262c9d7bb5ca7767adb3e5fd BUG: 1215265 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/10384 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--libglusterfs/src/cluster-syncop.h35
-rw-r--r--xlators/cluster/ec/src/ec-common.h2
-rw-r--r--xlators/cluster/ec/src/ec-generic.c1
-rw-r--r--xlators/cluster/ec/src/ec-heal.c793
4 files changed, 823 insertions, 8 deletions
diff --git a/libglusterfs/src/cluster-syncop.h b/libglusterfs/src/cluster-syncop.h
index 2c94246ff1f..a681951c27d 100644
--- a/libglusterfs/src/cluster-syncop.h
+++ b/libglusterfs/src/cluster-syncop.h
@@ -121,6 +121,41 @@ cluster_tryentrylk (xlator_t **subvols, unsigned char *on, int numsubvols,
call_frame_t *frame, xlator_t *this, char *dom,
inode_t *inode, const char *name);
+int32_t
+cluster_fxattrop (xlator_t **subvols, unsigned char *on, int numsubvols,
+ default_args_cbk_t *replies, unsigned char *output,
+ call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+cluster_fstat (xlator_t **subvols, unsigned char *on, int numsubvols,
+ default_args_cbk_t *replies, unsigned char *output,
+ call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+cluster_ftruncate (xlator_t **subvols, unsigned char *on, int numsubvols,
+ default_args_cbk_t *replies, unsigned char *output,
+ call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata);
+
+int32_t
+cluster_open (xlator_t **subvols, unsigned char *on, int numsubvols,
+ default_args_cbk_t *replies, unsigned char *output,
+ call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+int
+cluster_tryinodelk (xlator_t **subvols, unsigned char *on, int numsubvols,
+ default_args_cbk_t *replies, unsigned char *locked_on,
+ call_frame_t *frame, xlator_t *this, char *dom,
+ inode_t *inode, off_t off, size_t size);
+
+int32_t
+cluster_fsetattr (xlator_t **subvols, unsigned char *on, int numsubvols,
+ default_args_cbk_t *replies, unsigned char *output,
+ call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
void
cluster_replies_wipe (default_args_cbk_t *replies, int num_subvols);
#endif /* !_CLUSTER_SYNCOP_H */
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index ba009040b71..04f85a43f16 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -31,6 +31,8 @@ typedef enum {
#define EC_FLAG_WAITING_WINDS 0x0010
+#define EC_SELFHEAL_BIT 62
+
#define EC_MINIMUM_ONE -1
#define EC_MINIMUM_MIN -2
#define EC_MINIMUM_ALL -3
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 4cf5a50ecbd..50169771476 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -18,7 +18,6 @@
#include "ec-fops.h"
#include "byte-order.h"
-#define EC_SELFHEAL_BIT 62
/* FOP: flush */
int32_t ec_flush_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 1e19cf57e1b..b7b910502f8 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -1052,10 +1052,11 @@ void ec_heal_reopen_fd(ec_heal_t * heal)
UNLOCK(&inode->lock);
}
-int32_t ec_heal_writev_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- struct iatt * prebuf, struct iatt * postbuf,
- dict_t * xdata)
+int32_t
+ec_heal_writev_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno);
@@ -1091,7 +1092,7 @@ int32_t ec_heal_readv_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
return 0;
}
-void ec_heal_data(ec_heal_t * heal)
+void ec_heal_data_block(ec_heal_t *heal)
{
ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad);
@@ -1393,7 +1394,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)
return EC_STATE_HEAL_DATA_COPY;
case EC_STATE_HEAL_DATA_COPY:
- ec_heal_data(heal);
+ ec_heal_data_block(heal);
return EC_STATE_HEAL_DATA_UNLOCK;
@@ -1633,6 +1634,18 @@ ec_mask_to_char_array (uintptr_t mask, unsigned char *array, int numsubvols)
array[i] = ((mask >> i) & 1);
}
+uintptr_t
+ec_char_array_to_mask (unsigned char *array, int numsubvols)
+{
+ int i = 0;
+ uintptr_t mask = 0;
+
+ for (i = 0; i < numsubvols; i++)
+ if (array[i])
+ mask |= (1ULL<<i);
+ return mask;
+}
+
int
ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,
uint64_t *versions, uint64_t *dirty,
@@ -1640,8 +1653,8 @@ ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,
{
void *ptr = NULL;
uint64_t *value = NULL;
- uint64_t max_version = 0;
int source = -1;
+ uint64_t max_version = 0;
int32_t len = 0;
int ret = 0;
int i = 0;
@@ -2643,3 +2656,769 @@ out:
STACK_DESTROY (frame->root);
return ret;
}
+
+/*Data heal*/
+int
+ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty,
+ uint64_t *size, unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ char version_size[64] = {0};
+ uint64_t *value = NULL;
+ dict_t *version_size_db = NULL;
+ unsigned char *same = NULL;
+ void *ptr = NULL;
+ int len = 0;
+ int max_same_count = 0;
+ int source = 0;
+ int i = 0;
+ int ret = 0;
+
+ version_size_db = dict_new ();
+ if (!version_size_db) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret < 0)
+ continue;
+ ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_VERSION,
+ &ptr, &len);
+ if (ret == 0) {
+ value = ptr;
+ versions[i] = ntoh64(value[EC_DATA_TXN]);
+ }
+
+ ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_DIRTY,
+ &ptr, &len);
+ if (ret == 0) {
+ value = ptr;
+ dirty[i] = ntoh64(value[EC_DATA_TXN]);
+ }
+ ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_SIZE,
+ &ptr, &len);
+ if (ret == 0) {
+ value = ptr;
+ size[i] = ntoh64(*value);
+ }
+ /*Build a db of same version, size*/
+ snprintf (version_size, sizeof (version_size),
+ "%"PRIu64"-%"PRIu64, versions[i], size[i]);
+ ret = dict_get_bin (version_size_db, version_size,
+ (void **)&same);
+ if (ret < 0) {
+ same = alloca0 (ec->nodes);
+ }
+
+ same[i] = 1;
+ if (max_same_count < EC_COUNT (same, ec->nodes)) {
+ max_same_count = EC_COUNT (same, ec->nodes);
+ source = i;
+ }
+
+ if (ret < 0) {
+ ret = dict_set_static_bin (version_size_db,
+ version_size, same, ec->nodes);
+ }
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ /* If we don't have ec->fragments number of same version,size it is not
+ * recoverable*/
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ } else {
+ snprintf (version_size, sizeof (version_size),
+ "%"PRIu64"-%"PRIu64, versions[source], size[source]);
+ ret = dict_get_bin (version_size_db, version_size,
+ (void **)&same);
+ if (ret < 0)
+ goto out;
+ memcpy (sources, same, ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (replies[i].valid && (replies[i].op_ret == 0) &&
+ !sources[i])
+ healed_sinks[i] = 1;
+ }
+ }
+ if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ ret = source;
+out:
+ if (version_size_db)
+ dict_unref (version_size_db);
+ return ret;
+}
+
+int
+__ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *locked_on, uint64_t *versions,
+ uint64_t *dirty, uint64_t *size, unsigned char *sources,
+ unsigned char *healed_sinks, unsigned char *trim,
+ struct iatt *stbuf)
+{
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ dict_t *xattrs = NULL;
+ uint64_t zero_array[2] = {0};
+ int source = 0;
+ int ret = 0;
+ uint64_t zero_value = 0;
+ uint64_t source_size = 0;
+ int i = 0;
+
+ EC_REPLIES_ALLOC (replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ xattrs = dict_new ();
+ if (!xattrs ||
+ dict_set_static_bin (xattrs, EC_XATTR_VERSION, zero_array,
+ sizeof (zero_array)) ||
+ dict_set_static_bin (xattrs, EC_XATTR_DIRTY, zero_array,
+ sizeof (zero_array)) ||
+ dict_set_static_bin (xattrs, EC_XATTR_SIZE, &zero_value,
+ sizeof (zero_value))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = cluster_fxattrop (ec->xl_list, locked_on, ec->nodes,
+ replies, output, frame, ec->xl, fd,
+ GF_XATTROP_ADD_ARRAY64, xattrs, NULL);
+ if (EC_COUNT (output, ec->nodes) <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ source = ec_heal_data_find_direction (ec, replies, versions, dirty,
+ size, sources, healed_sinks);
+ ret = source;
+ if (ret < 0)
+ goto out;
+
+ /* There could be files with versions, size same but on disk ia_size
+ * could be different because of disk crashes, mark them as sinks as
+ * well*/
+ ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, replies,
+ output, frame, ec->xl, fd, NULL);
+ EC_INTERSECT (sources, sources, output, ec->nodes);
+ EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
+ if ((EC_COUNT (sources, ec->nodes) < ec->fragments) ||
+ (EC_COUNT (healed_sinks, ec->nodes) == 0)) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ source_size = ec_adjust_size (ec, size[source], 1);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ if (replies[i].stat.ia_size != source_size) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ } else if (stbuf) {
+ *stbuf = replies[i].stat;
+ }
+ }
+
+ if (healed_sinks[i]) {
+ if (replies[i].stat.ia_size)
+ trim[i] = 1;
+ }
+ }
+
+ if (EC_COUNT(sources, ec->nodes) < ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ ret = source;
+out:
+ if (xattrs)
+ dict_unref (xattrs);
+ cluster_replies_wipe (replies, ec->nodes);
+ return ret;
+}
+
+int
+__ec_heal_mark_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
+ uint64_t *versions, unsigned char *healed_sinks)
+{
+ int i = 0;
+ int ret = 0;
+ unsigned char *mark = NULL;
+ dict_t *xattrs = NULL;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ uint64_t versions_xattr[2] = {0};
+
+ EC_REPLIES_ALLOC (replies, ec->nodes);
+ xattrs = dict_new ();
+ if (!xattrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mark = alloca0 (ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if ((versions[i] >> EC_SELFHEAL_BIT) & 1)
+ continue;
+ mark[i] = 1;
+ }
+
+ if (EC_COUNT (mark, ec->nodes) == 0)
+ return 0;
+
+ versions_xattr[EC_DATA_TXN] = hton64(1ULL<<EC_SELFHEAL_BIT);
+ if (dict_set_static_bin (xattrs, EC_XATTR_VERSION, versions_xattr,
+ sizeof (versions_xattr))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ output = alloca0 (ec->nodes);
+ ret = cluster_fxattrop (ec->xl_list, mark, ec->nodes,
+ replies, output, frame, ec->xl, fd,
+ GF_XATTROP_ADD_ARRAY64, xattrs, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i]) {
+ if (mark[i])
+ healed_sinks[i] = 0;
+ continue;
+ }
+ versions[i] |= (1ULL<<EC_SELFHEAL_BIT);
+ }
+
+ if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ cluster_replies_wipe (replies, ec->nodes);
+ if (xattrs)
+ dict_unref (xattrs);
+ return ret;
+}
+
+int32_t
+ec_manager_heal_block (ec_fop_data_t *fop, int32_t state)
+{
+ ec_heal_t *heal = fop->data;
+ heal->fop = fop;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ ec_owner_set(fop->frame, fop->frame->root);
+
+ ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0);
+
+ return EC_STATE_HEAL_DATA_COPY;
+
+ case EC_STATE_HEAL_DATA_COPY:
+ ec_heal_data_block (heal);
+
+ return EC_STATE_HEAL_DATA_UNLOCK;
+
+ case -EC_STATE_HEAL_DATA_COPY:
+ case -EC_STATE_HEAL_DATA_UNLOCK:
+ case EC_STATE_HEAL_DATA_UNLOCK:
+ ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0);
+
+ if (state < 0)
+ return -EC_STATE_REPORT;
+ else
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+ fop->cbks.heal (fop->req_frame, fop, fop->xl, 0,
+ 0, (heal->good | heal->bad),
+ heal->good, heal->bad, NULL);
+ }
+
+ return EC_STATE_END;
+ case -EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+ fop->cbks.heal (fop->req_frame, fop, fop->xl, -1,
+ EIO, 0, 0, 0, NULL);
+ }
+
+ return -EC_STATE_END;
+ default:
+ gf_log(fop->xl->name, GF_LOG_ERROR, "Unhandled state %d for %s",
+ state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+/*Takes lock */
+void
+ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target,
+ int32_t minimum, fop_heal_cbk_t func, ec_heal_t *heal)
+{
+ ec_cbk_t callback = { .heal = func };
+ ec_fop_data_t *fop = NULL;
+ int32_t error = EIO;
+
+ gf_log("ec", GF_LOG_TRACE, "EC(HEAL) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate (frame, this, EC_FOP_HEAL,
+ EC_FLAG_UPDATE_LOC_INODE, target, minimum,
+ ec_wind_heal, ec_manager_heal_block, callback,
+ heal);
+ if (fop == NULL)
+ goto out;
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, EIO, 0, 0, 0, NULL);
+ }
+}
+
+int32_t
+ec_heal_block_done (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+ uintptr_t good, uintptr_t bad, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ fop->heal = NULL;
+ heal->fop = NULL;
+ syncbarrier_wake (heal->data);
+ return 0;
+}
+
+int
+ec_sync_heal_block (call_frame_t *frame, xlator_t *this, ec_heal_t *heal)
+{
+ ec_heal_block (frame, this, heal->bad|heal->good, EC_MINIMUM_ONE,
+ ec_heal_block_done, heal);
+ syncbarrier_wait (heal->data, 1);
+ if (heal->bad == 0)
+ return -ENOTCONN;
+ return 0;
+}
+
+int
+ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ ec_heal_t *heal = NULL;
+ int ret = 0;
+ syncbarrier_t barrier;
+ struct iobuf_pool *pool = NULL;
+
+ if (syncbarrier_init (&barrier))
+ return -ENOMEM;
+
+ heal = alloca0(sizeof (*heal));
+ heal->fd = fd_ref (fd);
+ heal->xl = ec->xl;
+ heal->data = &barrier;
+ syncbarrier_init (heal->data);
+ pool = ec->xl->ctx->iobuf_pool;
+ heal->size = iobpool_default_pagesize (pool);
+ heal->bad = ec_char_array_to_mask (healed_sinks, ec->nodes);
+ heal->good = ec_char_array_to_mask (sources, ec->nodes);
+ heal->iatt.ia_type = IA_IFREG;
+ LOCK_INIT(&heal->lock);
+
+ for (heal->offset = 0; (heal->offset < size) && !heal->done;
+ heal->offset += heal->size) {
+ ret = ec_sync_heal_block (frame, ec->xl, heal);
+ if (ret < 0)
+ break;
+
+ }
+ fd_unref (heal->fd);
+ LOCK_DESTROY (&heal->lock);
+ syncbarrier_destroy (heal->data);
+ return ret;
+}
+
+int
+__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *healed_sinks, unsigned char *trim)
+{
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ int i = 0;
+
+ EC_REPLIES_ALLOC (replies, ec->nodes);
+ output = alloca0 (ec->nodes);
+
+ if (EC_COUNT (trim, ec->nodes) == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
+ frame, ec->xl, fd, 0, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i] && trim[i])
+ healed_sinks[i] = 0;
+ }
+
+ if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+out:
+ cluster_replies_wipe (replies, ec->nodes);
+ return ret;
+}
+
+int
+ec_data_undo_pending (call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
+ uint64_t *versions, uint64_t *dirty, uint64_t *size,
+ int source, gf_boolean_t erase_dirty, int idx)
+{
+ uint64_t versions_xattr[2] = {0};
+ uint64_t dirty_xattr[2] = {0};
+ uint64_t allzero[2] = {0};
+ uint64_t size_xattr = 0;
+ int ret = 0;
+
+ versions_xattr[EC_DATA_TXN] = hton64(versions[source] - versions[idx]);
+ ret = dict_set_static_bin (xattr, EC_XATTR_VERSION,
+ versions_xattr,
+ sizeof (versions_xattr));
+ if (ret < 0)
+ goto out;
+
+ size_xattr = hton64(size[source] - size[idx]);
+ ret = dict_set_static_bin (xattr, EC_XATTR_SIZE,
+ &size_xattr, sizeof (size_xattr));
+ if (ret < 0)
+ goto out;
+
+ if (erase_dirty) {
+ dirty_xattr[EC_DATA_TXN] = hton64(-dirty[idx]);
+ ret = dict_set_static_bin (xattr, EC_XATTR_DIRTY,
+ dirty_xattr,
+ sizeof (dirty_xattr));
+ if (ret < 0)
+ goto out;
+ }
+
+ if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&
+ (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) &&
+ (size == 0)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = syncop_fxattrop (ec->xl_list[idx], fd,
+ GF_XATTROP_ADD_ARRAY64, xattr, NULL, NULL);
+out:
+ return ret;
+}
+
+int
+__ec_fd_data_adjust_versions (call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *sources, unsigned char *healed_sinks,
+ uint64_t *versions, uint64_t *dirty, uint64_t *size)
+{
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int op_ret = 0;
+ int source = -1;
+ gf_boolean_t erase_dirty = _gf_false;
+
+ xattr = dict_new ();
+ if (!xattr) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
+
+ /* dirty xattr represents if the file needs heal. Unless all the
+ * copies are healed, don't erase it */
+ if (EC_COUNT (sources, ec->nodes) +
+ EC_COUNT (healed_sinks, ec->nodes) == ec->nodes)
+ erase_dirty = _gf_true;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i]) {
+ ret = ec_data_undo_pending (frame, ec, fd, xattr,
+ versions, dirty, size,
+ source, erase_dirty, i);
+ if (ret < 0)
+ goto out;
+ }
+
+ }
+
+ if (!erase_dirty)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ ret = ec_data_undo_pending (frame, ec, fd, xattr,
+ versions, dirty, size,
+ source, erase_dirty, i);
+ if (ret < 0)
+ continue;
+ }
+
+ }
+out:
+ if (xattr)
+ dict_unref (xattr);
+ return op_ret;
+}
+
+int
+ec_restore_time_and_adjust_versions (call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *sources,
+ unsigned char *healed_sinks,
+ uint64_t *versions, uint64_t *dirty,
+ uint64_t *size)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *participants = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *postsh_sources = NULL;
+ unsigned char *postsh_healed_sinks = NULL;
+ unsigned char *postsh_trim = NULL;
+ uint64_t *postsh_versions = NULL;
+ uint64_t *postsh_dirty = NULL;
+ uint64_t *postsh_size = NULL;
+ int ret = 0;
+ int i = 0;
+ struct iatt source_buf = {0};
+ loc_t loc = {0};
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ participants = alloca0(ec->nodes);
+ postsh_sources = alloca0(ec->nodes);
+ postsh_healed_sinks = alloca0(ec->nodes);
+ postsh_trim = alloca0(ec->nodes);
+ postsh_versions = alloca0(ec->nodes * sizeof (*postsh_versions));
+ postsh_dirty = alloca0(ec->nodes * sizeof (*postsh_dirty));
+ postsh_size = alloca0(ec->nodes * sizeof (*postsh_size));
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i] || sources[i])
+ participants[i] = 1;
+ }
+
+ EC_REPLIES_ALLOC (replies, ec->nodes);
+ ret = cluster_inodelk (ec->xl_list, participants, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name,
+ fd->inode, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked", uuid_utoa (fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __ec_heal_data_prepare (frame, ec, fd, locked_on,
+ postsh_versions, postsh_dirty,
+ postsh_size, postsh_sources,
+ postsh_healed_sinks, postsh_trim,
+ &source_buf);
+ if (ret < 0)
+ goto unlock;
+
+ loc.inode = inode_ref (fd->inode);
+ gf_uuid_copy (loc.gfid, fd->inode->gfid);
+ ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes,
+ replies, output, frame, ec->xl, &loc,
+ &source_buf,
+ GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME,
+ NULL);
+ EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
+ if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_fd_data_adjust_versions (frame, ec, fd, sources,
+ healed_sinks, versions, dirty, size);
+ }
+unlock:
+ cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, ec->xl->name, fd->inode, 0, 0);
+ cluster_replies_wipe (replies, ec->nodes);
+ loc_wipe (&loc);
+ return ret;
+}
+
+int
+__ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *output = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ uint64_t *size = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *trim = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+ int source = 0;
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ sources = alloca0 (ec->nodes);
+ healed_sinks = alloca0 (ec->nodes);
+ trim = alloca0 (ec->nodes);
+ versions = alloca0 (ec->nodes * sizeof (*versions));
+ dirty = alloca0 (ec->nodes * sizeof (*dirty));
+ size = alloca0 (ec->nodes * sizeof (*size));
+
+ EC_REPLIES_ALLOC (replies, ec->nodes);
+ ret = cluster_inodelk (ec->xl_list, heal_on, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name,
+ fd->inode, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked", uuid_utoa (fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __ec_heal_data_prepare (frame, ec, fd, locked_on,
+ versions, dirty, size, sources,
+ healed_sinks, trim, NULL);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+ ret = __ec_heal_mark_sinks (frame, ec, fd, versions,
+ healed_sinks);
+ if (ret < 0)
+ goto unlock;
+
+ ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim);
+ }
+unlock:
+ cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, ec->xl->name, fd->inode, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ ret = ec_rebuild_data (frame, ec, fd, size[source], sources,
+ healed_sinks);
+ if (ret < 0)
+ goto out;
+
+ ret = ec_restore_time_and_adjust_versions (frame, ec, fd, sources,
+ healed_sinks, versions,
+ dirty, size);
+out:
+ cluster_replies_wipe (replies, ec->nodes);
+ return ret;
+}
+
+int
+ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ call_frame_t *frame = NULL;
+ fd_t *fd = NULL;
+ loc_t loc = {0};
+ char selfheal_domain[1024] = {0};
+ int ret = 0;
+
+ EC_REPLIES_ALLOC (replies, ec->nodes);
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ loc. inode = inode_ref (inode);
+ gf_uuid_copy (loc.gfid, inode->gfid);
+
+ fd = fd_create (inode, 0);
+ if (!fd) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
+ frame = copy_frame (req_frame);
+ if (!frame) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+
+ ret = cluster_open (ec->xl_list, up_subvols, ec->nodes, replies, output,
+ frame, ec->xl, &loc, O_RDWR|O_LARGEFILE, fd, NULL);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ fd_bind (fd);
+ sprintf (selfheal_domain, "%s:self-heal", ec->xl->name);
+ /*If other processes are already doing the heal, don't block*/
+ ret = cluster_tryinodelk (ec->xl_list, output, ec->nodes, replies,
+ locked_on, frame, ec->xl, selfheal_domain, inode,
+ 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked", uuid_utoa (inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_data (frame, ec, fd, locked_on);
+ }
+unlock:
+ cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, selfheal_domain, inode, 0, 0);
+out:
+ if (fd)
+ fd_unref (fd);
+ loc_wipe (&loc);
+ cluster_replies_wipe (replies, ec->nodes);
+ if (frame)
+ STACK_DESTROY (frame->root);
+ return ret;
+}