summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/basic/ec/ec-common5
-rw-r--r--tests/basic/ec/ec.t4
-rw-r--r--tests/basic/ec/self-heal.t4
-rw-r--r--xlators/cluster/ec/src/ec-combine.c6
-rw-r--r--xlators/cluster/ec/src/ec-common.c292
-rw-r--r--xlators/cluster/ec/src/ec-data.h1
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c30
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c2
8 files changed, 231 insertions, 113 deletions
diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common
index 65422d72095..19818355f04 100644
--- a/tests/basic/ec/ec-common
+++ b/tests/basic/ec/ec-common
@@ -81,6 +81,11 @@ for dir in . dir1; do
EXPECT "$cs_big" echo $(sha1sum $dir/big | awk '{ print $1 }')
+# Give enough time for current operations to complete. Otherwise the
+# following kill_brick can cause data corruption and self-heal will be
+# needed, but this script is not prepared to handle self-healing.
+ sleep 2
+
for idx in `seq 0 $LAST_BRICK`; do
TEST kill_brick $V0 $H0 $B0/$V0$idx
diff --git a/tests/basic/ec/ec.t b/tests/basic/ec/ec.t
index 91bad52499a..77c6460f11a 100644
--- a/tests/basic/ec/ec.t
+++ b/tests/basic/ec/ec.t
@@ -190,6 +190,8 @@ TEST touch $M0/setxattr
TEST touch $M0/removexattr
TEST setfattr -n user.bar -v "ash_nazg_gimbatul" $M0/removexattr
+sleep 2
+
# Kill a couple of bricks and allow some time for things to settle.
TEST kill_brick $V0 $H0 $B0/${V0}3
TEST kill_brick $V0 $H0 $B0/${V0}8
@@ -216,6 +218,8 @@ TEST setfattr -x user.bar $M0/removexattr
# Test uid/gid behavior
TEST setup_perm_file $M0
+sleep 2
+
# Unmount/remount so that create/write and truncate don't see cached data.
TEST umount $M0
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t
index 3df19184169..2eb2663f8ea 100644
--- a/tests/basic/ec/self-heal.t
+++ b/tests/basic/ec/self-heal.t
@@ -178,6 +178,8 @@ for idx1 in {0..4}; do
done
done
+sleep 2
+
TEST kill_brick $V0 $H0 $B0/${V0}0
TEST kill_brick $V0 $H0 $B0/${V0}1
TEST cp $tmp/test test2
@@ -196,6 +198,8 @@ TEST [ -f test4 ]
EXPECT "2" stat -c "%h" test2
EXPECT "2" stat -c "%h" test4
+sleep 2
+
TEST $CLI volume start $V0 force
# Wait until the killed bricks have been started and recognized by the ec
# xlator
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
index 0fa5ac068de..93593fb8b6a 100644
--- a/xlators/cluster/ec/src/ec-combine.c
+++ b/xlators/cluster/ec/src/ec-combine.c
@@ -326,8 +326,10 @@ int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk,
len = prelen;
for (i = 0; i < num; i++)
{
- memcpy(str + len, sep, seplen);
- len += seplen;
+ if (i > 0) {
+ memcpy(str + len, sep, seplen);
+ len += seplen;
+ }
tmp = data[i]->len - 1;
memcpy(str + len, data[i]->data, tmp);
len += tmp;
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 561871cee93..2ba17305411 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -631,9 +631,11 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
return uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
}
-void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update)
+ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock,
+ int32_t update)
{
ec_lock_t * tmp;
+ ec_lock_link_t *link = NULL;
int32_t tmp_update;
if ((fop->lock_count > 0) &&
@@ -654,13 +656,23 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update)
fop->lock_count++;
- lock->refs++;
+ if (lock->timer != NULL) {
+ link = lock->timer->data;
+ ec_trace("UNLOCK_CANCELLED", link->fop, "lock=%p", lock);
+ gf_timer_call_cancel(fop->xl->ctx, lock->timer);
+ lock->timer = NULL;
+ } else {
+ lock->refs++;
+ }
+
+ return link;
}
void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update)
{
ec_lock_t * lock = NULL;
ec_inode_t * ctx = NULL;
+ ec_lock_link_t *link = NULL;
loc_t tmp;
int32_t error;
@@ -724,16 +736,21 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update)
ctx->entry_lock = lock;
insert:
- ec_lock_insert(fop, lock, update);
+ link = ec_lock_insert(fop, lock, update);
unlock:
UNLOCK(&tmp.inode->lock);
loc_wipe(&tmp);
+
+ if (link != NULL) {
+ ec_resume(link->fop, 0);
+ }
}
void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)
{
+ ec_lock_link_t *link = NULL;
ec_lock_t * lock;
ec_inode_t * ctx;
@@ -778,10 +795,14 @@ void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)
ctx->inode_lock = lock;
insert:
- ec_lock_insert(fop, lock, update);
+ link = ec_lock_insert(fop, lock, update);
unlock:
UNLOCK(&loc->inode->lock);
+
+ if (link != NULL) {
+ ec_resume(link->fop, 0);
+ }
}
void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update)
@@ -898,90 +919,6 @@ void ec_lock(ec_fop_data_t * fop)
}
}
-int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xdata)
-{
- ec_fop_data_t * fop = cookie;
-
- if (op_ret < 0)
- {
- gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)",
- ec_fop_name(fop->parent->id));
- }
- else
- {
- ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data);
- }
-
- return 0;
-}
-
-void ec_unlock(ec_fop_data_t * fop)
-{
- ec_lock_t * lock;
- int32_t i, refs;
-
- for (i = 0; i < fop->lock_count; i++)
- {
- lock = fop->locks[i].lock;
-
- LOCK(&lock->loc.inode->lock);
-
- ec_trace("UNLOCK", fop, "lock=%p", lock);
-
- refs = --lock->refs;
- if (refs == 0)
- {
- *lock->plock = NULL;
- }
-
- UNLOCK(&lock->loc.inode->lock);
-
- if (refs == 0)
- {
- if (lock->mask != 0)
- {
- ec_owner_set(fop->frame, lock);
-
- switch (lock->kind)
- {
- case EC_LOCK_ENTRY:
- ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, "
- "path=%s",
- lock, lock->loc.inode, lock->loc.path);
-
- ec_entrylk(fop->frame, fop->xl, lock->mask,
- EC_MINIMUM_ALL, ec_unlocked, lock,
- fop->xl->name, &lock->loc, NULL,
- ENTRYLK_UNLOCK, lock->type, NULL);
-
- break;
-
- case EC_LOCK_INODE:
- lock->flock.l_type = F_UNLCK;
- ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p",
- lock, lock->loc.inode);
-
- ec_inodelk(fop->frame, fop->xl, lock->mask,
- EC_MINIMUM_ALL, ec_unlocked, lock,
- fop->xl->name, &lock->loc, F_SETLK,
- &lock->flock, NULL);
-
- break;
-
- default:
- gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock "
- "type");
- }
- }
-
- ec_trace("LOCK_DESTROY", fop, "lock=%p", lock);
-
- ec_lock_destroy(lock);
- }
- }
-}
-
int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,
xlator_t * this, int32_t op_ret,
int32_t op_errno, inode_t * inode,
@@ -991,7 +928,7 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,
ec_t * ec;
ec_fop_data_t * fop = cookie;
ec_inode_t * ctx;
- ec_lock_t * lock;
+ ec_lock_t *lock = NULL;
if (op_ret >= 0)
{
@@ -1192,6 +1129,58 @@ out:
ec_fop_set_error(fop, error);
}
+int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)",
+ ec_fop_name(fop->parent->id));
+ } else {
+ ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data);
+ }
+
+ return 0;
+}
+
+void ec_unlock_lock(ec_fop_data_t *fop, ec_lock_t *lock)
+{
+ if (lock->mask != 0) {
+ ec_owner_set(fop->frame, lock);
+
+ switch (lock->kind) {
+ case EC_LOCK_ENTRY:
+ ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s", lock,
+ lock->loc.inode, lock->loc.path);
+
+ ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
+ ec_unlocked, lock, fop->xl->name, &lock->loc, NULL,
+ ENTRYLK_UNLOCK, lock->type, NULL);
+
+ break;
+
+ case EC_LOCK_INODE:
+ lock->flock.l_type = F_UNLCK;
+ ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
+ lock->loc.inode);
+
+ ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
+ ec_unlocked, lock, fop->xl->name, &lock->loc, F_SETLK,
+ &lock->flock, NULL);
+
+ break;
+
+ default:
+ gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type");
+ }
+ }
+
+ ec_trace("LOCK_DESTROY", fop, "lock=%p", lock);
+
+ ec_lock_destroy(lock);
+}
+
int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
xlator_t * this, int32_t op_ret,
int32_t op_errno, dict_t * xattr,
@@ -1209,11 +1198,15 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
fop->parent->mask &= fop->good;
}
+ if (fop->data != NULL) {
+ ec_unlock_lock(fop->parent, fop->data);
+ }
+
return 0;
}
void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version,
- uint64_t size)
+ uint64_t size, ec_lock_t *lock)
{
dict_t * dict;
uid_t uid;
@@ -1253,7 +1246,7 @@ void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version,
fop->frame->root->gid = 0;
ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN,
- ec_update_size_version_done, NULL, loc,
+ ec_update_size_version_done, lock, loc,
GF_XATTROP_ADD_ARRAY64, dict, NULL);
fop->frame->root->uid = uid;
@@ -1274,6 +1267,103 @@ out:
gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size");
}
+void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock)
+{
+ ec_trace("UNLOCK_NOW", fop, "lock=%p", lock);
+
+ if (lock->version_delta != 0) {
+ ec_update_size_version(fop, &lock->loc, lock->version_delta,
+ lock->size_delta, lock);
+ } else {
+ ec_unlock_lock(fop, lock);
+ }
+
+ ec_resume(fop, 0);
+}
+
+void ec_unlock_timer_cbk(void *data)
+{
+ ec_lock_link_t *link = data;
+ ec_lock_t *lock = link->lock;
+ ec_fop_data_t *fop = NULL;
+
+ LOCK(&lock->loc.inode->lock);
+
+ if (lock->timer != NULL) {
+ fop = link->fop;
+
+ ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock);
+
+ GF_ASSERT(lock->refs == 1);
+
+ gf_timer_call_cancel(fop->xl->ctx, lock->timer);
+ lock->timer = NULL;
+ *lock->plock = NULL;
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (fop != NULL) {
+ ec_unlock_now(fop, lock);
+ }
+}
+
+void ec_unlock_timer_add(ec_lock_link_t *link)
+{
+ struct timespec delay;
+ ec_fop_data_t *fop = link->fop;
+ ec_lock_t *lock = link->lock;
+ int32_t refs = 1;
+
+ LOCK(&lock->loc.inode->lock);
+
+ GF_ASSERT(lock->timer == NULL);
+
+ if (lock->refs != 1) {
+ ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
+
+ lock->refs--;
+
+ UNLOCK(&lock->loc.inode->lock);
+ } else {
+ ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock);
+
+ delay.tv_sec = 1;
+ delay.tv_nsec = 0;
+
+ LOCK(&fop->lock);
+
+ fop->jobs++;
+ fop->refs++;
+
+ UNLOCK(&fop->lock);
+
+ lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
+ ec_unlock_timer_cbk, link);
+ if (lock->timer == NULL) {
+ gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an unlock");
+
+ *lock->plock = NULL;
+ refs = 0;
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (refs == 0) {
+ ec_unlock_now(fop, lock);
+ }
+ }
+}
+
+void ec_unlock(ec_fop_data_t *fop)
+{
+ int32_t i;
+
+ for (i = 0; i < fop->lock_count; i++) {
+ ec_unlock_timer_add(&fop->locks[i]);
+ }
+}
+
void ec_flush_size_version(ec_fop_data_t * fop)
{
ec_lock_t * lock;
@@ -1296,7 +1386,7 @@ void ec_flush_size_version(ec_fop_data_t * fop)
if (version > 0)
{
- ec_update_size_version(fop, &lock->loc, version, delta);
+ ec_update_size_version(fop, &lock->loc, version, delta, NULL);
}
}
@@ -1305,16 +1395,10 @@ void ec_lock_reuse(ec_fop_data_t *fop)
ec_fop_data_t * wait_fop;
ec_lock_t * lock;
ec_lock_link_t * link;
- uint64_t version = 0, delta = 0;
- int32_t refs = 0;
int32_t i;
for (i = 0; i < fop->lock_count; i++)
{
- refs = 0;
- delta = 0;
- version = 0;
-
wait_fop = NULL;
lock = fop->locks[i].lock;
@@ -1338,14 +1422,6 @@ void ec_lock_reuse(ec_fop_data_t *fop)
}
}
- version = lock->version_delta;
- delta = lock->size_delta;
- refs = lock->refs;
- if (refs == 1) {
- lock->version_delta = 0;
- lock->size_delta = 0;
- }
-
lock->good_mask &= fop->mask;
if (!list_empty(&lock->waiting))
@@ -1371,10 +1447,6 @@ void ec_lock_reuse(ec_fop_data_t *fop)
ec_resume(wait_fop, 0);
}
-
- if ((refs == 1) && (version > 0)) {
- ec_update_size_version(fop, &lock->loc, version, delta);
- }
}
}
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index ac197fe7f0b..97ec58469ad 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -142,6 +142,7 @@ union _ec_cbk
struct _ec_lock
{
ec_lock_t **plock;
+ gf_timer_t *timer;
struct list_head waiting;
uintptr_t mask;
uintptr_t good_mask;
diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c
index 02961acb8bd..0bd10d4e27d 100644
--- a/xlators/cluster/ec/src/ec-dir-write.c
+++ b/xlators/cluster/ec/src/ec-dir-write.c
@@ -1016,11 +1016,41 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
{
+ ec_t *ec;
ec_cbk_data_t * cbk;
switch (state)
{
case EC_STATE_INIT:
+ if (S_ISREG(fop->mode[0])) {
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ fop->error = EIO;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ ec = fop->xl->private;
+
+ fop->config.version = EC_CONFIG_VERSION;
+ fop->config.algorithm = EC_CONFIG_ALGORITHM;
+ fop->config.gf_word_size = EC_GF_BITS;
+ fop->config.bricks = ec->nodes;
+ fop->config.redundancy = ec->redundancy;
+ fop->config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+ if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG,
+ &fop->config) < 0) {
+ fop->error = EIO;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ /* Fall through */
+
case EC_STATE_LOCK:
ec_lock_prepare_entry(fop, &fop->loc[0], 1);
ec_lock(fop);
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index 78a189bc325..c31f0d97674 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -81,7 +81,7 @@ int32_t ec_manager_access(ec_fop_data_t * fop, int32_t state)
case EC_STATE_DISPATCH:
ec_dispatch_one(fop);
- return EC_STATE_PREPARE_ANSWER;
+ return EC_STATE_REPORT;
case -EC_STATE_REPORT:
if (fop->cbks.access != NULL)