summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xrun-tests.sh6
-rw-r--r--tests/bugs/disperse/bug-1188145.t50
-rw-r--r--xlators/cluster/ec/src/ec-combine.c35
-rw-r--r--xlators/cluster/ec/src/ec-common.c1509
-rw-r--r--xlators/cluster/ec/src/ec-common.h36
-rw-r--r--xlators/cluster/ec/src/ec-data.c82
-rw-r--r--xlators/cluster/ec/src/ec-data.h84
-rw-r--r--xlators/cluster/ec/src/ec-dir-read.c8
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c96
-rw-r--r--xlators/cluster/ec/src/ec-fops.h3
-rw-r--r--xlators/cluster/ec/src/ec-generic.c86
-rw-r--r--xlators/cluster/ec/src/ec-heal.c103
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c38
-rw-r--r--xlators/cluster/ec/src/ec-helpers.h7
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c51
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c159
-rw-r--r--xlators/cluster/ec/src/ec-locks.c13
-rw-r--r--xlators/cluster/ec/src/ec.c97
-rw-r--r--xlators/cluster/ec/src/ec.h5
19 files changed, 1397 insertions, 1071 deletions
diff --git a/run-tests.sh b/run-tests.sh
index 0bfa5c6230c..ef6568c30b0 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -201,13 +201,7 @@ function is_bad_test ()
./tests/basic/ec/quota.t \
./tests/bugs/distribute/bug-1161156.t \
./tests/basic/tier/tier.t \
- ./tests/basic/ec/ec-4-1.t \
- ./tests/basic/ec/ec.t \
./tests/basic/quota-nfs.t \
- ./tests/basic/ec/ec-6-2.t \
- ./tests/basic/ec/ec-3-1.t \
- ./tests/basic/ec/ec-5-1.t \
- ./tests/basic/ec/ec-12-4.t \
./tests/bugs/quota/bug-1035576.t \
./tests/bugs/glusterfs/bug-867253.t \
./tests/bugs/glusterd/bug-974007.t \
diff --git a/tests/bugs/disperse/bug-1188145.t b/tests/bugs/disperse/bug-1188145.t
new file mode 100644
index 00000000000..aa3a59bc62f
--- /dev/null
+++ b/tests/bugs/disperse/bug-1188145.t
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+function create_dirs()
+{
+ local stop=$1
+ local idx
+ local res
+
+ res=0
+ idx=1
+ while [[ -f ${stop} ]]; do
+ mkdir $M0/${idx}
+ if [[ $? -ne 0 ]]; then
+ res=1
+ break;
+ fi
+ idx=$(( idx + 1 ))
+ done
+
+ return ${res}
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5}
+EXPECT "Created" volinfo_field $V0 'Status'
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status'
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
+
+name=`mktemp -t ${0##*/}.XXXXXX`
+create_dirs ${name} &
+pid=$!
+
+sleep 2
+TEST $CLI volume set $V0 uss on
+sleep 5
+TEST $CLI volume set $V0 uss off
+sleep 5
+
+TEST rm -f ${name}
+TEST wait ${pid}
+
+cleanup
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
index 9d4a18999f1..4617a0430f1 100644
--- a/xlators/cluster/ec/src/ec-combine.c
+++ b/xlators/cluster/ec/src/ec-combine.c
@@ -171,8 +171,10 @@ void ec_iatt_rebuild(ec_t * ec, struct iatt * iatt, int32_t count,
gf_boolean_t
ec_xattr_match (dict_t *dict, char *key, data_t *value, void *arg)
{
- if (fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0)
+ if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) ||
+ (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) {
return _gf_false;
+ }
return _gf_true;
}
@@ -185,6 +187,8 @@ ec_value_ignore (char *key)
(strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) ||
(strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) ||
(strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) ||
(strncmp(key, GF_XATTR_CLRLK_CMD,
strlen (GF_XATTR_CLRLK_CMD)) == 0) ||
(strncmp(key, EC_QUOTA_PREFIX, strlen(EC_QUOTA_PREFIX)) == 0) ||
@@ -225,15 +229,9 @@ int32_t ec_dict_list(data_t ** list, int32_t * count, ec_cbk_data_t * cbk,
dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict;
list[i] = dict_get(dict, key);
- if (list[i] == NULL)
- {
- gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "Unexpected missing "
- "dictionary entry");
-
- return 0;
+ if (list[i] != NULL) {
+ i++;
}
-
- i++;
}
*count = i;
@@ -471,11 +469,6 @@ int32_t ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key)
return -1;
}
- if (num <= 1)
- {
- return 0;
- }
-
max = data_to_uint32(data[0]);
for (i = 1; i < num; i++)
{
@@ -507,10 +500,6 @@ int32_t ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key)
return -1;
}
- if (num <= 1) {
- return 0;
- }
-
max = data_to_uint64(data[0]);
for (i = 1; i < num; i++) {
tmp = data_to_uint64(data[i]);
@@ -630,6 +619,10 @@ int32_t ec_dict_data_combine(dict_t * dict, char * key, data_t * value,
{
return ec_dict_data_max32(data->cbk, data->which, key);
}
+ if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) {
+ return ec_dict_data_max32(data->cbk, data->which, key);
+ }
if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
return ec_dict_data_quota(data->cbk, data->which, key);
@@ -831,6 +824,8 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)
LOCK(&fop->lock);
+ fop->received |= newcbk->mask;
+
item = fop->cbk_list.prev;
list_for_each_entry(cbk, &fop->cbk_list, list)
{
@@ -868,7 +863,9 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)
}
cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
- needed = fop->minimum - cbk->count - fop->winds + 1;
+ if ((fop->mask ^ fop->remaining) == fop->received) {
+ needed = fop->minimum - cbk->count;
+ }
UNLOCK(&fop->lock);
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 393d9142797..ba81fc7313f 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -368,24 +368,34 @@ int32_t ec_child_select(ec_fop_data_t * fop)
uintptr_t mask = 0;
int32_t first = 0, num = 0;
+ ec_fop_cleanup(fop);
+
fop->mask &= ec->node_mask;
mask = ec->xl_up;
if (fop->parent == NULL)
{
- if (fop->loc[0].inode != NULL) {
+ if ((fop->flags & EC_FLAG_UPDATE_LOC_PARENT) && fop->loc[0].parent)
+ mask &= ec_inode_good(fop->loc[0].parent, fop->xl);
+
+ if ((fop->flags & EC_FLAG_UPDATE_LOC_INODE) && fop->loc[0].inode) {
mask &= ec_inode_good(fop->loc[0].inode, fop->xl);
}
- if (fop->loc[1].inode != NULL) {
+
+ if ((fop->flags & EC_FLAG_UPDATE_LOC_INODE) && fop->loc[1].inode) {
mask &= ec_inode_good(fop->loc[1].inode, fop->xl);
}
- if (fop->fd != NULL) {
- if (fop->fd->inode != NULL) {
+
+ if (fop->fd) {
+ if ((fop->flags & EC_FLAG_UPDATE_FD_INODE) && fop->fd->inode) {
mask &= ec_inode_good(fop->fd->inode, fop->xl);
}
- mask &= ec_fd_good(fop->fd, fop->xl);
+ if (fop->flags & fop->flags & EC_FLAG_UPDATE_FD) {
+ mask &= ec_fd_good(fop->fd, fop->xl);
+ }
}
}
+
if ((fop->mask & ~mask) != 0)
{
gf_log(fop->xl->name, GF_LOG_WARNING, "Executing operation with "
@@ -420,6 +430,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)
/*Unconditionally wind on healing subvolumes*/
fop->mask |= fop->healing;
fop->remaining = fop->mask;
+ fop->received = 0;
ec_trace("SELECT", fop, "");
@@ -585,7 +596,7 @@ void ec_dispatch_min(ec_fop_data_t * fop)
}
}
-ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)
+ec_lock_t *ec_lock_allocate(xlator_t *xl, loc_t *loc)
{
ec_t * ec = xl->private;
ec_lock_t * lock;
@@ -602,9 +613,9 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)
lock = mem_get0(ec->lock_pool);
if (lock != NULL)
{
- lock->kind = kind;
lock->good_mask = -1ULL;
INIT_LIST_HEAD(&lock->waiting);
+ INIT_LIST_HEAD(&lock->frozen);
if (ec_loc_from_loc(xl, &lock->loc, loc) != 0)
{
mem_put(lock);
@@ -618,6 +629,9 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)
void ec_lock_destroy(ec_lock_t * lock)
{
loc_wipe(&lock->loc);
+ if (lock->fd != NULL) {
+ fd_unref(lock->fd);
+ }
mem_put(lock);
}
@@ -627,166 +641,96 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
}
-ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock,
- int32_t update)
+void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
+ loc_t *base)
{
- ec_lock_t *new_lock, *tmp;
- ec_lock_link_t *link = NULL;
- int32_t tmp_update;
+ ec_lock_link_t *link;
- new_lock = lock;
+ /* This check is only prepared for up to 2 locks per fop. If more locks
+ * are needed this must be changed. */
if ((fop->lock_count > 0) &&
- (ec_lock_compare(fop->locks[0].lock, new_lock) > 0))
- {
- tmp = fop->locks[0].lock;
- fop->locks[0].lock = new_lock;
- new_lock = tmp;
-
- tmp_update = fop->locks_update;
- fop->locks_update = update;
- update = tmp_update;
- }
- fop->locks[fop->lock_count].lock = new_lock;
- fop->locks[fop->lock_count].fop = fop;
-
- fop->locks_update |= update << fop->lock_count;
-
- fop->lock_count++;
-
- if (lock->timer != NULL) {
- link = lock->timer->data;
- ec_trace("UNLOCK_CANCELLED", link->fop, "lock=%p", lock);
- gf_timer_call_cancel(fop->xl->ctx, lock->timer);
- lock->timer = NULL;
+ (ec_lock_compare(fop->locks[0].lock, lock) < 0)) {
+ fop->first_lock = fop->lock_count;
} else {
- lock->refs++;
- }
-
- return link;
-}
-
-void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update)
-{
- ec_lock_t * lock = NULL;
- ec_inode_t * ctx = NULL;
- ec_lock_link_t *link = NULL;
- loc_t tmp;
-
- if ((fop->parent != NULL) || (fop->error != 0))
- {
- return;
- }
-
- /* update is only 0 for 'opendir', which needs to lock the entry pointed
- * by loc instead of its parent.
- */
- if (update)
- {
- if (ec_loc_parent(fop->xl, loc, &tmp) != 0) {
- ec_fop_set_error(fop, EIO);
-
- return;
- }
-
- /* If there's another lock, make sure that it's not the same. Otherwise
- * do not insert it.
- *
- * This can only happen on renames where source and target names are
- * in the same directory. */
- if ((fop->lock_count > 0) &&
- (fop->locks[0].lock->loc.inode == tmp.inode)) {
- goto wipe;
+ /* When the first lock is added to the current fop, request lock
+ * counts from locks xlator to be able to determine if there is
+ * contention and release the lock sooner. */
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ ec_fop_set_error(fop, ENOMEM);
+ return;
+ }
}
- } else {
- if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) {
- ec_fop_set_error(fop, EIO);
-
+ if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT,
+ fop->xl->name) != 0) {
+ ec_fop_set_error(fop, ENOMEM);
return;
}
}
- LOCK(&tmp.inode->lock);
-
- ctx = __ec_inode_get(tmp.inode, fop->xl);
- if (ctx == NULL)
- {
- __ec_fop_set_error(fop, EIO);
-
- goto unlock;
- }
-
- if (ctx->entry_lock != NULL)
- {
- lock = ctx->entry_lock;
- ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s"
- "Lock already acquired",
- lock, tmp.inode, tmp.path);
+ link = &fop->locks[fop->lock_count++];
- goto insert;
- }
+ link->lock = lock;
+ link->fop = fop;
+ link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
+ link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
+ link->base = base;
- lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp);
- if (lock == NULL)
- {
- __ec_fop_set_error(fop, EIO);
-
- goto unlock;
- }
-
- ec_trace("LOCK_CREATE", fop, "lock=%p", lock);
-
- lock->type = ENTRYLK_WRLCK;
-
- lock->plock = &ctx->entry_lock;
- ctx->entry_lock = lock;
-
-insert:
- link = ec_lock_insert(fop, lock, update);
-
-unlock:
- UNLOCK(&tmp.inode->lock);
-
-wipe:
- loc_wipe(&tmp);
-
- if (link != NULL) {
- ec_resume(link->fop, 0);
- }
+ lock->refs++;
+ lock->inserted++;
}
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)
+void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
+ uint32_t flags, loc_t *base)
{
- ec_lock_link_t *link = NULL;
- ec_lock_t * lock;
- ec_inode_t * ctx;
+ ec_lock_t *lock = NULL;
+ ec_inode_t *ctx;
- if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL))
- {
+ if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) {
return;
}
LOCK(&loc->inode->lock);
ctx = __ec_inode_get(loc->inode, fop->xl);
- if (ctx == NULL)
- {
+ if (ctx == NULL) {
__ec_fop_set_error(fop, EIO);
goto unlock;
}
- if (ctx->inode_lock != NULL)
- {
+ if (ctx->inode_lock != NULL) {
lock = ctx->inode_lock;
+
+ /* If there's another lock, make sure that it's not the same. Otherwise
+ * do not insert it.
+ *
+ * This can only happen on renames where source and target names are
+ * in the same directory. */
+ if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) {
+ /* Combine data/meta updates */
+ fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0;
+ fop->locks[0].update[EC_METADATA_TXN] |=
+ (flags & EC_UPDATE_META) != 0;
+
+ /* Only one base inode is allowed per fop, so there shouldn't be
+ * overwrites here. */
+ if (base != NULL) {
+ fop->locks[0].base = base;
+ }
+
+ goto update_query;
+ }
+
ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already "
"acquired", lock, loc->inode);
goto insert;
}
- lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc);
- if (lock == NULL)
- {
+ lock = ec_lock_allocate(fop->xl, loc);
+ if (lock == NULL) {
__ec_fop_set_error(fop, EIO);
goto unlock;
@@ -797,154 +741,78 @@ void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update)
lock->flock.l_type = F_WRLCK;
lock->flock.l_whence = SEEK_SET;
- lock->plock = &ctx->inode_lock;
+ lock->ctx = ctx;
ctx->inode_lock = lock;
insert:
- link = ec_lock_insert(fop, lock, update);
-
+ ec_lock_insert(fop, lock, flags, base);
+update_query:
+ lock->query |= (flags & EC_QUERY_INFO) != 0;
unlock:
UNLOCK(&loc->inode->lock);
+}
- if (link != NULL) {
- ec_resume(link->fop, 0);
- }
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags)
+{
+ ec_lock_prepare_inode_internal(fop, loc, flags, NULL);
}
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update)
+void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc,
+ uint32_t flags)
{
- loc_t loc;
+ loc_t tmp, *base = NULL;
- if ((fop->parent != NULL) || (fop->error != 0))
- {
+ if (fop->error != 0) {
return;
}
- if (ec_loc_from_fd(fop->xl, &loc, fd) == 0)
- {
- ec_lock_prepare_inode(fop, &loc, update);
-
- loc_wipe(&loc);
- }
- else
- {
+ if (ec_loc_parent(fop->xl, loc, &tmp) != 0) {
ec_fop_set_error(fop, EIO);
- }
-}
-
-int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xdata)
-{
- ec_fop_data_t * fop = cookie;
- ec_lock_t * lock = NULL;
-
- if (op_ret >= 0)
- {
- lock = fop->data;
- lock->mask = fop->good;
- lock->acquired = 1;
- fop->parent->mask &= fop->good;
- fop->parent->locked++;
-
- ec_trace("LOCKED", fop->parent, "lock=%p", lock);
-
- ec_lock(fop->parent);
+ return;
}
- else
- {
- gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock");
+
+ if ((flags & EC_INODE_SIZE) != 0) {
+ base = loc;
+ flags ^= EC_INODE_SIZE;
}
- return 0;
+ ec_lock_prepare_inode_internal(fop, &tmp, flags, base);
+
+ loc_wipe(&tmp);
}
-void ec_lock(ec_fop_data_t * fop)
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
{
- ec_lock_t * lock;
-
- while (fop->locked < fop->lock_count)
- {
- lock = fop->locks[fop->locked].lock;
-
- LOCK(&lock->loc.inode->lock);
-
- if (lock->owner != NULL)
- {
- ec_trace("LOCK_WAIT", fop, "lock=%p", lock);
-
- list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting);
-
- ec_sleep(fop);
-
- UNLOCK(&lock->loc.inode->lock);
-
- break;
- }
- lock->owner = fop;
-
- UNLOCK(&lock->loc.inode->lock);
-
- if (!lock->acquired)
- {
- ec_owner_set(fop->frame, lock);
-
- if (lock->kind == EC_LOCK_ENTRY)
- {
- ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s",
- lock, lock->loc.inode, lock->loc.path);
-
- ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked,
- lock, fop->xl->name, &lock->loc, NULL,
- ENTRYLK_LOCK, lock->type, NULL);
- }
- else
- {
- ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock,
- lock->loc.inode);
+ loc_t loc;
- ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked,
- lock, fop->xl->name, &lock->loc, F_SETLKW,
- &lock->flock, NULL);
- }
+ if (fop->error != 0) {
+ return;
+ }
- break;
- }
+ if (ec_loc_from_fd(fop->xl, &loc, fd) != 0) {
+ ec_fop_set_error(fop, EIO);
- ec_trace("LOCK_REUSE", fop, "lock=%p", lock);
+ return;
+ }
- if (lock->have_size)
- {
- fop->pre_size = fop->post_size = lock->size;
- fop->have_size = 1;
- }
- fop->mask &= lock->good_mask;
+ ec_lock_prepare_inode_internal(fop, &loc, flags, NULL);
- fop->locked++;
- }
+ loc_wipe(&loc);
}
gf_boolean_t
-ec_config_check (ec_fop_data_t *fop, dict_t *xdata)
+ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
{
ec_t *ec;
- if (ec_dict_del_config(xdata, EC_XATTR_CONFIG, &fop->config) < 0) {
- gf_log(fop->xl->name, GF_LOG_ERROR, "Failed to get a valid "
- "config");
-
- ec_fop_set_error(fop, EIO);
-
- return _gf_false;
- }
-
ec = fop->xl->private;
- if ((fop->config.version != EC_CONFIG_VERSION) ||
- (fop->config.algorithm != EC_CONFIG_ALGORITHM) ||
- (fop->config.gf_word_size != EC_GF_BITS) ||
- (fop->config.bricks != ec->nodes) ||
- (fop->config.redundancy != ec->redundancy) ||
- (fop->config.chunk_size != EC_METHOD_CHUNK_SIZE)) {
+ if ((config->version != EC_CONFIG_VERSION) ||
+ (config->algorithm != EC_CONFIG_ALGORITHM) ||
+ (config->gf_word_size != EC_GF_BITS) ||
+ (config->bricks != ec->nodes) ||
+ (config->redundancy != ec->redundancy) ||
+ (config->chunk_size != EC_METHOD_CHUNK_SIZE)) {
uint32_t data_bricks;
/* This combination of version/algorithm requires the following
@@ -957,271 +825,201 @@ ec_config_check (ec_fop_data_t *fop, dict_t *xdata)
chunk_size (in bits) must be a multiple of gf_word_size *
(bricks - redundancy) */
- data_bricks = fop->config.bricks - fop->config.redundancy;
- if ((fop->config.redundancy < 1) ||
- (fop->config.redundancy * 2 >= fop->config.bricks) ||
- !ec_is_power_of_2(fop->config.gf_word_size) ||
- ((fop->config.chunk_size * 8) % (fop->config.gf_word_size *
- data_bricks) != 0)) {
+ data_bricks = config->bricks - config->redundancy;
+ if ((config->redundancy < 1) ||
+ (config->redundancy * 2 >= config->bricks) ||
+ !ec_is_power_of_2(config->gf_word_size) ||
+ ((config->chunk_size * 8) % (config->gf_word_size * data_bricks)
+ != 0)) {
gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid or corrupted config");
} else {
gf_log(fop->xl->name, GF_LOG_ERROR, "Unsupported config "
"(V=%u, A=%u, W=%u, "
"N=%u, R=%u, S=%u)",
- fop->config.version, fop->config.algorithm,
- fop->config.gf_word_size, fop->config.bricks,
- fop->config.redundancy, fop->config.chunk_size);
+ config->version, config->algorithm,
+ config->gf_word_size, config->bricks,
+ config->redundancy, config->chunk_size);
}
- ec_fop_set_error(fop, EIO);
-
return _gf_false;
}
return _gf_true;
}
-int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret,
- int32_t op_errno, inode_t * inode,
- struct iatt * buf, dict_t * xdata,
- struct iatt * postparent)
+int32_t
+ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
- ec_fop_data_t * fop = cookie;
- ec_inode_t * ctx;
+ ec_fop_data_t *fop = cookie, *parent;
+ ec_lock_link_t *link = fop->data;
ec_lock_t *lock = NULL;
+ ec_inode_t *ctx;
- if (op_ret >= 0)
- {
- if ((buf->ia_type == IA_IFREG) && !ec_config_check(fop, xdata)) {
- return 0;
- }
+ lock = link->lock;
+ parent = link->fop;
+ ctx = lock->ctx;
- LOCK(&inode->lock);
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to get size and version (error %d: %s)", op_errno,
+ strerror (op_errno));
- ctx = __ec_inode_get(inode, this);
- if (ctx != NULL) {
- if (ctx->inode_lock != NULL) {
- lock = ctx->inode_lock;
- lock->version[0] = fop->answer->version[0];
- lock->version[1] = fop->answer->version[1];
+ goto out;
+ }
- if (buf->ia_type == IA_IFREG) {
- lock->have_size = 1;
- lock->size = buf->ia_size;
- }
- }
- if (ctx->entry_lock != NULL) {
- lock = ctx->entry_lock;
- lock->version[0] = fop->answer->version[0];
- lock->version[1] = fop->answer->version[1];
- }
- }
+ op_errno = EIO;
- UNLOCK(&inode->lock);
+ LOCK(&lock->loc.inode->lock);
- if (lock != NULL)
- {
- // Only update parent mask if the lookup has been made with
- // inode locked.
- fop->parent->mask &= fop->good;
- }
+ if (ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
+ EC_VERSION_SIZE) != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Unable to get version xattr");
- if (buf->ia_type == IA_IFREG) {
- fop->parent->pre_size = fop->parent->post_size = buf->ia_size;
- fop->parent->have_size = 1;
- }
- }
- else
- {
- gf_log(this->name, GF_LOG_WARNING, "Failed to get size and version "
- "(error %d)", op_errno);
- ec_fop_set_error(fop, op_errno);
+ goto unlock;
}
+ ctx->post_version[0] += ctx->pre_version[0];
+ ctx->post_version[1] += ctx->pre_version[1];
- return 0;
-}
+ ctx->have_version = _gf_true;
-gf_boolean_t
-ec_is_data_fop (glusterfs_fop_t fop)
-{
- switch (fop) {
- case GF_FOP_WRITE:
- case GF_FOP_TRUNCATE:
- case GF_FOP_FTRUNCATE:
- case GF_FOP_FALLOCATE:
- case GF_FOP_DISCARD:
- case GF_FOP_ZEROFILL:
- return _gf_true;
- default:
- return _gf_false;
- }
- return _gf_false;
-}
+ if (ec_dict_del_array(dict, EC_XATTR_DIRTY, ctx->pre_dirty,
+ EC_VERSION_SIZE) == 0) {
+ ctx->post_dirty[0] += ctx->pre_dirty[0];
+ ctx->post_dirty[1] += ctx->pre_dirty[1];
-gf_boolean_t
-ec_is_metadata_fop (glusterfs_fop_t fop)
-{
- switch (fop) {
- case GF_FOP_SETATTR:
- case GF_FOP_FSETATTR:
- case GF_FOP_SETXATTR:
- case GF_FOP_FSETXATTR:
- case GF_FOP_REMOVEXATTR:
- case GF_FOP_FREMOVEXATTR:
- return _gf_true;
- default:
- return _gf_false;
- }
- return _gf_false;
-}
+ ctx->have_dirty = _gf_true;
+ }
-int32_t
-ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
-{
- ec_fop_data_t *fop = cookie, *parent;
- ec_lock_t *lock = NULL;
- uint64_t size = 0;
- uint64_t version[EC_VERSION_SIZE] = {0, 0};
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ if (ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size) != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Unable to get size xattr");
- if (op_ret >= 0) {
- parent = fop->parent;
- while ((parent != NULL) && (parent->locks[0].lock == NULL)) {
- parent = parent->parent;
- }
- if (parent == NULL) {
- return 0;
+ goto unlock;
}
+ ctx->post_size = ctx->pre_size;
- lock = parent->locks[0].lock;
- if (ec_is_metadata_fop (fop->parent->id))
- lock->is_dirty[EC_METADATA_TXN] = _gf_true;
- else
- lock->is_dirty[EC_DATA_TXN] = _gf_true;
-
- if (lock->loc.inode->ia_type == IA_IFREG) {
- if (!ec_config_check(fop, dict) ||
- (ec_dict_del_number(dict, EC_XATTR_SIZE, &size) != 0)) {
- ec_fop_set_error(fop, EIO);
- return 0;
- }
- }
+ ctx->have_size = _gf_true;
- if (ec_dict_del_array(dict, EC_XATTR_VERSION, version,
- EC_VERSION_SIZE) != 0) {
- ec_fop_set_error(fop, EIO);
- return 0;
+ if ((ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config) != 0) ||
+ !ec_config_check(parent, &ctx->config)) {
+ gf_log(this->name, GF_LOG_ERROR, "Unable to get config xattr");
+
+ goto unlock;
}
- LOCK(&lock->loc.inode->lock);
+ ctx->have_config = _gf_true;
+ }
- if (lock->loc.inode->ia_type == IA_IFREG) {
- lock->size = size;
- fop->parent->pre_size = fop->parent->post_size = size;
- fop->parent->have_size = lock->have_size = 1;
- }
- lock->version[0] = version[0];
- lock->version[1] = version[1];
+ ctx->have_info = _gf_true;
- UNLOCK(&lock->loc.inode->lock);
+ op_errno = 0;
+
+unlock:
+ UNLOCK(&lock->loc.inode->lock);
+out:
+ if (op_errno == 0) {
+ parent->mask &= fop->good;
- fop->parent->mask &= fop->good;
/*As of now only data healing marks bricks as healing*/
- if (ec_is_data_fop (fop->parent->id))
- fop->parent->healing |= fop->healing;
+ if (ec_is_data_fop (parent->id)) {
+ parent->healing |= fop->healing;
+ }
} else {
- gf_log(this->name, GF_LOG_WARNING,
- "Failed to get size and version (error %d: %s)", op_errno,
- strerror (op_errno));
- ec_fop_set_error(fop, op_errno);
+ ec_fop_set_error(parent, op_errno);
}
return 0;
}
-void ec_get_size_version(ec_fop_data_t * fop)
+void ec_get_size_version(ec_lock_link_t *link)
{
loc_t loc;
- dict_t * xdata;
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ ec_fop_data_t *fop;
+ dict_t *dict = NULL;
uid_t uid;
gid_t gid;
int32_t error = ENOMEM;
uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
- if (fop->have_size)
- {
+ lock = link->lock;
+ ctx = lock->ctx;
+
+ /* If ec metadata has already been retrieved, do not try again. */
+ if (ctx->have_info) {
return;
}
- if ((fop->parent != NULL) && fop->parent->have_size)
- {
- fop->pre_size = fop->parent->pre_size;
- fop->post_size = fop->parent->post_size;
-
- fop->have_size = 1;
-
+ /* Determine if there's something we need to retrieve for the current
+ * operation. */
+ if (!lock->query && (lock->loc.inode->ia_type != IA_IFREG)) {
return;
}
+
+ fop = link->fop;
+
uid = fop->frame->root->uid;
gid = fop->frame->root->gid;
- fop->frame->root->uid = 0;
- fop->frame->root->gid = 0;
-
memset(&loc, 0, sizeof(loc));
- xdata = dict_new();
- if (xdata == NULL)
- {
+ dict = dict_new();
+ if (dict == NULL) {
goto out;
}
- if ((ec_dict_set_array(xdata, EC_XATTR_VERSION,
- allzero, EC_VERSION_SIZE) != 0) ||
- (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) ||
- (ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) ||
- (ec_dict_set_array(xdata, EC_XATTR_DIRTY, allzero,
- EC_VERSION_SIZE) != 0))
- {
+
+ /* Once we know that an xattrop will be needed, we try to get all available
+ * information in a single call. */
+ if ((ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
+ EC_VERSION_SIZE) != 0) ||
+ (ec_dict_set_array(dict, EC_XATTR_DIRTY, allzero,
+ EC_VERSION_SIZE) != 0)) {
goto out;
}
- error = EIO;
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ if ((ec_dict_set_number(dict, EC_XATTR_SIZE, 0) != 0) ||
+ (ec_dict_set_number(dict, EC_XATTR_CONFIG, 0) != 0)) {
+ goto out;
+ }
+ }
- if (!fop->use_fd)
- {
- if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0)
- {
+ fop->frame->root->uid = 0;
+ fop->frame->root->gid = 0;
+
+ /* For normal fops, ec_[f]xattrop() must succeed on at least
+ * EC_MINIMUM_MIN bricks, however when this is called as part of a
+ * self-heal operation the mask of target bricks (fop->mask) could
+ * contain less than EC_MINIMUM_MIN bricks, causing the lookup to
+ * always fail. Thus we always use the same minimum used for the main
+ * fop.
+ */
+ if (lock->fd == NULL) {
+ if (ec_loc_from_loc(fop->xl, &loc, &lock->loc) != 0) {
goto out;
}
- if (gf_uuid_is_null(loc.pargfid))
- {
- if (loc.parent != NULL)
- {
+ if (gf_uuid_is_null(loc.pargfid)) {
+ if (loc.parent != NULL) {
inode_unref(loc.parent);
loc.parent = NULL;
}
GF_FREE((char *)loc.path);
- loc.path = NULL;
- loc.name = NULL;
+ loc.path = NULL;
+ loc.name = NULL;
}
- /* For normal fops, ec_lookup() must succeed on at least EC_MINIMUM_MIN
- * bricks, however when this is called as part of a self-heal operation
- * the mask of target bricks (fop->mask) could contain less than
- * EC_MINIMUM_MIN bricks, causing the lookup to always fail. Thus we
- * always use the same minimum used for the main fop.
- */
- ec_lookup(fop->frame, fop->xl, fop->mask, fop->minimum,
- ec_get_size_version_set, NULL, &loc, xdata);
+
+ ec_xattrop (fop->frame, fop->xl, fop->mask, fop->minimum,
+ ec_prepare_update_cbk, link, &loc,
+ GF_XATTROP_ADD_ARRAY64, dict, NULL);
} else {
- if (ec_loc_from_fd(fop->xl, &loc, fop->fd) != 0) {
- goto out;
- }
ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
- ec_prepare_update_cbk, NULL, fop->fd,
- GF_XATTROP_ADD_ARRAY64, xdata, NULL);
+ ec_prepare_update_cbk, link, lock->fd,
+ GF_XATTROP_ADD_ARRAY64, dict, NULL);
}
+
error = 0;
out:
@@ -1230,9 +1028,8 @@ out:
loc_wipe(&loc);
- if (xdata != NULL)
- {
- dict_unref(xdata);
+ if (dict != NULL) {
+ dict_unref(dict);
}
if (error != 0) {
@@ -1240,147 +1037,408 @@ out:
}
}
-void ec_prepare_update(ec_fop_data_t *fop)
+gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t *size)
{
- loc_t loc;
- dict_t *xdata;
- ec_fop_data_t *tmp;
- ec_lock_t *lock;
- ec_t *ec;
- uid_t uid;
- gid_t gid;
- uint64_t version[2] = {0, 0};
- uint64_t dirty[2] = {0, 0};
- int32_t error = ENOMEM;
+ ec_inode_t *ctx;
+ gf_boolean_t found = _gf_false;
- tmp = fop;
- while ((tmp != NULL) && (tmp->locks[0].lock == NULL)) {
- tmp = tmp->parent;
- }
- if ((tmp != NULL) &&
- (tmp->locks[0].lock->is_dirty[0] || tmp->locks[0].lock->is_dirty[1])) {
- lock = tmp->locks[0].lock;
+ LOCK(&inode->lock);
- fop->pre_size = fop->post_size = lock->size;
- fop->have_size = 1;
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto unlock;
+ }
- return;
+ if (ctx->have_size) {
+ *size = ctx->post_size;
+ found = _gf_true;
}
- uid = fop->frame->root->uid;
- gid = fop->frame->root->gid;
- fop->frame->root->uid = 0;
- fop->frame->root->gid = 0;
+unlock:
+ UNLOCK(&inode->lock);
- memset(&loc, 0, sizeof(loc));
+ return found;
+}
- ec = fop->xl->private;
- if (ec_bits_count (fop->mask) >= ec->fragments) {
- /* It is changing data only if the update happens on at least
- * fragment number of bricks. Otherwise it probably is healing*/
- if (ec_is_metadata_fop (fop->id))
- dirty[EC_METADATA_TXN] = 1;
- else
- dirty[EC_DATA_TXN] = 1;
+gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t size)
+{
+ ec_inode_t *ctx;
+ gf_boolean_t found = _gf_false;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto unlock;
}
- xdata = dict_new();
- if (xdata == NULL) {
- goto out;
+ /* Normal fops always have ctx->have_size set. However self-heal calls this
+ * to prepare the inode, so ctx->have_size will be false. In this case we
+ * prepare both pre_size and post_size, and set have_size and have_info to
+ * true. */
+ if (!ctx->have_size) {
+ ctx->pre_size = size;
+ ctx->have_size = ctx->have_info = _gf_true;
}
- if ((ec_dict_set_array(xdata, EC_XATTR_VERSION,
- version, EC_VERSION_SIZE) != 0) ||
- (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) ||
- (ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) ||
- (ec_dict_set_array(xdata, EC_XATTR_DIRTY, dirty,
- EC_VERSION_SIZE) != 0)) {
- goto out;
+ ctx->post_size = size;
+
+ found = _gf_true;
+
+unlock:
+ UNLOCK(&inode->lock);
+
+ return found;
+}
+
+void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
+{
+ ec_inode_t *ctx;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto unlock;
}
- error = EIO;
+ ctx->have_info = _gf_false;
+ ctx->have_config = _gf_false;
+ ctx->have_version = _gf_false;
+ ctx->have_size = _gf_false;
+ ctx->have_dirty = _gf_false;
- if (!fop->use_fd) {
- if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) {
- goto out;
- }
+ memset(&ctx->config, 0, sizeof(ctx->config));
+ memset(ctx->pre_version, 0, sizeof(ctx->pre_version));
+ memset(ctx->post_version, 0, sizeof(ctx->post_version));
+ ctx->pre_size = ctx->post_size = 0;
+ memset(ctx->pre_dirty, 0, sizeof(ctx->pre_dirty));
+ memset(ctx->post_dirty, 0, sizeof(ctx->post_dirty));
+
+unlock:
+ UNLOCK(&inode->lock);
+}
- ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
- ec_prepare_update_cbk, NULL, &loc, GF_XATTROP_ADD_ARRAY64,
- xdata, NULL);
+int32_t ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link;
+
+ if (op_ret >= 0) {
+ link = fop->data;
+ if (ec_dict_del_number(xdata, EC_XATTR_SIZE, &link->size) != 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Unable to determine real file size");
+ }
} else {
- ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
- ec_prepare_update_cbk, NULL, fop->fd,
- GF_XATTROP_ADD_ARRAY64, xdata, NULL);
+ /* Prevent failure of parent fop. */
+ fop->error = 0;
}
- error = 0;
+ return 0;
+}
-out:
+/* This function is used to get the trusted.ec.size xattr from a file when
+ * no lock is needed on the inode. This is only required to maintan iatt
+ * structs on fops that manipulate directory entries but do not operate
+ * directly on the inode, like link, rename, ...
+ *
+ * Any error processing this request is ignored. In the worst case, an invalid
+ * or not up to date value in the iatt could cause some cache invalidation.
+ */
+void ec_get_real_size(ec_lock_link_t *link)
+{
+ ec_fop_data_t *fop;
+ dict_t *xdata;
- fop->frame->root->uid = uid;
- fop->frame->root->gid = gid;
+ if (link->base == NULL) {
+ return;
+ }
- loc_wipe(&loc);
+ if (link->base->inode->ia_type != IA_IFREG) {
+ return;
+ }
+ fop = link->fop;
+
+ if (ec_get_inode_size(fop, link->base->inode, &link->size)) {
+ return;
+ }
+
+ xdata = dict_new();
+ if (xdata == NULL) {
+ return;
+ }
+ if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) {
+ goto out;
+ }
+
+ /* Send a simple lookup. A single answer is considered ok since this value
+ * is only used to return an iatt struct related to an inode that is not
+ * locked and have not suffered any operation. */
+ ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link,
+ link->base, xdata);
+
+out:
if (xdata != NULL) {
dict_unref(xdata);
}
+}
- if (error != 0) {
- ec_fop_set_error(fop, error);
+void ec_lock_acquired(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+
+ lock = link->lock;
+ fop = link->fop;
+
+ ec_trace("LOCKED", link->fop, "lock=%p", lock);
+
+ /* If the fop has an fd available, attach it to the lock structure to be
+ * able to do fxattrop calls instead of xattrop. It's safe to change this
+ * here because no xattrop using the fd can start concurrently at this
+ * point. */
+ if (fop->use_fd) {
+ if (lock->fd != NULL) {
+ fd_unref(lock->fd);
+ }
+ lock->fd = fd_ref(fop->fd);
}
+ lock->acquired = _gf_true;
+
+ fop->mask &= lock->good_mask;
+
+ fop->locked++;
+
+ ec_get_size_version(link);
+ ec_get_real_size(link);
}
-int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link = NULL;
+ ec_lock_t *lock = NULL;
- if (op_ret < 0) {
- gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)",
- ec_fop_name(fop->parent->id));
+ if (op_ret >= 0) {
+ link = fop->data;
+ lock = link->lock;
+ lock->mask = lock->good_mask = fop->good;
+
+ ec_lock_acquired(link);
+ ec_lock(fop->parent);
} else {
- ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data);
+ gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock");
}
return 0;
}
-void ec_unlock_lock(ec_fop_data_t *fop, ec_lock_t *lock)
+gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
{
- if ((lock->mask != 0) && lock->acquired) {
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+
+ lock = link->lock;
+ fop = link->fop;
+ if (!lock->acquired) {
ec_owner_set(fop->frame, lock);
- switch (lock->kind) {
- case EC_LOCK_ENTRY:
- ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s", lock,
- lock->loc.inode, lock->loc.path);
+ ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock,
+ lock->loc.inode);
+
+ lock->flock.l_type = F_WRLCK;
+ ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked,
+ link, fop->xl->name, &lock->loc, F_SETLKW, &lock->flock,
+ NULL);
+
+ return _gf_false;
+ }
+
+ ec_trace("LOCK_REUSE", fop, "lock=%p", lock);
+
+ ec_lock_acquired(link);
+
+ return _gf_true;
+}
+
+void ec_lock(ec_fop_data_t *fop)
+{
+ ec_lock_link_t *link;
+ ec_lock_link_t *timer_link = NULL;
+ ec_lock_t *lock;
+
+ /* There is a chance that ec_resume is called on fop even before ec_sleep.
+ * Which can result in refs == 0 for fop leading to use after free in this
+ * function when it calls ec_sleep so do ec_sleep at start and end of this
+ * function.*/
+ ec_sleep (fop);
+ while (fop->locked < fop->lock_count) {
+ /* Since there are only up to 2 locks per fop, this xor will change
+ * the order of the locks if fop->first_lock is 1. */
+ link = &fop->locks[fop->locked ^ fop->first_lock];
+ lock = link->lock;
+
+ timer_link = NULL;
+
+ LOCK(&lock->loc.inode->lock);
+ GF_ASSERT (lock->inserted > 0);
+ lock->inserted--;
+
+ if (lock->timer != NULL) {
+ GF_ASSERT (lock->release == _gf_false);
+ timer_link = lock->timer->data;
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ gf_timer_call_cancel(fop->xl->ctx, lock->timer);
+ lock->timer = NULL;
+
+ lock->refs--;
+ /* There should remain at least 1 ref, the current one. */
+ GF_ASSERT(lock->refs > 0);
+ }
+
+ GF_ASSERT(list_empty(&link->wait_list));
+
+ if ((lock->owner != NULL) || lock->release) {
+ if (lock->release) {
+ ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
- ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
- ec_unlocked, lock, fop->xl->name, &lock->loc, NULL,
- ENTRYLK_UNLOCK, lock->type, NULL);
+ list_add_tail(&link->wait_list, &lock->frozen);
+
+ /* The lock is frozen, so we move the current reference to
+ * refs_frozen. After that, there should remain at least one
+ * ref belonging to the lock that is processing the release. */
+ lock->refs--;
+ GF_ASSERT(lock->refs > 0);
+ lock->refs_frozen++;
+ } else {
+ ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
+
+ list_add_tail(&link->wait_list, &lock->waiting);
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ ec_sleep(fop);
break;
+ }
- case EC_LOCK_INODE:
- lock->flock.l_type = F_UNLCK;
- ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
- lock->loc.inode);
+ lock->owner = fop;
- ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
- ec_unlocked, lock, fop->xl->name, &lock->loc, F_SETLK,
- &lock->flock, NULL);
+ UNLOCK(&lock->loc.inode->lock);
+ if (!ec_lock_acquire(link)) {
break;
+ }
- default:
- gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type");
+ if (timer_link != NULL) {
+ ec_resume(timer_link->fop, 0);
+ timer_link = NULL;
}
}
+ ec_resume (fop, 0);
+
+ if (timer_link != NULL) {
+ ec_resume(timer_link->fop, 0);
+ }
+}
+
+void
+ec_lock_unfreeze(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
- ec_trace("LOCK_DESTROY", fop, "lock=%p", lock);
+ lock = link->lock;
+
+ LOCK(&lock->loc.inode->lock);
+
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+
+ lock->refs--;
+ GF_ASSERT (lock->refs == lock->inserted);
+
+ GF_ASSERT(list_empty(&lock->waiting) && (lock->owner == NULL));
+
+ list_splice_init(&lock->frozen, &lock->waiting);
+ lock->refs += lock->refs_frozen;
+ lock->refs_frozen = 0;
+
+ if (!list_empty(&lock->waiting)) {
+ link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
+ list_del_init(&link->wait_list);
+
+ lock->owner = link->fop;
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock);
+
+ if (ec_lock_acquire(link)) {
+ ec_lock(link->fop);
+ }
+ ec_resume(link->fop, 0);
+ } else if (lock->refs == 0) {
+ ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
+
+ lock->ctx->inode_lock = NULL;
+
+ UNLOCK(&lock->loc.inode->lock);
- ec_lock_destroy(lock);
+ ec_lock_destroy(lock);
+ } else {
+ UNLOCK(&lock->loc.inode->lock);
+ }
+}
+
+int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link = fop->data;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)",
+ ec_fop_name(link->fop->id));
+ } else {
+ ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock);
+ }
+
+ ec_lock_unfreeze(link);
+
+ return 0;
+}
+
+void ec_unlock_lock(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+
+ lock = link->lock;
+ fop = link->fop;
+
+ ec_clear_inode_info(fop, lock->loc.inode);
+
+ if ((lock->mask != 0) && lock->acquired) {
+ ec_owner_set(fop->frame, lock);
+
+ lock->flock.l_type = F_UNLCK;
+ ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
+ lock->loc.inode);
+
+ ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
+ ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK,
+ &lock->flock, NULL);
+ } else {
+ ec_lock_unfreeze(link);
+ }
}
int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
@@ -1388,111 +1446,128 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
int32_t op_errno, dict_t * xattr,
dict_t * xdata)
{
- ec_fop_data_t * fop = cookie;
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link;
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
- if (op_ret < 0)
- {
+ if (op_ret < 0) {
gf_log(fop->xl->name, GF_LOG_ERROR, "Failed to update version and "
"size (error %d)", op_errno);
- }
- else
- {
+ } else {
fop->parent->mask &= fop->good;
+ link = fop->data;
+ lock = link->lock;
+ ctx = lock->ctx;
+
+ if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version,
+ EC_VERSION_SIZE) == 0) {
+ ctx->pre_version[0] = ctx->post_version[0];
+ ctx->pre_version[1] = ctx->post_version[1];
+
+ ctx->have_version = _gf_true;
+ }
+ if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) {
+ ctx->pre_size = ctx->post_size;
+
+ ctx->have_size = _gf_true;
+ }
+ if (ec_dict_del_array(xattr, EC_XATTR_DIRTY, ctx->post_dirty,
+ EC_VERSION_SIZE) == 0) {
+ ctx->pre_dirty[0] = ctx->post_dirty[0];
+ ctx->pre_dirty[1] = ctx->post_dirty[1];
+
+ ctx->have_dirty = _gf_true;
+ }
+ if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) &&
+ ec_config_check(fop->parent, &ctx->config)) {
+ ctx->have_config = _gf_true;
+ }
+
+ ctx->have_info = _gf_true;
}
- if (fop->data != NULL) {
- ec_unlock_lock(fop->parent, fop->data);
+ if ((fop->parent->id != GF_FOP_FLUSH) &&
+ (fop->parent->id != GF_FOP_FSYNC) &&
+ (fop->parent->id != GF_FOP_FSYNCDIR)) {
+ ec_unlock_lock(fop->data);
}
return 0;
}
-uint64_t
-ec_get_dirty_value (ec_t *ec, uintptr_t fop_mask, uint64_t version_delta,
- gf_boolean_t dirty)
-{
- uint64_t dirty_val = 0;
-
- if (version_delta) {
- if (~fop_mask & ec->node_mask) {
- /* fop didn't succeed on all subvols so 'dirty' xattr
- * shouldn't be cleared */
- if (!dirty)
- dirty_val = 1;
- } else {
- /* fop succeed on all subvols so 'dirty' xattr
- * should be cleared */
- if (dirty)
- dirty_val = -1;
- }
- }
- return dirty_val;
-}
-
void
-ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2],
- uint64_t size, gf_boolean_t dirty[2], ec_lock_t *lock)
+ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
+ uint64_t size, uint64_t *dirty)
{
- ec_t *ec = fop->xl->private;
+ ec_fop_data_t *fop;
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
dict_t * dict;
uid_t uid;
gid_t gid;
- uint64_t dirty_values[2] = {0};
- int i = 0;
- if (fop->parent != NULL)
- {
- fop->parent->post_size = fop->post_size;
+ fop = link->fop;
- return;
- }
-
- ec_trace("UPDATE", fop, "version=%ld, size=%ld, dirty=%u", version, size,
- dirty);
+ ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
+ version[0], version[1], size, dirty[0], dirty[1]);
dict = dict_new();
- if (dict == NULL)
- {
+ if (dict == NULL) {
goto out;
}
- if (version[0] != 0 || version[1] != 0) {
+ lock = link->lock;
+ ctx = lock->ctx;
+
+ /* If we don't have version information or it has been modified, we
+ * update it. */
+ if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {
if (ec_dict_set_array(dict, EC_XATTR_VERSION,
version, EC_VERSION_SIZE) != 0) {
goto out;
}
}
+
if (size != 0) {
+ /* If size has been changed, we should already know the previous size
+ * of the file. */
+ GF_ASSERT(ctx->have_size);
+
if (ec_dict_set_number(dict, EC_XATTR_SIZE, size) != 0) {
goto out;
}
}
- for (i = 0; i < sizeof (dirty_values)/sizeof (dirty_values[0]); i++) {
- dirty_values[i] = ec_get_dirty_value (ec, fop->mask, version[i],
- dirty[i]);
- }
-
- if (dirty_values[0] || dirty_values[1]) {
- if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty_values,
+ /* If we don't have dirty information or it has been modified, we update
+ * it. */
+ if (!ctx->have_dirty || (dirty[0] != 0) || (dirty[1] != 0)) {
+ if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty,
EC_VERSION_SIZE) != 0) {
goto out;
}
}
+ /* If config information is not know, we request it now. */
+ if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
+ /* A failure requesting this xattr is ignored because it's not
+ * absolutely required right now. */
+ ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+ }
+
uid = fop->frame->root->uid;
gid = fop->frame->root->gid;
fop->frame->root->uid = 0;
fop->frame->root->gid = 0;
- if (fop->use_fd) {
- ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN,
- ec_update_size_version_done, lock, fop->fd,
+ if (link->lock->fd == NULL) {
+ ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN,
+ ec_update_size_version_done, link, &link->lock->loc,
GF_XATTROP_ADD_ARRAY64, dict, NULL);
} else {
- ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN,
- ec_update_size_version_done, lock, loc,
+ ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN,
+ ec_update_size_version_done, link, link->lock->fd,
GF_XATTROP_ADD_ARRAY64, dict, NULL);
}
@@ -1504,8 +1579,7 @@ ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2],
return;
out:
- if (dict != NULL)
- {
+ if (dict != NULL) {
dict_unref(dict);
}
@@ -1514,46 +1588,99 @@ out:
gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size");
}
-void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock)
+gf_boolean_t
+ec_update_info(ec_lock_link_t *link)
{
- ec_trace("UNLOCK_NOW", fop, "lock=%p", lock);
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ uint64_t version[2];
+ uint64_t dirty[2];
+ uint64_t size;
- if ((lock->version_delta[0] != 0) || (lock->version_delta[1] != 0) ||
- lock->is_dirty[0] || lock->is_dirty[1]) {
- ec_update_size_version(fop, &lock->loc, lock->version_delta,
- lock->size_delta, lock->is_dirty, lock);
- } else {
- ec_unlock_lock(fop, lock);
+ lock = link->lock;
+ ctx = lock->ctx;
+
+ /* pre_version[*] will be 0 if have_version is false */
+ version[0] = ctx->post_version[0] - ctx->pre_version[0];
+ version[1] = ctx->post_version[1] - ctx->pre_version[1];
+
+ size = ctx->post_size - ctx->pre_size;
+
+ /* pre_dirty[*] will be 0 if have_dirty is false */
+ dirty[0] = ctx->post_dirty[0] - ctx->pre_dirty[0];
+ dirty[1] = ctx->post_dirty[1] - ctx->pre_dirty[1];
+
+ if ((version[0] != 0) || (version[1] != 0) ||
+ (dirty[0] != 0) || (dirty[1] != 0)) {
+ ec_update_size_version(link, version, size, dirty);
+
+ return _gf_true;
}
- ec_resume(fop, 0);
+ return _gf_false;
}
-void ec_unlock_timer_cbk(void *data)
+void
+ec_unlock_now(ec_lock_link_t *link)
{
- ec_lock_link_t *link = data;
- ec_lock_t *lock = link->lock;
- ec_fop_data_t *fop = NULL;
+ ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock);
- LOCK(&lock->loc.inode->lock);
+ if (!ec_update_info(link)) {
+ ec_unlock_lock(link);
+ }
- if (lock->timer != NULL) {
- fop = link->fop;
+ ec_resume(link->fop, 0);
+}
- ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock);
+void
+ec_unlock_timer_del(ec_lock_link_t *link)
+{
+ int32_t before = 0;
+ ec_lock_t *lock;
+ inode_t *inode;
+ gf_boolean_t now = _gf_false;
+
+ lock = link->lock;
+
+ /* A race condition can happen if timer expires, calls this function
+ * and the lock is released (lock->loc is wiped) but the fop is not
+ * fully completed yet (it's still on the list of pending fops). In
+ * this case, this function can also be called if ec_unlock_force() is
+ * called. */
+ inode = lock->loc.inode;
+ if (inode == NULL) {
+ return;
+ }
- GF_ASSERT(lock->refs == 1);
+ LOCK(&inode->lock);
- gf_timer_call_cancel(fop->xl->ctx, lock->timer);
- lock->timer = NULL;
- *lock->plock = NULL;
- }
+ if (lock->timer != NULL) {
+ ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
- UNLOCK(&lock->loc.inode->lock);
+ gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
+ lock->timer = NULL;
- if (fop != NULL) {
- ec_unlock_now(fop, lock);
- }
+ lock->release = now = _gf_true;
+
+ before = lock->refs + lock->refs_frozen;
+ list_splice_init(&lock->waiting, &lock->frozen);
+ lock->refs_frozen += lock->refs - lock->inserted - 1;
+ lock->refs = 1 + lock->inserted;
+ /* We moved around the locks, so total number of locks shouldn't
+ * change by this operation*/
+ GF_ASSERT (before == (lock->refs + lock->refs_frozen));
+ }
+
+ UNLOCK(&inode->lock);
+
+ if (now) {
+ ec_unlock_now(link);
+ }
+}
+
+void ec_unlock_timer_cbk(void *data)
+{
+ ec_unlock_timer_del(data);
}
void ec_unlock_timer_add(ec_lock_link_t *link)
@@ -1561,28 +1688,28 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
struct timespec delay;
ec_fop_data_t *fop = link->fop;
ec_lock_t *lock = link->lock;
- int32_t refs = 1;
+ gf_boolean_t now = _gf_false;
LOCK(&lock->loc.inode->lock);
GF_ASSERT(lock->timer == NULL);
- if (lock->refs != 1) {
+ if ((lock->refs - lock->inserted) > 1) {
ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
lock->refs--;
UNLOCK(&lock->loc.inode->lock);
} else if (lock->acquired) {
- delay.tv_sec = 1;
- delay.tv_nsec = 0;
+ ec_t *ec = fop->xl->private;
ec_sleep(fop);
- /* If healing is needed, do not delay lock release to let self-heal
- * start working as soon as possible. */
- if (!ec_fop_needs_heal(fop)) {
- ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock);
+ /* If healing is needed, the lock needs to be released due to
+ * contention, or ec is shutting down, do not delay lock release. */
+ if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) {
+ ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
+ lock->release);
delay.tv_sec = 1;
delay.tv_nsec = 0;
@@ -1592,26 +1719,25 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an "
"unlock");
- *lock->plock = NULL;
- refs = 0;
+ lock->release = now = _gf_true;
}
} else {
- ec_trace("UNLOCK_FORCE", fop, "lock=%p", lock);
- *lock->plock = NULL;
- refs = 0;
+ ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
+ lock->release);
+ lock->release = now = _gf_true;
}
UNLOCK(&lock->loc.inode->lock);
- if (refs == 0) {
- ec_unlock_now(fop, lock);
+ if (now) {
+ ec_unlock_now(link);
}
} else {
- *lock->plock = NULL;
+ lock->release = _gf_true;
UNLOCK(&lock->loc.inode->lock);
- ec_lock_destroy(lock);
+ ec_lock_unfreeze(link);
}
}
@@ -1624,52 +1750,60 @@ void ec_unlock(ec_fop_data_t *fop)
}
}
-void ec_flush_size_version(ec_fop_data_t * fop)
+void
+ec_unlock_force(ec_fop_data_t *fop)
{
- ec_lock_t * lock;
- uint64_t version[2], delta;
- gf_boolean_t dirty[2] = {_gf_false, _gf_false};
+ int32_t i;
- GF_ASSERT(fop->lock_count == 1);
+ for (i = 0; i < fop->lock_count; i++) {
+ ec_trace("UNLOCK_FORCED", fop, "lock=%p", &fop->locks[i]);
- lock = fop->locks[0].lock;
-
- LOCK(&lock->loc.inode->lock);
-
- GF_ASSERT(lock->owner == fop);
-
- version[0] = lock->version_delta[0];
- version[1] = lock->version_delta[1];
- dirty[0] = lock->is_dirty[0];
- dirty[1] = lock->is_dirty[1];
- delta = lock->size_delta;
- lock->version_delta[0] = 0;
- lock->version_delta[1] = 0;
- lock->size_delta = 0;
- lock->is_dirty[0] = _gf_false;
- lock->is_dirty[1] = _gf_false;
+ ec_unlock_timer_del(&fop->locks[i]);
+ }
+}
- UNLOCK(&lock->loc.inode->lock);
+void ec_flush_size_version(ec_fop_data_t *fop)
+{
+ GF_ASSERT(fop->lock_count == 1);
- if (version[0] > 0 || version[1] > 0 || dirty[0] || dirty[1])
- {
- ec_update_size_version(fop, &lock->loc, version, delta, dirty,
- NULL);
- }
+ ec_update_info(&fop->locks[0]);
}
void ec_lock_reuse(ec_fop_data_t *fop)
{
- ec_fop_data_t * wait_fop;
- ec_lock_t * lock;
- ec_lock_link_t * link;
- int32_t i;
+ ec_t *ec;
+ ec_cbk_data_t *cbk;
+ ec_lock_t *lock;
+ ec_lock_link_t *link;
+ ec_inode_t *ctx;
+ int32_t i, count;
+ gf_boolean_t release = _gf_false;
+
+ cbk = fop->answer;
+ if (cbk != NULL) {
+ if (cbk->xdata != NULL) {
+ if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
+ &count) == 0) && (count > 1)) {
+ release = _gf_true;
+ }
+ if (release) {
+ gf_log(fop->xl->name, GF_LOG_DEBUG,
+ "Lock contention detected");
+ }
+ }
+ } else {
+ /* If we haven't get an answer with enough quorum, we always release
+ * the lock. */
+ release = _gf_true;
+ }
+
+ ec = fop->xl->private;
for (i = 0; i < fop->lock_count; i++)
{
- wait_fop = NULL;
-
- lock = fop->locks[i].lock;
+ link = &fop->locks[i];
+ lock = link->lock;
+ ctx = lock->ctx;
LOCK(&lock->loc.inode->lock);
@@ -1677,47 +1811,42 @@ void ec_lock_reuse(ec_fop_data_t *fop)
GF_ASSERT(lock->owner == fop);
lock->owner = NULL;
+ lock->release |= release;
- if (((fop->locks_update >> i) & 1) != 0) {
- if (fop->error == 0)
- {
- if (ec_is_metadata_fop (fop->id)) {
- lock->version_delta[1]++;
- } else {
- lock->version_delta[0]++;
- }
- lock->size_delta += fop->post_size - fop->pre_size;
- if (fop->have_size) {
- lock->size = fop->post_size;
- lock->have_size = 1;
+ if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
+ if (link->update[0]) {
+ ctx->post_version[0]++;
+ if (ec->node_mask & ~fop->mask) {
+ ctx->post_dirty[0]++;
+ }
+ }
+ if (link->update[1]) {
+ ctx->post_version[1]++;
+ if (ec->node_mask & ~fop->mask) {
+ ctx->post_dirty[1]++;
}
}
}
lock->good_mask &= fop->mask;
+ link = NULL;
if (!list_empty(&lock->waiting))
{
link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
list_del_init(&link->wait_list);
- wait_fop = link->fop;
-
- if (lock->kind == EC_LOCK_INODE)
- {
- wait_fop->pre_size = wait_fop->post_size = fop->post_size;
- wait_fop->have_size = fop->have_size;
- }
- wait_fop->mask &= fop->mask;
+ lock->owner = link->fop;
}
UNLOCK(&lock->loc.inode->lock);
- if (wait_fop != NULL)
+ if (link != NULL)
{
- ec_lock(wait_fop);
-
- ec_resume(wait_fop, 0);
+ if (ec_lock_acquire(link)) {
+ ec_lock(link->fop);
+ }
+ ec_resume(link->fop, 0);
}
}
}
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index 9e0aaa0f079..c0db0218699 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -20,6 +20,9 @@ typedef enum {
EC_METADATA_TXN
} ec_txn_t;
+#define EC_FOP_HEAL -1
+#define EC_FOP_FHEAL -2
+
#define EC_CONFIG_VERSION 0
#define EC_CONFIG_ALGORITHM 0
@@ -35,19 +38,20 @@ typedef enum {
#define EC_MINIMUM_MIN -2
#define EC_MINIMUM_ALL -3
-#define EC_LOCK_ENTRY 0
-#define EC_LOCK_INODE 1
+#define EC_UPDATE_DATA 1
+#define EC_UPDATE_META 2
+#define EC_QUERY_INFO 4
+#define EC_INODE_SIZE 8
#define EC_STATE_START 0
#define EC_STATE_END 0
#define EC_STATE_INIT 1
#define EC_STATE_LOCK 2
-#define EC_STATE_GET_SIZE_AND_VERSION 3
-#define EC_STATE_DISPATCH 4
-#define EC_STATE_PREPARE_ANSWER 5
-#define EC_STATE_REPORT 6
-#define EC_STATE_LOCK_REUSE 7
-#define EC_STATE_UNLOCK 8
+#define EC_STATE_DISPATCH 3
+#define EC_STATE_PREPARE_ANSWER 4
+#define EC_STATE_REPORT 5
+#define EC_STATE_LOCK_REUSE 6
+#define EC_STATE_UNLOCK 7
#define EC_STATE_DELAYED_START 100
@@ -81,15 +85,21 @@ void ec_update_bad(ec_fop_data_t * fop, uintptr_t good);
void ec_fop_set_error(ec_fop_data_t * fop, int32_t error);
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update);
-void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update);
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update);
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags);
+void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc,
+ uint32_t flags);
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags);
void ec_lock(ec_fop_data_t * fop);
void ec_lock_reuse(ec_fop_data_t *fop);
void ec_unlock(ec_fop_data_t * fop);
+void ec_unlock_force(ec_fop_data_t *fop);
+
+gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t *size);
+gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t size);
+void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode);
-void ec_get_size_version(ec_fop_data_t * fop);
-void ec_prepare_update(ec_fop_data_t *fop);
void ec_flush_size_version(ec_fop_data_t * fop);
void ec_dispatch_all(ec_fop_data_t * fop);
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
index fb47aea90a8..047ccd5ff31 100644
--- a/xlators/cluster/ec/src/ec-data.c
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -96,6 +96,19 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk)
mem_put(cbk);
}
+/* PARENT_DOWN will be notified to children only after these fops are complete
+ * when graph switch happens. We do not want graph switch to be waiting on
+ * heal to complete as healing big file/directory could take a while. Which
+ * will lead to hang on the mount.
+ */
+static inline gf_boolean_t
+ec_needs_graceful_completion (ec_fop_data_t *fop)
+{
+ if ((fop->id != EC_FOP_HEAL) && (fop->id != EC_FOP_FHEAL))
+ return _gf_true;
+ return _gf_false;
+}
+
ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
int32_t id, uint32_t flags,
uintptr_t target, int32_t minimum,
@@ -114,6 +127,12 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
return NULL;
}
+ INIT_LIST_HEAD(&fop->cbk_list);
+ INIT_LIST_HEAD(&fop->answer_list);
+ INIT_LIST_HEAD(&fop->pending_list);
+ INIT_LIST_HEAD(&fop->locks[0].wait_list);
+ INIT_LIST_HEAD(&fop->locks[1].wait_list);
+
fop->xl = this;
fop->req_frame = frame;
@@ -148,9 +167,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
fop->minimum = minimum;
fop->mask = target;
- INIT_LIST_HEAD(&fop->cbk_list);
- INIT_LIST_HEAD(&fop->answer_list);
-
fop->wind = wind;
fop->handler = handler;
fop->cbks = cbks;
@@ -165,17 +181,20 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
parent = frame->local;
if (parent != NULL)
{
- LOCK(&parent->lock);
-
- parent->jobs++;
- parent->refs++;
-
- UNLOCK(&parent->lock);
+ ec_sleep(parent);
}
fop->parent = parent;
}
+ if (ec_needs_graceful_completion (fop)) {
+ LOCK(&ec->lock);
+
+ list_add_tail(&fop->pending_list, &ec->pending_fops);
+
+ UNLOCK(&ec->lock);
+ }
+
return fop;
}
@@ -190,10 +209,41 @@ void ec_fop_data_acquire(ec_fop_data_t * fop)
UNLOCK(&fop->lock);
}
+static void
+ec_handle_last_pending_fop_completion (ec_fop_data_t *fop, gf_boolean_t *notify)
+{
+ ec_t *ec = fop->xl->private;
+
+ if (!list_empty (&fop->pending_list)) {
+ LOCK(&ec->lock);
+ {
+ list_del_init (&fop->pending_list);
+ *notify = list_empty (&ec->pending_fops);
+ }
+ UNLOCK(&ec->lock);
+ }
+}
+
+void
+ec_fop_cleanup(ec_fop_data_t *fop)
+{
+ ec_cbk_data_t *cbk, *tmp;
+
+ list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list) {
+ list_del_init(&cbk->answer_list);
+
+ ec_cbk_data_destroy(cbk);
+ }
+ INIT_LIST_HEAD(&fop->cbk_list);
+
+ fop->answer = NULL;
+}
+
void ec_fop_data_release(ec_fop_data_t * fop)
{
- ec_cbk_data_t * cbk, * tmp;
+ ec_t *ec = NULL;
int32_t refs;
+ gf_boolean_t notify = _gf_false;
LOCK(&fop->lock);
@@ -238,13 +288,13 @@ void ec_fop_data_release(ec_fop_data_t * fop)
ec_resume_parent(fop, fop->error);
- list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list)
- {
- list_del_init(&cbk->answer_list);
-
- ec_cbk_data_destroy(cbk);
- }
+ ec_fop_cleanup(fop);
+ ec = fop->xl->private;
+ ec_handle_last_pending_fop_completion (fop, &notify);
mem_put(fop);
+ if (notify) {
+ ec_pending_fops_completed(ec);
+ }
}
}
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 9e5c92dd5b8..8204cf087de 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -67,10 +67,20 @@ struct _ec_fd
struct _ec_inode
{
uintptr_t bad;
- ec_lock_t *entry_lock;
ec_lock_t *inode_lock;
+ gf_boolean_t have_info;
+ gf_boolean_t have_config;
+ gf_boolean_t have_version;
+ gf_boolean_t have_size;
+ gf_boolean_t have_dirty;
+ ec_config_t config;
+ uint64_t pre_version[2];
+ uint64_t post_version[2];
+ uint64_t pre_size;
+ uint64_t post_size;
+ uint64_t pre_dirty[2];
+ uint64_t post_dirty[2];
struct list_head heal;
-
};
typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *,
@@ -80,7 +90,6 @@ typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *,
int32_t, int32_t, uintptr_t, uintptr_t,
uintptr_t, dict_t *);
-
union _ec_cbk
{
fop_access_cbk_t access;
@@ -132,21 +141,21 @@ union _ec_cbk
struct _ec_lock
{
- ec_lock_t **plock;
+ ec_inode_t *ctx;
gf_timer_t *timer;
- struct list_head waiting;
+ struct list_head waiting; /* Queue of requests being serviced. */
+ struct list_head frozen; /* Queue of requests that will be serviced in
+ the next unlock/lock cycle. */
uintptr_t mask;
uintptr_t good_mask;
- int32_t kind;
int32_t refs;
- int32_t acquired;
- int32_t have_size;
- uint64_t size;
- uint64_t size_delta;
- uint64_t version[2];
- uint64_t version_delta[2];
- gf_boolean_t is_dirty[2];
+ int32_t refs_frozen;
+ int32_t inserted;
+ gf_boolean_t acquired;
+ gf_boolean_t release;
+ gf_boolean_t query;
ec_fop_data_t *owner;
+ fd_t *fd;
loc_t loc;
union
{
@@ -157,9 +166,12 @@ struct _ec_lock
struct _ec_lock_link
{
- ec_lock_t * lock;
- ec_fop_data_t * fop;
- struct list_head wait_list;
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+ struct list_head wait_list;
+ gf_boolean_t update[2];
+ loc_t *base;
+ uint64_t size;
};
struct _ec_fop_data
@@ -172,22 +184,19 @@ struct _ec_fop_data
int32_t winds;
int32_t jobs;
int32_t error;
- ec_fop_data_t * parent;
- xlator_t * xl;
- call_frame_t * req_frame; // frame of the calling xlator
- call_frame_t * frame; // frame used by this fop
- struct list_head cbk_list; // sorted list of groups of answers
- struct list_head answer_list; // list of answers
- ec_cbk_data_t * answer; // accepted answer
+ ec_fop_data_t *parent;
+ xlator_t *xl;
+ call_frame_t *req_frame; /* frame of the calling xlator */
+ call_frame_t *frame; /* frame used by this fop */
+ struct list_head cbk_list; /* sorted list of groups of answers */
+ struct list_head answer_list; /* list of answers */
+ struct list_head pending_list; /* member of ec_t.pending_fops */
+ ec_cbk_data_t *answer; /* accepted answer */
int32_t lock_count;
int32_t locked;
ec_lock_link_t locks[2];
- int32_t locks_update;
- int32_t have_size;
- uint64_t pre_size;
- uint64_t post_size;
+ int32_t first_lock;
gf_lock_t lock;
- ec_config_t config;
uint32_t flags;
uint32_t first;
@@ -196,6 +205,7 @@ struct _ec_fop_data
if fop->minimum number of subvolumes succeed
which are not healing*/
uintptr_t remaining;
+ uintptr_t received; /* Mask of responses */
uintptr_t good;
uintptr_t bad;
@@ -203,7 +213,7 @@ struct _ec_fop_data
ec_handler_f handler;
ec_resume_f resume;
ec_cbk_t cbks;
- void * data;
+ void *data;
ec_heal_t *heal;
uint64_t user_size;
@@ -211,8 +221,8 @@ struct _ec_fop_data
int32_t use_fd;
- dict_t * xdata;
- dict_t * dict;
+ dict_t *xdata;
+ dict_t *dict;
int32_t int32;
uint32_t uint32;
uint64_t size;
@@ -222,14 +232,14 @@ struct _ec_fop_data
entrylk_type entrylk_type;
gf_xattrop_flags_t xattrop_flags;
dev_t dev;
- inode_t * inode;
- fd_t * fd;
+ inode_t *inode;
+ fd_t *fd;
struct iatt iatt;
- char * str[2];
+ char *str[2];
loc_t loc[2];
struct gf_flock flock;
- struct iovec * vector;
- struct iobref * buffers;
+ struct iovec *vector;
+ struct iobref *buffers;
};
struct _ec_cbk_data
@@ -299,4 +309,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
void ec_fop_data_acquire(ec_fop_data_t * fop);
void ec_fop_data_release(ec_fop_data_t * fop);
+void ec_fop_cleanup(ec_fop_data_t *fop);
+
#endif /* __EC_DATA_H__ */
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
index ffc3ed5a7cd..354c63d3683 100644
--- a/xlators/cluster/ec/src/ec-dir-read.c
+++ b/xlators/cluster/ec/src/ec-dir-read.c
@@ -128,14 +128,9 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 0);
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -195,7 +190,6 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c
index ffc96bf4351..ce09138fb7a 100644
--- a/xlators/cluster/ec/src/ec-dir-write.c
+++ b/xlators/cluster/ec/src/ec-dir-write.c
@@ -98,11 +98,10 @@ void ec_wind_create(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)
{
-
-
- ec_t * ec;
- ec_cbk_data_t * cbk;
- ec_fd_t * ctx;
+ ec_config_t config;
+ ec_t *ec;
+ ec_cbk_data_t *cbk;
+ ec_fd_t *ctx;
uint64_t version[2] = {0, 0};
switch (state)
@@ -137,16 +136,15 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)
ec = fop->xl->private;
- fop->config.version = EC_CONFIG_VERSION;
- fop->config.algorithm = EC_CONFIG_ALGORITHM;
- fop->config.gf_word_size = EC_GF_BITS;
- fop->config.bricks = ec->nodes;
- fop->config.redundancy = ec->redundancy;
- fop->config.chunk_size = EC_METHOD_CHUNK_SIZE;
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG,
- &fop->config) < 0)
- {
+ &config) < 0) {
fop->error = EIO;
return EC_STATE_REPORT;
@@ -172,7 +170,8 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -376,17 +375,11 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- // Parent entry of fop->loc[0] should be locked, but I don't
- // receive enough information to do it (fop->loc[0].parent is
- // NULL).
- ec_lock_prepare_entry(fop, &fop->loc[1], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[1], EC_UPDATE_DATA |
+ EC_UPDATE_META |
+ EC_INODE_SIZE);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -410,7 +403,7 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3,
cbk->count);
if (cbk->iatt[0].ia_type == IA_IFREG) {
- cbk->iatt[0].ia_size = fop->pre_size;
+ cbk->iatt[0].ia_size = fop->locks[0].size;
}
if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
@@ -446,7 +439,6 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -589,7 +581,8 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -764,6 +757,7 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
{
+ ec_config_t config;
ec_t *ec;
ec_cbk_data_t * cbk;
uint64_t version[2] = {0, 0};
@@ -783,15 +777,15 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
ec = fop->xl->private;
- fop->config.version = EC_CONFIG_VERSION;
- fop->config.algorithm = EC_CONFIG_ALGORITHM;
- fop->config.gf_word_size = EC_GF_BITS;
- fop->config.bricks = ec->nodes;
- fop->config.redundancy = ec->redundancy;
- fop->config.chunk_size = EC_METHOD_CHUNK_SIZE;
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG,
- &fop->config) < 0) {
+ &config) < 0) {
fop->error = EIO;
return EC_STATE_REPORT;
@@ -814,7 +808,8 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -997,15 +992,13 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
- ec_lock_prepare_entry(fop, &fop->loc[1], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], EC_UPDATE_DATA |
+ EC_UPDATE_META |
+ EC_INODE_SIZE);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[1],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -1034,9 +1027,8 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5,
cbk->count);
- if (cbk->iatt[0].ia_type == IA_IFREG)
- {
- cbk->iatt[0].ia_size = fop->pre_size;
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ cbk->iatt[0].ia_size = fop->locks[0].size;
}
}
}
@@ -1064,7 +1056,6 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -1191,7 +1182,8 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1361,7 +1353,8 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1552,14 +1545,10 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -1607,7 +1596,6 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h
index d6b9770f720..7661077cca3 100644
--- a/xlators/cluster/ec/src/ec-fops.h
+++ b/xlators/cluster/ec/src/ec-fops.h
@@ -16,9 +16,6 @@
#include "ec-data.h"
#include "ec-common.h"
-#define EC_FOP_HEAL -1
-#define EC_FOP_FHEAL -2
-
void ec_access(call_frame_t * frame, xlator_t * this, uintptr_t target,
int32_t minimum, fop_access_cbk_t func, void *data, loc_t * loc,
int32_t mask, dict_t * xdata);
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index d957bf6533d..f50c7a70560 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -320,15 +320,10 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 0);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
- return EC_STATE_DISPATCH;
+ return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
ec_flush_size_version(fop);
@@ -361,8 +356,10 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,
cbk->count);
- cbk->iatt[0].ia_size = fop->pre_size;
- cbk->iatt[1].ia_size = fop->post_size;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
}
}
else
@@ -388,7 +385,6 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -705,7 +701,6 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
{
ec_cbk_data_t * ans = NULL;
ec_inode_t * ctx = NULL;
- ec_lock_t * lock = NULL;
data_t * data = NULL;
uint8_t * buff = NULL;
uint64_t size = 0;
@@ -729,14 +724,14 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
LOCK(&cbk->inode->lock);
ctx = __ec_inode_get(cbk->inode, fop->xl);
- if ((ctx != NULL) && (ctx->inode_lock != NULL))
+ if (ctx != NULL)
{
- lock = ctx->inode_lock;
- cbk->version[0] = lock->version[0];
- cbk->version[1] = lock->version[1];
- if (lock->have_size)
- {
- size = lock->size;
+ if (ctx->have_version) {
+ cbk->version[0] = ctx->post_version[0];
+ cbk->version[1] = ctx->post_version[1];
+ }
+ if (ctx->have_size) {
+ size = ctx->post_size;
have_size = 1;
}
}
@@ -964,9 +959,20 @@ int32_t ec_manager_lookup(ec_fop_data_t * fop, int32_t state)
return EC_STATE_PREPARE_ANSWER;
case EC_STATE_PREPARE_ANSWER:
+ /*
+ * Lookup happens without any lock, so there is a chance that it
+ * will have answers before modification happened and after
+ * modification happened in the same response. So choose the next
+ * best answer when the answers don't match for EC_MINIMUM_MIN
+ */
+
+ if (!fop->answer && !list_empty(&fop->cbk_list)) {
+ fop->answer = list_entry (fop->cbk_list.next, ec_cbk_data_t,
+ list);
+ }
+
cbk = fop->answer;
- if (cbk != NULL)
- {
+ if (cbk != NULL) {
if (!ec_dict_combine(cbk, EC_COMBINE_XDATA))
{
if (cbk->op_ret >= 0)
@@ -986,9 +992,7 @@ int32_t ec_manager_lookup(ec_fop_data_t * fop, int32_t state)
ec_lookup_rebuild(fop->xl->private, fop, cbk);
}
- }
- else
- {
+ } else {
ec_fop_set_error(fop, EIO);
}
@@ -1295,8 +1299,8 @@ out:
/* FOP: xattrop */
-int32_t ec_combine_xattrop(ec_fop_data_t * fop, ec_cbk_data_t * dst,
- ec_cbk_data_t * src)
+int32_t ec_combine_xattrop(ec_fop_data_t *fop, ec_cbk_data_t *dst,
+ ec_cbk_data_t *src)
{
if (!ec_dict_compare(dst->dict, src->dict))
{
@@ -1316,9 +1320,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
ec_fop_data_t *fop = NULL;
ec_cbk_data_t *cbk = NULL;
+ data_t *data;
+ uint64_t *version;
int32_t idx = (int32_t)(uintptr_t)cookie;
- uint64_t version = 0;
- uint64_t *version_xattr = 0;
VALIDATE_OR_GOTO (this, out);
GF_VALIDATE_OR_GOTO (this->name, frame, out);
@@ -1338,12 +1342,19 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret >= 0) {
cbk->dict = dict_ref (xattr);
- if (dict_get_bin (xattr, EC_XATTR_VERSION,
- (void **)&version_xattr) == 0) {
- version = ntoh64(version_xattr[0]);
- if ((version >> EC_SELFHEAL_BIT) & 1)
- fop->healing |= (1ULL<<idx);
+ data = dict_get(cbk->dict, EC_XATTR_VERSION);
+ if ((data != NULL) && (data->len >= sizeof(uint64_t))) {
+ version = (uint64_t *)data->data;
+
+ if (((ntoh64(version[0]) >> EC_SELFHEAL_BIT) & 1) != 0) {
+ LOCK(&fop->lock);
+
+ fop->healing |= 1ULL << idx;
+
+ UNLOCK(&fop->lock);
+ }
}
+
ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
EC_VERSION_SIZE);
}
@@ -1377,13 +1388,10 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- if (fop->fd == NULL)
- {
- ec_lock_prepare_inode(fop, &fop->loc[0], 1);
- }
- else
- {
- ec_lock_prepare_fd(fop, fop->fd, 1);
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META);
}
ec_lock(fop);
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index ceddfeb6ac7..80725e5a9fa 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -119,9 +119,8 @@ void ec_heal_lookup_resume(ec_fop_data_t * fop)
heal->version[0] = cbk->version[0];
heal->version[1] = cbk->version[1];
heal->raw_size = cbk->size;
- heal->fop->pre_size = cbk->iatt[0].ia_size;
- heal->fop->post_size = cbk->iatt[0].ia_size;
- heal->fop->have_size = 1;
+
+ GF_ASSERT(ec_set_inode_size(fop, cbk->inode, cbk->size));
if (ec_loc_update(heal->xl, &heal->loc, cbk->inode,
&cbk->iatt[0]) != 0)
@@ -532,12 +531,7 @@ ec_heal_init (ec_fop_data_t * fop)
gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path,
uuid_utoa(heal->loc.gfid));
} else {
- LOCK(&fop->lock);
-
- fop->jobs++;
- fop->refs++;
-
- UNLOCK(&fop->lock);
+ ec_sleep(fop);
}
list_add_tail(&heal->list, &ctx->heal);
@@ -552,25 +546,8 @@ out:
return error;
}
-void ec_heal_entrylk(ec_heal_t * heal, entrylk_cmd cmd)
-{
- loc_t loc;
-
- if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) {
- gf_log("ec", GF_LOG_NOTICE, "ec_loc_parent() failed");
- ec_fop_set_error(heal->fop, EIO);
-
- return;
- }
-
- ec_entrylk(heal->fop->frame, heal->xl, -1, EC_MINIMUM_ALL, NULL, NULL,
- heal->xl->name, &loc, NULL, cmd, ENTRYLK_WRLCK, NULL);
-
- loc_wipe(&loc);
-}
-
-void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd,
- off_t offset, size_t size)
+void ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc,
+ off_t offset, size_t size)
{
struct gf_flock flock;
@@ -581,20 +558,47 @@ void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd,
flock.l_pid = 0;
flock.l_owner.len = 0;
- if (use_fd)
+ /* Remove inode size information before unlocking it. */
+ if ((type == F_UNLCK) && (heal->loc.inode != NULL)) {
+ ec_clear_inode_info(heal->fop, heal->loc.inode);
+ }
+
+ if (fd != NULL)
{
ec_finodelk(heal->fop->frame, heal->xl, heal->fop->mask,
- EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, heal->fd,
+ EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, fd,
F_SETLKW, &flock, NULL);
}
else
{
ec_inodelk(heal->fop->frame, heal->xl, heal->fop->mask, EC_MINIMUM_ALL,
- NULL, NULL, heal->xl->name, &heal->loc, F_SETLKW, &flock,
+ NULL, NULL, heal->xl->name, loc, F_SETLKW, &flock,
NULL);
}
}
+void ec_heal_entrylk(ec_heal_t *heal, int32_t type)
+{
+ loc_t loc;
+
+ if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) {
+ ec_fop_set_error(heal->fop, EIO);
+
+ return;
+ }
+
+ ec_heal_lock(heal, type, NULL, &loc, 0, 0);
+
+ loc_wipe(&loc);
+}
+
+void ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd,
+ off_t offset, size_t size)
+{
+ ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset,
+ size);
+}
+
void ec_heal_lookup(ec_heal_t *heal, uintptr_t mask)
{
dict_t * xdata;
@@ -1302,13 +1306,10 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)
case EC_STATE_DISPATCH:
if (heal->done != 0) {
- gf_log("ec", GF_LOG_NOTICE, "heal already done");
return EC_STATE_HEAL_DISPATCH;
}
- gf_log("ec", GF_LOG_NOTICE, "heal before entrylk");
- ec_heal_entrylk(heal, ENTRYLK_LOCK);
- gf_log("ec", GF_LOG_NOTICE, "heal after entrylk");
+ ec_heal_entrylk(heal, F_WRLCK);
return EC_STATE_HEAL_ENTRY_LOOKUP;
@@ -1336,7 +1337,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)
/* Only heal data/metadata if enough information is supplied. */
if (gf_uuid_is_null(heal->loc.gfid))
{
- ec_heal_entrylk(heal, ENTRYLK_UNLOCK);
+ ec_heal_entrylk(heal, F_UNLCK);
return EC_STATE_HEAL_DISPATCH;
}
@@ -1392,7 +1393,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)
case -EC_STATE_HEAL_UNLOCK_ENTRY:
case EC_STATE_HEAL_UNLOCK_ENTRY:
if (heal->nameheal)
- ec_heal_entrylk(heal, ENTRYLK_UNLOCK);
+ ec_heal_entrylk(heal, F_UNLCK);
heal->bad = ec_heal_needs_data_rebuild(heal);
if (heal->bad != 0)
@@ -2562,9 +2563,9 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
EC_REPLIES_ALLOC (replies, ec->nodes);
output = alloca0 (ec->nodes);
locked_on = alloca0 (ec->nodes);
- ret = cluster_entrylk (ec->xl_list, participants, ec->nodes, replies,
+ ret = cluster_inodelk (ec->xl_list, participants, ec->nodes, replies,
locked_on, frame, ec->xl, ec->xl->name, parent,
- NULL);
+ 0, 0);
{
if (ret <= ec->fragments) {
gf_log (ec->xl->name, GF_LOG_DEBUG, "%s/%s: Skipping "
@@ -2578,8 +2579,8 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
ret = __ec_heal_name (frame, ec, parent, name, participants);
}
unlock:
- cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
- frame, ec->xl, ec->xl->name, parent, NULL);
+ cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, ec->xl->name, parent, 0, 0);
out:
cluster_replies_wipe (replies, ec->nodes);
loc_wipe (&loc);
@@ -2663,9 +2664,9 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
dirty = alloca0 (ec->nodes * sizeof (*dirty));
EC_REPLIES_ALLOC (replies, ec->nodes);
- ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies,
+ ret = cluster_inodelk (ec->xl_list, heal_on, ec->nodes, replies,
locked_on, frame, ec->xl, ec->xl->name, inode,
- NULL);
+ 0, 0);
{
if (ret <= ec->fragments) {
gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
@@ -2680,8 +2681,8 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
source = ret;
}
unlock:
- cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
- frame, ec->xl, ec->xl->name, inode, NULL);
+ cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, ec->xl->name, inode, 0, 0);
if (ret < 0)
goto out;
@@ -2728,9 +2729,9 @@ ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
sprintf (selfheal_domain, "%s:self-heal", ec->xl->name);
ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
/*If other processes are already doing the heal, don't block*/
- ret = cluster_entrylk (ec->xl_list, up_subvols, ec->nodes, replies,
+ ret = cluster_inodelk (ec->xl_list, up_subvols, ec->nodes, replies,
locked_on, frame, ec->xl, selfheal_domain, inode,
- NULL);
+ 0, 0);
{
if (ret <= ec->fragments) {
gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
@@ -2743,8 +2744,8 @@ ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
sources, healed_sinks);
}
unlock:
- cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
- frame, ec->xl, selfheal_domain, inode, NULL);
+ cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, selfheal_domain, inode, 0, 0);
cluster_replies_wipe (replies, ec->nodes);
return ret;
}
@@ -3086,8 +3087,8 @@ ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target,
if (fop == NULL)
goto out;
- fop->pre_size = fop->post_size = heal->total_size;
- fop->have_size = 1;
+ GF_ASSERT(ec_set_inode_size(fop, heal->fd->inode, heal->total_size));
+
error = 0;
out:
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index 8ce3087d5a6..48251c84bac 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -738,3 +738,41 @@ ec_filter_internal_xattrs (dict_t *xattr)
dict_foreach_match (xattr, ec_is_internal_xattr, NULL,
dict_remove_foreach_fn, NULL);
}
+
+gf_boolean_t
+ec_is_data_fop (glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_WRITE:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+/*
+gf_boolean_t
+ec_is_metadata_fop (int32_t lock_kind, glusterfs_fop_t fop)
+{
+ if (lock_kind == EC_LOCK_ENTRY) {
+ return _gf_false;
+ }
+
+ switch (fop) {
+ case GF_FOP_SETATTR:
+ case GF_FOP_FSETATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ case GF_FOP_REMOVEXATTR:
+ case GF_FOP_FREMOVEXATTR:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}*/
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
index df4495138fe..14243df54f3 100644
--- a/xlators/cluster/ec/src/ec-helpers.h
+++ b/xlators/cluster/ec/src/ec-helpers.h
@@ -59,4 +59,11 @@ ec_is_internal_xattr (dict_t *dict, char *key, data_t *value, void *data);
void
ec_filter_internal_xattrs (dict_t *xattr);
+
+gf_boolean_t
+ec_is_data_fop (glusterfs_fop_t fop);
+/*
+gf_boolean_t
+ec_is_metadata_fop (glusterfs_fop_t fop);
+*/
#endif /* __EC_HELPERS_H__ */
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index 7372c0a0599..853d914148b 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -267,9 +267,9 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)
(strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,
strlen(GF_XATTR_CLRLK_CMD)) != 0)) {
if (fop->fd == NULL) {
- ec_lock_prepare_inode(fop, &fop->loc[0], 0);
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
} else {
- ec_lock_prepare_fd(fop, fop->fd, 0);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
}
ec_lock(fop);
}
@@ -1094,12 +1094,12 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
size_t fsize = 0, size = 0, max = 0;
int32_t i = 0;
- if (cbk->op_ret < 0)
- {
+ if (cbk->op_ret < 0) {
goto out;
}
- cbk->iatt[0].ia_size = fop->pre_size;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size));
if (cbk->op_ret > 0)
{
@@ -1331,15 +1331,10 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 0);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
- return EC_STATE_DISPATCH;
+ return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
ec_dispatch_min(fop);
@@ -1396,7 +1391,6 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -1580,22 +1574,14 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- if (fop->fd == NULL)
- {
- ec_lock_prepare_inode(fop, &fop->loc[0], 0);
- }
- else
- {
- ec_lock_prepare_fd(fop, fop->fd, 0);
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
}
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
- return EC_STATE_DISPATCH;
+ return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
ec_dispatch_all(fop);
@@ -1614,16 +1600,16 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
cbk->op_errno = EIO;
}
}
- if (cbk->op_ret < 0)
- {
+ if (cbk->op_ret < 0) {
ec_fop_set_error(fop, cbk->op_errno);
- }
- else
- {
+ } else if (cbk->iatt[0].ia_type == IA_IFREG) {
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1,
cbk->count);
- cbk->iatt[0].ia_size = fop->pre_size;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
}
}
else
@@ -1659,7 +1645,6 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index 6b485a26fbc..368b3ae5edf 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -123,11 +123,13 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state)
switch (state) {
case EC_STATE_INIT:
case EC_STATE_LOCK:
- if (fop->fd == NULL)
- ec_lock_prepare_inode(fop, &fop->loc[0], 1);
- else
- ec_lock_prepare_fd(fop, fop->fd, 1);
-
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0],
+ EC_UPDATE_META | EC_QUERY_INFO);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_META | EC_QUERY_INFO);
+ }
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -378,21 +380,15 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- if (fop->fd == NULL)
- {
- ec_lock_prepare_inode(fop, &fop->loc[0], 1);
- }
- else
- {
- ec_lock_prepare_fd(fop, fop->fd, 1);
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0],
+ EC_UPDATE_META | EC_QUERY_INFO);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_META | EC_QUERY_INFO);
}
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -412,17 +408,17 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)
cbk->op_errno = EIO;
}
}
- if (cbk->op_ret < 0)
- {
+ if (cbk->op_ret < 0) {
ec_fop_set_error(fop, cbk->op_errno);
- }
- else
- {
+ } else if (cbk->iatt[0].ia_type == IA_IFREG) {
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,
cbk->count);
- cbk->iatt[0].ia_size = fop->pre_size;
- cbk->iatt[1].ia_size = fop->pre_size;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
}
}
else
@@ -462,7 +458,6 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -992,21 +987,17 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- if (fop->id == GF_FOP_TRUNCATE)
- {
- ec_lock_prepare_inode(fop, &fop->loc[0], 1);
- }
- else
- {
- ec_lock_prepare_fd(fop, fop->fd, 1);
+ if (fop->id == GF_FOP_TRUNCATE) {
+ ec_lock_prepare_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META |
+ EC_QUERY_INFO);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_DATA | EC_UPDATE_META |
+ EC_QUERY_INFO);
}
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_prepare_update(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -1035,14 +1026,18 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,
cbk->count);
- cbk->iatt[0].ia_size = fop->pre_size;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
cbk->iatt[1].ia_size = fop->user_size;
- fop->post_size = fop->user_size;
- if ((fop->pre_size > fop->post_size) &&
- (fop->user_size != fop->offset))
- {
- if (!ec_truncate_clean(fop))
- {
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_set_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ fop->user_size));
+ if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) &&
+ (fop->user_size != fop->offset)) {
+ if (!ec_truncate_clean(fop)) {
ec_fop_set_error(fop, EIO);
}
}
@@ -1085,7 +1080,6 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -1355,9 +1349,13 @@ void ec_writev_start(ec_fop_data_t *fop)
ec_fd_t *ctx;
fd_t *fd;
size_t tail;
+ uint64_t current;
uid_t uid;
gid_t gid;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &current));
+
fd = fd_anonymous(fop->fd->inode);
if (fd == NULL) {
ec_fop_set_error(fop, EIO);
@@ -1373,7 +1371,7 @@ void ec_writev_start(ec_fop_data_t *fop)
ctx = ec_fd_get(fop->fd, fop->xl);
if (ctx != NULL) {
if ((ctx->flags & O_APPEND) != 0) {
- fop->offset = fop->pre_size;
+ fop->offset = current;
}
}
@@ -1404,22 +1402,17 @@ void ec_writev_start(ec_fop_data_t *fop)
iobref_unref(fop->buffers);
fop->buffers = iobref;
- if (fop->head > 0)
- {
+ if (fop->head > 0) {
ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head,
NULL, fd, ec->stripe_size, fop->offset, 0, NULL);
}
tail = fop->size - fop->user_size - fop->head;
- if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size)))
- {
- if (fop->pre_size > fop->offset + fop->head + fop->user_size)
- {
+ if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
+ if (current > fop->offset + fop->head + fop->user_size) {
ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
ec_writev_merge_tail, NULL, fd, ec->stripe_size,
fop->offset + fop->size - ec->stripe_size, 0, NULL);
- }
- else
- {
+ } else {
memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
}
}
@@ -1530,14 +1523,11 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 1);
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_DATA | EC_UPDATE_META |
+ EC_QUERY_INFO);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_prepare_update(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -1574,27 +1564,34 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,
cbk->count);
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
size = fop->offset + fop->head + fop->user_size;
- if (size > fop->pre_size)
- {
- fop->post_size = size;
- }
-
- cbk->iatt[0].ia_size = fop->pre_size;
- cbk->iatt[1].ia_size = fop->post_size;
-
- cbk->op_ret *= ec->fragments;
- if (cbk->op_ret < fop->head)
- {
- cbk->op_ret = 0;
- }
- else
- {
- cbk->op_ret -= fop->head;
+ if (size > cbk->iatt[0].ia_size) {
+ /* Only update inode size if this is a top level fop.
+ * Otherwise this is an internal write and the top
+ * level fop should take care of the real inode size.
+ */
+ if (fop->parent == NULL) {
+ /* This shouldn't fail because we have the inode
+ * locked. */
+ GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode,
+ size));
+ }
+ cbk->iatt[1].ia_size = size;
}
- if (cbk->op_ret > fop->user_size)
- {
- cbk->op_ret = fop->user_size;
+ if (fop->error == 0) {
+ cbk->op_ret *= ec->fragments;
+ if (cbk->op_ret < fop->head) {
+ cbk->op_ret = 0;
+ } else {
+ cbk->op_ret -= fop->head;
+ }
+ if (cbk->op_ret > fop->user_size) {
+ cbk->op_ret = fop->user_size;
+ }
}
}
}
@@ -1621,8 +1618,8 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
+ case -EC_STATE_DELAYED_START:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
GF_ASSERT(fop->error != 0);
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
index 10572037932..22b6fa4d6e5 100644
--- a/xlators/cluster/ec/src/ec-locks.c
+++ b/xlators/cluster/ec/src/ec-locks.c
@@ -37,13 +37,22 @@ int32_t ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
locked |= ans->mask;
cbk = ans;
} else {
- notlocked |= ans->mask;
+ if (ans->op_errno == EAGAIN) {
+ switch (fop->uint32) {
+ case EC_LOCK_MODE_NONE:
+ case EC_LOCK_MODE_ALL:
+ /* Goal is to treat non-blocking lock as failure
+ * even if there is a signle EAGAIN*/
+ notlocked |= ans->mask;
+ break;
+ }
+ }
}
}
if (error == -1) {
if (ec_bits_count(locked | notlocked) >= ec->fragments) {
- if (ec_bits_count (locked) >= ec->fragments) {
+ if (notlocked == 0) {
if (fop->answer == NULL) {
fop->answer = cbk;
}
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 3dd04299541..4028aa4d2bb 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -278,6 +278,7 @@ ec_notify_cbk (void *data)
{
ec_t *ec = data;
glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
LOCK(&ec->lock);
{
@@ -309,10 +310,14 @@ ec_notify_cbk (void *data)
/* CHILD_DOWN should not come here as no grace period is given
* for notifying CHILD_DOWN. */
- default_notify (ec->xl, event, NULL);
+ propagate = _gf_true;
}
unlock:
UNLOCK(&ec->lock);
+
+ if (propagate) {
+ default_notify (ec->xl, event, NULL);
+ }
}
void
@@ -360,6 +365,49 @@ ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx)
}
}
+gf_boolean_t
+ec_force_unlocks(ec_t *ec)
+{
+ struct list_head list;
+ ec_fop_data_t *fop;
+
+ if (list_empty(&ec->pending_fops)) {
+ return _gf_true;
+ }
+
+ INIT_LIST_HEAD(&list);
+
+ /* All pending fops when GF_EVENT_PARENT_DOWN is received should only
+ * be fops waiting for a delayed unlock. However the unlock can
+ * generate new fops. We don't want to trverse these new fops while
+ * forcing unlocks, so we move all fops to a temporal list. To process
+ * them without interferences.*/
+ list_splice_init(&ec->pending_fops, &list);
+
+ while (!list_empty(&list)) {
+ fop = list_entry(list.next, ec_fop_data_t, pending_list);
+ list_move_tail(&fop->pending_list, &ec->pending_fops);
+
+ UNLOCK(&ec->lock);
+
+ ec_unlock_force(fop);
+
+ LOCK(&ec->lock);
+ }
+
+ ec->shutdown = _gf_true;
+
+ return list_empty(&ec->pending_fops);
+}
+
+void
+ec_pending_fops_completed(ec_t *ec)
+{
+ if (ec->shutdown) {
+ default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL);
+ }
+}
+
int32_t
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -367,14 +415,16 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
int32_t idx = 0;
int32_t error = 0;
glusterfs_event_t old_event = GF_EVENT_MAXVAL;
- glusterfs_event_t new_event = GF_EVENT_MAXVAL;
dict_t *input = NULL;
dict_t *output = NULL;
+ gf_boolean_t propagate = _gf_true;
+
+ gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %p",
+ event, data, data2);
if (event == GF_EVENT_TRANSLATOR_OP) {
if (!ec->up) {
error = -1;
- goto out;
} else {
input = data;
output = data2;
@@ -400,13 +450,14 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
*/
ec_launch_notify_timer (this, ec);
goto unlock;
+ } else if (event == GF_EVENT_PARENT_DOWN) {
+ /* If there aren't pending fops running after we have waken up
+ * them, we immediately propagate the notification. */
+ propagate = ec_force_unlocks(ec);
+ goto unlock;
}
- gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d",
- event, data, idx);
-
if (idx < ec->nodes) { /* CHILD_* events */
-
old_event = ec_get_event_from_state (ec);
if (event == GF_EVENT_CHILD_UP) {
@@ -415,28 +466,30 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
ec_handle_down (this, ec, idx);
}
- new_event = ec_get_event_from_state (ec);
+ event = ec_get_event_from_state (ec);
- if (new_event == GF_EVENT_CHILD_UP && !ec->up) {
+ if (event == GF_EVENT_CHILD_UP && !ec->up) {
ec_up (this, ec);
- } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) {
+ } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
ec_down (this, ec);
}
- if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL))
- new_event = GF_EVENT_CHILD_MODIFIED;
-
- event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */
- if (new_event != GF_EVENT_MAXVAL)
- error = default_notify (this, new_event, data);
+ if (event != GF_EVENT_MAXVAL) {
+ if (event == old_event) {
+ event = GF_EVENT_CHILD_MODIFIED;
+ }
+ } else {
+ propagate = _gf_false;
+ }
}
- unlock:
- UNLOCK (&ec->lock);
+unlock:
+ UNLOCK (&ec->lock);
- if (event != GF_EVENT_MAXVAL)
- return default_notify (this, event, data);
+ if (propagate) {
+ error = default_notify (this, event, data);
+ }
out:
- return error;
+ return error;
}
int32_t
@@ -478,6 +531,8 @@ init (xlator_t *this)
ec->xl = this;
LOCK_INIT(&ec->lock);
+ INIT_LIST_HEAD(&ec->pending_fops);
+
ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);
ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096);
ec->lock_pool = mem_pool_new(ec_lock_t, 1024);
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index b8f3e288197..fdedb89ec18 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -44,6 +44,8 @@ struct _ec
xlator_t ** xl_list;
gf_lock_t lock;
gf_timer_t * timer;
+ gf_boolean_t shutdown;
+ struct list_head pending_fops;
struct mem_pool * fop_pool;
struct mem_pool * cbk_pool;
struct mem_pool * lock_pool;
@@ -51,4 +53,7 @@ struct _ec
char vol_uuid[UUID_SIZE + 1];
dict_t *leaf_to_subvolid;
};
+
+void ec_pending_fops_completed(ec_t *ec);
+
#endif /* __EC_H__ */