summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src/ec-common.c
diff options
context:
space:
mode:
authorXavier Hernandez <xhernandez@datalab.es>2014-07-14 17:34:04 +0200
committerVijay Bellur <vbellur@redhat.com>2014-09-15 23:12:16 -0700
commitd97863562bb0d2f685df3d2e3aa4bef1299c8307 (patch)
treeb8d9455cdface5425e2452e98751ac75dac358e4 /xlators/cluster/ec/src/ec-common.c
parent2be54585002cd1c9d02928b89a02047b58dd6aed (diff)
ec: Optimize read/write performance
This patch significantly improves performance of read/write operations on a dispersed volume by reusing previous inodelk/ entrylk operations on the same inode/entry. This reduces the latency of each individual operation considerably. Inode version and size are also updated when needed instead of on each request. This gives an additional boost. Change-Id: I4b98d5508c86b53032e16e295f72a3f83fd8fcac BUG: 1122586 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/8369 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-common.c')
-rw-r--r--xlators/cluster/ec/src/ec-common.c545
1 files changed, 397 insertions, 148 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index a4423d94aa9..ad04e646d68 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -316,20 +316,10 @@ void ec_resume_parent(ec_fop_data_t * fop, int32_t error)
}
}
-void ec_report(ec_fop_data_t * fop, int32_t error)
-{
- if (!list_empty(&fop->lock_list))
- {
- ec_owner_set(fop->frame, fop->frame->root);
- }
-
- ec_resume(fop, error);
-}
-
void ec_complete(ec_fop_data_t * fop)
{
ec_cbk_data_t * cbk = NULL;
- int32_t ready = 0, report = 0;
+ int32_t resume = 0;
LOCK(&fop->lock);
@@ -351,21 +341,17 @@ void ec_complete(ec_fop_data_t * fop)
}
}
- report = 1;
+ resume = 1;
}
else if ((fop->flags & EC_FLAG_WAITING_WINDS) != 0)
{
- ready = 1;
+ resume = 1;
}
}
UNLOCK(&fop->lock);
- if (report)
- {
- ec_report(fop, 0);
- }
- if (ready)
+ if (resume)
{
ec_resume(fop, 0);
}
@@ -518,7 +504,7 @@ void ec_dispatch_start(ec_fop_data_t * fop)
INIT_LIST_HEAD(&fop->cbk_list);
- if (!list_empty(&fop->lock_list))
+ if (fop->lock_count > 0)
{
ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner);
}
@@ -602,6 +588,7 @@ void ec_dispatch_min(ec_fop_data_t * fop)
ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)
{
+ ec_t * ec = xl->private;
ec_lock_t * lock;
if ((loc->inode == NULL) ||
@@ -613,15 +600,15 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)
return NULL;
}
- lock = GF_MALLOC(sizeof(*lock), ec_mt_ec_lock_t);
+ lock = mem_get0(ec->lock_pool);
if (lock != NULL)
{
- memset(lock, 0, sizeof(*lock));
-
lock->kind = kind;
+ lock->good_mask = -1ULL;
+ INIT_LIST_HEAD(&lock->waiting);
if (!ec_loc_from_loc(xl, &lock->loc, loc))
{
- GF_FREE(lock);
+ mem_put(lock);
lock = NULL;
}
}
@@ -634,34 +621,55 @@ void ec_lock_destroy(ec_lock_t * lock)
GF_FREE(lock->basename);
loc_wipe(&lock->loc);
- GF_FREE(lock);
+ mem_put(lock);
}
-int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xdata)
+int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
{
- ec_fop_data_t * fop = cookie;
- ec_lock_t * lock = NULL;
+ int32_t res;
- if (op_ret >= 0)
+ res = uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
+ if (res != 0)
{
- lock = fop->data;
- lock->mask = fop->good;
- fop->parent->mask &= fop->good;
-
- ec_trace("LOCKED", fop->parent, "lock=%p", lock);
+ return res;
}
- else
+ if (lock1->basename == NULL)
{
- gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock");
+ if (lock2->basename == NULL)
+ {
+ return 0;
+ }
+ return 1;
+ }
+ if (lock2->basename == NULL)
+ {
+ return -1;
}
+ return strcmp(lock1->basename, lock2->basename);
+}
- return 0;
+void ec_lock_insert(ec_fop_data_t * fop, ec_lock_t * lock)
+{
+ ec_lock_t * tmp;
+
+ if ((fop->lock_count > 0) &&
+ (ec_lock_compare(fop->locks[0].lock, lock) > 0))
+ {
+ tmp = fop->locks[0].lock;
+ fop->locks[0].lock = lock;
+ lock = tmp;
+ }
+ fop->locks[fop->lock_count].lock = lock;
+ fop->locks[fop->lock_count].fop = fop;
+ fop->lock_count++;
+
+ lock->refs++;
}
-void ec_lock_entry(ec_fop_data_t * fop, loc_t * loc)
+void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc)
{
ec_lock_t * lock = NULL;
+ ec_inode_t * ctx = NULL;
char * name = NULL;
loc_t tmp;
int32_t error;
@@ -679,116 +687,106 @@ void ec_lock_entry(ec_fop_data_t * fop, loc_t * loc)
return;
}
- LOCK(&fop->lock);
+ LOCK(&tmp.inode->lock);
- list_for_each_entry(lock, &fop->lock_list, list)
+ ctx = __ec_inode_get(tmp.inode, fop->xl);
+ if (ctx == NULL)
{
- if ((lock->kind == EC_LOCK_ENTRY) &&
- (lock->loc.inode == tmp.inode) &&
- (strcmp(lock->basename, name) == 0))
- {
- ec_trace("LOCK_ENTRYLK", fop, "lock=%p, parent=%p, path=%s, "
- "name=%s. Lock already acquired",
- lock, loc->parent, loc->path, name);
-
- lock = NULL;
+ __ec_fop_set_error(fop, EIO);
- goto unlock;
- }
+ goto unlock;
}
- lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp);
- if (lock != NULL)
+ list_for_each_entry(lock, &ctx->entry_locks, list)
{
- lock->type = ENTRYLK_WRLCK;
- lock->basename = name;
-
- if (list_empty(&fop->lock_list))
+ if (strcmp(lock->basename, name) == 0)
{
- ec_owner_set(fop->frame, fop->frame->root);
+ ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s, "
+ "name=%s. Lock already acquired",
+ lock, tmp.inode, tmp.path, name);
+
+ goto insert;
}
- list_add_tail(&lock->list, &fop->lock_list);
}
- else
+
+ lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp);
+ if (lock == NULL)
{
__ec_fop_set_error(fop, EIO);
+
+ goto unlock;
}
-unlock:
- UNLOCK(&fop->lock);
+ ec_trace("LOCK_CREATE", fop, "lock=%p", lock);
- loc_wipe(&tmp);
+ lock->type = ENTRYLK_WRLCK;
+ lock->basename = name;
+ name = NULL;
- if (lock != NULL)
- {
- ec_trace("LOCK_ENTRYLK", fop, "lock=%p, parent=%p, path=%s, "
- "basename=%s", lock, lock->loc.inode,
- lock->loc.path, lock->basename);
+ list_add_tail(&lock->list, &ctx->entry_locks);
- ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, lock,
- fop->xl->name, &lock->loc, lock->basename, ENTRYLK_LOCK,
- lock->type, NULL);
- }
- else
- {
- GF_FREE(name);
- }
+insert:
+ ec_lock_insert(fop, lock);
+
+unlock:
+ UNLOCK(&tmp.inode->lock);
+
+ loc_wipe(&tmp);
+ GF_FREE(name);
}
-void ec_lock_inode(ec_fop_data_t * fop, loc_t * loc)
+void ec_lock_prepare_inode(ec_fop_data_t * fop, loc_t * loc)
{
ec_lock_t * lock;
+ ec_inode_t * ctx;
if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL))
{
return;
}
- LOCK(&fop->lock);
+ LOCK(&loc->inode->lock);
- list_for_each_entry(lock, &fop->lock_list, list)
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx == NULL)
{
- if ((lock->kind == EC_LOCK_INODE) && (lock->loc.inode == loc->inode))
- {
- UNLOCK(&fop->lock);
-
- ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already "
- "acquired", lock, loc->inode);
+ __ec_fop_set_error(fop, EIO);
- return;
- }
+ goto unlock;
}
- lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc);
- if (lock != NULL)
+ if (!list_empty(&ctx->inode_locks))
{
- lock->flock.l_type = F_WRLCK;
- lock->flock.l_whence = SEEK_SET;
+ lock = list_entry(ctx->inode_locks.next, ec_lock_t, list);
+ ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already "
+ "acquired", lock, loc->inode);
- if (list_empty(&fop->lock_list))
- {
- ec_owner_set(fop->frame, fop->frame->root);
- }
- list_add_tail(&lock->list, &fop->lock_list);
+ goto insert;
}
- else
+
+ lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc);
+ if (lock == NULL)
{
__ec_fop_set_error(fop, EIO);
+
+ goto unlock;
}
- UNLOCK(&fop->lock);
+ ec_trace("LOCK_CREATE", fop, "lock=%p", lock);
- if (lock != NULL)
- {
- ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p, owner=%p", lock,
- lock->loc.inode, fop->frame->root);
+ lock->flock.l_type = F_WRLCK;
+ lock->flock.l_whence = SEEK_SET;
- ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, lock,
- fop->xl->name, &lock->loc, F_SETLKW, &lock->flock, NULL);
- }
+ list_add_tail(&lock->list, &ctx->inode_locks);
+
+insert:
+ ec_lock_insert(fop, lock);
+
+unlock:
+ UNLOCK(&loc->inode->lock);
}
-void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd)
+void ec_lock_prepare_fd(ec_fop_data_t * fop, fd_t * fd)
{
loc_t loc;
@@ -799,7 +797,7 @@ void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd)
if (ec_loc_from_fd(fop->xl, &loc, fd))
{
- ec_lock_inode(fop, &loc);
+ ec_lock_prepare_inode(fop, &loc);
loc_wipe(&loc);
}
@@ -809,6 +807,100 @@ void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd)
}
}
+int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this,
+ int32_t op_ret, int32_t op_errno, dict_t * xdata)
+{
+ ec_fop_data_t * fop = cookie;
+ ec_lock_t * lock = NULL;
+
+ if (op_ret >= 0)
+ {
+ lock = fop->data;
+ lock->mask = fop->good;
+ lock->acquired = 1;
+
+ fop->parent->mask &= fop->good;
+ fop->parent->locked++;
+
+ ec_trace("LOCKED", fop->parent, "lock=%p", lock);
+
+ ec_lock(fop->parent);
+ }
+ else
+ {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock");
+ }
+
+ return 0;
+}
+
+void ec_lock(ec_fop_data_t * fop)
+{
+ ec_lock_t * lock;
+
+ while (fop->locked < fop->lock_count)
+ {
+ lock = fop->locks[fop->locked].lock;
+
+ LOCK(&lock->loc.inode->lock);
+
+ if (lock->owner != NULL)
+ {
+ ec_trace("LOCK_WAIT", fop, "lock=%p", lock);
+
+ list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting);
+
+ fop->jobs++;
+ fop->refs++;
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ break;
+ }
+ lock->owner = fop;
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (!lock->acquired)
+ {
+ ec_owner_set(fop->frame, lock);
+
+ if (lock->kind == EC_LOCK_ENTRY)
+ {
+ ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s, "
+ "name=%s", lock, lock->loc.inode, lock->loc.path,
+ lock->basename);
+
+ ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked,
+ lock, fop->xl->name, &lock->loc, lock->basename,
+ ENTRYLK_LOCK, lock->type, NULL);
+ }
+ else
+ {
+ ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock,
+ lock->loc.inode);
+
+ ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked,
+ lock, fop->xl->name, &lock->loc, F_SETLKW,
+ &lock->flock, NULL);
+ }
+
+ break;
+ }
+
+ ec_trace("LOCK_REUSE", fop, "lock=%p", lock);
+
+ if (lock->have_size)
+ {
+ fop->pre_size = fop->post_size = lock->size;
+ fop->have_size = 1;
+ }
+ fop->mask &= lock->good_mask;
+
+ fop->locked++;
+ }
+}
+
int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this,
int32_t op_ret, int32_t op_errno, dict_t * xdata)
{
@@ -829,50 +921,68 @@ int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this,
void ec_unlock(ec_fop_data_t * fop)
{
- ec_lock_t * lock, * item;
-
- ec_trace("UNLOCK", fop, "");
+ ec_lock_t * lock;
+ int32_t i, refs;
- list_for_each_entry_safe(lock, item, &fop->lock_list, list)
+ for (i = 0; i < fop->lock_count; i++)
{
- list_del(&lock->list);
+ lock = fop->locks[i].lock;
- if (lock->mask != 0)
- {
- switch (lock->kind)
- {
- case EC_LOCK_ENTRY:
- ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, parent=%p, "
- "path=%s, basename=%s",
- lock, lock->loc.inode, lock->loc.path,
- lock->basename);
+ LOCK(&lock->loc.inode->lock);
- ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
- ec_unlocked, lock, fop->xl->name, &lock->loc,
- lock->basename, ENTRYLK_UNLOCK, lock->type,
- NULL);
+ ec_trace("UNLOCK", fop, "lock=%p", lock);
- break;
-
- case EC_LOCK_INODE:
- lock->flock.l_type = F_UNLCK;
- ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
- lock->loc.inode);
+ refs = --lock->refs;
+ if (refs == 0)
+ {
+ list_del_init(&lock->list);
+ }
- ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL,
- ec_unlocked, lock, fop->xl->name, &lock->loc,
- F_SETLK, &lock->flock, NULL);
+ UNLOCK(&lock->loc.inode->lock);
- break;
+ if (refs == 0)
+ {
+ if (lock->mask != 0)
+ {
+ ec_owner_set(fop->frame, lock);
- default:
- gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type");
+ switch (lock->kind)
+ {
+ case EC_LOCK_ENTRY:
+ ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, "
+ "path=%s, basename=%s",
+ lock, lock->loc.inode, lock->loc.path,
+ lock->basename);
+
+ ec_entrylk(fop->frame, fop->xl, lock->mask,
+ EC_MINIMUM_ALL, ec_unlocked, lock,
+ fop->xl->name, &lock->loc, lock->basename,
+ ENTRYLK_UNLOCK, lock->type, NULL);
+
+ break;
+
+ case EC_LOCK_INODE:
+ lock->flock.l_type = F_UNLCK;
+ ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p",
+ lock, lock->loc.inode);
+
+ ec_inodelk(fop->frame, fop->xl, lock->mask,
+ EC_MINIMUM_ALL, ec_unlocked, lock,
+ fop->xl->name, &lock->loc, F_SETLK,
+ &lock->flock, NULL);
+
+ break;
+
+ default:
+ gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock "
+ "type");
+ }
}
- }
- loc_wipe(&lock->loc);
+ ec_trace("LOCK_DESTROY", fop, "lock=%p", lock);
- GF_FREE(lock);
+ ec_lock_destroy(lock);
+ }
}
}
@@ -883,11 +993,36 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,
struct iatt * postparent)
{
ec_fop_data_t * fop = cookie;
+ ec_inode_t * ctx;
+ ec_lock_t * lock;
if (op_ret >= 0)
{
- fop->parent->mask &= fop->good;
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, this);
+ if ((ctx != NULL) && !list_empty(&ctx->inode_locks))
+ {
+ lock = list_entry(ctx->inode_locks.next, ec_lock_t, list);
+
+ lock->have_size = 1;
+ lock->size = buf->ia_size;
+ lock->version = fop->answer->version;
+ }
+
+ UNLOCK(&inode->lock);
+
+ if (lock != NULL)
+ {
+ // Only update parent mask if the lookup has been made with
+ // inode locked.
+ fop->parent->mask &= fop->good;
+ }
+
fop->parent->pre_size = fop->parent->post_size = buf->ia_size;
+
+ fop->parent->have_size = 1;
+
}
else
{
@@ -907,11 +1042,18 @@ void ec_get_size_version(ec_fop_data_t * fop)
gid_t gid;
int32_t error = ENOMEM;
- if (fop->parent != NULL)
+ if (fop->have_size)
+ {
+ return;
+ }
+
+ if ((fop->parent != NULL) && fop->parent->have_size)
{
fop->pre_size = fop->parent->pre_size;
fop->post_size = fop->parent->post_size;
+ fop->have_size = 1;
+
return;
}
@@ -998,10 +1140,10 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
return 0;
}
-void ec_update_size_version(ec_fop_data_t * fop)
+void ec_update_size_version(ec_fop_data_t * fop, uint64_t version,
+ size_t size)
{
dict_t * dict;
- size_t size;
uid_t uid;
gid_t gid;
@@ -1012,20 +1154,20 @@ void ec_update_size_version(ec_fop_data_t * fop)
return;
}
+ ec_trace("UPDATE", fop, "version=%ld, size=%ld", version, size);
+
dict = dict_new();
if (dict == NULL)
{
goto out;
}
- if (ec_dict_set_number(dict, EC_XATTR_VERSION, 1) != 0)
+ if (ec_dict_set_number(dict, EC_XATTR_VERSION, version) != 0)
{
goto out;
}
- size = fop->post_size;
- if (fop->pre_size != size)
+ if (size != 0)
{
- size -= fop->pre_size;
if (ec_dict_set_number(dict, EC_XATTR_SIZE, size) != 0)
{
goto out;
@@ -1069,6 +1211,113 @@ out:
gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size");
}
+void ec_flush_size_version(ec_fop_data_t * fop)
+{
+ ec_lock_t * lock;
+ uint64_t version;
+ size_t delta;
+
+ GF_ASSERT(fop->lock_count == 1);
+
+ lock = fop->locks[0].lock;
+
+ GF_ASSERT(lock->kind == EC_LOCK_INODE);
+
+ LOCK(&lock->loc.inode->lock);
+
+ GF_ASSERT(lock->owner == fop);
+
+ version = lock->version_delta;
+ delta = lock->size_delta;
+ lock->version_delta = 0;
+ lock->size_delta = 0;
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (version > 0)
+ {
+ ec_update_size_version(fop, version, delta);
+ }
+}
+
+void ec_lock_reuse(ec_fop_data_t * fop, int32_t update)
+{
+ ec_fop_data_t * wait_fop;
+ ec_lock_t * lock;
+ ec_lock_link_t * link;
+ size_t delta = 0;
+ uint64_t version = 0;
+ int32_t refs = 0;
+ int32_t i;
+
+ for (i = 0; i < fop->lock_count; i++)
+ {
+ wait_fop = NULL;
+
+ lock = fop->locks[i].lock;
+
+ LOCK(&lock->loc.inode->lock);
+
+ ec_trace("LOCK_DONE", fop, "lock=%p", lock);
+
+ GF_ASSERT(lock->owner == fop);
+ lock->owner = NULL;
+
+ if (lock->kind == EC_LOCK_INODE)
+ {
+ if (update && (fop->error == 0))
+ {
+ lock->version_delta++;
+ lock->size_delta += fop->post_size - fop->pre_size;
+ }
+ version = lock->version_delta;
+ delta = lock->size_delta;
+ refs = lock->refs;
+ if (refs == 1)
+ {
+ lock->version_delta = 0;
+ lock->size_delta = 0;
+ }
+
+ if (fop->have_size)
+ {
+ lock->size = fop->post_size;
+ lock->have_size = 1;
+ }
+ }
+ lock->good_mask &= fop->mask;
+
+ if (!list_empty(&lock->waiting))
+ {
+ link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
+ list_del_init(&link->wait_list);
+
+ wait_fop = link->fop;
+
+ if (lock->kind == EC_LOCK_INODE)
+ {
+ wait_fop->pre_size = wait_fop->post_size = fop->post_size;
+ wait_fop->have_size = fop->have_size;
+ }
+ wait_fop->mask &= fop->mask;
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (wait_fop != NULL)
+ {
+ ec_lock(wait_fop);
+
+ ec_resume(wait_fop, 0);
+ }
+ }
+
+ if ((refs == 1) && (version > 0))
+ {
+ ec_update_size_version(fop, version, delta);
+ }
+}
+
void __ec_manager(ec_fop_data_t * fop, int32_t error)
{
do