summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src/ec-dir-write.c
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2015-05-13 16:57:49 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2015-05-28 04:12:06 -0700
commit3a57ca8ee29ea8e3d3c5bbf28a56a821bfa99d99 (patch)
tree7e919238192422d4bb1f8a86a950013b286b41b3 /xlators/cluster/ec/src/ec-dir-write.c
parent2b8fdde926532014f19d850b1321a4c7046dc001 (diff)
cluster/ec: Fix all EIO errors in EC
Backport of http://review.gluster.org/10770 Backport of http://review.gluster.org/10806 Backport of http://review.gluster.org/10787 Backport of http://review.gluster.org/10868 Backport of http://review.gluster.com/10852 - When a blocking lock is requested, lock request is succeeded even when ec->fragment number of locks are acquired successfully in non-blocking locking phase. This will lead to fop succeeding only on the bricks where the locks are acquired, leading to the necessity of self-heals. To prevent these un-necessary self-heals, if the remaining locks fail with EAGAIN in non-blocking lock phase try blocking locking phase instead. - Handle lookup failures while op in progress - cluster/ec: Correctly cleanup delayed locks When a delayed lock is pending, a graph switch doesn't correctly terminate it. This means that the update of version and size xattrs is lost, causing EIO errors. This patch handles GF_EVENT_PARENT_DOWN event to correctly finish pending udpdates before completing the graph switch. - Fix use after free crash ec_heal creates ec_fop_data but doesn't run ec_manager. ec_fop_data_allocate adds this fop to ec->pending_fops, because ec_manager is not run on this heal fop it is never removed from ec->pending_fops. When it is accessed after free it leads to crash. It is better to not to add HEAL fops to ec->pending_fops because we don't want graph switch to hang the mount because of a BIG file/directory heal. - Forced unlock when lock contention is detected EC uses an eager lock mechanism to optimize multiple read/write requests on the same entry or inode. This increases performance but can have adverse results when other clients try to access the same entry/inode. To solve this, this patch adds a functionality to detect when this happens and force an earlier release to not block other clients. The method consists on requesting GF_GLUSTERFS_INODELK_COUNT and GF_GLUSTERFS_ENTRYLK_COUNT for all fops that take a lock. When this count is greater than one, the lock is marked to be released. All fops already waiting for this lock will be executed normally before releasing the lock, but new requests that also require it will be blocked and restarted after the lock has been released and reacquired again. Another problem was that some operations did correctly lock the parent of an entry when needed, but got the size and version xattrs from the entry instead of the parent. This patch solves this problem by binding all queries of size and version to each lock and replacing all entrylk calls by inodelk ones to remove concurrent updates on directory metadata. This also allows rename to correctly update source and destination directories. BUG: 1225279 Change-Id: I02a6084b138dd38e018a462347cd9ce38610c7ef Reviewed-on: http://review.gluster.org/10926 Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-dir-write.c')
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c96
1 files changed, 42 insertions, 54 deletions
diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c
index ffc96bf4351..ce09138fb7a 100644
--- a/xlators/cluster/ec/src/ec-dir-write.c
+++ b/xlators/cluster/ec/src/ec-dir-write.c
@@ -98,11 +98,10 @@ void ec_wind_create(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)
{
-
-
- ec_t * ec;
- ec_cbk_data_t * cbk;
- ec_fd_t * ctx;
+ ec_config_t config;
+ ec_t *ec;
+ ec_cbk_data_t *cbk;
+ ec_fd_t *ctx;
uint64_t version[2] = {0, 0};
switch (state)
@@ -137,16 +136,15 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)
ec = fop->xl->private;
- fop->config.version = EC_CONFIG_VERSION;
- fop->config.algorithm = EC_CONFIG_ALGORITHM;
- fop->config.gf_word_size = EC_GF_BITS;
- fop->config.bricks = ec->nodes;
- fop->config.redundancy = ec->redundancy;
- fop->config.chunk_size = EC_METHOD_CHUNK_SIZE;
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG,
- &fop->config) < 0)
- {
+ &config) < 0) {
fop->error = EIO;
return EC_STATE_REPORT;
@@ -172,7 +170,8 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -376,17 +375,11 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- // Parent entry of fop->loc[0] should be locked, but I don't
- // receive enough information to do it (fop->loc[0].parent is
- // NULL).
- ec_lock_prepare_entry(fop, &fop->loc[1], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[1], EC_UPDATE_DATA |
+ EC_UPDATE_META |
+ EC_INODE_SIZE);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -410,7 +403,7 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3,
cbk->count);
if (cbk->iatt[0].ia_type == IA_IFREG) {
- cbk->iatt[0].ia_size = fop->pre_size;
+ cbk->iatt[0].ia_size = fop->locks[0].size;
}
if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
@@ -446,7 +439,6 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -589,7 +581,8 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -764,6 +757,7 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
{
+ ec_config_t config;
ec_t *ec;
ec_cbk_data_t * cbk;
uint64_t version[2] = {0, 0};
@@ -783,15 +777,15 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
ec = fop->xl->private;
- fop->config.version = EC_CONFIG_VERSION;
- fop->config.algorithm = EC_CONFIG_ALGORITHM;
- fop->config.gf_word_size = EC_GF_BITS;
- fop->config.bricks = ec->nodes;
- fop->config.redundancy = ec->redundancy;
- fop->config.chunk_size = EC_METHOD_CHUNK_SIZE;
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG,
- &fop->config) < 0) {
+ &config) < 0) {
fop->error = EIO;
return EC_STATE_REPORT;
@@ -814,7 +808,8 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -997,15 +992,13 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
- ec_lock_prepare_entry(fop, &fop->loc[1], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], EC_UPDATE_DATA |
+ EC_UPDATE_META |
+ EC_INODE_SIZE);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[1],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -1034,9 +1027,8 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5,
cbk->count);
- if (cbk->iatt[0].ia_type == IA_IFREG)
- {
- cbk->iatt[0].ia_size = fop->pre_size;
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ cbk->iatt[0].ia_size = fop->locks[0].size;
}
}
}
@@ -1064,7 +1056,6 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT:
@@ -1191,7 +1182,8 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1361,7 +1353,8 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1552,14 +1545,10 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_entry(fop, &fop->loc[0], 1);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META);
ec_lock(fop);
- return EC_STATE_GET_SIZE_AND_VERSION;
-
- case EC_STATE_GET_SIZE_AND_VERSION:
- ec_get_size_version(fop);
-
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
@@ -1607,7 +1596,6 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)
case -EC_STATE_INIT:
case -EC_STATE_LOCK:
- case -EC_STATE_GET_SIZE_AND_VERSION:
case -EC_STATE_DISPATCH:
case -EC_STATE_PREPARE_ANSWER:
case -EC_STATE_REPORT: