diff options
| -rw-r--r-- | libglusterfs/src/globals.h | 2 | ||||
| -rw-r--r-- | tests/basic/ec/ec-optimistic-changelog.t | 152 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 49 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 14 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-types.h | 4 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.c | 22 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 | 
7 files changed, 243 insertions, 6 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h index a058a0f4ecd..1c8547265d1 100644 --- a/libglusterfs/src/globals.h +++ b/libglusterfs/src/globals.h @@ -78,6 +78,8 @@  #define GD_OP_VERSION_3_10_0   31000 /* Op-version for GlusterFS 3.10.0 */ +#define GD_OP_VERSION_3_10_1   31001 /* Op-version for GlusterFS 3.10.1 */ +  #define GD_OP_VERSION_4_0_0    40000 /* Op-version for GlusterFS 4.0.0 */  #define GD_OP_VER_PERSISTENT_AFR_XATTRS GD_OP_VERSION_3_6_0 diff --git a/tests/basic/ec/ec-optimistic-changelog.t b/tests/basic/ec/ec-optimistic-changelog.t new file mode 100644 index 00000000000..1277da6ca1b --- /dev/null +++ b/tests/basic/ec/ec-optimistic-changelog.t @@ -0,0 +1,152 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks optimistic-change-log option + +cleanup +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume heal $V0 disable + +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.read-ahead off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 disperse.background-heals 0 +TEST $CLI volume set $V0 disperse.optimistic-change-log off +TEST $CLI volume set $V0 disperse.eager-lock off +TEST $CLI volume start $V0 + +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength + +TEST $CLI volume set $V0 disperse.background-heals 1 +TEST touch $M0/a +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}1 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}2 + + + +### optimistic-change-log = off ; All bricks good. Test file operation +echo abc > $M0/a +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = off ; Kill one brick . Test file operation +TEST kill_brick $V0 $H0 $B0/${V0}2 +echo abc > $M0/a +EXPECT 2 get_pending_heal_count $V0 #One for each active brick +$CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +#Accessing file should heal the file now +EXPECT "abc" cat $M0/a +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = off ; All bricks good. Test entry operation +TEST touch $M0/b +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = off ; All bricks good. Test metadata operation +TEST chmod 0777 $M0/b +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = off ; Kill one brick. Test entry operation + +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST touch $M0/c +EXPECT 4 get_pending_heal_count $V0 #two for each active brick +$CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +getfattr -d -m. -e hex $M0 2>&1 > /dev/null +getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = off ; Kill one brick. Test metadata operation +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST chmod 0777 $M0/c +EXPECT 2 get_pending_heal_count $V0 #One for each active brick +$CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +TEST $CLI volume set $V0 disperse.optimistic-change-log on + +### optimistic-change-log = on ; All bricks good. Test file operation + +echo abc > $M0/aa +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = on ; Kill one brick. Test file operation + +TEST kill_brick $V0 $H0 $B0/${V0}2 +echo abc > $M0/aa +EXPECT 2 get_pending_heal_count $V0 #One for each active brick +$CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +#Accessing file should heal the file now +getfattr -d -m. -e hex $M0/aa 2>&1 > /dev/null +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = on ; All bricks good. Test entry operation + +TEST touch $M0/bb +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = on ; All bricks good. Test metadata operation + +TEST chmod 0777 $M0/bb +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = on ; Kill one brick. Test entry operation + +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST touch $M0/cc +EXPECT 4 get_pending_heal_count $V0 #two for each active brick +$CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +getfattr -d -m. -e hex $M0 2>&1 > /dev/null +getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +## optimistic-change-log = on ; Kill one brick. Test metadata operation + +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST chmod 0777 $M0/cc +EXPECT 2 get_pending_heal_count $V0 #One for each active brick +$CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +############################################################ + +cleanup diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 823922542a0..3ae7f110d99 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -932,16 +932,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)  }  gf_boolean_t -ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty) +ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, +                   uint64_t *dirty)  {      gf_boolean_t set_dirty = _gf_false;      if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) { +            if (!link->optimistic_changelog)                  dirty[EC_DATA_TXN] = 1;      }      if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) { +            if (!link->optimistic_changelog)                  dirty[EC_METADATA_TXN] = 1;      } @@ -962,6 +965,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,      ec_lock_link_t *link = fop->data;      ec_lock_t *lock = NULL;      ec_inode_t *ctx; +    gf_boolean_t release = _gf_false;      lock = link->lock;      parent = link->fop; @@ -1055,6 +1059,26 @@ unlock:      UNLOCK(&lock->loc.inode->lock);      if (op_errno == 0) { +        /* If the fop fails on any of the good bricks, it is important to mark +         * it dirty and update versions right away if dirty was not set before. +         */ +        if (lock->good_mask & ~(fop->good | fop->remaining)) { +                release = _gf_true; +        } + +        /* lock->release is a critical field that is checked and modified most +         * of the time inside a locked region. This use here is safe because we +         * are in a modifying fop and we currently don't allow two modifying +         * fops to be processed concurrently, so no one else could be checking +         * or modifying it.*/ +        if (link->update[0] && !link->dirty[0]) { +                lock->release |= release; +        } + +        if (link->update[1] && !link->dirty[1]) { +                lock->release |= release; +        } +          /* We don't allow the main fop to be executed on bricks that have not           * succeeded the initial xattrop. */          parent->mask &= fop->good; @@ -1097,6 +1121,7 @@ void ec_get_size_version(ec_lock_link_t *link)      ec_inode_t *ctx;      ec_fop_data_t *fop;      dict_t *dict = NULL; +    ec_t   *ec = NULL;      int32_t error = 0;      gf_boolean_t getting_xattr;      gf_boolean_t set_dirty = _gf_false; @@ -1105,6 +1130,17 @@ void ec_get_size_version(ec_lock_link_t *link)      lock = link->lock;      ctx = lock->ctx;      fop = link->fop; +    ec  = fop->xl->private; + +    if (ec->optimistic_changelog && +        !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id)) +            link->optimistic_changelog = _gf_true; + +    /* If ctx->have_info is false and lock->query is true, it means that we'll +     * send the xattrop anyway, so we can use it to update dirty counts, even +     * if it's not necessary to do it right now. */ +    if (!ctx->have_info && lock->query) +            link->optimistic_changelog = _gf_false;      set_dirty = ec_set_dirty_flag (link, ctx, dirty); @@ -1714,6 +1750,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,          if (link->update[1]) {              ctx->post_version[1]++;          } +        /* If the fop fails on any of the good bricks, it is important to mark +         * it dirty and update versions right away. */ +        if (link->update[0] || link->update[1]) { +                if (lock->good_mask & ~(fop->good | fop->remaining)) { +                        lock->release = _gf_true; +                } +        }      }      ec_lock_update_good(lock, fop); @@ -2028,9 +2071,13 @@ ec_update_info(ec_lock_link_t *link)                      if (ctx->dirty[1] != 0) {                          dirty[1] = -1;                      } +            } else { +                    link->optimistic_changelog = _gf_false; +                    ec_set_dirty_flag (link, ctx, dirty);              }              memset(ctx->dirty, 0, sizeof(ctx->dirty));      } +      if ((version[0] != 0) || (version[1] != 0) ||          (dirty[0] != 0) || (dirty[1] != 0)) {          ec_update_size_version(link, version, size, dirty); diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 3ce3c2ab02a..ddb90ce39cc 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -697,6 +697,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,      ec_fop_data_t * fop = NULL;      ec_cbk_data_t * cbk = NULL;      int32_t idx = (int32_t)(uintptr_t)cookie; +    uint64_t       dirty[2] = {0};      VALIDATE_OR_GOTO(this, out);      GF_VALIDATE_OR_GOTO(this->name, frame, out); @@ -746,8 +747,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,                  goto out;              } -            ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty, -                               EC_VERSION_SIZE); +            ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);          }          ec_combine(cbk, ec_combine_lookup); @@ -1142,7 +1142,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  dict_t *xdata)  {          ec_fop_data_t *fop = NULL; +        ec_lock_link_t *link = NULL;          ec_cbk_data_t *cbk = NULL; +        uint64_t       dirty[2] = {0};          data_t *data;          uint64_t *version;          int32_t idx = (int32_t)(uintptr_t)cookie; @@ -1178,8 +1180,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      }                  } -                ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty, +                ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty,                                     EC_VERSION_SIZE); +                link = fop->data; +                if (link) { +                        /*Keep a note of if the dirty is already set or not*/ +                        link->dirty[0] |= (dirty[0] != 0); +                        link->dirty[1] |= (dirty[1] != 0); +                }          }          if (xdata) diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index d701fe7d25e..f184f459c2e 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -246,6 +246,8 @@ struct _ec_lock_link {      struct list_head  owner_list;      struct list_head  wait_list;      gf_boolean_t      update[2]; +    gf_boolean_t      dirty[2]; +    gf_boolean_t      optimistic_changelog;      loc_t            *base;      uint64_t          size;  }; @@ -331,7 +333,6 @@ struct _ec_cbk_data {      int32_t           op_errno;      int32_t           count;      uintptr_t         mask; -    uint64_t          dirty[2];      dict_t           *xdata;      dict_t           *dict; @@ -561,6 +562,7 @@ struct _ec {      gf_timer_t        *timer;      gf_boolean_t       shutdown;      gf_boolean_t       eager_lock; +    gf_boolean_t       optimistic_changelog;      uint32_t           background_heals;      uint32_t           heal_wait_qlen;      struct list_head   pending_fops; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index e467fea28b8..01f1473f96d 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -301,6 +301,8 @@ reconfigure (xlator_t *this, dict_t *options)                  ret = -1;          } +        GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog, +                          options, bool, failed);  failed:          return ret;  } @@ -611,6 +613,7 @@ init (xlator_t *this)      this->private = ec;      ec->xl = this; +    ec->optimistic_changelog = _gf_true;      LOCK_INIT(&ec->lock);      INIT_LIST_HEAD(&ec->pending_fops); @@ -669,6 +672,7 @@ init (xlator_t *this)      GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);      GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed); +    GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);      this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);      if (!this->itable) @@ -1463,5 +1467,21 @@ struct volume_options options[] =          .description = "force the cpu extensions to be used to accelerate the "                         "galois field computations."      }, -    { } +    {   .key = {"optimistic-change-log"}, +        .type = GF_OPTION_TYPE_BOOL, +        .default_value = "on", +        .description =  "Set/Unset dirty flag for every update fop at the start" +                        "of the fop. If OFF, this option impacts performance of" +                        "entry  operations or metadata operations as it will" +                        "set dirty flag at the start and unset it at the end of" +                        "ALL update fop. If ON and all the bricks are good," +                        "dirty flag will be set at the start only for file fops" +                        "For metadata and entry fops dirty flag will not be set" +                        "at the start, if all the bricks are good. This does" +                        "not impact performance for metadata operations and" +                        "entry operation but has a very small window to miss" +                        "marking entry as dirty in case it is required to be" +                        "healed" +    }, +    { .key = {NULL} }  }; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 5cb4b6c9702..b3f6c40d7f0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3153,6 +3153,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version  = GD_OP_VERSION_3_10_0,            .validate_fn = validate_boolean          }, +        { .key        = "disperse.optimistic-change-log", +          .voltype    = "cluster/disperse", +          .type       = NO_DOC, +          .op_version = GD_OP_VERSION_3_10_1, +          .flags      = OPT_FLAG_CLIENT_OPT +        },          { .key         = NULL          }  };  | 
