diff options
| author | Ravishankar N <ravishankar@redhat.com> | 2015-12-23 13:49:14 +0530 | 
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2016-04-27 21:44:50 -0700 | 
| commit | 17ddeb5cfed4029db65d6432511ddff28c866129 (patch) | |
| tree | 6732c46b567f3847fd60c61db07c62d4f04553d2 | |
| parent | c54be67f438c71b4cdd63eb1702acfe86e765412 (diff) | |
afr: propagate child up event after timeout
Backport of: http://review.gluster.org/11113
Problem: During mount, afr waits for response from all its children before
notifying the parent xlator. In a 1x2 replica volume , if one of the nodes is
down, the mount will hang for more than a minute until child down is received
from the client xlator for that node.
Fix:
When parent up is received by afr, start a 10 second timer. In the timer call
back, if we receive a successful child up from atleast one brick, propagate the
event to the parent xlator.
Change-Id: I31e57c8802c1a03a4a5d581ee4ab82f3a9c8799d
BUG: 1330855
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/14088
Smoke: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 280 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-messages.h | 10 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 3 | 
4 files changed, 200 insertions, 99 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 1345d4d0bcc..9ff374048ed 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -3922,6 +3922,117 @@ find_child_index (xlator_t *this, xlator_t *child)          return i;  } +static int +__afr_get_up_children_count (afr_private_t *priv) +{ +        int             up_children         = 0; +        int             i = 0; + +        for (i = 0; i < priv->child_count; i++) +                if (priv->child_up[i] == 1) +                        up_children++; + +        return up_children; +} + +glusterfs_event_t +__afr_transform_event_from_state (afr_private_t *priv) +{ +        int i = 0; +        int up_children = 0; + +        if (AFR_COUNT (priv->last_event, priv->child_count) == +                       priv->child_count) +                /* have_heard_from_all. Let afr_notify() do the propagation. */ +                return GF_EVENT_MAXVAL; + +        up_children = __afr_get_up_children_count (priv); +        if (up_children) { +                /* We received at least one child up and there are pending +                 * notifications from some children. Treat these children as +                 * having sent a GF_EVENT_CHILD_DOWN. i.e. set the event as +                 * GF_EVENT_CHILD_MODIFIED, as done in afr_notify() */ +                for (i = 0; i < priv->child_count; i++) { +                        if (priv->last_event[i]) +                                continue; +                        priv->last_event[i] = GF_EVENT_CHILD_MODIFIED; +                        priv->child_up[i] = 0; +                } +                return GF_EVENT_CHILD_UP; +        } else { +                for (i = 0; i < priv->child_count; i++) { +                        if (priv->last_event[i]) +                                continue; +                        priv->last_event[i] = GF_EVENT_SOME_CHILD_DOWN; +                        priv->child_up[i] = 0; +                } +                return GF_EVENT_CHILD_DOWN; +        } + +        return GF_EVENT_MAXVAL; +} + +static void +afr_notify_cbk (void *data) +{ +        xlator_t *this = data; +        afr_private_t *priv = this->private; +        glusterfs_event_t event = GF_EVENT_MAXVAL; +        gf_boolean_t propagate = _gf_false; + +        LOCK (&priv->lock); +        { +                if (!priv->timer) { +                        /* +                         * Either child_up/child_down is already sent to parent. +                         * This is a spurious wake up. +                         */ +                        goto unlock; +                } +                priv->timer = NULL; +                event = __afr_transform_event_from_state (priv); +                if (event != GF_EVENT_MAXVAL) +                        propagate = _gf_true; +        } +unlock: +        UNLOCK (&priv->lock); +        if (propagate) +                default_notify (this, event, NULL); +} + +static void +__afr_launch_notify_timer (xlator_t *this, afr_private_t *priv) +{ + +        struct timespec delay = {0, }; + +        gf_msg_debug (this->name, 0, "Initiating child-down timer"); +        delay.tv_sec = 10; +        delay.tv_nsec = 0; +        priv->timer = gf_timer_call_after (this->ctx, delay, +                                           afr_notify_cbk, this); +        if (priv->timer == NULL) { +                gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL, +                        "Cannot create timer for delayed initialization"); +        } +} + +int +__get_heard_from_all_status (xlator_t *this) +{ +        afr_private_t *priv          = this->private; +        int           heard_from_all = 1; +        int           i              = 0; + +        for (i = 0; i < priv->child_count; i++) { +                if (!priv->last_event[i]) { +                        heard_from_all = 0; +                        break; +                } +        } +        return heard_from_all; +} +  int32_t  afr_notify (xlator_t *this, int32_t event,              void *data, void *data2) @@ -3954,12 +4065,6 @@ afr_notify (xlator_t *this, int32_t event,           */          priv->did_discovery = _gf_false; -        had_heard_from_all = 1; -        for (i = 0; i < priv->child_count; i++) { -                if (!priv->last_event[i]) { -                        had_heard_from_all = 0; -                } -        }          /* parent xlators dont need to know about every child_up, child_down           * because of afr ha. If all subvolumes go down, child_down has @@ -3979,10 +4084,32 @@ afr_notify (xlator_t *this, int32_t event,          had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,                                                             this); -        switch (event) { -        case GF_EVENT_CHILD_UP: +        if (event == GF_EVENT_TRANSLATOR_OP) {                  LOCK (&priv->lock);                  { +                        had_heard_from_all = __get_heard_from_all_status (this); +                } +                UNLOCK (&priv->lock); + +                if (!had_heard_from_all) { +                        ret = -1; +                } else { +                        input = data; +                        output = data2; +                        ret = afr_xl_op (this, input, output); +                } +                goto out; +        } + +        LOCK (&priv->lock); +        { +                had_heard_from_all = __get_heard_from_all_status (this); +                switch (event) { +                case GF_EVENT_PARENT_UP: +                        __afr_launch_notify_timer (this, priv); +                        propagate = 1; +                        break; +                case GF_EVENT_CHILD_UP:                          /*                           * This only really counts if the child was never up                           * (value = -1) or had been down (value = 0).  See @@ -3990,48 +4117,28 @@ afr_notify (xlator_t *this, int32_t event,                           * explanation.                           */                          if (priv->child_up[idx] != 1) { -                                priv->up_count++; -				priv->event_generation++; +                                priv->event_generation++;                          }                          priv->child_up[idx] = 1;                          call_psh = 1; -                        for (i = 0; i < priv->child_count; i++) -                                if (priv->child_up[i] == 1) -                                        up_children++; +                        up_children = __afr_get_up_children_count (priv);                          if (up_children == 1) {                                  gf_msg (this->name, GF_LOG_INFO, 0,                                          AFR_MSG_SUBVOL_UP,                                          "Subvolume '%s' came back up; " -                                        "going online.", ((xlator_t *)data)->name); +                                     "going online.", ((xlator_t *)data)->name);                          } else {                                  event = GF_EVENT_CHILD_MODIFIED;                          }                          priv->last_event[idx] = event; -                } -                UNLOCK (&priv->lock); -                break; +                        break; -        case GF_EVENT_CHILD_DOWN: -                LOCK (&priv->lock); -                { -                        /* -                         * If a brick is down when we start, we'll get a -                         * CHILD_DOWN to indicate its initial state.  There -                         * was never a CHILD_UP in this case, so if we -                         * increment "down_count" the difference between than -                         * and "up_count" will no longer be the number of -                         * children that are currently up.  This has serious -                         * implications e.g. for quorum enforcement, so we -                         * don't increment these values unless the event -                         * represents an actual state transition between "up" -                         * (value = 1) and anything else. -                         */ +                case GF_EVENT_CHILD_DOWN:                          if (priv->child_up[idx] == 1) { -                                priv->down_count++; -				priv->event_generation++; +                                priv->event_generation++;                          }                          priv->child_up[idx] = 0; @@ -4041,42 +4148,56 @@ afr_notify (xlator_t *this, int32_t event,                          if (down_children == priv->child_count) {                                  gf_msg (this->name, GF_LOG_ERROR, 0,                                          AFR_MSG_ALL_SUBVOLS_DOWN, -                                        "All subvolumes are down. Going offline " -                                        "until atleast one of them comes back up."); +                                       "All subvolumes are down. Going offline " +                                    "until atleast one of them comes back up.");                          } else {                                  event = GF_EVENT_SOME_CHILD_DOWN;                          }                          priv->last_event[idx] = event; -                } -                UNLOCK (&priv->lock); -                break; +                        break; -        case GF_EVENT_CHILD_CONNECTING: -                LOCK (&priv->lock); -                { +                case GF_EVENT_CHILD_CONNECTING:                          priv->last_event[idx] = event; -                } -                UNLOCK (&priv->lock); -                break; +                        break; -        case GF_EVENT_TRANSLATOR_OP: -                input = data; -                output = data2; -                if (!had_heard_from_all) { -                        ret = -1; -                        goto out; +                case GF_EVENT_SOME_CHILD_DOWN: +                        priv->last_event[idx] = event; +                        break; + +                default: +                        propagate = 1; +                        break;                  } -                ret = afr_xl_op (this, input, output); -                goto out; -                break; +                have_heard_from_all = __get_heard_from_all_status (this); +                if (!had_heard_from_all && have_heard_from_all) { +                        if (priv->timer) { +                                gf_timer_call_cancel (this->ctx, priv->timer); +                                priv->timer = NULL; +                        } +                        /* This is the first event which completes aggregation +                           of events from all subvolumes. If at least one subvol +                           had come up, propagate CHILD_UP, but only this time +                        */ +                        event = GF_EVENT_CHILD_DOWN; +                        up_children = __afr_get_up_children_count (priv); +                        for (i = 0; i < priv->child_count; i++) { +                                if (priv->last_event[i] == GF_EVENT_CHILD_UP) { +                                        event = GF_EVENT_CHILD_UP; +                                        break; +                                } -        default: -                propagate = 1; -                break; +                                if (priv->last_event[i] == +                                                GF_EVENT_CHILD_CONNECTING) { +                                        event = GF_EVENT_CHILD_CONNECTING; +                               /* continue to check other events for CHILD_UP */ +                                } +                        } +                }          } +        UNLOCK (&priv->lock);          if (priv->quorum_count) {                  has_quorum = afr_has_quorum (priv->child_up, this); @@ -4089,44 +4210,11 @@ afr_notify (xlator_t *this, int32_t event,                                  "Client-quorum is not met");          } -        /* have all subvolumes reported status once by now? */ -        have_heard_from_all = 1; -        for (i = 0; i < priv->child_count; i++) { -                if (!priv->last_event[i]) -                        have_heard_from_all = 0; -        } -          /* if all subvols have reported status, no need to hide anything             or wait for anything else. Just propagate blindly */          if (have_heard_from_all)                  propagate = 1; -        if (!had_heard_from_all && have_heard_from_all) { -                /* This is the first event which completes aggregation -                   of events from all subvolumes. If at least one subvol -                   had come up, propagate CHILD_UP, but only this time -                */ -                event = GF_EVENT_CHILD_DOWN; - -                LOCK (&priv->lock); -                { -                        up_children = AFR_COUNT (priv->child_up, priv->child_count); -                        for (i = 0; i < priv->child_count; i++) { -                                if (priv->last_event[i] == GF_EVENT_CHILD_UP) { -                                        event = GF_EVENT_CHILD_UP; -                                        break; -                                } - -                                if (priv->last_event[i] == -                                                GF_EVENT_CHILD_CONNECTING) { -                                        event = GF_EVENT_CHILD_CONNECTING; -                                        /* continue to check other events for CHILD_UP */ -                                } -                        } -                } -                UNLOCK (&priv->lock); -        } -          ret = 0;          if (propagate)                  ret = default_notify (this, event, data); @@ -4371,18 +4459,18 @@ gf_boolean_t  afr_have_quorum (char *logname, afr_private_t *priv)  {          unsigned int        quorum = 0; +        unsigned int        up_children = 0;          GF_VALIDATE_OR_GOTO(logname,priv,out); +        up_children = __afr_get_up_children_count (priv);          quorum = priv->quorum_count; -        if (quorum != AFR_QUORUM_AUTO) { -                return (priv->up_count >= (priv->down_count + quorum)); -        } +        if (quorum != AFR_QUORUM_AUTO) +                return up_children >= quorum;          quorum = priv->child_count / 2 + 1; -        if (priv->up_count >= (priv->down_count + quorum)) { +        if (up_children >= quorum)                  return _gf_true; -        }          /*           * Special case for even numbers of nodes: if we have exactly half @@ -4393,7 +4481,7 @@ afr_have_quorum (char *logname, afr_private_t *priv)           */          if ((priv->child_count % 2) == 0) {                  quorum = priv->child_count / 2; -                if (priv->up_count >= (priv->down_count + quorum)) { +                if (up_children >= quorum) {                          if (priv->child_up[0]) {                                  return _gf_true;                          } diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index 1fb5b51b20a..f2cd8300099 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -45,7 +45,7 @@   */  #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR -#define GLFS_NUM_MESSAGES       39 +#define GLFS_NUM_MESSAGES       41  #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)  #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages" @@ -357,5 +357,13 @@  */  #define AFR_MSG_NO_CHANGELOG  (GLFS_COMP_BASE_AFR + 40) +/*! + * @messageid 108041 + * @diagnosis Unable to create timer thread for delayed initialization. + * @recommendedaction Possibly check process's log file for messages from + * timer infra. +*/ +#define AFR_MSG_TIMER_CREATE_FAIL               (GLFS_COMP_BASE_AFR + 41) +  #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"  #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index ad92eaf0c3e..dafacee2996 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -498,6 +498,12 @@ fini (xlator_t *this)          afr_private_t *priv = NULL;          priv = this->private; +        LOCK (&priv->lock); +        if (priv->timer != NULL) { +                gf_timer_call_cancel(this->ctx, priv->timer); +                priv->timer = NULL; +        } +        UNLOCK (&priv->lock);          this->private = NULL;          afr_priv_destroy (priv);          //if (this->itable);//I dont see any destroy func diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index d3c1e1066b6..c553a1ffc4c 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -104,8 +104,7 @@ typedef struct _afr_private {          unsigned int wait_count;      /* # of servers to wait for success */ -        uint64_t up_count;      /* number of CHILD_UPs we have seen */ -        uint64_t down_count;    /* number of CHILD_DOWNs we have seen */ +        gf_timer_t *timer;      /* launched when parent up is received */          gf_boolean_t      optimistic_change_log;          gf_boolean_t      eager_lock;  | 
