error-gen: fix randomness

This generates errors in a truly random distribution, including streaks and dry spells, instead of just setting a constant interval between errors. It also supports very small error probabilities, down to 1ppm instead of 1%. Also cleaned up some bad logic in the "random-failure" case. Change-Id: I17b058c4b86b0aa17dd5706250e6d44f242fcd9d Signed-off-by: Jeff Darcy <jdarcy@fb.com> Reviewed-on: https://review.gluster.org/17886 Smoke: Gluster Build System <jenkins@build.gluster.org> Tested-by: Jeff Darcy <jeff@pl.atyp.us> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com> Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
author: Jeff Darcy <jdarcy@fb.com> 2017-07-26 11:56:04 -0700
committer: Jeff Darcy <jeff@pl.atyp.us> 2017-07-31 21:23:43 +0000
commit: 18cb1a87bb7c88657b499b22894b81f1f694b6ad (patch)
tree: 3a3b2732939533c9aaa2e8d2e266309021a79828 /xlators/debug
parent: 91c9f4a19fde4894576b398252c77f730832a26a (diff)
2 files changed, 67 insertions, 27 deletions
diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c
index 80f46dec065..0ce88a92918 100644
--- a/xlators/debug/error-gen/src/error-gen.c
+++ b/xlators/debug/error-gen/src/error-gen.c
@@ -12,6 +12,22 @@
 #include "statedump.h"
 #include "defaults.h"
 
+/*
+ * The user can specify an error probability as a float percentage, but we
+ * store it internally as a numerator with this as the denominator.  When it's
+ * used, it's like this:
+ *
+ *    (rand() % FAILURE_GRANULARITY) < error_rate
+ *
+ * To minimize rounding errors from the modulo operation, it's good for this to
+ * be a power of two.
+ *
+ * (BTW this is just the normal case.  If "random-failure" is set, that does
+ * something completely different and this number is irrelevant.  See error_gen
+ * for the legacy code.)
+ */
+#define FAILURE_GRANULARITY     (1 << 20)
+
 sys_error_t error_no_list[] = {
         [GF_FOP_LOOKUP]            = { .error_no_count = 4,
                                     .error_no = {ENOENT,ENOTDIR,
@@ -246,32 +262,52 @@ error_gen (xlator_t *this, int op_no)
 {
         eg_t             *egp = NULL;
         int              count = 0;
-        int              failure_iter_no = GF_FAILURE_DEFAULT;
         int              error_no_int = 0;
         int              rand_no = 0;
         int              ret = 0;
+        gf_boolean_t     should_err = _gf_false;
 
         egp = this->private;
 
-        LOCK (&egp->lock);
-        {
-                count = ++egp->op_count;
-                failure_iter_no = egp->failure_iter_no;
-                error_no_int = egp->error_no_int;
-        }
-        UNLOCK (&egp->lock);
-
-        if((count % failure_iter_no) == 0) {
+        if (egp->random_failure) {
+                /*
+                 * I honestly don't know why anyone would use this "feature"
+                 * but I'll try to preserve its functionality anyway.  Without
+                 * locking twice to update failure_iter_no and egp->op_count
+                 * separately, then not locking at all to update
+                 * egp->failure_iter_no.  That's not needed for compatibility,
+                 * and it's abhorrently wrong.  I have *some* standards.
+                 */
                 LOCK (&egp->lock);
                 {
-                        egp->op_count = 0;
+                        count = ++(egp->op_count);
+                        error_no_int = egp->error_no_int;
+                        if ((count % egp->failure_iter_no) == 0) {
+                                egp->op_count = 0;
+                                /* coverty[DC.WEAK_CRYPTO] */
+                                egp->failure_iter_no = 3
+                                        + (rand () % GF_UNIVERSAL_ANSWER);
+                                should_err = _gf_true;
+                        }
                 }
                 UNLOCK (&egp->lock);
+        } else {
+                /*
+                 * It turns out that rand() is almost universally implemented
+                 * as a linear congruential PRNG, which is about as cheap as
+                 * it gets.  This gets us real random behavior, including
+                 * phenomena like streaks and dry spells, with controllable
+                 * long-term probability, cheaply.
+                 */
+                if ((rand () % FAILURE_GRANULARITY) < egp->failure_iter_no) {
+                        should_err = _gf_true;
+                }
+        }
 
+        if (should_err) {
                 if (error_no_int)
                         ret = error_no_int;
                 else {
-
                         rand_no = generate_rand_no (op_no);
                         if (op_no >= GF_FOP_MAXVALUE)
                                 op_no = 0;
@@ -279,10 +315,8 @@ error_gen (xlator_t *this, int op_no)
                                 rand_no = 0;
                         ret = error_no_list[op_no].error_no[rand_no];
                 }
-                if (egp->random_failure == _gf_true)
-                        /* coverty[DC.WEAK_CRYPTO] */
-                        egp->failure_iter_no = 3 + (rand () % GF_UNIVERSAL_ANSWER);
         }
+
         return ret;
 }
 
@@ -1379,14 +1413,14 @@ error_gen_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
 }
 
 static void
-error_gen_set_failure (eg_t *pvt, int percent)
+error_gen_set_failure (eg_t *pvt, double percent)
 {
+        double  ppm;
+
         GF_ASSERT (pvt);
 
-        if (percent)
-                pvt->failure_iter_no = 100/percent;
-        else
-                pvt->failure_iter_no = 100/GF_FAILURE_DEFAULT;
+        ppm = (percent / 100.0) * (double)FAILURE_GRANULARITY;
+        pvt->failure_iter_no = (int)ppm;
 }
 
 static void
@@ -1482,7 +1516,7 @@ reconfigure (xlator_t *this, dict_t *options)
         eg_t            *pvt = NULL;
         int32_t          ret = 0;
         char            *error_enable_fops = NULL;
-        int32_t          failure_percent_int = 0;
+        double           failure_percent_dbl = 0.0;
 
         if (!this || !this->private)
                 goto out;
@@ -1496,7 +1530,7 @@ reconfigure (xlator_t *this, dict_t *options)
         if (pvt->error_no)
                 pvt->error_no_int = conv_errno_to_int (&pvt->error_no);
 
-        GF_OPTION_RECONF ("failure", failure_percent_int, options, int32,
+        GF_OPTION_RECONF ("failure", failure_percent_dbl, options, percent,
                           out);
 
         GF_OPTION_RECONF ("enable", error_enable_fops, options, str, out);
@@ -1505,7 +1539,7 @@ reconfigure (xlator_t *this, dict_t *options)
                           bool, out);
 
         error_gen_parse_fill_fops (pvt, error_enable_fops);
-        error_gen_set_failure (pvt, failure_percent_int);
+        error_gen_set_failure (pvt, failure_percent_dbl);
 
         ret = 0;
 out:
@@ -1519,7 +1553,7 @@ init (xlator_t *this)
         eg_t            *pvt = NULL;
         int32_t          ret = 0;
         char            *error_enable_fops = NULL;
-        int32_t          failure_percent_int = 0;
+        double          failure_percent_dbl = 0.0;
 
         if (!this->children || this->children->next) {
                 gf_log (this->name, GF_LOG_ERROR,
@@ -1549,7 +1583,7 @@ init (xlator_t *this)
         if (pvt->error_no)
                 pvt->error_no_int = conv_errno_to_int (&pvt->error_no);
 
-        GF_OPTION_INIT ("failure", failure_percent_int, int32, out);
+        GF_OPTION_INIT ("failure", failure_percent_dbl, percent, out);
 
         GF_OPTION_INIT ("enable", error_enable_fops, str, out);
 
@@ -1557,7 +1591,7 @@ init (xlator_t *this)
 
 
         error_gen_parse_fill_fops (pvt, error_enable_fops);
-        error_gen_set_failure (pvt, failure_percent_int);
+        error_gen_set_failure (pvt, failure_percent_dbl);
 
         this->private = pvt;
 
@@ -1641,7 +1675,7 @@ struct xlator_fops fops = {
 
 struct volume_options options[] = {
         { .key  = {"failure"},
-          .type = GF_OPTION_TYPE_INT,
+          .type = GF_OPTION_TYPE_PERCENT,
           .description = "Percentage failure of operations when enabled.",
         },
 
diff --git a/xlators/debug/error-gen/src/error-gen.h b/xlators/debug/error-gen/src/error-gen.h
index 6ef6b232e6a..2cd95de335a 100644
--- a/xlators/debug/error-gen/src/error-gen.h
+++ b/xlators/debug/error-gen/src/error-gen.h
@@ -29,6 +29,12 @@ enum GF_PSEUDO_ERRORS {
 typedef struct {
         int enable[GF_FOP_MAXVALUE];
         int op_count;
+        /*
+         * This is only an iteration number in the random-failure case.  For
+         * the normal controlled-probability case, it's actually a numerator
+         * for the failure probability (see FAILURE_GRANULARITY declaration).
+         * It's just not worth blowing up the diff by changing it.
+         */
         int failure_iter_no;
         char *error_no;
         int error_no_int;
author	Jeff Darcy <jdarcy@fb.com>	2017-07-26 11:56:04 -0700
committer	Jeff Darcy <jeff@pl.atyp.us>	2017-07-31 21:23:43 +0000
commit	18cb1a87bb7c88657b499b22894b81f1f694b6ad (patch)
tree	3a3b2732939533c9aaa2e8d2e266309021a79828 /xlators/debug
parent	91c9f4a19fde4894576b398252c77f730832a26a (diff)