From fd2f9c0be1a5e170ca71079b2da2c9f3d64341ae Mon Sep 17 00:00:00 2001
From: Raghavendra G <raghavendra@gluster.com>
Date: Fri, 19 Aug 2011 12:08:15 +0530
Subject: performance/write-behind: preserve lk-owner while syncing writes.

  - This patch also makes syncing of non-overlapping but consecutive
    writes parallel. Till now only contiguous writes were synced
    parallely.

Change-Id: Icf0d5ea373f30c79fcdc90ba44b7e7a1bc5f0111
BUG: 765141
Signed-off-by: Raghavendra G <raghavendra@gluster.com>
Reviewed-on: http://review.gluster.com/269
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
---
 .../performance/write-behind/src/write-behind.c    | 209 ++++++++++++++-------
 1 file changed, 143 insertions(+), 66 deletions(-)

(limited to 'xlators/performance')

diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 81a254df6a0..704bde416bf 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -28,9 +28,9 @@
 #include "statedump.h"
 #include "write-behind-mem-types.h"
 
-#define MAX_VECTOR_COUNT  8
-#define WB_AGGREGATE_SIZE 131072 /* 128 KB */
-#define WB_WINDOW_SIZE    1048576 /* 1MB */
+#define MAX_VECTOR_COUNT          8
+#define WB_AGGREGATE_SIZE         131072 /* 128 KB */
+#define WB_WINDOW_SIZE            1048576 /* 1MB */
 
 typedef struct list_head list_head_t;
 struct wb_conf;
@@ -55,22 +55,23 @@ typedef struct wb_file {
 }wb_file_t;
 
 typedef struct wb_request {
-        list_head_t     list;
-        list_head_t     winds;
-        list_head_t     unwinds;
-        list_head_t     other_requests;
-        call_stub_t    *stub;
-        size_t          write_size;
-        int32_t         refcount;
-        wb_file_t      *file;
-        glusterfs_fop_t fop;
+        list_head_t           list;
+        list_head_t           winds;
+        list_head_t           unwinds;
+        list_head_t           other_requests;
+        call_stub_t          *stub;
+        size_t                write_size;
+        int32_t               refcount;
+        wb_file_t            *file;
+        glusterfs_fop_t       fop;
+        gf_lkowner_t          lk_owner;
         union {
                 struct  {
-                        char write_behind;
-                        char stack_wound;
-                        char got_reply;
-                        char virgin;
-                        char flush_all;     /* while trying to sync to back-end,
+                        char  write_behind;
+                        char  stack_wound;
+                        char  got_reply;
+                        char  virgin;
+                        char  flush_all;     /* while trying to sync to back-end,
                                              * don't wait till a data of size
                                              * equal to configured aggregate-size
                                              * is accumulated, instead sync
@@ -87,12 +88,12 @@ typedef struct wb_request {
 } wb_request_t;
 
 struct wb_conf {
-        uint64_t     aggregate_size;
-        uint64_t     window_size;
-        uint64_t     disable_till;
-        gf_boolean_t enable_O_SYNC;
-        gf_boolean_t flush_behind;
-        gf_boolean_t enable_trickling_writes;
+        uint64_t         aggregate_size;
+        uint64_t         window_size;
+        uint64_t         disable_till;
+        gf_boolean_t     enable_O_SYNC;
+        gf_boolean_t     flush_behind;
+        gf_boolean_t     enable_trickling_writes;
 };
 
 typedef struct wb_local {
@@ -119,6 +120,71 @@ ssize_t
 __wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size,
                  char enable_trickling_writes);
 
+/*
+  Below is a succinct explanation of the code deciding whether two regions
+  overlap, from Pavan <tcp@gluster.com>.
+
+  For any two ranges to be non-overlapping, either the end of the first
+  range is lesser than the start of the second, or vice versa. Example -
+
+  <--------->       <-------------->
+  p         q       x              y
+
+  ( q < x ) or (y < p) = > No overlap.
+
+  To check for *overlap*, we can negate this (using de morgan's laws), and
+  it becomes -
+
+  (q >= x ) and (y >= p)
+
+  Either that, or you write the negation using -
+
+  if (! ((q < x) or (y < p)) ) {
+  "Overlap"
+  }
+*/
+
+static inline char
+wb_requests_overlap (wb_request_t *request1, wb_request_t *request2)
+{
+        off_t r1_start = 0, r1_end = 0, r2_start = 0, r2_end = 0;
+
+        r1_start = request1->stub->args.writev.off;
+        r1_end = r1_start + iov_length (request1->stub->args.writev.vector,
+                                        request1->stub->args.writev.count);
+
+        r2_start = request2->stub->args.writev.off;
+        r2_end = r2_start + iov_length (request2->stub->args.writev.vector,
+                                        request2->stub->args.writev.count);
+
+        return ((r1_end >= r2_start) && (r2_end >= r1_start));
+}
+
+
+static inline char
+wb_overlap (list_head_t *list, wb_request_t *request)
+{
+        char          overlap = 0;
+        wb_request_t *tmp     = NULL;
+
+        GF_VALIDATE_OR_GOTO ("write-behind", list, out);
+        GF_VALIDATE_OR_GOTO ("write-behind", request, out);
+
+        list_for_each_entry (tmp, list, list) {
+                if (tmp == request) {
+                        break;
+                }
+
+                overlap = wb_requests_overlap (tmp, request);
+                if (overlap) {
+                        break;
+                }
+        }
+
+out:
+        return overlap;
+}
+
 
 static int
 __wb_request_unref (wb_request_t *this)
@@ -252,6 +318,8 @@ wb_enqueue (wb_file_t *file, call_stub_t *stub)
                 request->flags.write_request.virgin = 1;
         }
 
+        request->lk_owner = frame->root->lk_owner;
+
         LOCK (&file->lock);
         {
                 list_add_tail (&request->list, &file->request);
@@ -421,19 +489,21 @@ out:
 ssize_t
 wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds)
 {
-        wb_request_t   *dummy         = NULL, *request = NULL;
-        wb_request_t   *first_request = NULL, *next = NULL;
-        size_t          total_count   = 0, count = 0;
-        size_t          copied        = 0;
-        call_frame_t   *sync_frame    = NULL;
-        struct iobref  *iobref        = NULL;
-        wb_local_t     *local         = NULL;
-        struct iovec   *vector        = NULL;
-        ssize_t         current_size  = 0, bytes = 0;
-        size_t          bytecount     = 0;
-        wb_conf_t      *conf          = NULL;
-        fd_t           *fd            = NULL;
-        int32_t         op_errno      = -1;
+        wb_request_t  *dummy                = NULL, *request = NULL;
+        wb_request_t  *first_request        = NULL, *next = NULL;
+        size_t         total_count          = 0, count = 0;
+        size_t         copied               = 0;
+        call_frame_t  *sync_frame           = NULL;
+        struct iobref *iobref               = NULL;
+        wb_local_t    *local                = NULL;
+        struct iovec  *vector               = NULL;
+        ssize_t        current_size         = 0, bytes = 0;
+        size_t         bytecount            = 0;
+        wb_conf_t     *conf                 = NULL;
+        fd_t          *fd                   = NULL;
+        int32_t        op_errno             = -1;
+        off_t          next_offset_expected = 0;
+        gf_lkowner_t   lk_owner             = {0, };
 
         GF_VALIDATE_OR_GOTO_WITH_ERROR ((file ? file->this->name
                                          : "write-behind"), frame,
@@ -485,6 +555,10 @@ wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds)
 
                         first_request = request;
                         current_size = 0;
+
+                        next_offset_expected = request->stub->args.writev.off
+                                + request->write_size;
+                        lk_owner = request->lk_owner;
                 }
 
                 count += request->stub->args.writev.count;
@@ -514,8 +588,9 @@ wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds)
                     || ((count + next->stub->args.writev.count)
                         > MAX_VECTOR_COUNT)
                     || ((current_size + next->write_size)
-                        > conf->aggregate_size)) {
-
+                        > conf->aggregate_size)
+                    || (next_offset_expected != next->stub->args.writev.off)
+                    || (!is_same_lkowner (&lk_owner, &next->lk_owner))) {
                         sync_frame = copy_frame (frame);
                         if (sync_frame == NULL) {
                                 bytes = -1;
@@ -523,6 +598,8 @@ wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds)
                                 goto out;
                         }
 
+                        frame->root->lk_owner = lk_owner;
+
                         sync_frame->local = local;
                         local->file = file;
 
@@ -1518,12 +1595,11 @@ unwind:
 size_t
 __wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds)
 {
-        wb_request_t *request         = NULL;
-        size_t        size            = 0;
-        char          first_request   = 1;
-        off_t         offset_expected = 0;
-        wb_conf_t    *conf            = NULL;
-        int           count           = 0;
+        wb_request_t *request           = NULL;
+        size_t        size              = 0;
+        char          first_request     = 1, overlap = 0;
+        wb_conf_t    *conf              = NULL;
+        int           count             = 0;
 
         GF_VALIDATE_OR_GOTO ("write-behind", file, out);
         GF_VALIDATE_OR_GOTO (file->this->name, list, out);
@@ -1541,12 +1617,11 @@ __wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds)
                 if (!request->flags.write_request.stack_wound) {
                         if (first_request) {
                                 first_request = 0;
-                                offset_expected
-                                        = request->stub->args.writev.off;
-                        }
-
-                        if (request->stub->args.writev.off != offset_expected) {
-                                break;
+                        } else {
+                                overlap = wb_overlap (list, request);
+                                if (overlap) {
+                                        goto out;
+                                }
                         }
 
                         if ((file->flags & O_APPEND)
@@ -1554,12 +1629,10 @@ __wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds)
                                  > conf->aggregate_size)
                                 || ((count + request->stub->args.writev.count)
                                     > MAX_VECTOR_COUNT))) {
-                                break;
+                                goto out;
                         }
 
                         size += request->write_size;
-                        offset_expected += request->write_size;
-                        file->aggregate_current -= request->write_size;
                         count += request->stub->args.writev.count;
 
                         request->flags.write_request.stack_wound = 1;
@@ -1568,19 +1641,23 @@ __wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds)
         }
 
 out:
+        if (file != NULL) {
+                file->aggregate_current -= size;
+        }
+
         return size;
 }
 
 
 int32_t
 __wb_can_wind (list_head_t *list, char *other_fop_in_queue,
-               char *non_contiguous_writes, char *incomplete_writes,
+               char *overlapping_writes, char *incomplete_writes,
                char *wind_all)
 {
         wb_request_t *request         = NULL;
         char          first_request   = 1;
-        off_t         offset_expected = 0;
         int32_t       ret             = -1;
+        char          overlap         = 0;
 
         GF_VALIDATE_OR_GOTO ("write-behind", list, out);
 
@@ -1605,8 +1682,6 @@ __wb_can_wind (list_head_t *list, char *other_fop_in_queue,
                         if (first_request) {
                                 char flush = 0;
                                 first_request = 0;
-                                offset_expected
-                                        = request->stub->args.writev.off;
 
                                 flush = request->flags.write_request.flush_all;
                                 if (wind_all != NULL) {
@@ -1614,14 +1689,14 @@ __wb_can_wind (list_head_t *list, char *other_fop_in_queue,
                                 }
                         }
 
-                        if (offset_expected != request->stub->args.writev.off) {
-                                if (non_contiguous_writes) {
-                                        *non_contiguous_writes = 1;
+                        overlap = wb_overlap (list, request);
+                        if (overlap) {
+                                if (overlapping_writes != NULL) {
+                                        *overlapping_writes = 1;
                                 }
+
                                 break;
                         }
-
-                        offset_expected += request->write_size;
                 }
         }
 
@@ -1638,7 +1713,7 @@ __wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf,
         size_t        size                   = 0;
         char          other_fop_in_queue     = 0;
         char          incomplete_writes      = 0;
-        char          non_contiguous_writes  = 0;
+        char          overlapping_writes     = 0;
         wb_request_t *request                = NULL;
         wb_file_t    *file                   = NULL;
         char          wind_all               = 0;
@@ -1655,7 +1730,7 @@ __wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf,
         file = request->file;
 
         ret = __wb_can_wind (list, &other_fop_in_queue,
-                             &non_contiguous_writes, &incomplete_writes,
+                             &overlapping_writes, &incomplete_writes,
                              &wind_all);
         if (ret == -1) {
                 gf_log (file->this->name, GF_LOG_WARNING,
@@ -1664,7 +1739,7 @@ __wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf,
         }
 
         if (!incomplete_writes && ((enable_trickling_writes)
-                                   || (wind_all) || (non_contiguous_writes)
+                                   || (wind_all) || (overlapping_writes)
                                    || (other_fop_in_queue)
                                    || (file->aggregate_current
                                        >= aggregate_conf))) {
@@ -1988,7 +2063,9 @@ __wb_collapse_write_bufs (list_head_t *requests, size_t page_size)
                         offset_expected = holder->stub->args.writev.off
                                 + holder->write_size;
 
-                        if (request->stub->args.writev.off != offset_expected) {
+                        if ((request->stub->args.writev.off != offset_expected)
+                            || (!is_same_lkowner (&request->lk_owner,
+                                                  &holder->lk_owner))) {
                                 holder = request;
                                 continue;
                         }
-- 
cgit