diff options
| author | Xavier Hernandez <xhernandez@datalab.es> | 2014-11-08 21:46:41 +0100 | 
|---|---|---|
| committer | Raghavendra Bhat <raghavendra@redhat.com> | 2015-02-11 01:37:55 -0800 | 
| commit | 22035d5e37db748cbdee2596d99006cea6b5282e (patch) | |
| tree | 3f84db610a611e52ee8b17b9977a81ed78ea44b4 | |
| parent | 8c9526a8626dc1a7b7aac5f96ecb33991dc44237 (diff) | |
ec: Fix posix compliance failures
This patch solves some problems that caused dispersed volumes to not
pass posix smoke tests:
* Problems in open/create with O_WRONLY
    Opening files with -w- permissions using O_WRONLY returned an EACCES
    error because internally O_WRONLY was replaced with O_RDWR.
* Problems with entrylk on renames.
    When source and destination were the same, ec tried to acquire
    the same entrylk twice, causing a deadlock.
* Overwrite of a variable when reordering locks.
    On a rename, if the second lock needed to be placed at the beggining
    of the list, the 'lock' variable was overwritten and later its timer
    was cancelled, cancelling the incorrect one.
* Handle O_TRUNC in open.
    When O_TRUNC was received in an open call, it was blindly propagated
    to child subvolumes. This caused a discrepancy between real file
    size and the size stored into trusted.ec.size xattr. This has been
    solved by removing O_TRUNC from open and later calling ftruncate.
This is a backport of http://review.gluster.org/9420
Change-Id: I20c3d6e1c11be314be86879be54b728e01013798
BUG: 1159471
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/9501
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
| -rw-r--r-- | tests/bugs/bug-1161886/bug-1161886.c | 53 | ||||
| -rw-r--r-- | tests/bugs/bug-1161886/bug-1161886.t | 44 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 54 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 50 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 62 | 
8 files changed, 198 insertions, 87 deletions
diff --git a/tests/bugs/bug-1161886/bug-1161886.c b/tests/bugs/bug-1161886/bug-1161886.c new file mode 100644 index 00000000000..036f4ad1cc7 --- /dev/null +++ b/tests/bugs/bug-1161886/bug-1161886.c @@ -0,0 +1,53 @@ + +#include <stdio.h> +#include "glfs.h" +#include "glfs-handles.h" + +int +main (int argc, char *argv[]) +{ +        glfs_t    *fs = NULL; +        glfs_fd_t *fd = NULL; +        int        ret = 1; + +        if (argc != 4) { +                fprintf (stderr, "Syntax: %s <host> <volname> <file>\n", argv[0]); +                return 1; +        } + +        fs = glfs_new (argv[2]); +        if (!fs) { +                fprintf (stderr, "glfs_new: returned NULL\n"); +                return 1; +        } + +        ret = glfs_set_volfile_server (fs, "tcp", argv[1], 24007); +        if (ret != 0) { +                fprintf (stderr, "glfs_set_volfile_server: retuned %d\n", ret); +                goto out; +        } +        ret = glfs_set_logging (fs, "/dev/null", 7); +        if (ret != 0) { +                fprintf (stderr, "glfs_set_logging: returned %d\n", ret); +                goto out; +        } +        ret = glfs_init (fs); +        if (ret != 0) { +                fprintf (stderr, "glfs_init: returned %d\n", ret); +                goto out; +        } + +        fd = glfs_open (fs, argv[3], O_RDWR | O_TRUNC); +        if (fd == NULL) { +                fprintf (stderr, "glfs_open: returned NULL\n"); +                goto out; +        } +        glfs_close(fd); + +        ret = 0; + +out: +        glfs_fini (fs); + +        return ret; +} diff --git a/tests/bugs/bug-1161886/bug-1161886.t b/tests/bugs/bug-1161886/bug-1161886.t new file mode 100644 index 00000000000..3911038cd5f --- /dev/null +++ b/tests/bugs/bug-1161886/bug-1161886.t @@ -0,0 +1,44 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function check_ec_size +{ +    local res + +    for i in {0..2}; do +        res=$(( `getfattr -n trusted.ec.size -e hex $B0/$V0$i/$1 | sed -n "s/^trusted.ec.size=//p"` )) +        if [ "x$res" == "x" -o "$res" != "$2" ]; then +            echo "N" +            return 0 +        fi +    done +    echo "Y" +    return 0 +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 1 $H0:$B0/${V0}{0..2} +EXPECT "Created" volinfo_field $V0 'Status' +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +TEST dd if=/dev/zero of=$M0/file1 bs=4k count=1 +TEST mv $M0/file1 $M0/file2 +TEST ! stat $M0/file1 +TEST stat $M0/file2 + +TEST gcc -Wall -O2 -I api/src -o $(dirname $0)/bug-1161886 $(dirname $0)/bug-1161886.c -lgfapi +TEST $(dirname $0)/bug-1161886 $H0 $V0 /file2 +EXPECT "^0$" stat -c "%s" $M0/file2 +EXPECT "^Y$" check_ec_size file2 0 + +rm -f $(dirname $0)/bug-1161886 + +cleanup diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index dbf2238ea8e..bf34206d00e 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -232,6 +232,16 @@ void ec_fop_set_error(ec_fop_data_t * fop, int32_t error)      UNLOCK(&fop->lock);  } +void ec_sleep(ec_fop_data_t *fop) +{ +    LOCK(&fop->lock); + +    fop->refs++; +    fop->jobs++; + +    UNLOCK(&fop->lock); +} +  int32_t ec_check_complete(ec_fop_data_t * fop, ec_resume_f resume)  {      int32_t error = -1; @@ -435,12 +445,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)          return 0;      } -    LOCK(&fop->lock); - -    fop->jobs++; -    fop->refs++; - -    UNLOCK(&fop->lock); +    ec_sleep(fop);      return 1;  } @@ -637,22 +642,23 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)  ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock,                                 int32_t update)  { -    ec_lock_t * tmp; +    ec_lock_t *new_lock, *tmp;      ec_lock_link_t *link = NULL;      int32_t tmp_update; +    new_lock = lock;      if ((fop->lock_count > 0) && -        (ec_lock_compare(fop->locks[0].lock, lock) > 0)) +        (ec_lock_compare(fop->locks[0].lock, new_lock) > 0))      {          tmp = fop->locks[0].lock; -        fop->locks[0].lock = lock; -        lock = tmp; +        fop->locks[0].lock = new_lock; +        new_lock = tmp;          tmp_update = fop->locks_update;          fop->locks_update = update;          update = tmp_update;      } -    fop->locks[fop->lock_count].lock = lock; +    fop->locks[fop->lock_count].lock = new_lock;      fop->locks[fop->lock_count].fop = fop;      fop->locks_update |= update << fop->lock_count; @@ -693,6 +699,16 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update)              return;          } + +        /* If there's another lock, make sure that it's not the same. Otherwise +         * do not insert it. +         * +         * This can only happen on renames where source and target names are +         * in the same directory. */ +        if ((fop->lock_count > 0) && +            (fop->locks[0].lock->loc.inode == tmp.inode)) { +            goto wipe; +        }      } else {          if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) {              ec_fop_set_error(fop, EIO); @@ -742,6 +758,7 @@ insert:  unlock:      UNLOCK(&tmp.inode->lock); +wipe:      loc_wipe(&tmp);      if (link != NULL) { @@ -870,12 +887,7 @@ void ec_lock(ec_fop_data_t * fop)              list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting); -            LOCK(&fop->lock); - -            fop->jobs++; -            fop->refs++; - -            UNLOCK(&fop->lock); +            ec_sleep(fop);              UNLOCK(&lock->loc.inode->lock); @@ -1329,12 +1341,7 @@ void ec_unlock_timer_add(ec_lock_link_t *link)          UNLOCK(&lock->loc.inode->lock);      } else if (lock->acquired) { -        LOCK(&fop->lock); - -        fop->jobs++; -        fop->refs++; - -        UNLOCK(&fop->lock); +        ec_sleep(fop);          /* If healing is needed, do not delay lock release to let self-heal           * start working as soon as possible. */ @@ -1353,6 +1360,7 @@ void ec_unlock_timer_add(ec_lock_link_t *link)                  refs = 0;              }          } else { +            ec_trace("UNLOCK_FORCE", fop, "lock=%p", lock);              *lock->plock = NULL;              refs = 0;          } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index a0f5c0bc290..2b1d9574cdf 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -93,6 +93,7 @@ void ec_dispatch_one(ec_fop_data_t * fop);  void ec_wait_winds(ec_fop_data_t * fop); +void ec_sleep(ec_fop_data_t *fop);  void ec_resume(ec_fop_data_t * fop, int32_t error);  void ec_resume_parent(ec_fop_data_t * fop, int32_t error); diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index e8d96272987..d48be20470d 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -158,10 +158,7 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)                  return EC_STATE_REPORT;              } -            if (ctx->flags == 0) -            { -                ctx->flags = fop->int32; -            } +            ctx->flags = fop->int32;              UNLOCK(&fop->fd->lock); @@ -207,8 +204,7 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)              /* We need to write to specific offsets on the bricks, so we               * need to remove O_APPEND from flags (if present) */ -            fop->int32 &= ~(O_ACCMODE | O_APPEND); -            fop->int32 |= O_RDWR; +            fop->int32 &= ~O_APPEND;          /* Fall through */ diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 949170db495..862d45d0cf3 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -783,7 +783,6 @@ ec_cbk_data_t * ec_heal_lookup_check(ec_heal_t * heal, uintptr_t * pgood,  void ec_heal_prepare(ec_heal_t * heal)  {      ec_cbk_data_t * cbk; -    ec_fd_t * ctx;      int32_t error = ENOMEM;      heal->available = heal->good; @@ -814,13 +813,6 @@ void ec_heal_prepare(ec_heal_t * heal)                  goto out;              } -            ctx = ec_fd_get(heal->fd, heal->xl); -            if ((ctx == NULL) || (loc_copy(&ctx->loc, &heal->loc) != 0)) -            { -                goto out; -            } - -            ctx->flags = O_RDWR;          }          if (heal->iatt.ia_type == IA_IFLNK) @@ -1057,11 +1049,6 @@ void ec_heal_reopen_fd(ec_heal_t * heal)                  else                  {                      flags = ctx_fd->flags & ~(O_TRUNC | O_APPEND); -                    if ((flags & O_ACCMODE) == O_WRONLY) -                    { -                        flags &= ~O_ACCMODE; -                        flags |= O_RDWR; -                    }                      ec_open(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,                              ec_heal_reopen_cbk, NULL, &heal->loc, flags, fd, diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index a962264fde9..c87f6ea1d01 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -694,6 +694,26 @@ void ec_wind_open(ec_t * ec, ec_fop_data_t * fop, int32_t idx)                        &fop->loc[0], fop->int32, fop->fd, fop->xdata);  } +int32_t ec_open_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                             int32_t op_ret, int32_t op_errno, +                             struct iatt *prebuf, struct iatt *postbuf, +                             dict_t *xdata) +{ +    ec_fop_data_t *fop = cookie; +    int32_t error = 0; + +    fop = fop->data; +    if (op_ret >= 0) { +        fop->answer->iatt[0] = *postbuf; +    } else { +        error = op_errno; +    } + +    ec_resume(fop, error); + +    return 0; +} +  int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state)  {      ec_cbk_data_t * cbk; @@ -714,21 +734,18 @@ int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state)                  return EC_STATE_REPORT;              } -            if (ctx->flags == 0) -            { -                ctx->flags = fop->int32; -            } +            ctx->flags = fop->int32;              UNLOCK(&fop->fd->lock); -            if ((fop->int32 & O_ACCMODE) == O_WRONLY) -            { -                fop->int32 &= ~O_ACCMODE; -                fop->int32 |= O_RDWR; -            }              /* We need to write to specific offsets on the bricks, so we -             * need to remove O_APPEND from flags (if present) */ -            fop->int32 &= ~O_APPEND; +               need to remove O_APPEND from flags (if present). +               If O_TRUNC is specified, we remove it from open and an +               ftruncate will be executed later, which will correctly update +               the file size taking appropriate locks. O_TRUNC flag is saved +               into fop->uint32 to use it later.*/ +            fop->uint32 = fop->int32 & O_TRUNC; +            fop->int32 &= ~(O_APPEND | O_TRUNC);          /* Fall through */ @@ -763,6 +780,17 @@ int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state)                          }                          UNLOCK(&fop->fd->lock); + +                        /* If O_TRUNC was specified, call ftruncate to +                           effectively trunc the file with appropriate locks +                           acquired. We don't use ctx->flags because self-heal +                           can use the same fd with different flags. */ +                        if (fop->uint32 != 0) { +                            ec_sleep(fop); +                            ec_ftruncate(fop->req_frame, fop->xl, cbk->mask, +                                         fop->minimum, ec_open_truncate_cbk, +                                         fop, cbk->fd, 0, NULL); +                        }                      }                  }                  if (cbk->op_ret < 0) { diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index a48ea09926a..140d59f5f20 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -1248,8 +1248,6 @@ int32_t ec_truncate_open_cbk(call_frame_t * frame, void * cookie,  int32_t ec_truncate_clean(ec_fop_data_t * fop)  { -    ec_fd_t * ctx; -      if (fop->fd == NULL)      {          fop->fd = fd_create(fop->loc[0].inode, fop->frame->root->pid); @@ -1257,13 +1255,6 @@ int32_t ec_truncate_clean(ec_fop_data_t * fop)          {              return 0;          } -        ctx = ec_fd_get(fop->fd, fop->xl); -        if ((ctx == NULL) || (loc_copy(&ctx->loc, &fop->loc[0]) != 0)) -        { -            return 0; -        } - -        ctx->flags = O_RDWR;          ec_open(fop->frame, fop->xl, fop->answer->mask, fop->minimum,                  ec_truncate_open_cbk, fop, &fop->loc[0], O_RDWR, fop->fd, @@ -1701,20 +1692,6 @@ out:  /* FOP: writev */ -int32_t ec_writev_init(ec_fop_data_t * fop) -{ -    ec_fd_t * ctx; - -    ctx = ec_fd_get(fop->fd, fop->xl); -    if (ctx != NULL) { -        if ((ctx->flags & O_ACCMODE) == O_RDONLY) { -            return EBADF; -        } -    } - -    return 0; -} -  int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,                               xlator_t * this, int32_t op_ret, int32_t op_errno,                               struct iovec * vector, int32_t count, @@ -1787,14 +1764,29 @@ int32_t ec_writev_merge_head(call_frame_t * frame, void * cookie,      return 0;  } -void ec_writev_start(ec_fop_data_t * fop) +void ec_writev_start(ec_fop_data_t *fop)  {      ec_t *ec = fop->xl->private;      struct iobref *iobref = NULL;      struct iobuf *iobuf = NULL;      void *ptr = NULL;      ec_fd_t *ctx; +    fd_t *fd;      size_t tail; +    uid_t uid; +    gid_t gid; + +    fd = fd_anonymous(fop->fd->inode); +    if (fd == NULL) { +        ec_fop_set_error(fop, EIO); + +        return; +    } + +    uid = fop->frame->root->uid; +    fop->frame->root->uid = 0; +    gid = fop->frame->root->gid; +    fop->frame->root->gid = 0;      ctx = ec_fd_get(fop->fd, fop->xl);      if (ctx != NULL) { @@ -1833,7 +1825,7 @@ void ec_writev_start(ec_fop_data_t * fop)      if (fop->head > 0)      {          ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, -                 NULL, fop->fd, ec->stripe_size, fop->offset, 0, NULL); +                 NULL, fd, ec->stripe_size, fop->offset, 0, NULL);      }      tail = fop->size - fop->user_size - fop->head;      if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) @@ -1841,7 +1833,7 @@ void ec_writev_start(ec_fop_data_t * fop)          if (fop->pre_size > fop->offset + fop->head + fop->user_size)          {              ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, -                     ec_writev_merge_tail, NULL, fop->fd, ec->stripe_size, +                     ec_writev_merge_tail, NULL, fd, ec->stripe_size,                       fop->offset + fop->size - ec->stripe_size, 0, NULL);          }          else @@ -1850,6 +1842,11 @@ void ec_writev_start(ec_fop_data_t * fop)          }      } +    fop->frame->root->uid = uid; +    fop->frame->root->gid = gid; + +    fd_unref(fd); +      return;  out: @@ -1860,6 +1857,11 @@ out:          iobref_unref(iobref);      } +    fop->frame->root->uid = uid; +    fop->frame->root->gid = gid; + +    fd_unref(fd); +      ec_fop_set_error(fop, EIO);  } @@ -2007,14 +2009,6 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)      switch (state)      {          case EC_STATE_INIT: -            fop->error = ec_writev_init(fop); -            if (fop->error != 0) -            { -                return EC_STATE_REPORT; -            } - -        /* Fall through */ -          case EC_STATE_LOCK:              ec_lock_prepare_fd(fop, fop->fd, 1);              ec_lock(fop);  | 
