diff options
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 9 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 559 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 20 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 97 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 123 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 10 | 
6 files changed, 742 insertions, 76 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index c6fc469ee4c..3e7aedde13d 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -149,6 +149,15 @@  #define GF_REBALANCE_TID_KEY     "rebalance-id"  #define GF_REMOVE_BRICK_TID_KEY  "remove-brick-id"  #define GF_REPLACE_BRICK_TID_KEY "replace-brick-id" +#define DHT_SKIP_NON_LINKTO_UNLINK "unlink-only-if-dht-linkto-file" +#define DHT_SKIP_OPEN_FD_UNLINK "dont-unlink-for-open-fd" + +#define DHT_LINKFILE_MODE        (S_ISVTX) + +#define IS_DHT_LINKFILE_MODE(iabuf) ((st_mode_from_ia ((iabuf)->ia_prot, \ +                                                       (iabuf)->ia_type) \ +                                      & ~S_IFMT)                        \ +                                     == DHT_LINKFILE_MODE)  /* NOTE: add members ONLY at the end (just before _MAXVALUE) */  typedef enum { diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 5f7996a9ad6..c5105d27b91 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -442,7 +442,8 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  is_dir = check_is_dir (inode, stbuf, xattr);                  if (!is_dir) {                          gf_log (this->name, GF_LOG_DEBUG, -                                "lookup of %s on %s returned non dir 0%o", +                                "lookup of %s on %s returned non dir 0%o " +                                "calling lookup_everywhere",                                  local->loc.path, prev->this->name,                                  stbuf->ia_type);                          local->need_selfheal = 1; @@ -541,6 +542,12 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          LOCK (&frame->lock);          { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "revalidate lookup of %s " +                        "returned with op_ret %d and op_errno %d", +                        local->loc.path, op_ret, op_errno); +                  if (op_ret == -1) {                          local->op_errno = op_errno; @@ -564,6 +571,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                           * the file is not migrated */                          if (op_errno == ENOENT) {                                  if (IA_ISREG (local->loc.inode->ia_type)) { + +                                        gf_log (this->name, GF_LOG_DEBUG, +                                                "found ENOENT for %s. " +                                                "Setting " +                                                "need_lookup_everywhere" +                                                " flag to 1", +                                                local->loc.path); +                                          local->need_lookup_everywhere = 1;                                  }                          } @@ -760,9 +775,16 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,          }  unwind: +        gf_log (this->name, GF_LOG_DEBUG, +                "creation of linkto on hashed subvol:%s, " +                "returned with op_ret %d and op_errno %d: %s", +                local->hashed_subvol->name, +                op_ret, op_errno, uuid_utoa (local->loc.gfid)); +          if (local->linked == _gf_true)                  dht_linkfile_attr_heal (frame, this); +          DHT_STRIP_PHASE1_FLAGS (&local->stbuf);          DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,                            local->inode, &local->stbuf, local->xattr, @@ -771,6 +793,176 @@ out:          return ret;  } +int +dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int op_ret, int op_errno, +                       struct iatt *preparent, struct iatt *postparent, +                       dict_t *xdata) +{ +        int             this_call_cnt = 0; +        dht_local_t     *local = NULL; +        const char      *path =  NULL; + +        local =  (dht_local_t*)frame->local; +        path = local->loc.path; + +        gf_log (this->name, GF_LOG_INFO, "lookup_unlink returned with " +                "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno, +                ((path == NULL)? "null" : path )); + +        this_call_cnt = dht_frame_return (frame); +        if (is_last_call (this_call_cnt)) { +                dht_lookup_everywhere_done (frame, this); +        } + +        return 0; +} + +int +dht_lookup_unlink_of_false_linkto_cbk (call_frame_t *frame, void *cookie, +                                       xlator_t *this, int op_ret, int op_errno, +                                       struct iatt *preparent, +                                       struct iatt *postparent, dict_t *xdata) +{ +        int             this_call_cnt = 0; +        dht_local_t     *local = NULL; +        const char      *path =  NULL; + +        local =  (dht_local_t*)frame->local; +        path = local->loc.path; + +        gf_log (this->name, GF_LOG_INFO, "lookup_unlink returned with " +                "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno, +                ((path == NULL)? "null" : path )); + +        this_call_cnt = dht_frame_return (frame); +        if (is_last_call (this_call_cnt)) { + +                if (op_ret == 0) { +                        dht_lookup_everywhere_done (frame, this); +                } else { +                       /*When dht_lookup_everywhere is performed, one cached +                         *and one hashed file was found and hashed file does +                         *not point to the above mentioned cached node. So it +                         *was considered as stale and an unlink was performed. +                         *But unlink fails. So may be rebalance is in progress. +                        *now ideally we have two data-files. One obtained during +                         *lookup_everywhere and one where unlink-failed. So +                         *at this point in time we cannot decide which one to +                         *choose because there are chances of first cached +                         *file is truncated after rebalance and if it is choosen +                        *as cached node, application will fail. So return EIO.*/ + +                        if (op_errno == EBUSY) { + +                                gf_log (this->name, GF_LOG_ERROR, +                                        "Could not unlink the linkto file as " +                                        "either fd is open and/or linkto xattr " +                                        "is set for %s", +                                        ((path == NULL)? "null":path)); + +                        } +                        DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, +                                          NULL, NULL); + +                } +        } + +        return 0; +} + +int +dht_lookup_unlink_stale_linkto_cbk (call_frame_t *frame, void *cookie, +                                    xlator_t *this, int op_ret, int op_errno, +                                    struct iatt *preparent, +                                    struct iatt *postparent, dict_t *xdata) +{ + +        dht_local_t     *local = NULL; +        const char      *path  = NULL; + +        /* NOTE: +         * If stale file unlink fails either there is an open-fd or is not an +         * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten +         *  to ENOENT +         */ + +        local = frame->local; + +        if (local && local->loc.path) +                path = local->loc.path; + +        gf_log (this->name, GF_LOG_INFO, "Returned with op_ret %d and " +                "op_errno %d for %s", op_ret, op_errno, +                ((path==NULL)?"null":path)); + +        DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, +                          NULL); + +        return 0; +} + +int +dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict) { + +        int ret = 0; + +        ret = dict_set_int32 (dict, DHT_SKIP_NON_LINKTO_UNLINK, 1); + +        if (ret) +                goto err; + +        ret =  dict_set_int32 (dict, DHT_SKIP_OPEN_FD_UNLINK, 1); + +        if (ret) +                goto err; + + +        return 0; + +err: +        return -1; + +} +/* Rebalance is performed from cached_node to hashed_node. Initial cached_node + * contains a non-linkto file. After migration it is converted to linkto and + * then unlinked. And at hashed_subvolume, first a linkto file is present, + * then after migration it is converted to a non-linkto file. + * + * Lets assume a file is present on cached subvolume and a new brick is added + * and new brick is the new_hashed subvolume. So fresh lookup on newly added + * hashed subvolume will fail and dht_lookup_everywhere gets called.  If just + * before sending the dht_lookup_everywhere request rebalance is in progress, + * + * from cached subvolume it may see: Nonlinkto or linkto or No file + * from hashed subvolume it may see: No file or linkto file or non-linkto file + * + * So this boils down to 9 cases: + *   at cached_subvol            at hashed_subvol + *   ----------------           ----------------- + * + *a)   No file                     No file + *    [request reached after    [Request reached before + *       migration]                Migration] + * + *b)   No file                     Linkto File + * + *c)   No file                     Non-Linkto File + * + *d)   Linkto                      No-File + * + *e)   Linkto                      Linkto + * + *f)   Linkto                      Non-Linkto + * + *g)   NonLinkto                   No-File + * + *h)   NonLinkto                   Linkto + * + *i)   NonLinkto                   NonLinkto + * + * dht_lookup_everywhere_done takes decision based on any of the above case + */  int  dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) @@ -780,6 +972,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)          xlator_t     *hashed_subvol = NULL;          xlator_t     *cached_subvol = NULL;          dht_layout_t *layout = NULL; +        gf_boolean_t  found_non_linkto_on_hashed = _gf_false;          local = frame->local;          hashed_subvol = local->hashed_subvol; @@ -801,19 +994,210 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)                  return 0;          } +        gf_log (this->name, GF_LOG_INFO, "STATUS: hashed_subvol %s " +                "cached_subvol %s", +                (hashed_subvol == NULL)?"null":hashed_subvol->name, +                (cached_subvol == NULL)?"null":cached_subvol->name); +          if (!cached_subvol) { -                DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, -                                  NULL); + +                if (local->skip_unlink.handle_valid_link && hashed_subvol) { + +                        /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK": +                         * If this lookup is performed by rebalance and this +                         * rebalance process detected hashed file and by +                         * the time it sends the lookup request to cached node, +                         * file got migrated and now at intial hashed_node, +                         * final migrated file is present. With current logic, +                         * because this process fails to find the cached_node, +                         * it will unlink the file at initial hashed_node. +                         * +                         * So we avoid this by setting key, and checking at the +                         * posix_unlink that unlink the file only if file is a +                         * linkto file and not a migrated_file. +                         */ + + +                        ret = dht_fill_dict_to_avoid_unlink_of_migrating_file +                              (local->xattr_req); + +                        if (ret) { +                                /* If for some reason, setting key in the dict +                                 * fails, return with ENOENT, as with respect to +                                 * this process, it detected only a stale link +                                 * file. +                                 * +                                 * Next lookup will delete it. +                                 * +                                 * Performing deletion of stale link file when +                                 * setting key in dict fails, may cause the data +                                 * loss becase of the above mentioned race. +                                 */ + + +                                DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, +                                                  NULL, NULL, NULL, NULL); +                        } else { +                               local->skip_unlink.handle_valid_link = _gf_false; + +                               gf_log (this->name, GF_LOG_DEBUG, +                                       "No Cached was found and " +                                       "unlink on hashed was skipped" +                                       " so performing now: %s", +                                       local->loc.path); + +                               STACK_WIND (frame, +                                            dht_lookup_unlink_stale_linkto_cbk, +                                            hashed_subvol, +                                            hashed_subvol->fops->unlink, +                                            &local->loc, 0, local->xattr_req); +                        } + +                } else  { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "There was no cached file and  " +                                "unlink on hashed is not skipped %s", +                                local->loc.path); + +                        DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, +                                          NULL, NULL); +                }                  return 0;          } -        if (local->need_lookup_everywhere) { -                if (uuid_compare (local->gfid, local->inode->gfid)) { -                        /* GFID different, return error */ -                        DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, -                                          NULL, NULL, NULL); -                        return 0; +        /* At the time of dht_lookup, no file was found on hashed and that is +         * why dht_lookup_everywhere is called, but by the time +         * dht_lookup_everywhere +         * reached to server, file might have already migrated. In that case we +         * will find a migrated file at the hashed_node. In this case store the +         * layout in context and return successfully. +         */ + +        if (hashed_subvol || local->need_lookup_everywhere) { + +                if (local->need_lookup_everywhere) { + +                        found_non_linkto_on_hashed = _gf_true; + +                } else if ((local->file_count == 1) && +                            (hashed_subvol == cached_subvol)) { + +                        gf_log (this->name, GF_LOG_DEBUG, +                                "found cached file on hashed subvolume " +                                "so store in context and return for %s", +                                local->loc.path); + +                        found_non_linkto_on_hashed = _gf_true;                  } + +                if (found_non_linkto_on_hashed) +                        goto preset_layout; + +        } + + +        if (hashed_subvol) { +                if (local->skip_unlink.handle_valid_link == _gf_true) { +                        if (cached_subvol == local->skip_unlink.hash_links_to) { + +                             if (uuid_compare (local->skip_unlink.cached_gfid, +                                               local->skip_unlink.hashed_gfid)){ + +                                        /*GFID different, return error*/ +                                     DHT_STACK_UNWIND (lookup, frame, -1, +                                                       ESTALE, NULL, NULL, NULL, +                                                       NULL); + + +                                } + +                                ret = dht_layout_preset (this, cached_subvol, +                                                         local->loc.inode); +                                if (ret) { +                                        gf_log (this->name, GF_LOG_INFO, +                                                "Could not set pre-set layout " +                                                "for subvolume %s", +                                                cached_subvol->name); +                                } + +                                local->op_ret = (ret == 0) ? ret : -1; +                                local->op_errno = (ret == 0) ? ret : EINVAL; + +                                /* Presence of local->cached_subvol validates +                                 * that lookup from cached node is successful +                                 */ + +                                if (!local->op_ret && local->loc.parent) { +                                        dht_inode_ctx_time_update +                                                (local->loc.parent, this, +                                                 &local->postparent, 1); +                                } + +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "Skipped unlinking linkto file " +                                        "on the hashed subvolume. " +                                        "Returning success as it is a " +                                        "valid linkto file. Path:%s" +                                        ,local->loc.path); + +                                goto unwind_hashed_and_cached; +                        } else { + +                               local->skip_unlink.handle_valid_link = _gf_false; + +                               gf_log (this->name, GF_LOG_DEBUG, +                                       "Linkto file found on hashed " +                                       "subvol " +                                       "and data file found on cached " +                                       "subvolume. But linkto points to " +                                       "different cached subvolume (%s) " +                                       "path %s", +                                       local->skip_unlink.hash_links_to->name, +                                       local->loc.path); + +                               if (local->skip_unlink.opend_fd_count == 0) { + + +                          ret = dht_fill_dict_to_avoid_unlink_of_migrating_file +                                  (local->xattr_req); + + +                                        if (ret) { +                                          DHT_STACK_UNWIND (lookup, frame, -1, +                                                            EIO, NULL, NULL, +                                                            NULL, NULL); +                                        } else { +                                                local->call_cnt = 1; +                                                STACK_WIND (frame, +                                          dht_lookup_unlink_of_false_linkto_cbk, +                                                    hashed_subvol, +                                                    hashed_subvol->fops->unlink, +                                                    &local->loc, 0, +                                                    local->xattr_req); +                                        } + +                                        return 0; + +                                } +                        } + +                } +        } + + +preset_layout: + +        if (found_non_linkto_on_hashed) { + +                if (local->need_lookup_everywhere) { +                        if (uuid_compare (local->gfid, local->inode->gfid)) { +                                /* GFID different, return error */ +                                DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, +                                                  NULL, NULL, NULL, NULL); +                                return 0; +                        } +                } +                  local->op_ret = 0;                  local->op_errno = 0;                  layout = dht_layout_for_subvol (this, cached_subvol); @@ -890,26 +1274,15 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)                                     cached_subvol, hashed_subvol, &local->loc);          return ret; -} - - -int -dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                       int op_ret, int op_errno, -                       struct iatt *preparent, struct iatt *postparent, -                       dict_t *xdata) -{ -        int  this_call_cnt = 0; - -        this_call_cnt = dht_frame_return (frame); -        if (is_last_call (this_call_cnt)) { -                dht_lookup_everywhere_done (frame, this); -        } +unwind_hashed_and_cached: +        DHT_STRIP_PHASE1_FLAGS (&local->stbuf); +        DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, +                          local->loc.inode, &local->stbuf, local->xattr, +                          &local->postparent);          return 0;  } -  int  dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                             int32_t op_ret, int32_t op_errno, @@ -924,8 +1297,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          xlator_t     *subvol        = NULL;          loc_t        *loc           = NULL;          xlator_t     *link_subvol   = NULL; -        int           ret = -1; -        int32_t       fd_count = 0; +        int           ret           = -1; +        int32_t       fd_count      = 0; +        dict_t       *dict_req      = {0};          GF_VALIDATE_OR_GOTO ("dht", frame, out);          GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -939,6 +1313,11 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          prev   = cookie;          subvol = prev->this; +        gf_log (this->name, GF_LOG_DEBUG, +                "returned with op_ret %d and op_errno %d (%s) " +                "from subvol %s", op_ret, op_errno, loc->path, +                subvol->name); +          LOCK (&frame->lock);          {                  if (op_ret == -1) { @@ -957,6 +1336,13 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  }                  is_linkfile = check_is_linkfile (inode, buf, xattr); +                if (is_linkfile) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Found linktofile on %s for %s", +                                subvol->name, loc->path); + +                } +                  is_dir = check_is_dir (inode, buf, xattr);                  if (is_linkfile) { @@ -981,18 +1367,26 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  } else {                          local->file_count++; +                        gf_log (this->name, GF_LOG_DEBUG, +                                "found cached file on %s for %s", +                                subvol->name, loc->path); +                          if (!local->cached_subvol) {                                  /* found one file */                                  dht_iatt_merge (this, &local->stbuf, buf,                                                  subvol);                                  local->xattr = dict_ref (xattr);                                  local->cached_subvol = subvol; +                                  gf_log (this->name, GF_LOG_DEBUG, -                                        "found on %s file %s", +                                        "datafile found on %s file %s",                                          subvol->name, loc->path);                                  dht_iatt_merge (this, &local->postparent,                                                  postparent, subvol); + +                                uuid_copy (local->skip_unlink.cached_gfid, +                                           buf->ia_gfid);                          } else {                                  /* This is where we need 'rename' both entries logic */                                  gf_log (this->name, GF_LOG_WARNING, @@ -1009,15 +1403,68 @@ unlock:          if (is_linkfile) {                  ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count); -                /* Delete the linkfile only if there are no open fds on it. -                   if there is a open-fd, it may be in migration */ -                if (!ret && (fd_count == 0)) { -                        gf_log (this->name, GF_LOG_INFO, -                                "deleting stale linkfile %s on %s", -                                loc->path, subvol->name); -                        STACK_WIND (frame, dht_lookup_unlink_cbk, -                                    subvol, subvol->fops->unlink, loc, 0, NULL); -                        return 0; + +                /*  Any linkto file found on the non-hashed subvolume should +                 *  be unlinked (performed in the "else if" block below) +                 * +                 *  But if a linkto file is found on hashed subvolume, it may be +                 *  pointing to vaild cached node. So unlinking of linkto +                 *  file on hashed subvolume is skipped and inside +                 *  dht_lookup_everywhere_done, checks are performed. If this +                 *  linkto file is found as stale linkto file, it is deleted +                 *  otherwise unlink is skipped. +                 */ + +                if (local->hashed_subvol && local->hashed_subvol == subvol) { + +                        local->skip_unlink.handle_valid_link = _gf_true; +                        local->skip_unlink.opend_fd_count = fd_count; +                        local->skip_unlink.hash_links_to = link_subvol; +                        uuid_copy (local->skip_unlink.hashed_gfid, +                                   buf->ia_gfid); + +                        gf_log (this->name, GF_LOG_DEBUG, "Found" +                                " one linkto file on hashed subvol %s " +                                "for %s: Skipping unlinking till " +                                "everywhere_done", subvol->name, +                                loc->path); + +                } else if (!ret && (fd_count == 0)) { + +                        dict_req = dict_new (); + +                        ret = dht_fill_dict_to_avoid_unlink_of_migrating_file +                              (dict_req); + +                        if (ret) { + +                                /* Skip unlinking for dict_failure +                                 *File is found as a linkto file on non-hashed, +                                 *subvolume. In the current implementation, +                                 *finding a linkto-file on non-hashed does not +                                 *always implies that it is stale. So deletion +                                 *of file should be done only when both fd is +                                 *closed and linkto-xattr is set. In case of +                                 *dict_set failure, avoid skipping of file. +                                 *NOTE: dht_frame_return should get called for +                                 *      this block. +                                 */ + +                                dict_unref (dict_req); + +                        } else { +                                gf_log (this->name, GF_LOG_INFO, +                                        "attempting deletion of stale linkfile " +                                        "%s on %s", loc->path, subvol->name); + +                                STACK_WIND (frame, dht_lookup_unlink_cbk, +                                            subvol, subvol->fops->unlink, loc, +                                            0, dict_req); + +                                dict_unref (dict_req); + +                                return 0; +                        }                  }          } @@ -1054,6 +1501,9 @@ dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)          if (!local->inode)                  local->inode = inode_ref (loc->inode); +        gf_log (this->name, GF_LOG_DEBUG, +                "winding lookup call to %d subvols", call_cnt); +          for (i = 0; i < call_cnt; i++) {                  STACK_WIND (frame, dht_lookup_everywhere_cbk,                              conf->subvolumes[i], @@ -1252,9 +1702,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          if (!op_ret && uuid_is_null (local->gfid))                  memcpy (local->gfid, stbuf->ia_gfid, 16); +        gf_log (this->name, GF_LOG_DEBUG, +                      "fresh_lookup returned for %s with op_ret %d and " +                      "op_errno %d", loc->path, op_ret, op_errno); +          if (ENTRY_MISSING (op_ret, op_errno)) {                  gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol"                          " %s", loc->path, prev->this->name); +                  if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {                          local->op_errno = ENOENT;                          dht_lookup_everywhere (frame, this, loc); @@ -1313,13 +1768,17 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);          if (!subvol) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "linkfile not having link subvolume. path=%s", -                        loc->path); +                gf_log (this->name, GF_LOG_INFO, "linkfile not having link " +                        "subvol for %s", loc->path); +                  dht_lookup_everywhere (frame, this, loc);                  return 0;          } +        gf_log (this->name, GF_LOG_DEBUG, +                      "Calling lookup on linkto target %s for path %s", +                      subvol->name, loc->path); +          STACK_WIND (frame, dht_lookup_linkfile_cbk,                      subvol, subvol->fops->lookup,                      &local->loc, local->xattr_req); @@ -1465,6 +1924,13 @@ dht_lookup (call_frame_t *frame, xlator_t *this,                          dht_layout_unref (this, local->layout);                          local->layout = NULL;                          local->cached_subvol = NULL; + +                        gf_log (this->name, GF_LOG_WARNING, +                                "Called revalidate lookup for %s, " +                                "but layout->gen (%d) is less than " +                                "conf->gen (%d), calling fresh_lookup", +                                loc->path, layout->gen, conf->gen); +                          goto do_fresh_lookup;                  } @@ -1521,6 +1987,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,  		for (i = 0; i < call_cnt; i++) {  			subvol = layout->list[i].xlator; +                        gf_log (this->name, GF_LOG_DEBUG, "calling " +                                "revalidate lookup for %s at %s", +                                loc->path, subvol->name); +  			STACK_WIND (frame, dht_revalidate_cbk,  				    subvol, subvol->fops->lookup,  				    &local->loc, local->xattr_req); @@ -1565,6 +2035,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this,                                  "no subvolume in layout for path=%s, "                                  "checking on all the subvols to see if "                                  "it is a directory", loc->path); +                          call_cnt        = conf->subvolume_cnt;                          local->call_cnt = call_cnt; @@ -1575,6 +2046,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,                                  goto err;                          } +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Found null hashed subvol. Calling lookup" +                                " on all nodes."); +                          for (i = 0; i < call_cnt; i++) {                                  STACK_WIND (frame, dht_lookup_dir_cbk,                                              conf->subvolumes[i], @@ -1584,6 +2059,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,                          return 0;                  } +                gf_log (this->name, GF_LOG_DEBUG, +                        "Calling fresh lookup for %s on" +                        " %s", loc->path, hashed_subvol->name); +                  STACK_WIND (frame, dht_lookup_cbk,                              hashed_subvol, hashed_subvol->fops->lookup,                              loc, local->xattr_req); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 83725f09712..c7f20a28383 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -96,6 +96,15 @@ struct dht_rebalance_ {          dict_t              *xdata;  }; +struct dht_skip_linkto_unlink { + +        gf_boolean_t    handle_valid_link; +        int             opend_fd_count; +        xlator_t        *hash_links_to; +        uuid_t          cached_gfid; +        uuid_t          hashed_gfid; +}; +  struct dht_local {          int                      call_cnt;          loc_t                    loc; @@ -184,6 +193,9 @@ struct dht_local {          xlator_t        *first_up_subvol;          gf_boolean_t     added_link; + +        struct dht_skip_linkto_unlink  skip_unlink; +  };  typedef struct dht_local dht_local_t; @@ -752,4 +764,12 @@ dht_inodectx_dump (xlator_t *this, inode_t *inode);  int  dht_subvol_status (dht_conf_t *conf, xlator_t *subvol); +void +dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc, +                                     dht_layout_t *layout); +int +dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this); + +int +dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict);  #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index d6e34f92036..725e0c8c7b0 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -237,13 +237,15 @@ out:  }  static inline int -__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, -                                 dict_t *dict, fd_t **dst_fd) +__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, +                                 struct iatt *stbuf, dict_t *dict, +                                 fd_t **dst_fd)  { -        xlator_t *this = NULL; -        int       ret  = -1; -        fd_t     *fd   = NULL; -        struct iatt new_stbuf = {0,}; +        xlator_t    *this        = NULL; +        int          ret         = -1; +        fd_t        *fd          = NULL; +        struct iatt  new_stbuf   = {0,}; +        struct iatt  check_stbuf = {0,};          this = THIS; @@ -300,6 +302,46 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc                  goto out;          } +        /*Reason of doing lookup after create again: +         *In the create, there is some time-gap between opening fd at the +         *server (posix_layer) and binding it in server (incrementing fd count), +         *so if in that time-gap, if other process sends unlink considering it +         *as a linkto file, because inode->fd count will be 0, so file will be +         *unlinked at the backend. And because furthur operations are performed +         *on fd, so though migration will be done but will end with no file +         *at  the backend. +         */ + + +        ret = syncop_lookup (to, loc, NULL, &check_stbuf, NULL, NULL); +        if (!ret) { +                if (uuid_compare (stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "file %s exists in %s with different gfid," +                                "found in lookup after create", +                                loc->path, to->name); +                        ret = -1; +                        fd_unref (fd); +                        goto out; +                } + +        } + +        if (-ret == ENOENT) { +                gf_log (this->name, GF_LOG_ERROR, +                        "%s: file does not exists" +                        "on %s (%s)", loc->path, to->name, strerror (-ret)); +                ret = -1; +                fd_unref (fd); +                goto out; +        } + +        ret = syncop_fsetxattr (to, fd, dict, 0); +        if (ret < 0) +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: failed to set xattr on %s (%s)", +                        loc->path, to->name, strerror (-ret)); +          ret = syncop_ftruncate (to, fd, stbuf->ia_size);          if (ret < 0)                  gf_log (this->name, GF_LOG_ERROR, @@ -650,17 +692,18 @@ int  dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,                    int flag)  { -        int             ret            = -1; -        struct iatt     new_stbuf      = {0,}; -        struct iatt     stbuf          = {0,}; -        struct iatt     empty_iatt     = {0,}; -        ia_prot_t       src_ia_prot    = {0,}; -        fd_t           *src_fd         = NULL; -        fd_t           *dst_fd         = NULL; -        dict_t         *dict           = NULL; -        dict_t         *xattr          = NULL; -        dict_t         *xattr_rsp      = NULL; -        int             file_has_holes = 0; +        int          ret                  = -1; +        struct iatt  new_stbuf            = {0,}; +        struct iatt  stbuf                = {0,}; +        struct iatt  empty_iatt           = {0,}; +        ia_prot_t    src_ia_prot          = {0,}; +        fd_t        *src_fd               = NULL; +        fd_t        *dst_fd               = NULL; +        dict_t      *dict                 = NULL; +        dict_t      *xattr                = NULL; +        dict_t      *xattr_rsp            = NULL; +        int          file_has_holes       = 0; +        int          rcvd_enoent_from_src = 0;          gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",                  loc->path, from->name, to->name); @@ -827,15 +870,31 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,          }          /* Do a stat and check the gfid before unlink */ + +        /* +         * Cached file changes its state from non-linkto to linkto file after +         * migrating data. If lookup from any other mount-point is performed, +         * converted-linkto-cached file will be treated as a stale and will be +         * unlinked. But by this time, file is already migrated. So further +         * failure because of ENOENT should  not be treated as error +         */ +          ret = syncop_stat (from, loc, &empty_iatt);          if (ret) {                  gf_log (this->name, GF_LOG_WARNING,                          "%s: failed to do a stat on %s (%s)",                          loc->path, from->name, strerror (errno)); -                goto out; + +                if (-ret != ENOENT) { +                        ret = -1; +                        goto out; +                } + +                rcvd_enoent_from_src = 1;          } -        if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { +        if ((uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0 ) && +            (!rcvd_enoent_from_src)) {                  /* take out the source from namespace */                  ret = syncop_unlink (from, loc);                  if (ret) { diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index dc3a709cd26..bf5c188e5ca 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1019,20 +1019,60 @@ out:          return 0;  } +int32_t +posix_unlink_gfid_handle_and_entry (xlator_t *this, const char *real_path, +                                    struct iatt *stbuf, int32_t *op_errno) +{ +        int32_t             ret      =   0; + +        /*  Unlink the gfid_handle_first */ + +        if (stbuf && stbuf->ia_nlink == 1) { +                ret = posix_handle_unset (this, stbuf->ia_gfid, NULL); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "unlink of gfid handle failed for path:%s with" +                                "gfid %s with errno:%s", real_path, +                                uuid_utoa (stbuf->ia_gfid), strerror (errno)); +                } +        } + +        /* Unlink the actual file */ +        ret = sys_unlink (real_path); +        if (ret == -1) { +                if (op_errno) +                        *op_errno = errno; + +                gf_log (this->name, GF_LOG_ERROR, +                        "unlink of %s failed: %s", real_path, +                        strerror (errno)); +                goto err; +        } + +        return 0; + +err: +        return -1; +}  int32_t  posix_unlink (call_frame_t *frame, xlator_t *this,                loc_t *loc, int xflag, dict_t *xdata)  { -        int32_t               op_ret     = -1; -        int32_t               op_errno   = 0; -        char                 *real_path  = NULL; -        char                 *par_path   = NULL; -        int32_t               fd         = -1; -        struct iatt           stbuf      = {0,}; -        struct posix_private *priv       = NULL; -        struct iatt           preparent  = {0,}; -        struct iatt           postparent = {0,}; +        int32_t                op_ret             = -1; +        int32_t                op_errno           = 0; +        char                   *real_path         = NULL; +        char                   *par_path          = NULL; +        int32_t                fd                 = -1; +        struct iatt            stbuf              = {0,}; +        struct posix_private  *priv               = NULL; +        struct iatt            preparent          = {0,}; +        struct iatt            postparent         = {0,}; +        int32_t                unlink_if_linkto   = 0; +        int32_t                check_open_fd      = 0; +        int32_t                skip_unlink        = 0; +        ssize_t                xattr_size         = -1; +        int32_t                is_dht_linkto_file = 0;          DECLARE_OLD_FS_ID_VAR; @@ -1052,10 +1092,62 @@ posix_unlink (call_frame_t *frame, xlator_t *this,                  goto out;          } -        if (stbuf.ia_nlink == 1) -                posix_handle_unset (this, stbuf.ia_gfid, NULL); -          priv = this->private; + +        op_ret = dict_get_int32 (xdata, DHT_SKIP_OPEN_FD_UNLINK, +                                 &check_open_fd); + +        if (!op_ret && check_open_fd) { + +                LOCK (&loc->inode->lock); + +                if (loc->inode->fd_count) { +                        skip_unlink = 1; +                } + +                UNLOCK (&loc->inode->lock); + +                gf_log (this->name, GF_LOG_INFO, "open-fd-key-status: " +                        "%"PRIu32" for %s", skip_unlink, real_path); + +                if (skip_unlink) { +                        op_ret = -1; +                        op_errno = EBUSY; +                        goto out; +                } +        } + + +        op_ret = dict_get_int32 (xdata, DHT_SKIP_NON_LINKTO_UNLINK, +                                 &unlink_if_linkto); + +        if (!op_ret && unlink_if_linkto) { + +                LOCK (&loc->inode->lock); + +                xattr_size = sys_lgetxattr (real_path, LINKTO, NULL, 0); + +                if (xattr_size <= 0) { +                        skip_unlink = 1; +                } else { +                       is_dht_linkto_file =  IS_DHT_LINKFILE_MODE (&stbuf); +                       if (!is_dht_linkto_file) +                               skip_unlink = 1; +                } + +                UNLOCK (&loc->inode->lock); + +                gf_log (this->name, GF_LOG_INFO, "linkto_xattr status: " +                        "%"PRIu32" for %s", skip_unlink, real_path); + +                if (skip_unlink) { +                        op_ret = -1; +                        op_errno = EBUSY; +                        goto out; +                } +        } + +          if (priv->background_unlink) {                  if (IA_ISREG (loc->inode->ia_type)) {                          fd = open (real_path, O_RDONLY); @@ -1070,12 +1162,9 @@ posix_unlink (call_frame_t *frame, xlator_t *this,                  }          } -        op_ret = sys_unlink (real_path); +        op_ret =  posix_unlink_gfid_handle_and_entry (this, real_path, &stbuf, +                                                      &op_errno);          if (op_ret == -1) { -                op_errno = errno; -                gf_log (this->name, GF_LOG_ERROR, -                        "unlink of %s failed: %s", real_path, -                        strerror (op_errno));                  goto out;          } diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 58f445c699a..80121c08c8f 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -49,6 +49,16 @@  #include "posix-aio.h"  #endif +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 + +#define LINKTO "trusted.glusterfs.dht.linkto" + +#define POSIX_GFID_HANDLE_SIZE(base_path_len) (base_path_len + SLEN("/") \ +                                               + SLEN(GF_HIDDEN_PATH) + SLEN("/") \ +                                               + SLEN("00/")            \ +                                               + SLEN("00/") + SLEN(UUID0_STR) + 1) /* '\0' */; +  /**   * posix_fd - internal structure common to file and directory fd's   */  | 
