diff options
Diffstat (limited to 'xlators')
122 files changed, 3923 insertions, 2152 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8dbdb572abd..032ab5c8001 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,6 +45,41 @@ afr_quorum_errno(afr_private_t *priv) return ENOTCONN; } +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid) +{ + if (!__is_root_gfid(pargfid)) { + return _gf_false; + } + + if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { + /*For backward compatibility /.landfill is private*/ + return _gf_true; + } + + if (pid == GF_CLIENT_PID_GSYNCD) { + /*geo-rep needs to create/sync private directory on slave because + * it appears in changelog*/ + return _gf_false; + } + + if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { + if (strcmp(name, priv->anon_inode_name) == 0) { + /* anonymous-inode dir is private*/ + return _gf_true; + } + } else { + if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == + 0) { + /* anonymous-inode dir prefix is private for geo-rep to work*/ + return _gf_true; + } + } + + return _gf_false; +} + void afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, unsigned char *replies) @@ -1197,12 +1232,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, return 0; } -int +static int afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, int *spb_choice) { int ret = -1; - GF_VALIDATE_OR_GOTO(this->name, inode, out); LOCK(&inode->lock); @@ -1214,6 +1248,40 @@ out: return ret; } +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) +{ + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); + + priv = this->private; + + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; + } + } + +out: + return ret; +} int afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int event) @@ -2238,8 +2306,9 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, * need is a low probability that multiple clients * won't converge on the same subvolume. */ + gf_uuid_copy(gfid_copy, args->gfid); pid = getpid(); - memcpy(gfid_copy, &pid, sizeof(pid)); + *(pid_t *)gfid_copy ^= pid; } child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; @@ -2823,7 +2892,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = -1; + int spb_subvol = -1; int child_count = -1; if (*read_subvol != -1) @@ -2833,10 +2902,10 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, local = frame->local; child_count = priv->child_count; - afr_inode_split_brain_choice_get(local->inode, this, &spb_choice); - if ((spb_choice >= 0) && + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && (AFR_COUNT(success_replies, child_count) == child_count)) { - *read_subvol = spb_choice; + *read_subvol = spb_subvol; } else if (!priv->quorum_count || frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { *read_subvol = afr_first_up_child(frame, this); @@ -3635,7 +3704,7 @@ afr_ta_id_file_check(void *opaque) this = opaque; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_false); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc.name); @@ -3945,11 +4014,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) return 0; } - if (__is_root_gfid(loc->parent->gfid)) { - if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) { - op_errno = EPERM; - goto out; - } + if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, + frame->root->pid)) { + op_errno = EPERM; + goto out; } local = AFR_FRAME_INIT(frame, op_errno); @@ -5627,6 +5695,7 @@ afr_priv_dump(xlator_t *this) priv->background_self_heal_count); gf_proc_dump_write("healers", "%d", priv->healers); gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); + gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); if (priv->quorum_count == AFR_QUORUM_AUTO) { gf_proc_dump_write("quorum-type", "auto"); } else if (priv->quorum_count == 0) { @@ -6620,6 +6689,7 @@ afr_priv_destroy(afr_private_t *priv) GF_FREE(priv->local); GF_FREE(priv->pending_key); GF_FREE(priv->children); + GF_FREE(priv->anon_inode); GF_FREE(priv->child_up); GF_FREE(priv->halo_child_up); GF_FREE(priv->child_latency); diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index f69013f3e0a..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -164,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) } static void -afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, - gf_dirent_t *entries, fd_t *fd) +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) { int ret = -1; gf_dirent_t *entry = NULL; @@ -183,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) { - if (__is_root_gfid(fd->inode->gfid) && - !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { continue; } @@ -228,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret >= 0) - afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries, - local->fd); + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index a5b004f4258..64856042b65 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -137,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = 0; + int spb_subvol = 0; int event_generation = 0; int ret = 0; int32_t op_errno = 0; @@ -179,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ret = afr_inode_get_readable(frame, local->inode, this, NULL, &event_generation, AFR_DATA_TRANSACTION); if ((ret < 0) && - (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) == - 0) && - spb_choice < 0) { + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { afr_inode_refresh(frame, this, local->inode, local->inode->gfid, afr_open_continue); } else { diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 772b59f9a2f..6fc2c75145c 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -164,7 +164,7 @@ afr_ta_read_txn(void *opaque) xdata_rsp = NULL; /* It doesn't. So query thin-arbiter to see if it blames any data brick. */ - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc.name); @@ -272,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) int read_subvol = -1; inode_t *inode = NULL; int ret = -1; - int spb_choice = -1; + int spb_subvol = -1; local = frame->local; inode = local->inode; @@ -303,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) local->read_attempted[read_subvol] = 1; readfn: if (read_subvol == -1) { - ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice); - if ((ret == 0) && spb_choice >= 0) - read_subvol = spb_choice; + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; } if (read_subvol == -1) { diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index f35c41df274..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2750,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, out: return source; } + +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; + } + } + + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } + + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } + + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); + } else { + op_errno = ENOMEM; + } + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index ac31751997f..64893f441e3 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -16,54 +16,170 @@ #include <glusterfs/syncop-utils.h> #include <glusterfs/events.h> -static int -afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, - inode_t *inode, int child, struct afr_reply *replies) +int +afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, + struct afr_reply *replies, + gf_boolean_t *anon_inode) { afr_private_t *priv = NULL; + afr_local_t *local = NULL; xlator_t *subvol = NULL; int ret = 0; + int i = 0; + char g[64] = {0}; + unsigned char *lookup_success = NULL; + call_frame_t *frame = NULL; + loc_t loc2 = { + 0, + }; loc_t loc = { 0, }; - char g[64]; priv = this->private; - subvol = priv->children[child]; + lookup_success = alloca0(priv->child_count); + uuid_utoa_r(replies[child].poststat.ia_gfid, g); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + if (replies[child].poststat.ia_type == IA_IFDIR) { + /* This directory may have sub-directory hierarchy which may need to + * be preserved for subsequent heals. So unconditionally move the + * directory to anonymous-inode directory*/ + *anon_inode = _gf_true; + goto anon_inode; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid); + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + lookup_success[i] = 1; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + ret = -local->replies[i].op_errno; + } + } + + if (priv->quorum_count) { + if (afr_has_quorum(lookup_success, this, NULL)) { + *anon_inode = _gf_true; + } + } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) { + *anon_inode = _gf_true; + } else if (ret) { + goto out; + } + +anon_inode: + if (!*anon_inode) { + ret = 0; + goto out; + } loc.parent = inode_ref(dir); gf_uuid_copy(loc.pargfid, dir->gfid); loc.name = name; - loc.inode = inode_ref(inode); - if (replies[child].valid && replies[child].op_ret == 0) { - switch (replies[child].poststat.ia_type) { - case IA_IFDIR: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), - name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), - subvol->name); - ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); - break; - default: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), - name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), - subvol->name); - ret = syncop_unlink(subvol, &loc, NULL, NULL); - break; - } + ret = afr_anon_inode_create(this, child, &loc2.parent); + if (ret < 0) + goto out; + + loc2.name = g; + ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s failed", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s successful", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); } +out: loc_wipe(&loc); + loc_wipe(&loc2); + if (frame) { + AFR_STACK_DESTROY(frame); + } return ret; } int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies) +{ + char g[64] = {0}; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + gf_boolean_t anon_inode = _gf_false; + + priv = this->private; + subvol = priv->children[child]; + + if ((!replies[child].valid) || (replies[child].op_ret < 0)) { + /*Nothing to do*/ + ret = 0; + goto out; + } + + if (priv->use_anon_inode) { + ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child, + replies, &anon_inode); + if (ret < 0 || anon_inode) + goto out; + } + + loc.parent = inode_ref(dir); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, + uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), + name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink(subvol, &loc, NULL, NULL); + break; + } + +out: + loc_wipe(&loc); + return ret; +} + +int afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, unsigned char *sources, inode_t *dir, const char *name, inode_t *inode, @@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, loc_t srcloc = { 0, }; + loc_t anonloc = { + 0, + }; xlator_t *this = frame->this; afr_private_t *priv = NULL; dict_t *xdata = NULL; @@ -86,15 +205,17 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, 0, }; unsigned char *newentry = NULL; - char dir_uuid_str[64] = {0}, iatt_uuid_str[64] = {0}; + char iatt_uuid_str[64] = {0}; + char dir_uuid_str[64] = {0}; priv = this->private; iatt = &replies[source].poststat; + uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str); if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, "Invalid ia_type (%d) or gfid(%s). source brick=%d, " "pargfid=%s, name=%s", - iatt->ia_type, uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str), source, + iatt->ia_type, iatt_uuid_str, source, uuid_utoa_r(dir->gfid, dir_uuid_str), name); ret = -EINVAL; goto out; @@ -120,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, srcloc.inode = inode_ref(inode); gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); - if (iatt->ia_type != IA_IFDIR) - ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); - if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) { + ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == -ENOENT || ret == -ESTALE) { newentry[dst] = 1; ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, sources, newentry); if (ret) goto out; + } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) { + // Try rename from hidden directory + ret = afr_anon_inode_create(this, dst, &anonloc.parent); + if (ret < 0) + goto out; + anonloc.inode = inode_ref(inode); + anonloc.name = iatt_uuid_str; + ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL); + if (ret == -ENOENT || ret == -ESTALE) + ret = -1; /*This sets 'mismatch' to true*/ + goto out; } mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); @@ -166,6 +297,7 @@ out: GF_FREE(linkname); loc_wipe(&loc); loc_wipe(&srcloc); + loc_wipe(&anonloc); return ret; } @@ -578,6 +710,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (afr_is_private_directory(priv, fd->inode->gfid, name, + GF_CLIENT_PID_SELF_HEALD)) { + return 0; + } + xattr = dict_new(); if (!xattr) return -ENOMEM; @@ -626,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, replies); if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { - ret = afr_shd_index_purge(subvol, parent_idx_inode, name, + ret = afr_shd_entry_purge(subvol, parent_idx_inode, name, inode->ia_type); /* Why is ret force-set to 0? We do not care about * index purge failing for full heal as it is quite @@ -756,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; - if (__is_root_gfid(fd->inode->gfid) && - !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) - continue; - ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, loc.inode, subvol, local->need_full_crawl); @@ -822,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, /* The name indices under the pgfid index dir are guaranteed * to be regular files. Hence the hardcoding. */ - afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG); + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); ret = 0; goto out; } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index dd40c57ab12..834aac86d48 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, const char *bname, inode_t *inode, struct afr_reply *replies) { - loc_t loc = { - 0, - }; int i = 0; afr_private_t *priv = NULL; - char g[64]; int ret = 0; priv = this->private; - loc.parent = inode_ref(parent); - gf_uuid_copy(loc.pargfid, pargfid); - loc.name = bname; - loc.inode = inode_ref(inode); - for (i = 0; i < priv->child_count; i++) { if (!replies[i].valid) continue; @@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, if (replies[i].op_ret) continue; - switch (replies[i].poststat.ia_type) { - case IA_IFDIR: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid), - bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), - priv->children[i]->name); - - ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL); - break; - default: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid), - bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), - priv->children[i]->name); - - ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL); - break; - } + ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i, + replies); } - loc_wipe(&loc); - return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7a038fa7fe3..48e6dbcfb18 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -369,4 +369,9 @@ gf_boolean_t afr_is_file_empty_on_all_children(afr_private_t *priv, struct afr_reply *replies); +int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies); +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode); #endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 2219a53b277..109fd4b7421 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -94,7 +94,7 @@ __afr_shd_healer_wait(struct subvol_healer *healer) priv = healer->this->private; disabled_loop: - wait_till.tv_sec = time(NULL) + priv->shd.timeout; + wait_till.tv_sec = gf_time() + priv->shd.timeout; while (!healer->rerun) { ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); @@ -222,7 +222,7 @@ out: } int -afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, ia_type_t type) { int ret = 0; @@ -371,7 +371,7 @@ afr_shd_sweep_prepare(struct subvol_healer *healer) event->split_brain_count = 0; event->heal_failed_count = 0; - time(&event->start_time); + event->start_time = gf_time(); event->end_time = 0; _mask_cancellation(); } @@ -386,7 +386,7 @@ afr_shd_sweep_done(struct subvol_healer *healer) event = &healer->crawl_event; shd = &(((afr_private_t *)healer->this->private)->shd); - time(&event->end_time); + event->end_time = gf_time(); history = gf_memdup(event, sizeof(*event)); event->start_time = 0; @@ -424,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, ret = afr_shd_selfheal(healer, healer->subvol, gfid); if (ret == -ENOENT || ret == -ESTALE) - afr_shd_index_purge(subvol, parent->inode, entry->d_name, val); + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val); if (ret == 2) /* If bricks crashed in pre-op after creating indices/xattrop @@ -843,6 +843,176 @@ out: return need_heal; } +static int +afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + afr_private_t *priv = healer->this->private; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + loc_t loc = {0}; + int count = 0; + int i = 0; + int op_errno = 0; + struct iatt *iatt = NULL; + gf_boolean_t multiple_links = _gf_false; + unsigned char *gfid_present = alloca0(priv->child_count); + unsigned char *entry_present = alloca0(priv->child_count); + char *type = "file"; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + gf_msg_debug(healer->this->name, 0, + "Not all bricks are up. Skipping " + "cleanup of %s on %s", + entry->d_name, subvol->name); + ret = 0; + goto out; + } + + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + ret = gf_uuid_parse(entry->d_name, loc.gfid); + if (ret) { + ret = 0; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + gfid_present[i] = 1; + iatt = &local->replies[i].poststat; + if (iatt->ia_type == IA_IFDIR) { + type = "dir"; + } + + if (i == healer->subvol) { + if (local->replies[i].poststat.ia_nlink > 1) { + multiple_links = _gf_true; + } + } + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + /*Inode is deleted from subvol*/ + if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); + ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name, + iatt->ia_type); + if (ret == -ENOENT || ret == -ESTALE) + ret = 0; + } else if (count > 1) { + loc_wipe(&loc); + loc.parent = inode_ref(parent->inode); + loc.name = entry->d_name; + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, + &loc, NULL); + count = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + entry_present[i] = 1; + iatt = &local->replies[i].poststat; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + for (i = 0; i < priv->child_count; i++) { + if (gfid_present[i] && !entry_present[i]) { + /*Entry is not anonymous on at least one subvol*/ + gf_msg_debug(healer->this->name, 0, + "Valid entry present on %s " + "Skipping cleanup of %s on %s", + priv->children[i]->name, entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging %s %s/%s on all subvols", type, priv->anon_inode_name, + entry->d_name); + ret = 0; + for (i = 0; i < priv->child_count; i++) { + op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent, + entry->d_name, iatt->ia_type); + if (op_errno != ENOENT && op_errno != ESTALE) { + ret |= -op_errno; + } + } + } + +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return ret; +} + +static void +afr_cleanup_anon_inode_dir(struct subvol_healer *healer) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_private_t *priv = healer->this->private; + loc_t loc = {0}; + + ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode); + if (ret) + goto out; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, + afr_shd_anon_inode_cleaner, NULL, + priv->shd.max_threads, priv->shd.wait_qlength); +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return; +} + void * afr_shd_index_healer(void *data) { @@ -900,6 +1070,10 @@ afr_shd_index_healer(void *data) sleep(1); } while (ret > 0); + if (ret == 0) { + afr_cleanup_anon_inode_dir(healer); + } + if (ret == 0 && pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 687c28e6472..18db728ea7b 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -70,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); int -afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, ia_type_t type); #endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 67c3e0699e6..a51f79b1f43 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -124,9 +124,9 @@ afr_release_notify_lock_for_ta(void *opaque) this = (xlator_t *)opaque; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate loc for thin-arbiter."); goto out; } @@ -1029,7 +1029,7 @@ set_response: } int -afr_fill_ta_loc(xlator_t *this, loc_t *loc) +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop) { afr_private_t *priv = NULL; @@ -1037,6 +1037,11 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc) loc->parent = inode_ref(priv->root_inode); gf_uuid_copy(loc->pargfid, loc->parent->gfid); loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) { + /* Except afr_ta_id_file_check() which is path based, all other gluster + * FOPS need gfid.*/ + return -EINVAL; + } gf_uuid_copy(loc->gfid, priv->ta_gfid); loc->inode = inode_new(loc->parent->table); if (!loc->inode) { @@ -1046,86 +1051,6 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc) return 0; } -int -afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local) -{ - int ret = 0; - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - int failed_count = 0; - struct gf_flock flock = { - 0, - }; - loc_t loc = { - 0, - }; - int i = 0; - - priv = this->private; - if (!priv->thin_arbiter_count) - return 0; - - failed_count = AFR_COUNT(local->transaction.failed_subvols, - priv->child_count); - if (!failed_count) - return 0; - - GF_ASSERT(failed_count == 1); - ret = afr_fill_ta_loc(this, &loc); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed to populate thin-arbiter loc for: %s.", loc.name); - goto out; - } - - xattr = dict_new(); - if (!xattr) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin(xattr, priv->pending_key[i], - local->pending[i], - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) - goto out; - } - - flock.l_type = F_WRLCK; - flock.l_start = 0; - flock.l_len = 0; - - /*TODO: Convert to two domain locking. */ - ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], - AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL); - if (ret) - goto out; - - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); - - if (ret == -EINVAL) { - gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB, - "Thin-arbiter has denied post-op on %s for gfid %s.", - priv->pending_key[THIN_ARBITER_BRICK_INDEX], - uuid_utoa(local->inode->gfid)); - - } else if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Post-op on thin-arbiter id file %s failed for gfid %s.", - priv->pending_key[THIN_ARBITER_BRICK_INDEX], - uuid_utoa(local->inode->gfid)); - } - flock.l_type = F_UNLCK; - syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, - &loc, F_SETLK, &flock, NULL, NULL); -out: - if (xattr) - dict_unref(xattr); - - return ret; -} - static int afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) { @@ -1220,9 +1145,9 @@ afr_ta_post_op_do(void *opaque) this = local->transaction.frame->this; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate loc for thin-arbiter."); goto out; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 67e0a4d10be..df7366f0a65 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -135,6 +135,27 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo) } } +void +afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options) +{ + char *volfile_id_str = NULL; + uuid_t anon_inode_gfid = {0}; + + /*If volume id is not present don't enable anything*/ + if (dict_get_str(options, "volume-id", &volfile_id_str)) + return; + GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX); + /*anon_inode_name is not supposed to change once assigned*/ + if (!priv->anon_inode_name[0]) { + snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s", + AFR_ANON_DIR_PREFIX, volfile_id_str); + gf_uuid_parse(volfile_id_str, anon_inode_gfid); + /*Flip a bit to make sure volfile-id and anon-gfid are not same*/ + anon_inode_gfid[0] ^= 1; + uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str); + } +} + int reconfigure(xlator_t *this, dict_t *options) { @@ -290,6 +311,10 @@ reconfigure(xlator_t *this, dict_t *options) consistent_io = _gf_false; priv->consistent_io = consistent_io; + afr_handle_anon_inode_options(priv, options); + + GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool, + out); if (priv->shd.enabled) { if ((priv->shd.enabled != enabled_old) || (timeout_old != priv->shd.timeout)) @@ -541,7 +566,9 @@ init(xlator_t *this) GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); + afr_handle_anon_inode_options(priv, this->options); + GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out); if (priv->quorum_count != 0) priv->consistent_io = _gf_false; @@ -553,6 +580,9 @@ init(xlator_t *this) goto out; } + priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); + priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); @@ -561,7 +591,8 @@ init(xlator_t *this) priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); - if (!priv->child_up || !priv->child_latency || !priv->halo_child_up) { + if (!priv->child_up || !priv->child_latency || !priv->halo_child_up || + !priv->anon_inode) { ret = -ENOMEM; goto out; } @@ -1286,6 +1317,14 @@ struct volume_options options[] = { .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, + {.key = {"use-anonymous-inode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "Setting this option heals directory renames efficiently"}, + {.key = {NULL}}, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 1fff5640940..d62f9a9caf2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -42,6 +42,7 @@ #define AFR_LK_HEAL_DOM "afr.lock-heal.domain" #define AFR_HALO_MAX_LATENCY 99999 +#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode" #define PFLAG_PENDING (1 << 0) #define PFLAG_SBRAIN (1 << 1) @@ -190,6 +191,7 @@ typedef struct _afr_private { struct list_head ta_waitq; struct list_head ta_onwireq; + unsigned char *anon_inode; unsigned char *child_up; unsigned char *halo_child_up; int64_t *child_latency; @@ -275,10 +277,15 @@ typedef struct _afr_private { gf_boolean_t esh_granular; gf_boolean_t consistent_io; gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t use_anon_inode; /*For lock healing.*/ struct list_head saved_locks; struct list_head lk_healq; + + /*For anon-inode handling */ + char anon_inode_name[NAME_MAX + 1]; + char anon_gfid_str[UUID_SIZE + 1]; } afr_private_t; typedef enum { @@ -1271,8 +1278,8 @@ int afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice); int -afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, - int *spb_choice); +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol); int afr_get_child_index_from_name(xlator_t *this, char *name); @@ -1357,7 +1364,7 @@ int afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode); int -afr_fill_ta_loc(xlator_t *this, loc_t *loc); +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop); int afr_ta_post_op_lock(xlator_t *this, loc_t *loc); @@ -1409,4 +1416,8 @@ afr_dom_lock_release(call_frame_t *frame); void afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, unsigned char *replies); + +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index be92236e3bd..8ba0cc4c732 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -382,7 +382,7 @@ out: /* Code to save hashed subvol on inode ctx as a mds subvol */ -static int +int dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol) { dht_inode_ctx_t *ctx = NULL; @@ -2161,31 +2161,18 @@ static int dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict) { int ret = 0; - xlator_t *this = NULL; - char *linktoskip_key = NULL; - this = THIS; - GF_VALIDATE_OR_GOTO("dht", this, err); - - if (dht_is_tier_xlator(this)) - linktoskip_key = TIER_SKIP_NON_LINKTO_UNLINK; - else - linktoskip_key = DHT_SKIP_NON_LINKTO_UNLINK; - - ret = dict_set_int32(dict, linktoskip_key, 1); + ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1); if (ret) - goto err; + return -1; - ret = dict_set_int32(dict, DHT_SKIP_OPEN_FD_UNLINK, 1); + ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1); if (ret) - goto err; + return -1; return 0; - -err: - return -1; } static int32_t @@ -4314,6 +4301,8 @@ dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, index = conf->local_subvols_cnt; uuid_list_copy = gf_strdup(uuid_list); + if (!uuid_list_copy) + goto unlock; for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str; uuid_str = next_uuid_str) { @@ -4604,18 +4593,8 @@ dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dict_del(xattr, conf->xattr_name); dict_del(xattr, conf->mds_xattr_key); - /* filter out following two xattrs that need not - * be visible on the mount point for geo-rep - - * trusted.tier.fix.layout.complete and - * trusted.tier.tier-dht.commithash - */ - dict_del(xattr, conf->commithash_xattr_name); - if (frame->root->pid >= 0 && dht_is_tier_xlator(this)) { - dict_del(xattr, GF_XATTR_TIER_LAYOUT_FIXED_KEY); - } - if (frame->root->pid >= 0) { GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); @@ -5893,22 +5872,7 @@ dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, if (local->rebalance.target_node) { local->flags = forced_rebalance; - /* Flag to suggest its a tiering migration - * The reason for this dic key-value is that - * promotions and demotions are multithreaded - * so the original frame from gf_defrag_start() - * is not carried. A new frame will be created when - * we do syncop_setxattr(). This does not have the - * frame->root->pid of the original frame. So we pass - * this dic key-value when we do syncop_setxattr() to do - * data migration and set the frame->root->pid to - * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before - * calling dht_start_rebalance_task() */ - tmp = dict_get(xattr, TIERING_MIGRATION_KEY); - if (tmp) - frame->root->pid = GF_CLIENT_PID_TIER_DEFRAG; - else - frame->root->pid = GF_CLIENT_PID_DEFRAG; + frame->root->pid = GF_CLIENT_PID_DEFRAG; ret = dht_start_rebalance_task(this, frame); if (!ret) @@ -6720,10 +6684,9 @@ dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, layout = local->layout; - /* We have seen crashes in while running "rm -rf" on tier volumes - when the layout was NULL on the hot tier. This will skip the - entries on the subvol without a layout, hence preventing the crash - but rmdir might fail with "directory not empty" errors*/ + /* This will skip the entries on the subvol without a layout, + * hence preventing the crash but rmdir might fail with + * "directory not empty" errors*/ if (layout == NULL) goto done; @@ -10850,23 +10813,17 @@ dht_notify(xlator_t *this, int event, void *data, ...) int had_heard_from_all = 0; int have_heard_from_all = 0; - struct timeval time = { - 0, - }; gf_defrag_info_t *defrag = NULL; dict_t *dict = NULL; gf_defrag_type cmd = 0; dict_t *output = NULL; va_list ap; - dht_methods_t *methods = NULL; struct gf_upcall *up_data = NULL; struct gf_upcall_cache_invalidation *up_ci = NULL; conf = this->private; GF_VALIDATE_OR_GOTO(this->name, conf, out); - methods = &(conf->methods); - /* had all subvolumes reported status once till now? */ had_heard_from_all = 1; for (i = 0; i < conf->subvolume_cnt; i++) { @@ -10896,12 +10853,11 @@ dht_notify(xlator_t *this, int event, void *data, ...) break; } - gettimeofday(&time, NULL); LOCK(&conf->subvolume_lock); { conf->subvolume_status[cnt] = 1; conf->last_event[cnt] = event; - conf->subvol_up_time[cnt] = time.tv_sec; + conf->subvol_up_time[cnt] = gf_time(); } UNLOCK(&conf->subvolume_lock); @@ -11090,15 +11046,13 @@ dht_notify(xlator_t *this, int event, void *data, ...) * thread has already started. */ if (conf->defrag && !run_defrag) { - if (methods->migration_needed(this)) { - run_defrag = 1; - ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start, - this, "dhtdg"); - if (ret) { - GF_FREE(conf->defrag); - conf->defrag = NULL; - kill(getpid(), SIGTERM); - } + run_defrag = 1; + ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start, + this, "dhtdg"); + if (ret) { + GF_FREE(conf->defrag); + conf->defrag = NULL; + kill(getpid(), SIGTERM); } } } @@ -11243,28 +11197,6 @@ out: return ret; } -int32_t -dht_migration_needed(xlator_t *this) -{ - gf_defrag_info_t *defrag = NULL; - dht_conf_t *conf = NULL; - int ret = 0; - - conf = this->private; - - GF_VALIDATE_OR_GOTO("dht", conf, out); - GF_VALIDATE_OR_GOTO("dht", conf->defrag, out); - - defrag = conf->defrag; - - if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && - (defrag->cmd != GF_DEFRAG_CMD_START_DETACH_TIER)) - ret = 1; - -out: - return ret; -} - /* This function should not be called more then once during a FOP handling path. It is valid only for for ops on files @@ -11299,14 +11231,6 @@ dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf, return 0; } -gf_boolean_t -dht_is_tier_xlator(xlator_t *this) -{ - if (strcmp(this->type, "cluster/tier") == 0) - return _gf_true; - return _gf_false; -} - int32_t dht_release(xlator_t *this, fd_t *fd) { @@ -11465,117 +11389,3 @@ dht_dir_layout_error_check(xlator_t *this, inode_t *inode) /* Returning the first xlator error as all xlators have errors */ return layout->list[0].err; } - -/* Get brick paths from all the local subvols and store for use. - * - * TODO: Make sure newly added brick is not picked for migration. - * Otherwise there will be no rebalance as directory entries won't be present - * on a newly added brick */ -int -dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc) -{ - dict_t *dict = NULL; - gf_defrag_info_t *defrag = conf->defrag; - char *key = NULL; - char *tmp = NULL; - char *str = NULL; - char *token; - char *saveptr = NULL; - int i = 1; - int j = 0; - int ret = 0; - - key = gf_strdup("glusterfs.pathinfo"); - if (!key) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, - "failed to allocate " - "memory"); - ret = -1; - goto out; - } - - defrag->local_brick_paths = GF_CALLOC(conf->local_subvols_cnt, - sizeof(*defrag->local_brick_paths), - gf_common_mt_pointer); - - for (j = 0; j < conf->local_subvols_cnt; j++) { - ret = syncop_getxattr(conf->local_subvols[j], loc, &dict, key, NULL, - NULL); - if (ret == -1) { - gf_msg(this->name, GF_LOG_WARNING, 0, 0, - "failed to get path," - " errno %d", - ret); - /* TODO: We need not break out from here and can resume operation. - * We need a place holder in gf_defrag_info_t to mark which - * local_brick_paths we are working on. Right now, we blindly - * take defrag->local_brick_path[0]. This can be dynamic based on - * need */ - goto out; - } - - str = NULL; - ret = dict_get_str(dict, key, &str); - if (ret != 0) { - gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "dict get failed for :%s", - key); - goto out; - } - if (str == NULL) { - gf_msg(this->name, GF_LOG_ERROR, 0, 0, "key:%s not found", key); - ret = -1; - goto out; - } - - if (!defrag->is_pure_distribute) { - tmp = strstr(str, "REPLICATE"); - if (tmp) { - defrag->is_pure_distribute = _gf_false; - break; - } - - /*TODO: fetching glusterfs.pathinfo on erasure volume is failing. - *Function the old way till we get it resolved */ - tmp = strstr(str, "ERASURE"); - if (tmp) { - defrag->is_pure_distribute = _gf_false; - break; - } - - defrag->is_pure_distribute = _gf_true; - } - - saveptr = NULL; - - for (token = strtok_r(str, ":", &saveptr), i = 1; token;) { - token = strtok_r(NULL, ":", &saveptr); - i++; - if (i == 3) { - token = strtok_r(token, ">", &saveptr); - break; - } else { - continue; - } - } - - defrag->local_brick_paths[j] = gf_strdup(token); - } - -out: - if (ret == -1) { - gf_msg(this->name, GF_LOG_INFO, 0, 0, - "failed to get brick path. " - "Will operate old way"); - for (j = 0; j < conf->local_subvols_cnt; j++) { - GF_FREE(defrag->local_brick_paths[j]); - } - defrag->is_pure_distribute = _gf_false; - } - - if (defrag->is_pure_distribute) { - gf_msg(this->name, GF_LOG_INFO, 0, 0, "volume type : pure distribute"); - } - - GF_FREE(key); - return ret; -} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 84891406c71..fe0dc3db34a 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -24,7 +24,6 @@ #define _DHT_H #define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" -#define GF_XATTR_TIER_LAYOUT_FIXED_KEY "trusted.tier.fix.layout.complete" #define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data" #define DHT_MDS_STR "mds" #define GF_DHT_LOOKUP_UNHASHED_OFF 0 @@ -36,7 +35,6 @@ #define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal" /* Namespace synchronization */ #define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync" -#define TIERING_MIGRATION_KEY "tiering.migration" #define DHT_LAYOUT_HASH_INVALID 1 #define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN) @@ -242,19 +240,6 @@ typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame, dht_layout_t **inmem, dht_layout_t **ondisk); -typedef struct { - uint64_t blocks_used; - uint64_t pblocks_used; - uint64_t files_used; - uint64_t pfiles_used; - uint64_t unhashed_blocks_used; - uint64_t unhashed_pblocks_used; - uint64_t unhashed_files_used; - uint64_t unhashed_pfiles_used; - uint64_t unhashed_fsid; - uint64_t hashed_fsid; -} tier_statvfs_t; - struct dht_local { loc_t loc; loc_t loc2; @@ -272,7 +257,6 @@ struct dht_local { struct iatt preparent; struct iatt postparent; struct statvfs statvfs; - tier_statvfs_t tier_statvfs; fd_t *fd; inode_t *inode; dict_t *params; @@ -405,14 +389,7 @@ enum gf_defrag_type { GF_DEFRAG_CMD_STATUS = 1 + 2, GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, GF_DEFRAG_CMD_START_FORCE = 1 + 4, - GF_DEFRAG_CMD_START_TIER = 1 + 5, - GF_DEFRAG_CMD_STATUS_TIER = 1 + 6, - GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, - GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, - GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, - GF_DEFRAG_CMD_RESUME_TIER = 1 + 10, GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11, - GF_DEFRAG_CMD_STOP_TIER = 1 + 12, GF_DEFRAG_CMD_DETACH_START = 1 + 13, GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14, GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15, @@ -463,75 +440,6 @@ struct dht_container { int local_subvol_index; }; -typedef enum tier_mode_ { - TIER_MODE_NONE = 0, - TIER_MODE_TEST, - TIER_MODE_WM -} tier_mode_t; - -typedef enum tier_pause_state_ { - TIER_RUNNING = 0, - TIER_REQUEST_PAUSE, - TIER_PAUSED -} tier_pause_state_t; - -/* This Structure is only used in tiering fixlayout */ -typedef struct gf_tier_fix_layout_arg { - xlator_t *this; - dict_t *fix_layout; - pthread_t thread_id; -} gf_tier_fix_layout_arg_t; - -typedef struct gf_tier_conf { - int is_tier; - int watermark_hi; - int watermark_low; - int watermark_last; - unsigned long block_size; - fsblkcnt_t blocks_total; - fsblkcnt_t blocks_used; - uint64_t max_migrate_bytes; - int max_migrate_files; - int query_limit; - tier_mode_t mode; - int percent_full; - /* These flags are only used for tier-compact */ - gf_boolean_t compact_active; - /* These 3 flags are set to true when the client changes the */ - /* compaction mode on the command line. */ - /* When they are set, the daemon will trigger compaction as */ - /* soon as possible to activate or deactivate compaction. */ - /* If in the middle of a compaction, then the switches take */ - /* effect on the next compaction, not the current one. */ - /* If the user switches it off, we want to avoid needless */ - /* compactions. */ - /* If the user switches it on, they want to compact as soon */ - /* as possible. */ - gf_boolean_t compact_mode_switched; - gf_boolean_t compact_mode_switched_hot; - gf_boolean_t compact_mode_switched_cold; - int tier_max_promote_size; - int tier_promote_frequency; - int tier_demote_frequency; - int tier_compact_hot_frequency; - int tier_compact_cold_frequency; - uint64_t st_last_promoted_size; - uint64_t st_last_demoted_size; - struct synctask *pause_synctask; - gf_timer_t *pause_timer; - pthread_mutex_t pause_mutex; - int promote_in_progress; - int demote_in_progress; - /* This Structure is only used in tiering fixlayout */ - gf_tier_fix_layout_arg_t tier_fix_layout_arg; - /* Indicates the index of the first queryfile picked - * in the last cycle of promote or demote */ - int32_t last_promote_qfile_index; - int32_t last_demote_qfile_index; - tier_pause_state_t pause_state; - char volname[GD_VOLUME_NAME_MAX + 1]; -} gf_tier_conf_t; - typedef struct nodeuuid_info { char info; /* Set to 1 is this is my node's uuid*/ uuid_t uuid; /* Store the nodeuuid as well for debugging*/ @@ -559,17 +467,10 @@ struct gf_defrag_info_ { int cmd; inode_t *root_inode; uuid_t node_uuid; - struct timeval start_time; + time_t start_time; uint32_t new_commit_hash; gf_defrag_status_t defrag_status; gf_defrag_pattern_list_t *defrag_pattern; - gf_tier_conf_t tier_conf; - - /*Data Tiering params for scanner*/ - uint64_t total_files_promoted; - uint64_t total_files_demoted; - int write_freq_threshold; - int read_freq_threshold; pthread_cond_t parallel_migration_cond; pthread_mutex_t dfq_mutex; @@ -598,15 +499,6 @@ struct gf_defrag_info_ { gf_boolean_t stats; /* lock migration flag */ gf_boolean_t lock_migration_enabled; - - /* local system crawl */ - char **local_brick_paths; - - /* whether the volume is pure distribute */ - gf_boolean_t is_pure_distribute; - - /*TODO: Introduce a glusterd option to tune this behaviour*/ - gf_boolean_t operate_dist; }; typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -614,7 +506,6 @@ typedef struct gf_defrag_info_ gf_defrag_info_t; struct dht_methods_s { int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local); int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag); - int32_t (*migration_needed)(xlator_t *this); xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout, const char *name); }; @@ -635,7 +526,7 @@ struct dht_conf { int subvolume_cnt; int32_t refresh_interval; gf_lock_t subvolume_lock; - struct timeval last_stat_fetch; + time_t last_stat_fetch; gf_lock_t layout_lock; dict_t *leaf_to_subvol; void *private; /* Can be used by wrapper xlators over @@ -1325,9 +1216,6 @@ dht_layout_missing_dirs(dht_layout_t *layout); int dht_refresh_layout(call_frame_t *frame); -gf_boolean_t -dht_is_tier_xlator(xlator_t *this); - int dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child, int32_t *op_errno); @@ -1492,5 +1380,5 @@ int dht_dir_layout_error_check(xlator_t *this, inode_t *inode); int -dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc); +dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol); #endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 27097ca2475..c0588828fdb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -151,22 +151,18 @@ dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc) dht_conf_t *conf = NULL; call_frame_t *statfs_frame = NULL; dht_local_t *statfs_local = NULL; - struct timeval tv = { - 0, - }; loc_t tmp_loc = { 0, }; + time_t now; conf = this->private; - - gettimeofday(&tv, NULL); - + now = gf_time(); /* make it root gfid, should be enough to get the proper info back */ tmp_loc.gfid[15] = 1; - if (tv.tv_sec > (conf->refresh_interval + conf->last_stat_fetch.tv_sec)) { + if (now > (conf->refresh_interval + conf->last_stat_fetch)) { statfs_frame = copy_frame(frame); if (!statfs_frame) { goto err; @@ -198,7 +194,7 @@ dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc) statfs_local->params); } - conf->last_stat_fetch.tv_sec = tv.tv_sec; + conf->last_stat_fetch = now; } return 0; err: diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index eda2491e0ff..2f23ce90fbd 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -93,30 +93,28 @@ dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, /* Check if the rebalance phase1 is true */ if (IS_DHT_MIGRATION_PHASE1(postbuf)) { - if (!dht_is_tier_xlator(this)) { + if (!local->xattr_req) { + local->xattr_req = dict_new(); if (!local->xattr_req) { - local->xattr_req = dict_new(); - if (!local->xattr_req) { - gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, - "insufficient memory"); - local->op_errno = ENOMEM; - local->op_ret = -1; - goto out; - } - } - - ret = dict_set_uint32(local->xattr_req, - GF_PROTECT_FROM_EXTERNAL_WRITES, 1); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0, - "Failed to set key %s in dictionary", - GF_PROTECT_FROM_EXTERNAL_WRITES); + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "insufficient memory"); local->op_errno = ENOMEM; local->op_ret = -1; goto out; } } + ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES, + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0, + "Failed to set key %s in dictionary", + GF_PROTECT_FROM_EXTERNAL_WRITES); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + dht_iatt_merge(this, &local->stbuf, postbuf); dht_iatt_merge(this, &local->prebuf, prebuf); diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 33f9832395b..e3c4471334a 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -30,10 +30,7 @@ enum gf_dht_mem_types_ { gf_dht_mt_container_t, gf_dht_mt_octx_t, gf_dht_mt_miginfo_t, - gf_tier_mt_bricklist_t, - gf_tier_mt_ipc_ctr_params_t, gf_dht_mt_fd_ctx_t, - gf_tier_mt_qfile_array_t, gf_dht_ret_cache_t, gf_dht_nodeuuids_t, gf_dht_mt_end diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h index 026879e14af..601f8dad78b 100644 --- a/xlators/cluster/dht/src/dht-messages.h +++ b/xlators/cluster/dht/src/dht-messages.h @@ -38,12 +38,11 @@ GLFS_MSGID( DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED, DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES, DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED, - DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT, DHT_MSG_LOG_TIER_ERROR, - DHT_MSG_LOG_TIER_STATUS, DHT_MSG_GET_XATTR_FAILED, - DHT_MSG_FILE_LOOKUP_FAILED, DHT_MSG_OPEN_FD_FAILED, - DHT_MSG_SET_INODE_CTX_FAILED, DHT_MSG_UNLOCKING_FAILED, - DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO, DHT_MSG_CHUNK_SIZE_INFO, - DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR, + DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT, + DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED, + DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED, + DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO, + DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR, DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED, DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED, DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED, @@ -69,8 +68,7 @@ GLFS_MSGID( DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED, DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR, DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO, - DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_LOG_IPC_TIER_ERROR, - DHT_MSG_TIER_PAUSED, DHT_MSG_TIER_RESUME, DHT_MSG_SETTLE_HASH_FAILED, + DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED, DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED, DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED, DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED, @@ -96,15 +94,13 @@ GLFS_MSGID( DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED, DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL, DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED, - DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, - DHT_MSG_TIER_FIX_LAYOUT_STARTED, DHT_MSG_FIX_NOT_COMP, - DHT_MSG_REMOVE_TIER_FAILED, DHT_MSG_SUBVOL_DETER_FAILED, - DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID, DHT_MSG_SIZE_FILE, - DHT_MSG_GET_DATA_SIZE_FAILED, DHT_MSG_PTHREAD_JOIN_FAILED, - DHT_MSG_COUNTER_THREAD_CREATE_FAILED, DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, - DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE, DHT_MSG_ABORT_REBALANCE, - DHT_MSG_CREATE_TASK_REBAL_FAILED, DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, - DHT_MSG_MIG_TIER_PAUSED, DHT_MSG_ADD_CHOICES_ERROR, + DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP, + DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID, + DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED, + DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED, + DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE, + DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED, + DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR, DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR, DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED, DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED, @@ -180,7 +176,6 @@ GLFS_MSGID( "adding bricks" #define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file" #define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory" -#define DHT_MSG_MIG_TIER_PAUSED_STR "Migrate file paused" #define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr" #define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode" #define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination" @@ -222,17 +217,14 @@ GLFS_MSGID( #define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present" #define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed" #define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup" -#define DHT_MSG_LOG_TIER_STATUS_STR "lookup to cold tier on attach heal failed" #define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed" #define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping" #define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout" #define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed" #define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed" -#define DHT_MSG_TIER_FIX_LAYOUT_STARTED_STR "Tiering fix layout started" #define DHT_MSG_FIX_NOT_COMP_STR \ "Unable to retrieve fixlayout xattr. Assume background fix layout not " \ "complete" -#define DHT_MSG_REMOVE_TIER_FAILED_STR "Failed removing tier fix layout xattr" #define DHT_MSG_SUBVOL_DETER_FAILED_STR \ "local subvolume determination failed with error" #define DHT_MSG_LOCAL_SUBVOL_STR "local subvol" @@ -248,8 +240,6 @@ GLFS_MSGID( #define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR \ "Failed to initialise migration queue" #define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance" -#define DHT_MSG_TIER_RESUME_STR "Pause end. Resume tiering" -#define DHT_MSG_TIER_PAUSED_STR "Pause tiering" #define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout" #define DHT_MSG_WOKE_STR "woken" #define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance" diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index d850eef62ab..8ba8082bd86 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -14,7 +14,6 @@ #include <signal.h> #include <glusterfs/events.h> #include "glusterfs/compat-errno.h" // for ENODATA on BSD -#include <string.h> #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ @@ -610,26 +609,23 @@ __dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from, goto out; } - if (!!dht_is_tier_xlator(this)) { - xdata = dict_new(); - if (!xdata) { - *fop_errno = ENOMEM; - ret = -1; - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, - DHT_MSG_MIGRATE_FILE_FAILED, "%s: dict_new failed)", - loc->path); - goto out; - } + xdata = dict_new(); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: dict_new failed)", loc->path); + goto out; + } - ret = dict_set_int32(xdata, GF_CLEAN_WRITE_PROTECTION, 1); - if (ret) { - *fop_errno = ENOMEM; - ret = -1; - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, - "%s: failed to set dictionary value: key = %s ", loc->path, - GF_CLEAN_WRITE_PROTECTION); - goto out; - } + ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", loc->path, + GF_CLEAN_WRITE_PROTECTION); + goto out; } ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL); @@ -1097,7 +1093,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, break; } - if (!conf->force_migration && !dht_is_tier_xlator(this)) { + if (!conf->force_migration) { if (!xdata) { xdata = dict_new(); if (!xdata) { @@ -1537,21 +1533,6 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* If defrag is NULL, it should be assumed that migration is triggered - * from client using the trusted.distribute.migrate-data virtual xattr - */ - defrag = conf->defrag; - - /* migration of files from clients is restricted to non-tiered clients - * for now */ - if (!defrag && dht_is_tier_xlator(this)) { - ret = ENOTSUP; - goto out; - } - - if (defrag && defrag->tier_conf.is_tier) - log_level = GF_LOG_TRACE; - gf_log(this->name, log_level, "%s: attempting to move from %s to %s", loc->path, from->name, to->name); @@ -2301,14 +2282,12 @@ out: } } - if (!dht_is_tier_xlator(this)) { - lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, - NULL, NULL); - if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) { - gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0, - "%s: removexattr failed key %s", loc->path, - GF_PROTECT_FROM_EXTERNAL_WRITES); - } + lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL, + NULL); + if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0, + "%s: removexattr failed key %s", loc->path, + GF_PROTECT_FROM_EXTERNAL_WRITES); } if (dict) @@ -2895,8 +2874,7 @@ gf_defrag_migrate_single_file(void *opaque) if (defrag->stats == _gf_true) { gettimeofday(&end, NULL); - elapsed = (end.tv_sec - start.tv_sec) * 1e6 + - (end.tv_usec - start.tv_usec); + elapsed = gf_tvdiff(&start, &end); gf_log(this->name, GF_LOG_INFO, "Migration of " "file:%s size:%" PRIu64 @@ -3075,7 +3053,7 @@ int static gf_defrag_get_entry(xlator_t *this, int i, dht_conf_t *conf, gf_defrag_info_t *defrag, fd_t *fd, dict_t *migrate_data, struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req, - int *should_commit_hash, int *perrno) + int *perrno) { int ret = 0; char is_linkfile = 0; @@ -3279,7 +3257,7 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, int dfc_index = 0; int throttle_up = 0; struct dir_dfmeta *dir_dfmeta = NULL; - int should_commit_hash = 1; + xlator_t *old_THIS = NULL; gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path); gettimeofday(&dir_start, NULL); @@ -3292,6 +3270,9 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, goto out; } + old_THIS = THIS; + THIS = this; + dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer); if (!dir_dfmeta) { gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL"); @@ -3456,7 +3437,7 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf, defrag, dir_dfmeta->lfd[dfc_index], migrate_data, dir_dfmeta, xattr_req, - &should_commit_hash, perrno); + perrno); if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) { goto out; @@ -3500,24 +3481,19 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, } gettimeofday(&end, NULL); - elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + - (end.tv_usec - dir_start.tv_usec); + elapsed = gf_tvdiff(&dir_start, &end); gf_log(this->name, GF_LOG_INFO, "Migration operation on dir %s took " "%.2f secs", loc->path, elapsed / 1e6); ret = 0; out: - + THIS = old_THIS; gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt); if (xattr_req) dict_unref(xattr_req); - if (ret == 0 && should_commit_hash == 0) { - ret = 2; - } - /* It does not matter if it errored out - this number is * used to calculate rebalance estimated time to complete. * No locking required as dirs are processed by a single thread. @@ -3525,6 +3501,7 @@ out: defrag->num_dirs_processed++; return ret; } + int gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, dict_t *fix_layout) @@ -3539,7 +3516,6 @@ gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, * rebalance is complete. */ if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX || - defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER || defrag->cmd == GF_DEFRAG_CMD_DETACH_START) { return 0; } @@ -3585,114 +3561,6 @@ gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, return 0; } -/* Function for doing a named lookup on file inodes during an attach tier - * So that a hardlink lookup heal i.e gfid to parent gfid lookup heal - * happens on pre-existing data. This is required so that the ctr database has - * hardlinks of all the exisitng file in the volume. CTR xlator on the - * brick/server side does db update/insert of the hardlink on a namelookup. - * Currently the namedlookup is done synchronous to the fixlayout that is - * triggered by attach tier. This is not performant, adding more time to - * fixlayout. The performant approach is record the hardlinks on a compressed - * datastore and then do the namelookup asynchronously later, giving the ctr db - * eventual consistency - * */ -int -gf_fix_layout_tier_attach_lookup(xlator_t *this, loc_t *parent_loc, - gf_dirent_t *file_dentry) -{ - int ret = -1; - dict_t *lookup_xdata = NULL; - dht_conf_t *conf = NULL; - loc_t file_loc = { - 0, - }; - struct iatt iatt = { - 0, - }; - - GF_VALIDATE_OR_GOTO("tier", this, out); - - GF_VALIDATE_OR_GOTO(this->name, parent_loc, out); - - GF_VALIDATE_OR_GOTO(this->name, file_dentry, out); - - GF_VALIDATE_OR_GOTO(this->name, this->private, out); - - if (!parent_loc->inode) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, - "%s/%s parent is NULL", parent_loc->path, file_dentry->d_name); - goto out; - } - - conf = this->private; - - loc_wipe(&file_loc); - - if (gf_uuid_is_null(file_dentry->d_stat.ia_gfid)) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, - "%s/%s gfid not present", parent_loc->path, file_dentry->d_name); - goto out; - } - - gf_uuid_copy(file_loc.gfid, file_dentry->d_stat.ia_gfid); - - if (gf_uuid_is_null(parent_loc->gfid)) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, - "%s/%s" - " gfid not present", - parent_loc->path, file_dentry->d_name); - goto out; - } - - gf_uuid_copy(file_loc.pargfid, parent_loc->gfid); - - ret = dht_build_child_loc(this, &file_loc, parent_loc, file_dentry->d_name); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, - "Child loc build failed"); - ret = -1; - goto out; - } - - lookup_xdata = dict_new(); - if (!lookup_xdata) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, - "Failed creating lookup dict for %s", file_dentry->d_name); - goto out; - } - - ret = dict_set_int32(lookup_xdata, CTR_ATTACH_TIER_LOOKUP, 1); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, - "Failed to set lookup flag"); - goto out; - } - - gf_uuid_copy(file_loc.parent->gfid, parent_loc->gfid); - - /* Sending lookup to cold tier only */ - ret = syncop_lookup(conf->subvolumes[0], &file_loc, &iatt, NULL, - lookup_xdata, NULL); - if (ret) { - /* If the file does not exist on the cold tier than it must */ - /* have been discovered on the hot tier. This is not an error. */ - gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, - "%s lookup to cold tier on attach heal failed", file_loc.path); - goto out; - } - - ret = 0; - -out: - - loc_wipe(&file_loc); - - if (lookup_xdata) - dict_unref(lookup_xdata); - - return ret; -} - int gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, dict_t *fix_layout, dict_t *migrate_data) @@ -3712,7 +3580,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, }; inode_t *linked_inode = NULL, *inode = NULL; dht_conf_t *conf = NULL; - int should_commit_hash = 1; int perrno = 0; conf = this->private; @@ -3815,16 +3682,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; if (!IA_ISDIR(entry->d_stat.ia_type)) { - /* If its a fix layout during the attach - * tier operation do lookups on files - * on cold subvolume so that there is a - * CTR DB Lookup Heal triggered on existing - * data. - * */ - if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) { - gf_fix_layout_tier_attach_lookup(this, loc, entry); - } - continue; } loc_wipe(&entry_loc); @@ -3841,8 +3698,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, goto out; } else { - should_commit_hash = 0; - continue; } } @@ -3905,7 +3760,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, ret = -1; goto out; } else { - should_commit_hash = 0; continue; } } @@ -3923,7 +3777,7 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, goto out; } - if (ret && ret != 2) { + if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED, "Fix layout failed for %s", entry_loc.path); @@ -3990,11 +3844,10 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, } } - if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && - (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) { + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno); - if (ret && (ret != 2)) { + if (ret) { if (perrno == ENOENT || perrno == ESTALE) { ret = 0; goto out; @@ -4010,18 +3863,13 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (conf->decommission_in_progress) { goto out; } - - should_commit_hash = 0; } - } else if (ret == 2) { - should_commit_hash = 0; } } gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path); - if (should_commit_hash && - gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { + if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { defrag->total_failures++; gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED, @@ -4045,372 +3893,6 @@ out: if (fd) fd_unref(fd); - if (ret == 0 && should_commit_hash == 0) { - ret = 2; - } - - return ret; -} - -int -gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag, - loc_t *loc, dict_t *fix_layout, - dict_t *migrate_data) -{ - int ret = -1; - loc_t entry_loc = { - 0, - }; - fd_t *fd = NULL; - inode_t *linked_inode = NULL, *inode = NULL; - dht_conf_t *conf = NULL; - int should_commit_hash = 1; - int perrno = 0; - /* absolute brick path length */ - int brick_len = 0; - /* dir path length (relative to gluster mount) */ - int dir_len = 0; - /* absolute dir path length */ - int total_len = 0; - struct dirent *entry = NULL; - struct dirent scratch[2] = {{ - 0, - }}; - DIR *dirp = NULL; - int full_entry_length = 0; - int entry_len = 0; - char full_entry_path[4096] = { - 0, - }; - char full_dir_path[4096] = { - 0, - }; - ssize_t size = 0; - uuid_t tmp_gfid; - struct stat tmpbuf = { - 0, - }; - struct iatt iatt = { - 0, - }; - - struct stat lstatbuf = { - 0, - }; - struct iatt stbuf = { - 0, - }; - - conf = this->private; - if (!conf) { - ret = -1; - goto out; - } - - /* - * Since the primary target for the following lookup is to figure out if the - * entry still exists, going to do a direct stat call rather than going - * through the whole gluster stack. There are some benefits of doing gluster - * lookup, but this is redundant since we have done already one gluster - * lookup in the parent function. - * - * Randomly selecting the first local subvol to read, since it is expected - * that the directory structure is present in all the subvols identically - */ - - brick_len = strlen(defrag->local_brick_paths[0]); - /* discarding the first "/" */ - dir_len = strlen(loc->path) - 1; - /* Extra two: one for "/" at the end and one more for '\0'*/ - total_len = brick_len + dir_len + 2; - - snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0], - loc->path + 1); - - ret = sys_lstat(full_dir_path, &tmpbuf); - if (ret == -1) { - gf_log(this->name, GF_LOG_ERROR, - "[absolutepath %s] directory " - "not found, path %s error %d", - full_dir_path, loc->path, errno); - goto out; - } - - dirp = sys_opendir(full_dir_path); - if (!dirp) { - ret = -1; - gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s", - loc->path); - if (conf->decommission_subvols_cnt) { - defrag->total_failures++; - } - goto out; - } - - while ((entry = sys_readdir(dirp, scratch)) != NULL) { - if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { - ret = 1; - goto out; - } - if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") || - !strcmp(entry->d_name, ".glusterfs")) - continue; - - /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back - to stat in case d_type is not defined */ - if (entry->d_type != DT_DIR) { - continue; - } - - entry_len = strlen(entry->d_name); - full_entry_length = total_len + entry_len + 1; /* one more for "/"*/ - - snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path, - entry->d_name); - - size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16); - if (size != 16) { - gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s", - full_entry_path); - continue; - } - - loc_wipe(&entry_loc); - - ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); - if (ret) { - gf_log(this->name, GF_LOG_ERROR, - "Child loc" - " build failed for entry: %s", - entry->d_name); - - if (conf->decommission_in_progress) { - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - - goto out; - } else { - should_commit_hash = 0; - - continue; - } - } - - if (gf_uuid_is_null(tmp_gfid)) { - gf_log(this->name, GF_LOG_ERROR, - "%s/%s" - " gfid not present", - loc->path, entry->d_name); - continue; - } - - gf_uuid_copy(entry_loc.gfid, tmp_gfid); - - /*In case the gfid stored in the inode by inode_link - *and the gfid obtained in the lookup differs, then - *client3_3_lookup_cbk will return ESTALE and proper - *error will be captured. - */ - memset(&lstatbuf, 0, sizeof(struct stat)); - ret = sys_lstat(full_entry_path, &lstatbuf); - if (ret == -1) { - gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s", - entry->d_name); - } - - memset(&stbuf, 0, sizeof(struct iatt)); - iatt_from_stat(&stbuf, &lstatbuf); - gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid); - linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name, - &stbuf); - - inode = entry_loc.inode; - entry_loc.inode = linked_inode; - inode_unref(inode); - - if (gf_uuid_is_null(loc->gfid)) { - gf_log(this->name, GF_LOG_ERROR, - "%s/%s" - " gfid not present", - loc->path, entry->d_name); - continue; - } - - gf_uuid_copy(entry_loc.pargfid, loc->gfid); - - ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); - if (ret) { - if (-ret == ENOENT || -ret == ESTALE) { - gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED, - "Dir:%s renamed or removed. " - "Skipping", - loc->path); - ret = 0; - if (conf->decommission_subvols_cnt) { - defrag->total_failures++; - } - continue; - } else { - gf_msg(this->name, GF_LOG_ERROR, -ret, - DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s", - entry_loc.path); - - defrag->total_failures++; - - if (conf->decommission_in_progress) { - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - ret = -1; - goto out; - } else { - should_commit_hash = 0; - continue; - } - } - } - - /* A return value of 2 means, either process_dir or - * lookup of a dir failed. Hence, don't commit hash - * for the current directory*/ - - ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc, - fix_layout, migrate_data); - - if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) { - goto out; - } - - if (ret && ret != 2) { - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED, - "Fix layout failed for %s", entry_loc.path); - - defrag->total_failures++; - - if (conf->decommission_in_progress) { - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - - goto out; - } else { - /* Let's not commit-hash if - * gf_defrag_fix_layout failed*/ - continue; - } - } - } - - ret = sys_closedir(dirp); - if (ret) { - gf_msg_debug(this->name, 0, - "Failed to close dir %s. Reason :" - " %s", - full_dir_path, strerror(errno)); - ret = 0; - } - - dirp = NULL; - - /* A directory layout is fixed only after its subdirs are healed to - * any newly added bricks. If the layout is fixed before subdirs are - * healed, the newly added brick will get a non-null layout. - * Any subdirs which hash to that layout will no longer show up - * in a directory listing until they are healed. - */ - - ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); - - /* In case of a race where the directory is deleted just before - * layout setxattr, the errors are updated in the layout structure. - * We can use this information to make a decision whether the directory - * is deleted entirely. - */ - if (ret == 0) { - ret = dht_dir_layout_error_check(this, loc->inode); - ret = -ret; - } - - if (ret) { - if (-ret == ENOENT || -ret == ESTALE) { - gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED, - "Setxattr failed. Dir %s " - "renamed or removed", - loc->path); - if (conf->decommission_subvols_cnt) { - defrag->total_failures++; - } - ret = 0; - goto out; - } else { - gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, - "Setxattr failed for %s", loc->path); - - defrag->total_failures++; - - if (conf->decommission_in_progress) { - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - ret = -1; - goto out; - } - } - } - - if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && - (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) { - ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno); - - if (ret && (ret != 2)) { - if (perrno == ENOENT || perrno == ESTALE) { - ret = 0; - goto out; - } else { - defrag->total_failures++; - - gf_msg(this->name, GF_LOG_ERROR, 0, - DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, - "gf_defrag_process_dir failed for " - "directory: %s", - loc->path); - - if (conf->decommission_in_progress) { - goto out; - } - - should_commit_hash = 0; - } - } else if (ret == 2) { - should_commit_hash = 0; - } - } - - gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path); - - if (should_commit_hash && - gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { - defrag->total_failures++; - - gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED, - "Settle hash failed for %s", loc->path); - - ret = -1; - - if (conf->decommission_in_progress) { - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - goto out; - } - } - - ret = 0; -out: - loc_wipe(&entry_loc); - - if (fd) - fd_unref(fd); - - if (ret == 0 && should_commit_hash == 0) { - ret = 2; - } - - if (dirp) { - sys_closedir(dirp); - } - return ret; } @@ -4419,31 +3901,26 @@ dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf, loc_t *loc) { dict_t *dict = NULL; - gf_defrag_info_t *defrag = NULL; uuid_t *uuid_ptr = NULL; int ret = -1; int i = 0; int j = 0; - defrag = conf->defrag; - - if (defrag->cmd != GF_DEFRAG_CMD_START_TIER) { - /* Find local subvolumes */ - ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, - NULL, NULL); - if (ret && (ret != -ENODATA)) { - gf_msg(this->name, GF_LOG_ERROR, -ret, 0, - "local " - "subvolume determination failed with error: %d", - -ret); - ret = -1; - goto out; - } - - if (!ret) - goto out; + /* Find local subvolumes */ + ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL, + NULL); + if (ret && (ret != -ENODATA)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; } + if (!ret) + goto out; + ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL, NULL, NULL); if (ret) { @@ -4534,9 +4011,6 @@ dht_file_counter_thread(void *args) struct timespec time_to_wait = { 0, }; - struct timeval now = { - 0, - }; uint64_t tmp_size = 0; if (!args) @@ -4546,9 +4020,8 @@ dht_file_counter_thread(void *args) dht_build_root_loc(defrag->root_inode, &root_loc); while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) { - gettimeofday(&now, NULL); - time_to_wait.tv_sec = now.tv_sec + 600; - time_to_wait.tv_nsec = 0; + timespec_now(&time_to_wait); + time_to_wait.tv_sec += 600; pthread_mutex_lock(&defrag->fc_mutex); pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex, @@ -4621,7 +4094,7 @@ gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread) goto out; } - ret = gf_thread_create(filecnt_thread, NULL, &dht_file_counter_thread, + ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread, (void *)defrag, "dhtfcnt"); if (ret) { @@ -4678,7 +4151,7 @@ gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag, /*Spawn Threads Here*/ while (index < thread_spawn_count) { - ret = gf_thread_create(&(tid[index]), NULL, &gf_defrag_task, + ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task, (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff); if (ret != 0) { gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ", @@ -4768,7 +4241,6 @@ gf_defrag_start_crawl(void *data) pthread_t *tid = NULL; pthread_t filecnt_thread; gf_boolean_t fc_thread_started = _gf_false; - int i = 0; this = data; if (!this) @@ -4786,7 +4258,8 @@ gf_defrag_start_crawl(void *data) if (!defrag) goto exit; - gettimeofday(&defrag->start_time, NULL); + defrag->start_time = gf_time(); + dht_build_root_inode(this, &defrag->root_inode); if (!defrag->root_inode) goto out; @@ -4903,12 +4376,6 @@ gf_defrag_start_crawl(void *data) goto out; } - ret = dht_get_brick_paths(this, conf, &loc); - if (ret) { - gf_log(this->name, GF_LOG_WARNING, "could not get brick path"); - ret = 0; - } - /* Initialise the structures required for parallel migration */ ret = gf_defrag_parallel_migration_init(this, defrag, &tid, &thread_index); @@ -4926,27 +4393,14 @@ gf_defrag_start_crawl(void *data) } } - /* TODO: Need to introduce a flag to safely operate in the old way */ - if (defrag->operate_dist && defrag->is_pure_distribute) { - ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout, - migrate_data); - if (ret && ret != 2) { - defrag->total_failures++; - ret = -1; - goto out; - } - } else { - ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, - migrate_data); - if (ret && ret != 2) { - defrag->total_failures++; - ret = -1; - goto out; - } + ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data); + if (ret) { + defrag->total_failures++; + ret = -1; + goto out; } - if (ret != 2 && - gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) { + if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) { defrag->total_failures++; ret = -1; goto out; @@ -4988,14 +4442,6 @@ out: } UNLOCK(&defrag->lock); - for (i = 0; i < conf->local_subvols_cnt; i++) { - if (defrag->local_brick_paths[i]) { - GF_FREE(defrag->local_brick_paths[i]); - } - } - - GF_FREE(defrag->local_brick_paths); - GF_FREE(defrag); conf->defrag = NULL; @@ -5069,9 +4515,6 @@ gf_defrag_get_estimates_based_on_size(dht_conf_t *conf) uint64_t total_processed = 0; uint64_t tmp_count = 0; uint64_t time_to_complete = 0; - struct timeval now = { - 0, - }; double elapsed = 0; defrag = conf->defrag; @@ -5079,8 +4522,7 @@ gf_defrag_get_estimates_based_on_size(dht_conf_t *conf) if (!g_totalsize) goto out; - gettimeofday(&now, NULL); - elapsed = now.tv_sec - defrag->start_time.tv_sec; + elapsed = gf_time() - defrag->start_time; /* Don't calculate the estimates for the first 10 minutes. * It is unlikely to be accurate and estimates are not required @@ -5130,13 +4572,8 @@ gf_defrag_status_get(dht_conf_t *conf, dict_t *dict) uint64_t lookup = 0; uint64_t failures = 0; uint64_t skipped = 0; - uint64_t promoted = 0; - uint64_t demoted = 0; char *status = ""; double elapsed = 0; - struct timeval end = { - 0, - }; uint64_t time_to_complete = 0; uint64_t time_left = 0; gf_defrag_info_t *defrag = conf->defrag; @@ -5153,17 +4590,12 @@ gf_defrag_status_get(dht_conf_t *conf, dict_t *dict) lookup = defrag->num_files_lookedup; failures = defrag->total_failures; skipped = defrag->skipped; - promoted = defrag->total_files_promoted; - demoted = defrag->total_files_demoted; - gettimeofday(&end, NULL); - - elapsed = end.tv_sec - defrag->start_time.tv_sec; + elapsed = gf_time() - defrag->start_time; /* The rebalance is still in progress */ - if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && - (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED)) { + if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) { time_to_complete = gf_defrag_get_estimates_based_on_size(conf); if (time_to_complete && (time_to_complete > elapsed)) @@ -5178,14 +4610,6 @@ gf_defrag_status_get(dht_conf_t *conf, dict_t *dict) if (!dict) goto log; - ret = dict_set_uint64(dict, "promoted", promoted); - if (ret) - gf_log(THIS->name, GF_LOG_WARNING, "failed to set promoted count"); - - ret = dict_set_uint64(dict, "demoted", demoted); - if (ret) - gf_log(THIS->name, GF_LOG_WARNING, "failed to set demoted count"); - ret = dict_set_uint64(dict, "files", files); if (ret) gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count"); diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 1b6571cd43c..3e24065227c 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -1271,10 +1271,6 @@ dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie, local->call_cnt = conf->subvolume_cnt; if (op_ret < 0) { - /* We get this error when the directory entry was not created - * on a newky attached tier subvol. Hence proceed and do mkdir - * on the tier subvol. - */ if (op_errno == EINVAL) { local->call_cnt = 1; dht_selfheal_dir_mkdir_lookup_done(frame, this); @@ -1330,9 +1326,11 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, int ret = -1; dht_local_t *local = NULL; xlator_t *this = NULL; + dht_conf_t *conf = NULL; local = frame->local; this = frame->this; + conf = this->private; local->selfheal.force_mkdir = force; local->selfheal.hole_cnt = 0; @@ -1372,15 +1370,44 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, return 0; } - if (local->hashed_subvol == NULL) - local->hashed_subvol = dht_subvol_get_hashed(this, loc); + /* MDS xattr is populated only while DHT is having more than one + subvol.In case of graph switch while adding more dht subvols need to + consider hash subvol as a MDS to avoid MDS check failure at the time + of running fop on directory + */ + if (!dict_get(local->xattr, conf->mds_xattr_key) && + (conf->subvolume_cnt > 1)) { + if (local->hashed_subvol == NULL) { + local->hashed_subvol = dht_subvol_get_hashed(this, loc); + if (local->hashed_subvol == NULL) { + local->op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", + loc->pargfid, "name=%s", loc->name, "path=%s", + loc->path, NULL); + goto err; + } + } + ret = dht_inode_ctx_mdsvol_set(local->inode, this, + local->hashed_subvol); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s on inode vol is %s", + local->loc.path, + local->hashed_subvol ? local->hashed_subvol->name : "NULL"); + goto err; + } + } if (local->hashed_subvol == NULL) { - local->op_errno = EINVAL; - gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, - DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid, - "name=%s", loc->name, "path=%s", loc->path, NULL); - goto err; + local->hashed_subvol = dht_subvol_get_hashed(this, loc); + if (local->hashed_subvol == NULL) { + local->op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid, + "name=%s", loc->name, "path=%s", loc->path, NULL); + goto err; + } } local->current = &local->lock[0]; diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 811bb55925f..bb72b0ffbb5 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -140,9 +140,9 @@ dht_priv_dump(xlator_t *this) } } - if (conf->last_stat_fetch.tv_sec) + if (conf->last_stat_fetch) gf_proc_dump_write("last_stat_fetch", "%s", - ctime(&conf->last_stat_fetch.tv_sec)); + ctime(&conf->last_stat_fetch)); UNLOCK(&conf->subvolume_lock); @@ -537,6 +537,8 @@ gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, pattern_str = strtok_r(data, ",", &tmp_str); while (pattern_str) { dup_str = gf_strdup(pattern_str); + if (!dup_str) + goto out; pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); if (!pattern_list) { goto out; @@ -596,7 +598,6 @@ dht_init_methods(xlator_t *this) methods = &(conf->methods); methods->migration_get_dst_subvol = dht_migration_get_dst_subvol; - methods->migration_needed = dht_migration_needed; methods->migration_other = NULL; methods->layout_search = dht_layout_search; @@ -700,10 +701,6 @@ dht_init(xlator_t *this) pthread_cond_init(&defrag->fc_wakeup_cond, 0); defrag->global_error = 0; - - defrag->is_pure_distribute = _gf_false; - - defrag->operate_dist = _gf_true; } conf->use_fallocate = 1; @@ -1049,84 +1046,6 @@ struct volume_options dht_options[] = { /* NUFA option */ {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR}, - /* tier options */ - { - .key = {"tier-pause"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - - { - .key = {"tier-promote-frequency"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "120", - }, - - { - .key = {"tier-demote-frequency"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "3600", - }, - - { - .key = {"write-freq-threshold"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "0", - }, - - { - .key = {"read-freq-threshold"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "0", - }, - { - .key = {"watermark-hi"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "90", - }, - { - .key = {"watermark-low"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "75", - }, - { - .key = {"tier-mode"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "test", - }, - { - .key = {"tier-compact"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - {.key = {"tier-hot-compact-frequency"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "604800", - .description = "Frequency to compact DBs on hot tier in system"}, - {.key = {"tier-cold-compact-frequency"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "604800", - .description = "Frequency to compact DBs on cold tier in system"}, - { - .key = {"tier-max-mb"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "4000", - }, - { - .key = {"tier-max-promote-file-size"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "0", - }, - { - .key = {"tier-max-files"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "10000", - }, - { - .key = {"tier-query-limit"}, - .type = GF_OPTION_TYPE_INT, - .default_value = "100", - }, /* switch option */ {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY}, diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 59313639c45..3648a564840 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -595,7 +595,6 @@ nufa_init(xlator_t *this) dht_methods_t dht_methods = { .migration_get_dst_subvol = dht_migration_get_dst_subvol, - .migration_needed = dht_migration_needed, .layout_search = dht_layout_search, }; diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 9abdcec3f78..b955efd8c2d 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -316,17 +316,19 @@ ec_check_status(ec_fop_data_t *fop) } } - gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, - "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " - "remaining=%s, good=%s, bad=%s, %s)", - gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, - ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), - ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), - ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), - ec_bin(str4, sizeof(str4), fop->good, ec->nodes), - ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), - ec->nodes), - ec_msg_str(fop)); + gf_msg( + fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, + "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " + "remaining=%s, good=%s, bad=%s," + "(Least significant bit represents first client/brick of subvol), %s)", + gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), + ec_bin(str4, sizeof(str4), fop->good, ec->nodes), + ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), + ec->nodes), + ec_msg_str(fop)); if (fop->use_fd) { if (fop->fd != NULL) { ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, @@ -614,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop) loc_t *loc2 = NULL; char gfid1[64] = {0}; char gfid2[64] = {0}; + ec_fop_data_t *parent = fop->parent; if (fop->errstr) return fop->errstr; - if (!fop->use_fd) { loc1 = &fop->loc[0]; loc2 = &fop->loc[1]; @@ -625,23 +627,45 @@ ec_msg_str(ec_fop_data_t *fop) if (fop->id == GF_FOP_RENAME) { gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' and '%s' with gfids " - "%s and %s respectively", + "%s and %s respectively. Parent FOP: %s", ec_fop_name(fop->id), loc1->path, loc2->path, uuid_utoa_r(loc1->gfid, gfid1), - uuid_utoa_r(loc2->gfid, gfid2)); + uuid_utoa_r(loc2->gfid, gfid2), + parent ? ec_fop_name(parent->id) : "No Parent"); } else { - gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s", - ec_fop_name(fop->id), loc1->path, - uuid_utoa_r(loc1->gfid, gfid1)); + gf_asprintf( + &fop->errstr, + "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s", + ec_fop_name(fop->id), loc1->path, + uuid_utoa_r(loc1->gfid, gfid1), + parent ? ec_fop_name(parent->id) : "No Parent"); } } else { - gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s", - ec_fop_name(fop->id), - uuid_utoa_r(fop->fd->inode->gfid, gfid1)); + gf_asprintf( + &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s", + ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1), + parent ? ec_fop_name(parent->id) : "No Parent"); } return fop->errstr; } +static void +ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need, + int32_t loglevel) +{ + ec_t *ec = fop->xl->private; + char str1[32], str2[32], str3[32]; + + gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT, + "Insufficient available children for this request: " + "Have : %d, Need : %u : Child UP : %s " + "Mask: %s, Healing : %s : %s ", + have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->healing, ec->nodes), + ec_msg_str(fop)); +} + static int32_t ec_child_select(ec_fop_data_t *fop) { @@ -699,11 +723,7 @@ ec_child_select(ec_fop_data_t *fop) ec_trace("SELECT", fop, ""); if ((num < fop->minimum) && (num < ec->fragments)) { - gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, - "Insufficient available children " - "for this request (have %d, need " - "%d). %s", - num, fop->minimum, ec_msg_str(fop)); + ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR); return 0; } @@ -711,11 +731,7 @@ ec_child_select(ec_fop_data_t *fop) (fop->locks[0].update[EC_DATA_TXN] || fop->locks[0].update[EC_METADATA_TXN])) { if (ec->quorum_count && (num < ec->quorum_count)) { - gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, - "Insufficient available children " - "for this request (have %d, need " - "%d). %s", - num, ec->quorum_count, ec_msg_str(fop)); + ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR); return 0; } } diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 89a433e5b91..7d991f04aac 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -2498,7 +2498,7 @@ out: } int -ec_heal_set_dirty_without_lock(call_frame_t *frame, ec_t *ec, inode_t *inode) +ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode) { int i = 0; int ret = 0; @@ -2528,7 +2528,6 @@ ec_heal_set_dirty_without_lock(call_frame_t *frame, ec_t *ec, inode_t *inode) xattr[i] = dict; on[i] = 1; } - dirty_xattr[EC_METADATA_TXN] = hton64(1); ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr, (sizeof(*dirty_xattr) * EC_VERSION_SIZE)); if (ret < 0) { @@ -2629,13 +2628,11 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL, "Index entry needs to be purged for: %s ", uuid_utoa(loc->gfid)); - /* We need to send xattrop to set dirty flag so that it can be - * healed and index entry could be removed. We need not to take lock - * on this entry to do so as we are just setting dirty flag which - * actually increases the trusted.ec.dirty count and does not set - * the new value. - * This will make sure that it is not interfering in other fops.*/ - ec_heal_set_dirty_without_lock(frame, ec, loc->inode); + /* We need to send zero-xattrop so that stale index entry could be + * removed. We need not take lock on this entry to do so as + * xattrop on a brick is atomic. */ + ec_heal_purge_stale_index(frame, ec, loc->inode); + goto out; } else if (need_heal == EC_HEAL_NONEED) { gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL, "Heal is not required for : %s ", uuid_utoa(loc->gfid)); diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c index 63fe5d34e38..5c1586bc9c5 100644 --- a/xlators/cluster/ec/src/ec-heald.c +++ b/xlators/cluster/ec/src/ec-heald.c @@ -62,7 +62,7 @@ __ec_shd_healer_wait(struct subvol_healer *healer) ec = healer->this->private; disabled_loop: - wait_till.tv_sec = time(NULL) + ec->shd.timeout; + wait_till.tv_sec = gf_time() + ec->shd.timeout; while (!healer->rerun) { ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); @@ -156,15 +156,58 @@ ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name) return ret; } +static gf_boolean_t +ec_is_heal_completed(char *status) +{ + char *bad_pos = NULL; + char *zero_pos = NULL; + + if (!status) { + return _gf_false; + } + + /*Logic: + * Status will be of the form Good: <binary>, Bad: <binary> + * If heal completes, if we do strchr for '0' it should be present after + * 'Bad:' i.e. strRchr for ':' + * */ + + zero_pos = strchr(status, '0'); + bad_pos = strrchr(status, ':'); + if (!zero_pos || !bad_pos) { + /*malformed status*/ + return _gf_false; + } + + if (zero_pos > bad_pos) { + return _gf_true; + } + + return _gf_false; +} + int ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, gf_boolean_t full) { dict_t *xdata = NULL; + dict_t *dict = NULL; uint32_t count; int32_t ret; + char *heal_status = NULL; + ec_t *ec = healer->this->private; + + GF_ATOMIC_INC(ec->stats.shd.attempted); + ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL, + &xdata); + if (ret == 0) { + if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) { + if (ec_is_heal_completed(heal_status)) { + GF_ATOMIC_INC(ec->stats.shd.completed); + } + } + } - ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata); if (!full && (loc->inode->ia_type == IA_IFDIR)) { /* If we have just healed a directory, it's possible that * other index entries have appeared to be healed. */ @@ -183,6 +226,10 @@ ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, dict_unref(xdata); } + if (dict) { + dict_unref(dict); + } + return ret; } diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 90da328e641..de9b89bb2c9 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -626,6 +626,11 @@ struct _ec_statistics { requests. (Basically memory allocation errors). */ } stripe_cache; + struct { + gf_atomic_t attempted; /*Number of heals attempted on + files/directories*/ + gf_atomic_t completed; /*Number of heals complted on files/directories*/ + } shd; }; struct _ec { diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 66b4e634911..7344be4968d 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -325,13 +325,18 @@ ec_get_event_from_state(ec_t *ec) void ec_up(xlator_t *this, ec_t *ec) { + char str1[32], str2[32]; + if (ec->timer != NULL) { gf_timer_call_cancel(this->ctx, ec->timer); ec->timer = NULL; } ec->up = 1; - gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP"); + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, + "Going UP : Child UP = %s Child Notify = %s", + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name); } @@ -339,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec) void ec_down(xlator_t *this, ec_t *ec) { + char str1[32], str2[32]; + if (ec->timer != NULL) { gf_timer_call_cancel(this->ctx, ec->timer); ec->timer = NULL; } ec->up = 0; - gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN"); + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, + "Going DOWN : Child UP = %s Child Notify = %s", + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name); } @@ -700,6 +710,8 @@ ec_statistics_init(ec_t *ec) GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0); GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0); GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0); + GF_ATOMIC_INIT(ec->stats.shd.attempted, 0); + GF_ATOMIC_INIT(ec->stats.shd.completed, 0); } static int @@ -1569,6 +1581,10 @@ ec_dump_private(xlator_t *this) GF_ATOMIC_GET(ec->stats.stripe_cache.allocs)); gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC, GF_ATOMIC_GET(ec->stats.stripe_cache.errors)); + gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.shd.attempted)); + gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.shd.completed)); return 0; } diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c index 0158e8d8546..d45655ef4c3 100644 --- a/xlators/debug/error-gen/src/error-gen.c +++ b/xlators/debug/error-gen/src/error-gen.c @@ -1507,8 +1507,8 @@ init(xlator_t *this) this->private = pvt; - /* Give some seed value here */ - srand(time(NULL)); + /* Give some seed value here. */ + srand(gf_time()); ret = 0; out: diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index 345dbe7e09c..aa00c446e5a 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -135,7 +135,7 @@ struct ios_global_stats { gf_atomic_t block_count_read[IOS_BLOCK_COUNT_SIZE]; gf_atomic_t fop_hits[GF_FOP_MAXVALUE]; gf_atomic_t upcall_hits[GF_UPCALL_FLAGS_MAXVALUE]; - struct timeval started_at; + time_t started_at; struct ios_lat latency[GF_FOP_MAXVALUE]; uint64_t nr_opens; uint64_t max_nr_opens; @@ -292,9 +292,7 @@ is_fop_latency_started(call_frame_t *frame) begin = &frame->begin; \ end = &frame->end; \ \ - elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + \ - (end->tv_nsec - begin->tv_nsec)) / \ - 1000; \ + elapsed = gf_tsdiff(begin, end) / 1000.0; \ throughput = op_ret / elapsed; \ \ conf = this->private; \ @@ -767,9 +765,8 @@ err: int io_stats_dump_global_to_json_logfp(xlator_t *this, - struct ios_global_stats *stats, - struct timeval *now, int interval, - FILE *logfp) + struct ios_global_stats *stats, time_t now, + int interval, FILE *logfp) { int i = 0; int j = 0; @@ -795,10 +792,7 @@ io_stats_dump_global_to_json_logfp(xlator_t *this, }; dict_t *xattr = NULL; - interval_sec = ((now->tv_sec * 1000000.0 + now->tv_usec) - - (stats->started_at.tv_sec * 1000000.0 + - stats->started_at.tv_usec)) / - 1000000.0; + interval_sec = (double)(now - stats->started_at); conf = this->private; @@ -950,8 +944,8 @@ io_stats_dump_global_to_json_logfp(xlator_t *this, } if (interval == -1) { - ios_log(this, logfp, "\"%s.%s.uptime\": %" PRId64 ",", key_prefix, - str_prefix, (uint64_t)(now->tv_sec - stats->started_at.tv_sec)); + ios_log(this, logfp, "\"%s.%s.uptime\": %" PRIu64 ",", key_prefix, + str_prefix, (uint64_t)(now - stats->started_at)); ios_log(this, logfp, "\"%s.%s.bytes_read\": " "%" GF_PRI_ATOMIC ",", @@ -1203,7 +1197,7 @@ out: int io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats, - struct timeval *now, int interval, FILE *logfp) + time_t now, int interval, FILE *logfp) { int i = 0; int per_line = 0; @@ -1226,8 +1220,8 @@ io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats, ios_log(this, logfp, "\n=== Cumulative stats ==="); else ios_log(this, logfp, "\n=== Interval %d stats ===", interval); - ios_log(this, logfp, " Duration : %" PRId64 " secs", - (uint64_t)(now->tv_sec - stats->started_at.tv_sec)); + ios_log(this, logfp, " Duration : %" PRIu64 " secs", + (uint64_t)(now - stats->started_at)); ios_log(this, logfp, " BytesRead : %" GF_PRI_ATOMIC, GF_ATOMIC_GET(stats->data_read)); ios_log(this, logfp, " BytesWritten : %" GF_PRI_ATOMIC "\n", @@ -1372,7 +1366,7 @@ io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats, int io_stats_dump_global_to_dict(xlator_t *this, struct ios_global_stats *stats, - struct timeval *now, int interval, dict_t *dict) + time_t now, int interval, dict_t *dict) { int ret = 0; char key[64] = {0}; @@ -1398,7 +1392,7 @@ io_stats_dump_global_to_dict(xlator_t *this, struct ios_global_stats *stats, interval); snprintf(key, sizeof(key), "%d-duration", interval); - sec = (uint64_t)(now->tv_sec - stats->started_at.tv_sec); + sec = now - stats->started_at; ret = dict_set_uint64(dict, key, sec); if (ret) { gf_log(this->name, GF_LOG_ERROR, @@ -1521,9 +1515,8 @@ out: } int -io_stats_dump_global(xlator_t *this, struct ios_global_stats *stats, - struct timeval *now, int interval, - struct ios_dump_args *args) +io_stats_dump_global(xlator_t *this, struct ios_global_stats *stats, time_t now, + int interval, struct ios_dump_args *args) { int ret = -1; @@ -1581,13 +1574,13 @@ ios_dump_args_init(struct ios_dump_args *args, ios_dump_type_t type, } static void -ios_global_stats_clear(struct ios_global_stats *stats, struct timeval *now) +ios_global_stats_clear(struct ios_global_stats *stats, time_t now) { GF_ASSERT(stats); GF_ASSERT(now); memset(stats, 0, sizeof(*stats)); - stats->started_at = *now; + stats->started_at = now; } int @@ -1598,7 +1591,7 @@ io_stats_dump(xlator_t *this, struct ios_dump_args *args, ios_info_op_t op, struct ios_global_stats cumulative = {}; struct ios_global_stats incremental = {}; int increment = 0; - struct timeval now; + time_t now = 0; GF_ASSERT(this); GF_ASSERT(args); @@ -1606,8 +1599,8 @@ io_stats_dump(xlator_t *this, struct ios_dump_args *args, ios_info_op_t op, GF_ASSERT(args->type < IOS_DUMP_TYPE_MAX); conf = this->private; + now = gf_time(); - gettimeofday(&now, NULL); LOCK(&conf->lock); { if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE) @@ -1620,17 +1613,17 @@ io_stats_dump(xlator_t *this, struct ios_dump_args *args, ios_info_op_t op, if (!is_peek) { increment = conf->increment++; - ios_global_stats_clear(&conf->incremental, &now); + ios_global_stats_clear(&conf->incremental, now); } } } UNLOCK(&conf->lock); if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE) - io_stats_dump_global(this, &cumulative, &now, -1, args); + io_stats_dump_global(this, &cumulative, now, -1, args); if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL) - io_stats_dump_global(this, &incremental, &now, increment, args); + io_stats_dump_global(this, &incremental, now, increment, args); return 0; } @@ -1640,9 +1633,8 @@ io_stats_dump_fd(xlator_t *this, struct ios_fd *iosfd) { struct ios_conf *conf = NULL; struct timeval now; - uint64_t sec = 0; - uint64_t usec = 0; int i = 0; + double usecs = 0; uint64_t data_read = 0; uint64_t data_written = 0; uint64_t block_count_read = 0; @@ -1657,23 +1649,15 @@ io_stats_dump_fd(xlator_t *this, struct ios_fd *iosfd) return 0; gettimeofday(&now, NULL); - - if (iosfd->opened_at.tv_usec > now.tv_usec) { - now.tv_usec += 1000000; - now.tv_usec--; - } - - sec = now.tv_sec - iosfd->opened_at.tv_sec; - usec = now.tv_usec - iosfd->opened_at.tv_usec; + usecs = gf_tvdiff(&iosfd->opened_at, &now); gf_log(this->name, GF_LOG_INFO, "--- fd stats ---"); if (iosfd->filename) gf_log(this->name, GF_LOG_INFO, " Filename : %s", iosfd->filename); - if (sec) - gf_log(this->name, GF_LOG_INFO, - " Lifetime : %" PRId64 "secs, %" PRId64 "usecs", sec, usec); + if (usecs) + gf_log(this->name, GF_LOG_INFO, " Lifetime : %lf secs", usecs); data_read = GF_ATOMIC_GET(iosfd->data_read); if (data_read) @@ -1776,9 +1760,7 @@ update_ios_latency(struct ios_conf *conf, call_frame_t *frame, begin = &frame->begin; end = &frame->end; - elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + - (end->tv_nsec - begin->tv_nsec)) / - 1000; + elapsed = gf_tsdiff(begin, end) / 1000.0; update_ios_latency_stats(&conf->cumulative, elapsed, op); update_ios_latency_stats(&conf->incremental, elapsed, op); @@ -3592,26 +3574,21 @@ ios_destroy_top_stats(struct ios_conf *conf) return; } -static int +static void io_stats_clear(struct ios_conf *conf) { - struct timeval now; - int ret = -1; + time_t now = 0; GF_ASSERT(conf); + now = gf_time(); - if (!gettimeofday(&now, NULL)) { - LOCK(&conf->lock); - { - ios_global_stats_clear(&conf->cumulative, &now); - ios_global_stats_clear(&conf->incremental, &now); - conf->increment = 0; - } - UNLOCK(&conf->lock); - ret = 0; + LOCK(&conf->lock); + { + ios_global_stats_clear(&conf->cumulative, now); + ios_global_stats_clear(&conf->incremental, now); + conf->increment = 0; } - - return ret; + UNLOCK(&conf->lock); } int32_t @@ -3852,7 +3829,7 @@ ios_conf_destroy(struct ios_conf *conf) _ios_destroy_dump_thread(conf); ios_destroy_sample_buf(conf->ios_sample_buf); LOCK_DESTROY(&conf->lock); - GF_FREE(conf->dnscache); + gf_dnscache_deinit(conf->dnscache); GF_FREE(conf); } @@ -3875,7 +3852,7 @@ ios_init_stats(struct ios_global_stats *stats) for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) GF_ATOMIC_INIT(stats->upcall_hits[i], 0); - gettimeofday(&stats->started_at, NULL); + stats->started_at = gf_time(); } int @@ -3964,11 +3941,14 @@ init(xlator_t *this) gf_log(this->name, GF_LOG_ERROR, "Out of memory."); goto out; } - ret = -1; GF_OPTION_INIT("ios-dnscache-ttl-sec", conf->ios_dnscache_ttl_sec, int32, out); conf->dnscache = gf_dnscache_init(conf->ios_dnscache_ttl_sec); + if (!conf->dnscache) { + ret = -1; + goto out; + } GF_OPTION_INIT("sys-log-level", sys_log_str, str, out); if (sys_log_str) { @@ -4119,12 +4099,9 @@ notify(xlator_t *this, int32_t event, void *data, ...) } if (GF_IOS_INFO_CLEAR == op) { - ret = io_stats_clear(this->private); - if (ret) - gf_log(this->name, GF_LOG_ERROR, - "Failed to clear info stats"); + io_stats_clear(this->private); - ret = dict_set_int32(output, "stats-cleared", ret ? 0 : 1); + ret = dict_set_int32(output, "stats-cleared", 1); if (ret) gf_log(this->name, GF_LOG_ERROR, "Failed to set stats-cleared" diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 194634b003d..c57897f11ea 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -2,9 +2,13 @@ if BUILD_CLOUDSYNC CLOUDSYNC_DIR = cloudsync endif +if BUILD_METADISP + METADISP_DIR = metadisp +endif + SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \ compress changelog gfid-access snapview-client snapview-server trash \ shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \ - utime + utime $(METADISP_DIR) CLEANFILES = diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c index 34e20f9df11..5cef2ffa5e5 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c @@ -40,21 +40,21 @@ br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat) } void -br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, struct timeval *tv) +br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time) { if (!scrub_stat) return; pthread_mutex_lock(&scrub_stat->lock); { - scrub_stat->scrub_start_tv.tv_sec = tv->tv_sec; + scrub_stat->scrub_start_time = time; } pthread_mutex_unlock(&scrub_stat->lock); } void br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, - struct timeval *tv) + time_t time) { int lst_size = 0; @@ -67,10 +67,10 @@ br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, pthread_mutex_lock(&scrub_stat->lock); { - scrub_stat->scrub_end_tv.tv_sec = tv->tv_sec; + scrub_stat->scrub_end_time = time; - scrub_stat->scrub_duration = scrub_stat->scrub_end_tv.tv_sec - - scrub_stat->scrub_start_tv.tv_sec; + scrub_stat->scrub_duration = scrub_stat->scrub_end_time - + scrub_stat->scrub_start_time; snprintf(scrub_stat->last_scrub_time, lst_size, "%s", timestr); } diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h index 24128b90a66..f022aa831eb 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h @@ -15,20 +15,22 @@ #include <sys/time.h> #include <pthread.h> +#include <glusterfs/common-utils.h> + struct br_scrub_stats { - uint64_t scrubbed_files; /* Total number of scrubbed file */ + uint64_t scrubbed_files; /* Total number of scrubbed files. */ - uint64_t unsigned_files; /* Total number of unsigned file */ + uint64_t unsigned_files; /* Total number of unsigned files. */ - uint64_t scrub_duration; /* Duration of last scrub */ + uint64_t scrub_duration; /* Duration of last scrub. */ - char last_scrub_time[1024]; /*last scrub completion time */ + char last_scrub_time[GF_TIMESTR_SIZE]; /* Last scrub completion time. */ - struct timeval scrub_start_tv; /* Scrubbing starting time*/ + time_t scrub_start_time; /* Scrubbing starting time. */ - struct timeval scrub_end_tv; /* Scrubbing finishing time */ + time_t scrub_end_time; /* Scrubbing finishing time. */ - int8_t scrub_running; /* Scrub running or not */ + int8_t scrub_running; /* Whether scrub running or not. */ pthread_mutex_t lock; }; @@ -40,9 +42,9 @@ br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat); void br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat); void -br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, struct timeval *tv); +br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time); void br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, - struct timeval *tv); + time_t time); #endif /* __BIT_ROT_SCRUB_STATUS_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c index c4607654365..289dd53f610 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c @@ -604,22 +604,20 @@ br_scrubber_log_time(xlator_t *this, const char *sfx) char timestr[GF_TIMESTR_SIZE] = { 0, }; - struct timeval tv = { - 0, - }; br_private_t *priv = NULL; + time_t now = 0; + now = gf_time(); priv = this->private; - gettimeofday(&tv, NULL); - gf_time_fmt(timestr, sizeof(timestr), tv.tv_sec, gf_timefmt_FT); + gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT); if (strcasecmp(sfx, "started") == 0) { - br_update_scrub_start_time(&priv->scrub_stat, &tv); + br_update_scrub_start_time(&priv->scrub_stat, now); gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_START, "Scrubbing %s at %s", sfx, timestr); } else { - br_update_scrub_finish_time(&priv->scrub_stat, timestr, &tv); + br_update_scrub_finish_time(&priv->scrub_stat, timestr, now); gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_FINISH, "Scrubbing %s at %s", sfx, timestr); } @@ -631,12 +629,10 @@ br_fsscanner_log_time(xlator_t *this, br_child_t *child, const char *sfx) char timestr[GF_TIMESTR_SIZE] = { 0, }; - struct timeval tv = { - 0, - }; + time_t now = 0; - gettimeofday(&tv, NULL); - gf_time_fmt(timestr, sizeof(timestr), tv.tv_sec, gf_timefmt_FT); + now = gf_time(); + gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT); if (strcasecmp(sfx, "started") == 0) { gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s", @@ -919,9 +915,6 @@ br_fsscan_schedule(xlator_t *this) { uint32_t timo = 0; br_private_t *priv = NULL; - struct timeval tv = { - 0, - }; char timestr[GF_TIMESTR_SIZE] = { 0, }; @@ -933,8 +926,7 @@ br_fsscan_schedule(xlator_t *this) fsscrub = &priv->fsscrub; scrub_monitor = &priv->scrub_monitor; - (void)gettimeofday(&tv, NULL); - scrub_monitor->boot = tv.tv_sec; + scrub_monitor->boot = gf_time(); timo = br_fsscan_calculate_timeout(fsscrub->frequency); if (timo == 0) { @@ -978,9 +970,7 @@ br_fsscan_activate(xlator_t *this) char timestr[GF_TIMESTR_SIZE] = { 0, }; - struct timeval now = { - 0, - }; + time_t now = 0; br_private_t *priv = NULL; struct br_scrubber *fsscrub = NULL; struct br_monitor *scrub_monitor = NULL; @@ -989,7 +979,7 @@ br_fsscan_activate(xlator_t *this) fsscrub = &priv->fsscrub; scrub_monitor = &priv->scrub_monitor; - (void)gettimeofday(&now, NULL); + now = gf_time(); timo = br_fsscan_calculate_timeout(fsscrub->frequency); if (timo == 0) { gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, @@ -1003,7 +993,7 @@ br_fsscan_activate(xlator_t *this) } pthread_mutex_unlock(&scrub_monitor->donelock); - gf_time_fmt(timestr, sizeof(timestr), (now.tv_sec + timo), gf_timefmt_FT); + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); (void)gf_tw_mod_timer(priv->timer_wheel, scrub_monitor->timer, timo); _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); @@ -1023,9 +1013,7 @@ br_fsscan_reschedule(xlator_t *this) char timestr[GF_TIMESTR_SIZE] = { 0, }; - struct timeval now = { - 0, - }; + time_t now = 0; br_private_t *priv = NULL; struct br_scrubber *fsscrub = NULL; struct br_monitor *scrub_monitor = NULL; @@ -1037,7 +1025,7 @@ br_fsscan_reschedule(xlator_t *this) if (!fsscrub->frequency_reconf) return 0; - (void)gettimeofday(&now, NULL); + now = gf_time(); timo = br_fsscan_calculate_timeout(fsscrub->frequency); if (timo == 0) { gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, @@ -1045,7 +1033,7 @@ br_fsscan_reschedule(xlator_t *this) return -1; } - gf_time_fmt(timestr, sizeof(timestr), (now.tv_sec + timo), gf_timefmt_FT); + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); pthread_mutex_lock(&scrub_monitor->donelock); { @@ -1076,20 +1064,16 @@ br_fsscan_ondemand(xlator_t *this) char timestr[GF_TIMESTR_SIZE] = { 0, }; - struct timeval now = { - 0, - }; + time_t now = 0; br_private_t *priv = NULL; struct br_monitor *scrub_monitor = NULL; priv = this->private; scrub_monitor = &priv->scrub_monitor; - (void)gettimeofday(&now, NULL); - + now = gf_time(); timo = BR_SCRUB_ONDEMAND; - - gf_time_fmt(timestr, sizeof(timestr), (now.tv_sec + timo), gf_timefmt_FT); + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); pthread_mutex_lock(&scrub_monitor->donelock); { diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h index 8d2b7f051da..6c15a166f18 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h @@ -44,7 +44,8 @@ GLFS_MSGID(BITROT_STUB, BRS_MSG_NO_MEMORY, BRS_MSG_SET_EVENT_FAILED, BRS_MSG_NON_BITD_PID, BRS_MSG_SIGN_PREPARE_FAIL, BRS_MSG_USING_DEFAULT_THREAD_SIZE, BRS_MSG_ALLOC_MEM_FAILED, BRS_MSG_DICT_ALLOC_FAILED, BRS_MSG_CREATE_GF_DIRENT_FAILED, - BRS_MSG_ALLOC_FAILED, BRS_MSG_PATH_XATTR_GET_FAILED); + BRS_MSG_ALLOC_FAILED, BRS_MSG_PATH_XATTR_GET_FAILED, + BRS_MSG_VERSION_PREPARE_FAIL); #define BRS_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed" #define BRS_MSG_BAD_OBJ_THREAD_FAIL_STR "pthread_init failed" @@ -68,6 +69,8 @@ GLFS_MSGID(BITROT_STUB, BRS_MSG_NO_MEMORY, BRS_MSG_SET_EVENT_FAILED, "daemon. Unwinding the fop" #define BRS_MSG_SIGN_PREPARE_FAIL_STR \ "failed to prepare the signature. Unwinding the fop" +#define BRS_MSG_VERSION_PREPARE_FAIL_STR \ + "failed to prepare the version. Unwinding the fop" #define BRS_MSG_STUB_ALLOC_FAILED_STR "failed to allocate stub fop, Unwinding" #define BRS_MSG_BAD_OBJ_MARK_FAIL_STR "failed to mark object as bad" #define BRS_MSG_NON_SCRUB_BAD_OBJ_MARK_STR \ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c index 605a5e4c3e4..447dd47ff41 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -424,8 +424,8 @@ br_stub_prepare_version_request(xlator_t *this, dict_t *dict, priv = this->private; br_set_ongoingversion(obuf, oversion, priv->boot); - return dict_set_static_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf, - sizeof(br_version_t)); + return dict_set_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf, + sizeof(br_version_t)); } static int @@ -436,8 +436,7 @@ br_stub_prepare_signing_request(dict_t *dict, br_signature_t *sbuf, br_set_signature(sbuf, sign, signaturelen, &size); - return dict_set_static_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf, - size); + return dict_set_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf, size); } /** @@ -854,23 +853,27 @@ br_stub_perform_incversioning(xlator_t *this, call_frame_t *frame, op_errno = ENOMEM; dict = dict_new(); if (!dict) - goto done; + goto out; ret = br_stub_alloc_versions(&obuf, NULL, 0); - if (ret) - goto dealloc_dict; + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } ret = br_stub_prepare_version_request(this, dict, obuf, writeback_version); - if (ret) - goto dealloc_versions; + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_VERSION_PREPARE_FAIL, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + br_stub_dealloc_versions(obuf); + goto out; + } ret = br_stub_fd_versioning( this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk, writeback_version, BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE); - -dealloc_versions: - br_stub_dealloc_versions(obuf); -dealloc_dict: - dict_unref(dict); -done: +out: + if (dict) + dict_unref(dict); if (ret) { if (local) frame->local = NULL; @@ -1025,31 +1028,36 @@ static int br_stub_prepare_signature(xlator_t *this, dict_t *dict, inode_t *inode, br_isignature_t *sign, int *fakesuccess) { - int32_t ret = 0; + int32_t ret = -1; size_t signaturelen = 0; br_signature_t *sbuf = NULL; if (!br_is_signature_type_valid(sign->signaturetype)) - goto error_return; + goto out; signaturelen = sign->signaturelen; ret = br_stub_alloc_versions(NULL, &sbuf, signaturelen); - if (ret) - goto error_return; + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + goto out; + } ret = br_stub_prepare_signing_request(dict, sbuf, sign, signaturelen); - if (ret) - goto dealloc_versions; + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SIGN_PREPARE_FAIL, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + br_stub_dealloc_versions(sbuf); + goto out; + } + /* At this point sbuf has been added to dict, so the memory will be freed + * when the data from the dict is destroyed + */ ret = br_stub_compare_sign_version(this, inode, sbuf, dict, fakesuccess); - if (ret) - goto dealloc_versions; - - return 0; - -dealloc_versions: - br_stub_dealloc_versions(sbuf); -error_return: - return -1; +out: + return ret; } static void diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c index 71fe1f032a0..e561997d858 100644 --- a/xlators/features/changelog/src/changelog-helpers.c +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -242,8 +242,7 @@ changelog_write(int fd, char *buffer, size_t len) } int -htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts, - char *buffer) +htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer) { char changelog_path[PATH_MAX + 1] = { 0, @@ -273,7 +272,7 @@ htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts, goto out; } - len = snprintf(x_value, sizeof(x_value), "%lu:%d", ts, + len = snprintf(x_value, sizeof(x_value), "%ld:%d", ts, priv->rollover_count); if (len >= sizeof(x_value)) { ret = -1; @@ -382,8 +381,7 @@ out: } static int -changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv, - unsigned long ts) +changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv, time_t ts) { int ret = -1; int notify = 0; @@ -421,16 +419,14 @@ changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv, priv->changelog_fd = -1; } - time_t time = (time_t)ts; - - /* Get GMT time */ - gmt = gmtime(&time); + /* Get GMT time. */ + gmt = gmtime(&ts); strftime(yyyymmdd, sizeof(yyyymmdd), "%Y/%m/%d", gmt); (void)snprintf(ofile, PATH_MAX, "%s/" CHANGELOG_FILE_NAME, priv->changelog_dir); - (void)snprintf(nfile, PATH_MAX, "%s/%s/" CHANGELOG_FILE_NAME ".%lu", + (void)snprintf(nfile, PATH_MAX, "%s/%s/" CHANGELOG_FILE_NAME ".%ld", priv->changelog_dir, yyyymmdd, ts); (void)snprintf(nfile_dir, PATH_MAX, "%s/%s", priv->changelog_dir, yyyymmdd); @@ -593,7 +589,7 @@ out: * returns -1 on failure or error */ int -htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts) +htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts) { int ht_file_fd = -1; int ht_dir_fd = -1; @@ -723,7 +719,7 @@ out: * returns -1 on failure or error */ int -htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts) +htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts) { int ht_file_fd = -1; int ht_dir_fd = -1; @@ -741,12 +737,12 @@ htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts) int32_t len = 0; gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_NEW_HTIME_FILE, - "name=%lu", ts, NULL); + "name=%ld", ts, NULL); CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, ht_dir_path); /* get the htime file name in ht_file_path */ - len = snprintf(ht_file_path, PATH_MAX, "%s/%s.%lu", ht_dir_path, + len = snprintf(ht_file_path, PATH_MAX, "%s/%s.%ld", ht_dir_path, HTIME_FILE_NAME, ts); if ((len < 0) || (len >= PATH_MAX)) { ret = -1; @@ -792,7 +788,7 @@ htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts) goto out; } - (void)snprintf(ht_file_bname, sizeof(ht_file_bname), "%s.%lu", + (void)snprintf(ht_file_bname, sizeof(ht_file_bname), "%s.%ld", HTIME_FILE_NAME, ts); if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname, strlen(ht_file_bname), 0)) { @@ -963,8 +959,8 @@ out: } int -changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, - unsigned long ts, gf_boolean_t finale) +changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts, + gf_boolean_t finale) { int ret = -1; @@ -985,21 +981,12 @@ changelog_entry_length() return sizeof(changelog_log_data_t); } -int +void changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last) { - struct timeval tv = { - 0, - }; - cld->cld_type = CHANGELOG_TYPE_ROLLOVER; - - if (gettimeofday(&tv, NULL)) - return -1; - - cld->cld_roll_time = (unsigned long)tv.tv_sec; + cld->cld_roll_time = gf_time(); cld->cld_finale = is_last; - return 0; } int @@ -1274,7 +1261,7 @@ changelog_rollover(void *data) while (1) { (void)pthread_testcancel(); - tv.tv_sec = time(NULL) + priv->rollover_time; + tv.tv_sec = gf_time() + priv->rollover_time; tv.tv_nsec = 0; ret = 0; /* Reset ret to zero */ @@ -1355,12 +1342,7 @@ changelog_rollover(void *data) if (priv->explicit_rollover == _gf_true) sleep(1); - ret = changelog_fill_rollover_data(&cld, _gf_false); - if (ret) { - gf_smsg(this->name, GF_LOG_ERROR, 0, - CHANGELOG_MSG_ROLLOVER_DATA_FILL_FAILED, NULL); - continue; - } + changelog_fill_rollover_data(&cld, _gf_false); _mask_cancellation(); diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h index 0d06d98c9e1..38fa7590c32 100644 --- a/xlators/features/changelog/src/changelog-helpers.h +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -31,7 +31,7 @@ */ typedef struct changelog_log_data { /* rollover related */ - unsigned long cld_roll_time; + time_t cld_roll_time; /* reopen changelog? */ gf_boolean_t cld_finale; @@ -97,12 +97,6 @@ struct changelog_encoder { typedef struct changelog_time_slice { /** - * just in case we need nanosecond granularity some day. - * field is unused as of now (maybe we'd need it later). - */ - struct timeval tv_start; - - /** * version of changelog file, incremented each time changes * rollover. */ @@ -423,11 +417,11 @@ changelog_local_t * changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid, int xtra_records, gf_boolean_t update_flag); int -changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, - unsigned long ts, gf_boolean_t finale); +changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts, + gf_boolean_t finale); int changelog_open_journal(xlator_t *this, changelog_priv_t *priv); -int +void changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last); int changelog_inject_single_event(xlator_t *this, changelog_priv_t *priv, @@ -451,12 +445,11 @@ changelog_fsync_thread(void *data); int changelog_forget(xlator_t *this, inode_t *inode); int -htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts, - char *buffer); +htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer); int -htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts); +htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts); int -htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts); +htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts); /* Geo-Rep snapshot dependency changes */ void diff --git a/xlators/features/changelog/src/changelog-messages.h b/xlators/features/changelog/src/changelog-messages.h index 4dd56b8ee97..cb0e16c85d8 100644 --- a/xlators/features/changelog/src/changelog-messages.h +++ b/xlators/features/changelog/src/changelog-messages.h @@ -59,12 +59,12 @@ GLFS_MSGID( CHANGELOG_MSG_NO_HTIME_CURRENT, CHANGELOG_MSG_HTIME_CURRENT, CHANGELOG_MSG_NEW_HTIME_FILE, CHANGELOG_MSG_MKDIR_ERROR, CHANGELOG_MSG_PATH_NOT_FOUND, CHANGELOG_MSG_XATTR_INIT_FAILED, - CHANGELOG_MSG_WROTE_TO_CSNAP, CHANGELOG_MSG_ROLLOVER_DATA_FILL_FAILED, + CHANGELOG_MSG_WROTE_TO_CSNAP, CHANGELOG_MSG_UNUSED_0, CHANGELOG_MSG_GET_BUFFER_FAILED, CHANGELOG_MSG_BARRIER_STATE_NOTIFY, CHANGELOG_MSG_BARRIER_DISABLED, CHANGELOG_MSG_BARRIER_ALREADY_DISABLED, CHANGELOG_MSG_BARRIER_ON_ERROR, CHANGELOG_MSG_BARRIER_ENABLE, CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, CHANGELOG_MSG_ERROR_IN_DICT_GET, - CHANGELOG_MSG_GET_TIME_FAILURE, CHANGELOG_MSG_HTIME_FETCH_FAILED, + CHANGELOG_MSG_UNUSED_1, CHANGELOG_MSG_UNUSED_2, CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS, CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED, CHANGELOG_MSG_BARRIER_TIMEOUT, CHANGELOG_MSG_TIMEOUT_ADD_FAILED, @@ -123,8 +123,6 @@ GLFS_MSGID( #define CHANGELOG_MSG_GET_TIME_OP_FAILED_STR "Problem rolling over changelog(s)" #define CHANGELOG_MSG_BARRIER_INFO_STR "Explicit wakeup on barrier notify" #define CHANGELOG_MSG_SELECT_FAILED_STR "pthread_cond_timedwait failed" -#define CHANGELOG_MSG_ROLLOVER_DATA_FILL_FAILED_STR \ - "failed to fill rollover data" #define CHANGELOG_MSG_INJECT_FSYNC_FAILED_STR "failed to inject fsync event" #define CHANGELOG_MSG_LOCAL_INIT_FAILED_STR \ "changelog local initialization failed" @@ -144,9 +142,7 @@ GLFS_MSGID( #define CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND_STR "barrier key not found" #define CHANGELOG_MSG_ERROR_IN_DICT_GET_STR \ "Something went wrong in dict_get_str_boolean" -#define CHANGELOG_MSG_GET_TIME_FAILURE_STR "gettimeofday() failure" #define CHANGELOG_MSG_DIR_OPTIONS_NOT_SET_STR "changelog-dir option is not set" -#define CHANGELOG_MSG_HTIME_FETCH_FAILED_STR "unable to fetch htime" #define CHANGELOG_MSG_FREEUP_FAILED_STR "could not cleanup bootstrapper" #define CHANGELOG_MSG_CHILD_MISCONFIGURED_STR \ "translator needs a single subvolume" diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c index 9a7d158cbf2..6a6e5af859e 100644 --- a/xlators/features/changelog/src/changelog.c +++ b/xlators/features/changelog/src/changelog.c @@ -2252,23 +2252,11 @@ static int changelog_init(xlator_t *this, changelog_priv_t *priv) { int i = 0; - int ret = -1; - struct timeval tv = { - 0, - }; + int ret = 0; changelog_log_data_t cld = { 0, }; - ret = gettimeofday(&tv, NULL); - if (ret) { - gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_GET_TIME_FAILURE, - NULL); - goto out; - } - - priv->slice.tv_start = tv; - priv->maps[CHANGELOG_TYPE_DATA] = "D "; priv->maps[CHANGELOG_TYPE_METADATA] = "M "; priv->maps[CHANGELOG_TYPE_METADATA_XATTR] = "M "; @@ -2287,9 +2275,7 @@ changelog_init(xlator_t *this, changelog_priv_t *priv) * in case there was an encoding change. so... things are kept * simple here. */ - ret = changelog_fill_rollover_data(&cld, _gf_false); - if (ret) - goto out; + changelog_fill_rollover_data(&cld, _gf_false); ret = htime_open(this, priv, cld.cld_roll_time); /* call htime open with cld's rollover_time */ @@ -2470,9 +2456,6 @@ reconfigure(xlator_t *this, dict_t *options) char csnap_dir[PATH_MAX] = { 0, }; - struct timeval tv = { - 0, - }; uint32_t timeout = 0; priv = this->private; @@ -2564,9 +2547,7 @@ reconfigure(xlator_t *this, dict_t *options) out); if (active_now || active_earlier) { - ret = changelog_fill_rollover_data(&cld, !active_now); - if (ret) - goto out; + changelog_fill_rollover_data(&cld, !active_now); slice = &priv->slice; @@ -2585,13 +2566,7 @@ reconfigure(xlator_t *this, dict_t *options) if (!active_earlier) { gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_RECONFIGURE, NULL); - if (gettimeofday(&tv, NULL)) { - gf_smsg(this->name, GF_LOG_ERROR, 0, - CHANGELOG_MSG_HTIME_FETCH_FAILED, NULL); - ret = -1; - goto out; - } - htime_create(this, priv, tv.tv_sec); + htime_create(this, priv, gf_time()); } ret = changelog_spawn_helper_threads(this, priv); } diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c index 7680260988b..23c3599825a 100644 --- a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c @@ -237,7 +237,7 @@ aws_form_request(char *resource, char **date, char *reqtype, char *bucketid, int date_len = -1; int res_len = -1; - ctime = time(NULL); + ctime = gf_time(); gtime = gmtime(&ctime); date_len = strftime(httpdate, sizeof(httpdate), diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 4ece7ff6fc8..4abb2c73ce5 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -2104,7 +2104,7 @@ index_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) worker_enqueue(this, stub); return 0; normal: - ret = dict_get_str(xattr_req, "link-count", &flag); + ret = dict_get_str_sizen(xattr_req, "link-count", &flag); if ((ret == 0) && (strcmp(flag, GF_XATTROP_INDEX_COUNT) == 0)) { STACK_WIND(frame, index_lookup_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, xattr_req); @@ -2592,7 +2592,7 @@ notify(xlator_t *this, int event, void *data, ...) if ((event == GF_EVENT_PARENT_DOWN) && victim->cleanup_starting) { stub_cnt = GF_ATOMIC_GET(priv->stub_cnt); - clock_gettime(CLOCK_REALTIME, &sleep_till); + timespec_now_realtime(&sleep_till); sleep_till.tv_sec += 1; /* Wait for draining stub from queue before notify PARENT_DOWN */ diff --git a/xlators/features/leases/src/leases-internal.c b/xlators/features/leases/src/leases-internal.c index 67fdd53cee2..56dee244281 100644 --- a/xlators/features/leases/src/leases-internal.c +++ b/xlators/features/leases/src/leases-internal.c @@ -897,7 +897,7 @@ __recall_lease(xlator_t *this, lease_inode_ctx_t *lease_ctx) } priv = this->private; - recall_time = time(NULL); + recall_time = gf_time(); list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, lease_id_list) { @@ -1367,7 +1367,7 @@ expired_recall_cleanup(void *data) gf_msg_debug(this->name, 0, "Started the expired_recall_cleanup thread"); while (1) { - time_now = time(NULL); + time_now = gf_time(); pthread_mutex_lock(&priv->mutex); { if (priv->fini) { diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index 1fbbae3541f..a2c6be93e03 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -605,13 +605,11 @@ static void __insert_lock(pl_inode_t *pl_inode, posix_lock_t *lock) { if (lock->blocked) - gettimeofday(&lock->blkd_time, NULL); + lock->blkd_time = gf_time(); else - gettimeofday(&lock->granted_time, NULL); + lock->granted_time = gf_time(); list_add_tail(&lock->list, &pl_inode->ext_list); - - return; } /* Return true if the locks overlap, false otherwise */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index d5babcc325c..fd772c850dd 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -121,7 +121,6 @@ __stale_entrylk(xlator_t *this, pl_entry_lock_t *candidate_lock, pl_entry_lock_t *requested_lock, time_t *lock_age_sec) { posix_locks_private_t *priv = NULL; - struct timeval curr; priv = this->private; @@ -129,8 +128,7 @@ __stale_entrylk(xlator_t *this, pl_entry_lock_t *candidate_lock, * chance? Or just the locks we are attempting to acquire? */ if (names_conflict(candidate_lock->basename, requested_lock->basename)) { - gettimeofday(&curr, NULL); - *lock_age_sec = curr.tv_sec - candidate_lock->granted_time.tv_sec; + *lock_age_sec = gf_time() - candidate_lock->granted_time; if (*lock_age_sec > priv->revocation_secs) return _gf_true; } @@ -544,14 +542,10 @@ static int __lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, pl_entry_lock_t *lock, int nonblock) { - struct timeval now; - if (nonblock) goto out; - gettimeofday(&now, NULL); - - lock->blkd_time = now; + lock->blkd_time = gf_time(); list_add_tail(&lock->blocked_locks, &dom->blocked_entrylks); gf_msg_trace(this->name, 0, "Blocking lock: {pinode=%p, basename=%s}", @@ -612,7 +606,7 @@ __lock_entrylk(xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, } __pl_entrylk_ref(lock); - gettimeofday(&lock->granted_time, NULL); + lock->granted_time = gf_time(); list_add(&lock->domain_list, &dom->entrylk_list); ret = 0; diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index d4a24eb44be..d4e51d6e0a1 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -140,15 +140,13 @@ __stale_inodelk(xlator_t *this, pl_inode_lock_t *candidate_lock, pl_inode_lock_t *requested_lock, time_t *lock_age_sec) { posix_locks_private_t *priv = NULL; - struct timeval curr; priv = this->private; /* Question: Should we just prune them all given the * chance? Or just the locks we are attempting to acquire? */ if (inodelk_conflict(candidate_lock, requested_lock)) { - gettimeofday(&curr, NULL); - *lock_age_sec = curr.tv_sec - candidate_lock->granted_time.tv_sec; + *lock_age_sec = gf_time() - candidate_lock->granted_time; if (*lock_age_sec > priv->revocation_secs) return _gf_true; } @@ -397,15 +395,11 @@ static int __lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, int can_block) { - struct timeval now; - if (can_block == 0) { goto out; } - gettimeofday(&now, NULL); - - lock->blkd_time = now; + lock->blkd_time = gf_time(); list_add_tail(&lock->blocked_locks, &dom->blocked_inodelks); gf_msg_trace(this->name, 0, @@ -466,7 +460,7 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, return __lock_blocked_add(this, dom, lock, can_block); } __pl_inodelk_ref(lock); - gettimeofday(&lock->granted_time, NULL); + lock->granted_time = gf_time(); list_add(&lock->list, &dom->inodelk_list); return 0; diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 93c2973b2b3..c868eb494a2 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -43,9 +43,8 @@ struct __posix_lock { fd_t *fd; call_frame_t *frame; - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval - granted_time; /*time at which lock was queued into active list*/ + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ /* These two together serve to uniquely identify each process across nodes */ @@ -85,9 +84,9 @@ struct __pl_inode_lock { call_frame_t *frame; - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval - granted_time; /*time at which lock was queued into active list*/ + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ + /*last time at which lock contention was detected and notified*/ struct timespec contention_time; @@ -139,9 +138,9 @@ struct __entry_lock { const char *basename; - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval - granted_time; /*time at which lock was queued into active list*/ + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ + /*last time at which lock contention was detected and notified*/ struct timespec contention_time; diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index d781cde8969..cf0ae4c57dd 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -494,6 +494,9 @@ pl_inodelk_xattr_fill_multiple(dict_t *this, char *key, data_t *value, char *save_ptr = NULL; tmp_key = gf_strdup(key); + if (!tmp_key) + return -1; + strtok_r(tmp_key, ":", &save_ptr); if (!*save_ptr) { if (tmp_key) @@ -3684,10 +3687,10 @@ __dump_entrylks(pl_inode_t *pl_inode) list_for_each_entry(lock, &dom->entrylk_list, domain_list) { - gf_time_fmt(granted, sizeof(granted), lock->granted_time.tv_sec, + gf_time_fmt(granted, sizeof(granted), lock->granted_time, gf_timefmt_FT); gf_proc_dump_build_key(key, k, "entrylk[%d](ACTIVE)", count); - if (lock->blkd_time.tv_sec == 0) { + if (lock->blkd_time == 0) { snprintf(tmp, sizeof(tmp), ENTRY_GRNTD_FMT, lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK", @@ -3695,7 +3698,7 @@ __dump_entrylks(pl_inode_t *pl_inode) lkowner_utoa(&lock->owner), lock->client, lock->connection_id, granted); } else { - gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time.tv_sec, + gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time, gf_timefmt_FT); snprintf(tmp, sizeof(tmp), ENTRY_BLKD_GRNTD_FMT, lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" @@ -3712,7 +3715,7 @@ __dump_entrylks(pl_inode_t *pl_inode) list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) { - gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time.tv_sec, + gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time, gf_timefmt_FT); gf_proc_dump_build_key(key, k, "entrylk[%d](BLOCKED)", count); @@ -3764,9 +3767,8 @@ __dump_inodelks(pl_inode_t *pl_inode) SET_FLOCK_PID(&lock->user_flock, lock); pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, - lock->client, lock->connection_id, - &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, - _gf_true); + lock->client, lock->connection_id, &lock->granted_time, + &lock->blkd_time, _gf_true); gf_proc_dump_write(key, "%s", tmp); count++; @@ -3778,8 +3780,8 @@ __dump_inodelks(pl_inode_t *pl_inode) count); SET_FLOCK_PID(&lock->user_flock, lock); pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, - lock->client, lock->connection_id, 0, - &lock->blkd_time.tv_sec, _gf_false); + lock->client, lock->connection_id, 0, &lock->blkd_time, + _gf_false); gf_proc_dump_write(key, "%s", tmp); count++; @@ -3812,9 +3814,8 @@ __dump_posixlks(pl_inode_t *pl_inode) gf_proc_dump_build_key(key, "posixlk", "posixlk[%d](%s)", count, lock->blocked ? "BLOCKED" : "ACTIVE"); pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, - lock->client, lock->client_uid, &lock->granted_time.tv_sec, - &lock->blkd_time.tv_sec, - (lock->blocked) ? _gf_false : _gf_true); + lock->client, lock->client_uid, &lock->granted_time, + &lock->blkd_time, (lock->blocked) ? _gf_false : _gf_true); gf_proc_dump_write(key, "%s", tmp); count++; diff --git a/xlators/features/metadisp/Makefile.am b/xlators/features/metadisp/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/metadisp/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/metadisp/src/Makefile.am b/xlators/features/metadisp/src/Makefile.am new file mode 100644 index 00000000000..1520ad8c424 --- /dev/null +++ b/xlators/features/metadisp/src/Makefile.am @@ -0,0 +1,38 @@ +noinst_PYTHON = gen-fops.py + +EXTRA_DIST = fops-tmpl.c + +xlator_LTLIBRARIES = metadisp.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +nodist_metadisp_la_SOURCES = fops.c + +BUILT_SOURCES = fops.c + +metadisp_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +metadisp_la_SOURCES = metadisp.c \ + metadisp-unlink.c \ + metadisp-stat.c \ + metadisp-lookup.c \ + metadisp-readdir.c \ + metadisp-create.c \ + metadisp-open.c \ + metadisp-fsync.c \ + metadisp-setattr.c \ + backend.c + +metadisp_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = metadisp.h metadisp-fops.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +fops.c: fops-tmpl.c $(top_srcdir)/libglusterfs/src/generator.py gen-fops.py + PYTHONPATH=$(top_srcdir)/libglusterfs/src \ + $(PYTHON) $(srcdir)/gen-fops.py $(srcdir)/fops-tmpl.c > $@ + +CLEANFILES = $(nodist_metadisp_la_SOURCES) diff --git a/xlators/features/metadisp/src/backend.c b/xlators/features/metadisp/src/backend.c new file mode 100644 index 00000000000..ee2c25bfaa7 --- /dev/null +++ b/xlators/features/metadisp/src/backend.c @@ -0,0 +1,45 @@ +#define GFID_STR_LEN 37 + +#include "metadisp.h" + +/* + * backend.c + * + * functions responsible for converting user-facing paths to backend-style + * "/$GFID" paths. + */ + +int32_t +build_backend_loc(uuid_t gfid, loc_t *src_loc, loc_t *dst_loc) +{ + static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char gfid_buf[GFID_STR_LEN + 1] = { + 0, + }; + char *path = NULL; + + GF_VALIDATE_OR_GOTO("metadisp", src_loc, out); + GF_VALIDATE_OR_GOTO("metadisp", dst_loc, out); + + loc_copy(dst_loc, src_loc); + memcpy(dst_loc->pargfid, root, sizeof(root)); + GF_FREE((char *)dst_loc->path); // we are overwriting path so nuke + // whatever loc_copy gave us + + uuid_utoa_r(gfid, gfid_buf); + + path = GF_CALLOC(GFID_STR_LEN + 1, sizeof(char), + gf_common_mt_char); // freed via loc_wipe + + path[0] = '/'; + strncpy(path + 1, gfid_buf, GFID_STR_LEN); + path[GFID_STR_LEN] = 0; + dst_loc->path = path; + if (src_loc->name) + dst_loc->name = strrchr(dst_loc->path, '/'); + if (dst_loc->name) + dst_loc->name++; + return 0; +out: + return -1; +} diff --git a/xlators/features/metadisp/src/fops-tmpl.c b/xlators/features/metadisp/src/fops-tmpl.c new file mode 100644 index 00000000000..4385b7dd5b7 --- /dev/null +++ b/xlators/features/metadisp/src/fops-tmpl.c @@ -0,0 +1,10 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <glusterfs/xlator.h> +#include "metadisp.h" +#include "metadisp-fops.h" + +#pragma generate diff --git a/xlators/features/metadisp/src/gen-fops.py b/xlators/features/metadisp/src/gen-fops.py new file mode 100644 index 00000000000..8b5e120fdec --- /dev/null +++ b/xlators/features/metadisp/src/gen-fops.py @@ -0,0 +1,160 @@ +#!/usr/bin/python + +import sys +from generator import fop_subs, generate + +FN_METADATA_CHILD_GENERIC = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ metadata"); + STACK_WIND (frame, default_@NAME@_cbk, + METADATA_CHILD(this), METADATA_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} +""" + +FN_GENERIC_TEMPLATE = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ generic"); + STACK_WIND (frame, default_@NAME@_cbk, + DATA_CHILD(this), DATA_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} +""" + +FN_DATAFD_TEMPLATE = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ datafd"); + xlator_t *child = NULL; + child = DATA_CHILD(this); + STACK_WIND (frame, default_@NAME@_cbk, + child, child->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} +""" + +FN_DATALOC_TEMPLATE = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ dataloc"); + loc_t backend_loc = { + 0, + }; + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + xlator_t *child = NULL; + child = DATA_CHILD(this); + STACK_WIND (frame, default_@NAME@_cbk, + child, child->fops->@NAME@, + @SHORT_ARGS@); + return 0; + +unwind: + STACK_UNWIND_STRICT(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); + return 0; +} +""" + +FOPS_LINE_TEMPLATE = "\t.@NAME@ = metadisp_@NAME@," + +skipped = [ + "readdir", + "readdirp", + "lookup", + "fsync", + "stat", + "open", + "create", + "unlink", + "setattr", + # TODO: implement "inodelk", +] + + +def gen_fops(): + done = skipped + + # + # these are fops that wind to the DATA_CHILD + # + # NOTE: re-written in order from google doc: + # https://docs.google.com/document/d/1KEwVtSNvDhs4qb63gWx2ulCp5GJjge77NGJk4p_Ms4Q + for name in [ + "writev", + "readv", + "ftruncate", + "zerofill", + "discard", + "seek", + "fstat", + ]: + done = done + [name] + print(generate(FN_DATAFD_TEMPLATE, name, fop_subs)) + + for name in ["truncate"]: + done = done + [name] + print(generate(FN_DATALOC_TEMPLATE, name, fop_subs)) + + # these are fops that operate solely on dentries, folders, + # or extended attributes. Therefore, they must always + # wind to METADATA_CHILD and should never perform + # any path rewriting + # + # NOTE: re-written in order from google doc: + # https://docs.google.com/document/d/1KEwVtSNvDhs4qb63gWx2ulCp5GJjge77NGJk4p_Ms4Q + for name in [ + "mkdir", + "symlink", + "link", + "rename", + "mknod", + "opendir", + # "readdir, # special-cased + # "readdirp, # special-cased + "fsyncdir", + # "setattr", # special-cased + "readlink", + "fentrylk", + "access", + # TODO: these wind to both, + # data for backend-attributes and metadata for the rest + "xattrop", + "setxattr", + "getxattr", + "removexattr", + "fgetxattr", + "fsetxattr", + "fremovexattr", + ]: + + done = done + [name] + print(generate(FN_METADATA_CHILD_GENERIC, name, fop_subs)) + + print("struct xlator_fops fops = {") + for name in done: + print(generate(FOPS_LINE_TEMPLATE, name, fop_subs)) + + print("};") + + +for l in open(sys.argv[1], "r").readlines(): + if l.find("#pragma generate") != -1: + print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") + gen_fops() + print("/* END GENERATED CODE */") + else: + print(l[:-1]) diff --git a/xlators/features/metadisp/src/metadisp-create.c b/xlators/features/metadisp/src/metadisp-create.c new file mode 100644 index 00000000000..f8c9798dd59 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-create.c @@ -0,0 +1,101 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * Create, like stat, is a two-step process. We send a create + * to the METADATA_CHILD, then send another create to the DATA_CHILD. + * + * We do the metadata child first to ensure that the ACLs are enforced. + */ + +int32_t +metadisp_create_dentry_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; +} + +int32_t +metadisp_create_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) +{ + // create the backend data inode + STACK_WIND(frame, metadisp_create_dentry_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} + +int32_t +metadisp_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + METADISP_TRACE("%d %d", op_ret, op_errno); + call_stub_t *stub = cookie; + if (op_ret != 0) { + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; + } + + if (stub == NULL) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + return 0; + } + + call_resume(stub); + return 0; + +unwind: + STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int32_t +metadisp_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + METADISP_TRACE("."); + + loc_t backend_loc = { + 0, + }; + call_stub_t *stub = NULL; + uuid_t *gfid_req = NULL; + + RESOLVE_GFID_REQ(xdata, gfid_req, out); + + if (build_backend_loc(*gfid_req, loc, &backend_loc)) { + goto unwind; + } + + frame->local = loc; + + stub = fop_create_stub(frame, metadisp_create_resume, &backend_loc, flags, + mode, umask, fd, xdata); + + STACK_WIND_COOKIE(frame, metadisp_create_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->create, loc, flags, mode, + umask, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +out: + return -1; +} diff --git a/xlators/features/metadisp/src/metadisp-fops.h b/xlators/features/metadisp/src/metadisp-fops.h new file mode 100644 index 00000000000..56dd427cf34 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-fops.h @@ -0,0 +1,51 @@ +#ifndef GF_METADISP_FOPS_H_ +#define GF_METADISP_FOPS_H_ + +#include <glusterfs/xlator.h> +#include <glusterfs/dict.h> +#include <glusterfs/glusterfs.h> + +#include <sys/types.h> + +/* fops in here are defined in their own file. Every other fop is just defined + * inline of fops.c */ + +int +metadisp_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int +metadisp_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +int +metadisp_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +metadisp_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +metadisp_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int +metadisp_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +metadisp_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata); + +int +metadisp_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata); + +int +metadisp_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int +metadisp_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +#endif diff --git a/xlators/features/metadisp/src/metadisp-fsync.c b/xlators/features/metadisp/src/metadisp-fsync.c new file mode 100644 index 00000000000..2e46fa84eac --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-fsync.c @@ -0,0 +1,54 @@ + +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +int32_t +metadisp_fsync_resume(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata) +{ + STACK_WIND(frame, default_fsync_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; +} + +int32_t +metadisp_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + call_stub_t *stub = NULL; + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +metadisp_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + stub = fop_fsync_stub(frame, metadisp_fsync_resume, fd, flags, xdata); + STACK_WIND_COOKIE(frame, metadisp_fsync_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-lookup.c b/xlators/features/metadisp/src/metadisp-lookup.c new file mode 100644 index 00000000000..27d90c9f746 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-lookup.c @@ -0,0 +1,90 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * Lookup, like stat, is a two-step process for grabbing the metadata details + * as well as the data details. + */ + +int32_t +metadisp_backend_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + METADISP_TRACE("backend_lookup_cbk"); + if (op_errno == ENOENT) { + op_errno = ENODATA; + op_ret = -1; + } + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +int32_t +metadisp_backend_lookup_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + METADISP_TRACE("backend_lookup_resume"); + loc_t backend_loc = { + 0, + }; + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + STACK_WIND(frame, metadisp_backend_lookup_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->lookup, &backend_loc, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +metadisp_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + METADISP_TRACE("%d %d", op_ret, op_errno); + call_stub_t *stub = NULL; + stub = cookie; + + if (op_ret != 0) { + goto unwind; + } + + if (!IA_ISREG(buf->ia_type)) { + goto unwind; + } else if (!stub) { + op_errno = EINVAL; + goto unwind; + } + + METADISP_TRACE("resuming stub"); + + // memcpy(stub->args.loc.gfid, buf->ia_gfid, sizeof(uuid_t)); + call_resume(stub); + return 0; +unwind: + METADISP_TRACE("unwinding %d %d", op_ret, op_errno); + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + if (stub) { + call_stub_destroy(stub); + } + return 0; +} + +int32_t +metadisp_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + METADISP_TRACE("lookup"); + call_stub_t *stub = NULL; + stub = fop_lookup_stub(frame, metadisp_backend_lookup_resume, loc, xdata); + STACK_WIND_COOKIE(frame, metadisp_lookup_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-open.c b/xlators/features/metadisp/src/metadisp-open.c new file mode 100644 index 00000000000..64814afe636 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-open.c @@ -0,0 +1,70 @@ +#include <glusterfs/call-stub.h> +#include "metadisp.h" + +int32_t +metadisp_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + METADISP_TRACE("got open results %d %d", op_ret, op_errno); + + call_stub_t *stub = NULL; + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + if (!stub) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int32_t +metadisp_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + STACK_WIND_COOKIE(frame, metadisp_open_cbk, NULL, DATA_CHILD(this), + DATA_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int32_t +metadisp_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + call_stub_t *stub = NULL; + loc_t backend_loc = { + 0, + }; + + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + stub = fop_open_stub(frame, metadisp_open_resume, &backend_loc, flags, fd, + xdata); + STACK_WIND_COOKIE(frame, metadisp_open_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(open, frame, -1, EINVAL, NULL, NULL); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-readdir.c b/xlators/features/metadisp/src/metadisp-readdir.c new file mode 100644 index 00000000000..5f840b1e88f --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-readdir.c @@ -0,0 +1,65 @@ +#include "metadisp.h" + +/** + * With a change to the posix xlator, readdir and readdirp are shockingly + * simple. + * + * The issue with separating the backend data of the files + * with the metadata is that readdirs must now read from multiple sources + * to coalesce the directory entries. + * + * The way we do this is to tell the METADATA_CHILD that when it's + * running readdirp, each file entry should have a stat wound to + * 'stat-source-of-truth'. + * + * see metadisp_stat for how it handles winds _from_posix. + */ + +int32_t +metadisp_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + METADISP_TRACE("."); + /* + * Always use readdirp, even if the original was readdir. Why? Because NFS. + * There are multiple translations between Gluster, UNIX, and NFS stat + * structures in that path. One of them uses the type etc. from the stat + * structure, which is only filled in by readdirp. If we use readdir, the + * entries do actually go all the way back to the client and are visible in + * getdents, but then the readdir throws them away because of the + * uninitialized type. + */ + GF_UNUSED int32_t ret; + if (!xdata) { + xdata = dict_new(); + } + + // ret = dict_set_int32 (xdata, "list-xattr", 1); + + // I'm my own source of truth! + ret = dict_set_static_ptr(xdata, "stat-source-of-truth", (void *)this); + + STACK_WIND(frame, default_readdirp_cbk, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->readdirp, fd, size, off, xdata); + + return 0; +} + +int32_t +metadisp_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + METADISP_TRACE("."); + if (!xdata) { + xdata = dict_new(); + } + GF_UNUSED int32_t ret; + // ret = dict_set_int32 (xdata, "list-xattr", 1); + + // I'm my own source of truth! + ret = dict_set_static_ptr(xdata, "stat-source-of-truth", (void *)this); + + STACK_WIND(frame, default_readdirp_cbk, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-setattr.c b/xlators/features/metadisp/src/metadisp-setattr.c new file mode 100644 index 00000000000..6991cf644f3 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-setattr.c @@ -0,0 +1,90 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +int32_t +metadisp_backend_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *statpre, struct iatt *statpost, + dict_t *xdata) + +{ + METADISP_TRACE("backend_setattr_cbk"); + if (op_errno == ENOENT) { + op_errno = ENODATA; + op_ret = -1; + } + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + return 0; +} + +int32_t +metadisp_backend_setattr_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, + dict_t *xdata) + +{ + METADISP_TRACE("backend_setattr_resume"); + loc_t backend_loc = { + 0, + }; + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + STACK_WIND(frame, metadisp_backend_setattr_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->setattr, &backend_loc, stbuf, valid, + xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(setattr, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; +} + +int32_t +metadisp_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + METADISP_TRACE("%d %d", op_ret, op_errno); + call_stub_t *stub = NULL; + stub = cookie; + + if (op_ret != 0) { + goto unwind; + } + + if (!IA_ISREG(statpost->ia_type)) { + goto unwind; + } else if (!stub) { + op_errno = EINVAL; + goto unwind; + } + + METADISP_TRACE("resuming stub"); + call_resume(stub); + return 0; +unwind: + METADISP_TRACE("unwinding %d %d", op_ret, op_errno); + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + if (stub) { + call_stub_destroy(stub); + } + return 0; +} + +int32_t +metadisp_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + METADISP_TRACE("setattr"); + call_stub_t *stub = NULL; + stub = fop_setattr_stub(frame, metadisp_backend_setattr_resume, loc, stbuf, + valid, xdata); + STACK_WIND_COOKIE(frame, metadisp_setattr_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->setattr, loc, stbuf, valid, + xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-stat.c b/xlators/features/metadisp/src/metadisp-stat.c new file mode 100644 index 00000000000..b06d0dbcddd --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-stat.c @@ -0,0 +1,124 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * The stat flow in METADISP is complicated because we must + * do ensure a few things: + * 1. stat, on the path within the metadata layer, + * MUST get the backend FD of the data layer. + * --- we wind to the metadata layer, then the data layer. + * + * 2. the metadata layer MUST be able to ask the data + * layer for stat information. + * --- this is 'syncop-internal-from-posix' + * + * 3. when the metadata exists BUT the data is missing, + * we MUST mark the backend file as bad and heal it. + */ + +int32_t +metadisp_stat_backend_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + METADISP_TRACE("got backend stat results %d %d", op_ret, op_errno); + if (op_errno == ENOENT) { + STACK_UNWIND_STRICT(open, frame, -1, ENODATA, NULL, NULL); + return 0; + } + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int32_t +metadisp_stat_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + METADISP_TRACE("winding stat to path %s", loc->path); + if (gf_uuid_is_null(loc->gfid)) { + METADISP_TRACE("bad object, sending EUCLEAN"); + STACK_UNWIND_STRICT(open, frame, -1, EUCLEAN, NULL, NULL); + return 0; + } + + STACK_WIND(frame, metadisp_stat_backend_cbk, SECOND_CHILD(this), + SECOND_CHILD(this)->fops->stat, loc, xdata); + return 0; +} + +int32_t +metadisp_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + METADISP_TRACE("got stat results %d %d", op_ret, op_errno); + + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + // only use the stub for the files + if (!IA_ISREG(buf->ia_type)) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int32_t +metadisp_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int32_t ret = 0; + loc_t backend_loc = { + 0, + }; + METADISP_FILTER_ROOT(stat, loc, xdata); + + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + if (dict_get_int32(xdata, "syncop-internal-from-posix", &ret) == 0) { + // if we've just been sent a stat from posix, then we know + // that we must send down a stat for a file to the second child. + // + // that means we can skip the stat for the first child and just + // send to the data disk. + METADISP_TRACE("got syncop-internal-from-posix"); + STACK_WIND(frame, default_stat_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->stat, &backend_loc, xdata); + return 0; + } + + // we do not know if the request is for a file, folder, etc. wind + // to first child to find out. + stub = fop_stat_stub(frame, metadisp_stat_resume, &backend_loc, xdata); + METADISP_TRACE("winding stat to first child %s", loc->path); + STACK_WIND_COOKIE(frame, metadisp_stat_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->stat, loc, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(stat, frame, -1, EINVAL, NULL, NULL); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-unlink.c b/xlators/features/metadisp/src/metadisp-unlink.c new file mode 100644 index 00000000000..1f6a8eb35ce --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-unlink.c @@ -0,0 +1,160 @@ + +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * The unlink flow in metadisp is complicated because we must + * do ensure that UNLINK causes both the metadata objects + * to get removed and the data objects to get removed. + */ + +int32_t +metadisp_unlink_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflag, dict_t *xdata) +{ + METADISP_TRACE("winding backend unlink to path %s", loc->path); + STACK_WIND(frame, default_unlink_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +} + +int32_t +metadisp_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + METADISP_TRACE(". %d %d", op_ret, op_errno); + + int ret = 0; + call_stub_t *stub = NULL; + int nlink = 0; + + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, &nlink); + if (ret != 0) { + op_errno = EINVAL; + op_ret = -1; + goto unwind; + } + METADISP_TRACE("frontend hardlink count %d %d", ret, nlink); + if (nlink > 1) { + goto unwind; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int32_t +metadisp_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + call_stub_t *stub = NULL; + + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + // fail fast on empty gfid so we don't loop forever + if (gf_uuid_is_null(buf->ia_gfid)) { + op_ret = -1; + op_errno = ENODATA; + goto unwind; + } + + // fill gfid since the stub is incomplete + memcpy(stub->args.loc.gfid, buf->ia_gfid, sizeof(uuid_t)); + memcpy(stub->args.loc.pargfid, postparent->ia_gfid, sizeof(uuid_t)); + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +metadisp_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + loc_t backend_loc = { + 0, + }; + + if (gf_uuid_is_null(loc->gfid)) { + METADISP_TRACE("winding lookup for unlink to path %s", loc->path); + + // loop back to ourselves after a lookup + stub = fop_unlink_stub(frame, metadisp_unlink, loc, xflag, xdata); + STACK_WIND_COOKIE(frame, metadisp_unlink_lookup_cbk, stub, + METADATA_CHILD(this), + METADATA_CHILD(this)->fops->lookup, loc, xdata); + return 0; + } + + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + // + // ensure we get the link count on the unlink response, so we can + // account for hardlinks before winding to the backend. + // NOTE: + // multiple xlators use GF_REQUEST_LINK_COUNT_XDATA. confirmation + // is needed to ensure that multiple requests will work in the same + // xlator stack. + // + if (!xdata) { + xdata = dict_new(); + } + dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1); + + METADISP_TRACE("winding frontend unlink to path %s", loc->path); + stub = fop_unlink_stub(frame, metadisp_unlink_resume, &backend_loc, xflag, + xdata); + + STACK_WIND_COOKIE(frame, metadisp_unlink_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(unlink, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp.c b/xlators/features/metadisp/src/metadisp.c new file mode 100644 index 00000000000..3c8f150cebc --- /dev/null +++ b/xlators/features/metadisp/src/metadisp.c @@ -0,0 +1,46 @@ +#include <glusterfs/call-stub.h> + +#include "metadisp.h" +#include "metadisp-fops.h" + +int32_t +init(xlator_t *this) +{ + if (!this->children) { + gf_log(this->name, GF_LOG_ERROR, + "not configured with children. exiting"); + return -1; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + } + + return 0; +} + +void +fini(xlator_t *this) +{ + return; +} + +/* defined in fops.c */ +struct xlator_fops fops; + +struct xlator_cbks cbks = {}; + +struct volume_options options[] = { + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .fops = &fops, + .cbks = &cbks, + .options = options, + .op_version = {1}, + .identifier = "metadisp", + .category = GF_EXPERIMENTAL, +}; diff --git a/xlators/features/metadisp/src/metadisp.h b/xlators/features/metadisp/src/metadisp.h new file mode 100644 index 00000000000..c8fd7a13c04 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp.h @@ -0,0 +1,45 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef GF_METADISP_H_ +#define GF_METADISP_H_ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> + +#define METADATA_CHILD(_this) FIRST_CHILD(_this) +#define DATA_CHILD(_this) SECOND_CHILD(_this) + +int32_t +build_backend_loc(uuid_t gfid, loc_t *src_loc, loc_t *dst_loc); + +#define METADISP_TRACE(_args...) gf_log("metadisp", GF_LOG_INFO, _args) + +#define METADISP_FILTER_ROOT(_op, _args...) \ + if (strcmp(loc->path, "/") == 0) { \ + STACK_WIND(frame, default_##_op##_cbk, METADATA_CHILD(this), \ + METADATA_CHILD(this)->fops->_op, _args); \ + return 0; \ + } + +#define METADISP_FILTER_ROOT_BY_GFID(_op, _gfid, _args...) \ + if (__is_root_gfid(_gfid)) { \ + STACK_WIND(frame, default_##_op##_cbk, METADATA_CHILD(this), \ + METADATA_CHILD(this)->fops->_op, _args); \ + return 0; \ + } + +#define RESOLVE_GFID_REQ(_dict, _dest, _lbl) \ + VALIDATE_OR_GOTO(dict_get_ptr(_dict, "gfid-req", (void **)&_dest) == 0, \ + _lbl) + +#endif /* __TEMPLATE_H__ */ diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c index 73c008a2c00..18df9ae6d19 100644 --- a/xlators/features/quota/src/quota.c +++ b/xlators/features/quota/src/quota.c @@ -586,9 +586,6 @@ quota_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, quota_meta_t size = { 0, }; - struct timeval tv = { - 0, - }; local = frame->local; @@ -626,13 +623,12 @@ quota_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, * loop of validation and checking * limit when timeout is zero. */ - gettimeofday(&tv, NULL); LOCK(&ctx->lock); { ctx->size = size.size; + ctx->validate_time = gf_time(); ctx->file_count = size.file_count; ctx->dir_count = size.dir_count; - memcpy(&ctx->tv, &tv, sizeof(struct timeval)); } UNLOCK(&ctx->lock); @@ -644,27 +640,10 @@ unwind: return 0; } -static uint64_t -quota_time_elapsed(struct timeval *now, struct timeval *then) -{ - return (now->tv_sec - then->tv_sec); -} - -int32_t -quota_timeout(struct timeval *tv, int32_t timeout) +static inline gf_boolean_t +quota_timeout(time_t t, uint32_t timeout) { - struct timeval now = { - 0, - }; - int32_t timed_out = 0; - - gettimeofday(&now, NULL); - - if (quota_time_elapsed(&now, tv) >= timeout) { - timed_out = 1; - } - - return timed_out; + return (gf_time() - t) >= timeout; } /* Return: 1 if new entry added @@ -1128,7 +1107,7 @@ quota_check_object_limit(call_frame_t *frame, quota_inode_ctx_t *ctx, timeout = priv->hard_timeout; } - if (!just_validated && quota_timeout(&ctx->tv, timeout)) { + if (!just_validated && quota_timeout(ctx->validate_time, timeout)) { need_validate = 1; } else if ((object_aggr_count) > ctx->object_hard_lim) { hard_limit_exceeded = 1; @@ -1195,7 +1174,7 @@ quota_check_size_limit(call_frame_t *frame, quota_inode_ctx_t *ctx, timeout = priv->hard_timeout; } - if (!just_validated && quota_timeout(&ctx->tv, timeout)) { + if (!just_validated && quota_timeout(ctx->validate_time, timeout)) { need_validate = 1; } else if (wouldbe_size >= ctx->hard_lim) { hard_limit_exceeded = 1; @@ -4314,9 +4293,6 @@ quota_statfs_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, quota_meta_t size = { 0, }; - struct timeval tv = { - 0, - }; local = frame->local; @@ -4348,13 +4324,12 @@ quota_statfs_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; } - gettimeofday(&tv, NULL); LOCK(&ctx->lock); { ctx->size = size.size; + ctx->validate_time = gf_time(); ctx->file_count = size.file_count; ctx->dir_count = size.dir_count; - memcpy(&ctx->tv, &tv, sizeof(struct timeval)); } UNLOCK(&ctx->lock); @@ -4873,7 +4848,7 @@ off: void quota_log_helper(char **usage_str, int64_t cur_size, inode_t *inode, - char **path, struct timeval *cur_time) + char **path, time_t *cur_time) { xlator_t *this = THIS; @@ -4892,7 +4867,7 @@ quota_log_helper(char **usage_str, int64_t cur_size, inode_t *inode, if (!(*path)) *path = uuid_utoa(inode->gfid); - gettimeofday(cur_time, NULL); + *cur_time = gf_time(); } /* Logs if @@ -4903,9 +4878,7 @@ void quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, int64_t delta) { - struct timeval cur_time = { - 0, - }; + time_t cur_time = 0; char *usage_str = NULL; char *path = NULL; int64_t cur_size = 0; @@ -4931,12 +4904,12 @@ quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, "path=%s", usage_str, priv->volume_uuid, path); - ctx->prev_log = cur_time; + ctx->prev_log_time = cur_time; } /* Usage is above soft limit */ else if (cur_size > ctx->soft_lim && - quota_timeout(&ctx->prev_log, priv->log_timeout)) { + quota_timeout(ctx->prev_log_time, priv->log_timeout)) { quota_log_helper(&usage_str, cur_size, inode, &path, &cur_time); gf_msg(this->name, GF_LOG_ALERT, 0, Q_MSG_CROSSED_SOFT_LIMIT, @@ -4947,7 +4920,7 @@ quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, "path=%s", usage_str, priv->volume_uuid, path); - ctx->prev_log = cur_time; + ctx->prev_log_time = cur_time; } if (path) @@ -5184,9 +5157,9 @@ quota_priv_dump(xlator_t *this) if (ret) goto out; else { - gf_proc_dump_write("soft-timeout", "%d", priv->soft_timeout); - gf_proc_dump_write("hard-timeout", "%d", priv->hard_timeout); - gf_proc_dump_write("alert-time", "%d", priv->log_timeout); + gf_proc_dump_write("soft-timeout", "%u", priv->soft_timeout); + gf_proc_dump_write("hard-timeout", "%u", priv->hard_timeout); + gf_proc_dump_write("alert-time", "%u", priv->log_timeout); gf_proc_dump_write("quota-on", "%d", priv->is_quota_on); gf_proc_dump_write("statfs", "%d", priv->consider_statfs); gf_proc_dump_write("volume-uuid", "%s", priv->volume_uuid); diff --git a/xlators/features/quota/src/quota.h b/xlators/features/quota/src/quota.h index 8a3dc7a77f5..0395d78c9ef 100644 --- a/xlators/features/quota/src/quota.h +++ b/xlators/features/quota/src/quota.h @@ -153,8 +153,8 @@ struct quota_inode_ctx { int64_t object_soft_lim; struct iatt buf; struct list_head parents; - struct timeval tv; - struct timeval prev_log; + time_t validate_time; + time_t prev_log_time; gf_boolean_t ancestry_built; gf_lock_t lock; }; @@ -199,6 +199,7 @@ struct quota_local { typedef struct quota_local quota_local_t; struct quota_priv { + /* FIXME: consider time_t for timeouts. */ uint32_t soft_timeout; uint32_t hard_timeout; uint32_t log_timeout; diff --git a/xlators/features/read-only/src/worm-helper.c b/xlators/features/read-only/src/worm-helper.c index 25fbd4aa748..df45f2a940b 100644 --- a/xlators/features/read-only/src/worm-helper.c +++ b/xlators/features/read-only/src/worm-helper.c @@ -41,7 +41,7 @@ worm_init_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr) GF_VALIDATE_OR_GOTO("worm", this, out); GF_VALIDATE_OR_GOTO(this->name, file_ptr, out); - start_time = time(NULL); + start_time = gf_time(); dict = dict_new(); if (!dict) { gf_log(this->name, GF_LOG_ERROR, "Error creating the dict"); @@ -94,7 +94,7 @@ worm_set_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, if (ret) goto out; stbuf->ia_mtime = stpre.ia_mtime; - stbuf->ia_atime = time(NULL) + retention_state->ret_period; + stbuf->ia_atime = gf_time() + retention_state->ret_period; if (fop_with_fd) ret = syncop_fsetattr(this, (fd_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME, @@ -286,6 +286,7 @@ gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd, { int op_errno = EROFS; int ret = -1; + time_t now = 0; uint64_t com_period = 0; uint64_t start_time = 0; dict_t *dict = NULL; @@ -337,8 +338,10 @@ gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd, goto out; } - if (ret == -1 && (time(NULL) - start_time) >= com_period) { - if ((time(NULL) - stbuf.ia_mtime) >= com_period) { + now = gf_time(); + + if (ret == -1 && (now - start_time) >= com_period) { + if ((now - stbuf.ia_mtime) >= com_period) { ret = worm_set_state(this, fop_with_fd, file_ptr, &reten_state, &stbuf); if (ret) { @@ -352,10 +355,10 @@ gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd, op_errno = 0; goto out; } - } else if (ret == -1 && (time(NULL) - start_time) < com_period) { + } else if (ret == -1 && (now - start_time) < com_period) { op_errno = 0; goto out; - } else if (reten_state.retain && ((time(NULL) >= stbuf.ia_atime))) { + } else if (reten_state.retain && ((now >= stbuf.ia_atime))) { gf_worm_state_lookup(this, fop_with_fd, file_ptr, &reten_state, &stbuf); } if (reten_state.worm && !reten_state.retain && priv->worm_files_deletable && diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c index ff928d0df61..e5f93063943 100644 --- a/xlators/features/shard/src/shard.c +++ b/xlators/features/shard/src/shard.c @@ -1004,6 +1004,10 @@ shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) } int +shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, + xlator_t *this); + +int shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, shard_post_resolve_fop_handler_t post_res_handler) { @@ -1020,21 +1024,47 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, inode_t *fsync_inode = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; + uint64_t resolve_count = 0; priv = this->private; local = frame->local; local->call_count = 0; shard_idx_iter = local->first_block; res_inode = local->resolver_base_inode; + + if ((local->op_ret < 0) || (local->resolve_not)) + goto out; + + /* If this prealloc FOP is for fresh file creation, then the size of the + * file will be 0. Then there will be no shards associated with this file. + * So we can skip the lookup process for the shards which do not exists + * and directly issue mknod to crete shards. + * + * In case the prealloc fop is to extend the preallocated file to bigger + * size then just lookup and populate inodes of existing shards and + * update the create count + */ + if (local->fop == GF_FOP_FALLOCATE) { + if (!local->prebuf.ia_size) { + local->inode_list[0] = inode_ref(res_inode); + local->create_count = local->last_block; + shard_common_inode_write_post_lookup_shards_handler(frame, this); + return 0; + } + if (local->prebuf.ia_size < local->total_size) + local->create_count = local->last_block - + ((local->prebuf.ia_size - 1) / + local->block_size); + } + + resolve_count = local->last_block - local->create_count; + if (res_inode) gf_uuid_copy(gfid, res_inode->gfid); else gf_uuid_copy(gfid, local->base_gfid); - if ((local->op_ret < 0) || (local->resolve_not)) - goto out; - - while (shard_idx_iter <= local->last_block) { + while (shard_idx_iter <= resolve_count) { i++; if (shard_idx_iter == 0) { local->inode_list[i] = inode_ref(res_inode); @@ -2443,7 +2473,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, int count = 0; int call_count = 0; int32_t shard_idx_iter = 0; - int last_block = 0; + int lookup_count = 0; char path[PATH_MAX] = { 0, }; @@ -2463,7 +2493,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, local = frame->local; count = call_count = local->call_count; shard_idx_iter = local->first_block; - last_block = local->last_block; + lookup_count = local->last_block - local->create_count; local->pls_fop_handler = handler; if (local->lookup_shards_barriered) local->barrier.waitfor = local->call_count; @@ -2473,7 +2503,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, else gf_uuid_copy(gfid, local->base_gfid); - while (shard_idx_iter <= last_block) { + while (shard_idx_iter <= lookup_count) { if (local->inode_list[i]) { i++; shard_idx_iter++; @@ -5656,6 +5686,8 @@ shard_common_inode_write_post_resolve_handler(call_frame_t *frame, shard_common_lookup_shards( frame, this, local->resolver_base_inode, shard_common_inode_write_post_lookup_shards_handler); + } else if (local->create_count) { + shard_common_inode_write_post_lookup_shards_handler(frame, this); } else { shard_common_inode_write_do(frame, this); } diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c index a28aa1e09e0..7d09cba3e9c 100644 --- a/xlators/features/trash/src/trash.c +++ b/xlators/features/trash/src/trash.c @@ -216,7 +216,7 @@ append_time_stamp(char *name, size_t name_size) 0, }; - gf_time_fmt(timestr, sizeof(timestr), time(NULL), gf_timefmt_F_HMS); + gf_time_fmt(timestr, sizeof(timestr), gf_time(), gf_timefmt_F_HMS); /* removing white spaces in timestamp */ for (i = 0; i < strlen(timestr); i++) { diff --git a/xlators/features/upcall/src/upcall-internal.c b/xlators/features/upcall/src/upcall-internal.c index 978825f6b56..c641bd6f432 100644 --- a/xlators/features/upcall/src/upcall-internal.c +++ b/xlators/features/upcall/src/upcall-internal.c @@ -316,7 +316,7 @@ upcall_reaper_thread(void *data) priv = this->private; GF_ASSERT(priv); - time_now = time(NULL); + time_now = gf_time(); while (!priv->fini) { list_for_each_entry_safe(inode_ctx, tmp, &priv->inode_ctx_list, inode_ctx_list) @@ -344,7 +344,7 @@ upcall_reaper_thread(void *data) /* don't do a very busy loop */ timeout = get_cache_invalidation_timeout(this); sleep(timeout / 2); - time_now = time(NULL); + time_now = gf_time(); } return NULL; @@ -533,7 +533,7 @@ upcall_cache_invalidate(call_frame_t *frame, xlator_t *this, client_t *client, goto out; } - time_now = time(NULL); + time_now = gf_time(); pthread_mutex_lock(&up_inode_ctx->client_list_lock); { list_for_each_entry_safe(up_client_entry, tmp, @@ -670,13 +670,13 @@ upcall_cache_forget(xlator_t *this, inode_t *inode, return; } - time_now = time(NULL); + time_now = gf_time(); pthread_mutex_lock(&up_inode_ctx->client_list_lock); { list_for_each_entry_safe(up_client_entry, tmp, &up_inode_ctx->client_list, client_list) { - /* Set the access time to time(NULL) + /* Set the access time to gf_time() * to send notify */ up_client_entry->access_time = time_now; diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am index eaa61c435e5..685beb42d27 100644 --- a/xlators/mgmt/glusterd/src/Makefile.am +++ b/xlators/mgmt/glusterd/src/Makefile.am @@ -25,13 +25,14 @@ glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \ glusterd-conn-helper.c glusterd-snapd-svc.c glusterd-snapd-svc-helper.c \ glusterd-bitd-svc.c glusterd-scrub-svc.c glusterd-server-quorum.c \ glusterd-reset-brick.c glusterd-shd-svc.c glusterd-shd-svc-helper.c \ - glusterd-gfproxyd-svc.c glusterd-gfproxyd-svc-helper.c glusterd-ganesha.c + glusterd-gfproxyd-svc.c glusterd-gfproxyd-svc-helper.c glusterd-ganesha.c \ + $(CONTRIBDIR)/mount/mntent.c glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ $(top_builddir)/libglusterd/src/libglusterd.la \ $(top_builddir)/rpc/xdr/src/libgfxdr.la \ $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \ - $(XML_LIBS) -lcrypto $(URCU_LIBS) $(URCU_CDS_LIBS) $(LIB_DL) + $(XML_LIBS) -lcrypto $(URCU_LIBS) $(URCU_CDS_LIBS) $(LIB_DL) $(GF_XLATOR_MGNT_LIBADD) noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \ glusterd-sm.h glusterd-store.h glusterd-mem-types.h \ @@ -46,7 +47,8 @@ noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \ glusterd-scrub-svc.h glusterd-server-quorum.h glusterd-errno.h \ glusterd-shd-svc.h glusterd-shd-svc-helper.h \ glusterd-gfproxyd-svc.h glusterd-gfproxyd-svc-helper.h \ - $(CONTRIBDIR)/userspace-rcu/rculist-extra.h + $(CONTRIBDIR)/userspace-rcu/rculist-extra.h \ + $(CONTRIBDIR)/mount/mntent_compat.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 6d1a1e98848..e56cd0e6c74 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1359,14 +1359,14 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname); if (ret) { - gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, + gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "Unable to get volume name"); goto out; } ret = glusterd_volinfo_find(volname, &volinfo); if (ret) { - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "Unable to find volume: %s", volname); goto out; } @@ -1378,13 +1378,7 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"), &replica_count); if (ret) { - gf_msg_debug(THIS->name, 0, "Unable to get replica count"); - } - - ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"), - &arbiter_count); - if (ret) { - gf_msg_debug(THIS->name, 0, "No arbiter count present in the dict"); + gf_msg_debug(this->name, 0, "Unable to get replica count"); } if (replica_count > 0) { @@ -1400,18 +1394,18 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) glusterd_add_peers_to_auth_list(volname); - if (glusterd_is_volume_replicate(volinfo)) { + if (replica_count && glusterd_is_volume_replicate(volinfo)) { /* Do not allow add-brick for stopped volumes when replica-count * is being increased. */ - if (conf->op_version >= GD_OP_VERSION_3_7_10 && replica_count && - GLUSTERD_STATUS_STOPPED == volinfo->status) { + if (GLUSTERD_STATUS_STOPPED == volinfo->status && + conf->op_version >= GD_OP_VERSION_3_7_10) { ret = -1; snprintf(msg, sizeof(msg), " Volume must not be in" " stopped state when replica-count needs to " " be increased."); - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", msg); *op_errstr = gf_strdup(msg); goto out; @@ -1419,25 +1413,31 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) /* op-version check for replica 2 to arbiter conversion. If we * don't have this check, an older peer added as arbiter brick * will not have the arbiter xlator in its volfile. */ - if ((conf->op_version < GD_OP_VERSION_3_8_0) && (arbiter_count == 1) && - (replica_count == 3)) { - ret = -1; - snprintf(msg, sizeof(msg), - "Cluster op-version must " - "be >= 30800 to add arbiter brick to a " - "replica 2 volume."); - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", - msg); - *op_errstr = gf_strdup(msg); - goto out; + if ((replica_count == 3) && (conf->op_version < GD_OP_VERSION_3_8_0)) { + ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"), + &arbiter_count); + if (ret) { + gf_msg_debug(this->name, 0, + "No arbiter count present in the dict"); + } else if (arbiter_count == 1) { + ret = -1; + snprintf(msg, sizeof(msg), + "Cluster op-version must " + "be >= 30800 to add arbiter brick to a " + "replica 2 volume."); + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", + msg); + *op_errstr = gf_strdup(msg); + goto out; + } } /* Do not allow increasing replica count for arbiter volumes. */ - if (replica_count && volinfo->arbiter_count) { + if (volinfo->arbiter_count) { ret = -1; snprintf(msg, sizeof(msg), "Increasing replica count " "for arbiter volumes is not supported."); - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", msg); *op_errstr = gf_strdup(msg); goto out; @@ -1451,7 +1451,7 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) * doing this check at the originator node is sufficient. */ - if (is_origin_glusterd(dict) && !is_force) { + if (!is_force && is_origin_glusterd(dict)) { ret = 0; if (volinfo->type == GF_CLUSTER_TYPE_REPLICATE) { gf_msg_debug(this->name, 0, @@ -1459,15 +1459,18 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) "found. Checking brick order."); if (replica_count) ret = glusterd_check_brick_order(dict, msg, volinfo->type, + &volname, &bricks, &count, replica_count); else ret = glusterd_check_brick_order(dict, msg, volinfo->type, + &volname, &bricks, &count, volinfo->replica_count); } else if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { gf_msg_debug(this->name, 0, "Disperse cluster type" " found. Checking brick order."); - ret = glusterd_check_brick_order(dict, msg, volinfo->type, + ret = glusterd_check_brick_order(dict, msg, volinfo->type, &volname, + &bricks, &count, volinfo->disperse_count); } if (ret) { @@ -1496,7 +1499,7 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) if (len < 0) { strcpy(msg, "<error>"); } - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s", msg); *op_errstr = gf_strdup(msg); goto out; @@ -1528,7 +1531,7 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) "Volume name %s rebalance is in " "progress. Please retry after completion", volname); - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_OIP_RETRY_LATER, "%s", msg); + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OIP_RETRY_LATER, "%s", msg); *op_errstr = gf_strdup(msg); ret = -1; goto out; @@ -1546,18 +1549,22 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) msg[0] = '\0'; } - ret = dict_get_int32n(dict, "count", SLEN("count"), &count); - if (ret) { - gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, - "Unable to get count"); - goto out; + if (!count) { + ret = dict_get_int32n(dict, "count", SLEN("count"), &count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, + "Unable to get count"); + goto out; + } } - ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks); - if (ret) { - gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, - "Unable to get bricks"); - goto out; + if (!bricks) { + ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, + "Unable to get bricks"); + goto out; + } } if (bricks) { @@ -1576,7 +1583,7 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) "brick path %s is " "too long", brick); - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRKPATH_TOO_LONG, "%s", + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRKPATH_TOO_LONG, "%s", msg); *op_errstr = gf_strdup(msg); @@ -1587,7 +1594,7 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) ret = glusterd_brickinfo_new_from_brick(brick, &brickinfo, _gf_true, NULL); if (ret) { - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NOT_FOUND, + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NOT_FOUND, "Add-brick: Unable" " to get brickinfo"); goto out; @@ -1657,7 +1664,7 @@ out: GF_FREE(str_ret); GF_FREE(all_bricks); - gf_msg_debug(THIS->name, 0, "Returning %d", ret); + gf_msg_debug(this->name, 0, "Returning %d", ret); return ret; } @@ -2227,6 +2234,42 @@ out: } int +glusterd_post_commit_add_brick(dict_t *dict, char **op_errstr) +{ + int ret = 0; + char *volname = NULL; + + ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname); + + if (ret) { + gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, + "Unable to get volume name"); + goto out; + } + ret = glusterd_replace_old_auth_allow_list(volname); +out: + return ret; +} + +int +glusterd_post_commit_replace_brick(dict_t *dict, char **op_errstr) +{ + int ret = 0; + char *volname = NULL; + + ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname); + + if (ret) { + gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, + "Unable to get volume name"); + goto out; + } + ret = glusterd_replace_old_auth_allow_list(volname); +out: + return ret; +} + +int glusterd_set_rebalance_id_for_remove_brick(dict_t *req_dict, dict_t *rsp_dict) { int ret = -1; diff --git a/xlators/mgmt/glusterd/src/glusterd-ganesha.c b/xlators/mgmt/glusterd/src/glusterd-ganesha.c index 2d60daf180a..f08bd6cebee 100644 --- a/xlators/mgmt/glusterd/src/glusterd-ganesha.c +++ b/xlators/mgmt/glusterd/src/glusterd-ganesha.c @@ -421,6 +421,35 @@ check_host_list(void) } int +gd_ganesha_send_dbus(char *volname, char *value) +{ + runner_t runner = { + 0, + }; + int ret = -1; + runinit(&runner); + + GF_VALIDATE_OR_GOTO("glusterd-ganesha", volname, out); + GF_VALIDATE_OR_GOTO("glusterd-ganesha", value, out); + + ret = 0; + if (check_host_list()) { + /* Check whether ganesha is running on this node */ + if (manage_service("status")) { + gf_msg("glusterd-ganesha", GF_LOG_WARNING, 0, + GD_MSG_GANESHA_NOT_RUNNING, + "Export failed, NFS-Ganesha is not running"); + } else { + runner_add_args(&runner, GANESHA_PREFIX "/dbus-send.sh", CONFDIR, + value, volname, NULL); + ret = runner_run(&runner); + } + } +out: + return ret; +} + +int manage_export_config(char *volname, char *value, char **op_errstr) { runner_t runner = { @@ -447,9 +476,6 @@ int ganesha_manage_export(dict_t *dict, char *value, gf_boolean_t update_cache_invalidation, char **op_errstr) { - runner_t runner = { - 0, - }; int ret = -1; glusterd_volinfo_t *volinfo = NULL; dict_t *vol_opts = NULL; @@ -458,7 +484,6 @@ ganesha_manage_export(dict_t *dict, char *value, glusterd_conf_t *priv = NULL; gf_boolean_t option = _gf_false; - runinit(&runner); this = THIS; GF_ASSERT(this); priv = this->private; @@ -538,26 +563,13 @@ ganesha_manage_export(dict_t *dict, char *value, goto out; } } - - if (check_host_list()) { - /* Check whether ganesha is running on this node */ - if (manage_service("status")) { - gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GANESHA_NOT_RUNNING, - "Export failed, NFS-Ganesha is not running"); - } else { - runner_add_args(&runner, GANESHA_PREFIX "/dbus-send.sh", CONFDIR, - value, volname, NULL); - ret = runner_run(&runner); - if (ret) { - gf_asprintf(op_errstr, - "Dynamic export" - " addition/deletion failed." - " Please see log file for details"); - goto out; - } - } + ret = gd_ganesha_send_dbus(volname, value); + if (ret) { + gf_asprintf(op_errstr, + "Dynamic export addition/deletion failed." + " Please see log file for details"); + goto out; } - if (update_cache_invalidation) { vol_opts = volinfo->dict; ret = dict_set_dynstr_with_alloc(vol_opts, diff --git a/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c index b01fd4da24b..a0bfea41f0f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c +++ b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c @@ -310,7 +310,7 @@ glusterd_gfproxydsvc_start(glusterd_svc_t *svc, int flags) } runinit(&runner); - if (this->ctx->cmd_args.valgrind) { + if (this->ctx->cmd_args.vgtool != _gf_none) { len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s", svc->proc.logdir, svc->proc.logfile); if ((len < 0) || (len >= PATH_MAX)) { @@ -318,8 +318,13 @@ glusterd_gfproxydsvc_start(glusterd_svc_t *svc, int flags) goto out; } - runner_add_args(&runner, "valgrind", "--leak-check=full", - "--trace-children=yes", "--track-origins=yes", NULL); + if (this->ctx->cmd_args.vgtool == _gf_memcheck) + runner_add_args(&runner, "valgrind", "--leak-check=full", + "--trace-children=yes", "--track-origins=yes", + NULL); + else + runner_add_args(&runner, "valgrind", "--tool=drd", NULL); + runner_argprintf(&runner, "--log-file=%s", valgrind_logfile); } diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index 7d488ffd87a..1b21c40596d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -5593,7 +5593,7 @@ glusterd_get_state(rpcsvc_request_t *req, dict_t *dict) ret = dict_get_strn(dict, "filename", SLEN("filename"), &tmp_str); if (ret) { - now = time(NULL); + now = gf_time(); strftime(timestamp, sizeof(timestamp), "%Y%m%d_%H%M%S", localtime(&now)); gf_asprintf(&filename, "%s_%s", "glusterd_state", timestamp); diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.c b/xlators/mgmt/glusterd/src/glusterd-hooks.c index d18eb6b2f5e..61c0f1c946f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-hooks.c +++ b/xlators/mgmt/glusterd/src/glusterd-hooks.c @@ -206,11 +206,13 @@ glusterd_hooks_set_volume_args(dict_t *dict, runner_t *runner) int i = 0; int count = 0; int ret = -1; + int flag = 0; char query[1024] = { 0, }; char *key = NULL; char *value = NULL; + char *inet_family = NULL; xlator_t *this = NULL; this = THIS; GF_ASSERT(this); @@ -243,9 +245,23 @@ glusterd_hooks_set_volume_args(dict_t *dict, runner_t *runner) continue; runner_argprintf(runner, "%s=%s", key, value); + if ((strncmp(key, "cluster.enable-shared-storage", + SLEN("cluster.enable-shared-storage")) == 0 || + strncmp(key, "enable-shared-storage", + SLEN("enable-shared-storage")) == 0) && + strncmp(value, "enable", SLEN("enable")) == 0) + flag = 1; } glusterd_hooks_add_custom_args(dict, runner); + if (flag == 1) { + ret = dict_get_str_sizen(this->options, "transport.address-family", + &inet_family); + if (!ret) { + runner_argprintf(runner, "transport.address-family=%s", + inet_family); + } + } ret = 0; out: diff --git a/xlators/mgmt/glusterd/src/glusterd-log-ops.c b/xlators/mgmt/glusterd/src/glusterd-log-ops.c index a800d9543cf..34abf35cb00 100644 --- a/xlators/mgmt/glusterd/src/glusterd-log-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-log-ops.c @@ -76,7 +76,7 @@ __glusterd_handle_log_rotate(rpcsvc_request_t *req) "for volume %s", volname); - ret = dict_set_uint64(dict, "rotate-key", (uint64_t)time(NULL)); + ret = dict_set_uint64(dict, "rotate-key", (uint64_t)gf_time()); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-mem-types.h b/xlators/mgmt/glusterd/src/glusterd-mem-types.h index 17052cee263..d7257e1a7b5 100644 --- a/xlators/mgmt/glusterd/src/glusterd-mem-types.h +++ b/xlators/mgmt/glusterd/src/glusterd-mem-types.h @@ -27,6 +27,7 @@ typedef enum gf_gld_mem_types_ { gf_gld_mt_mop_stage_req_t, gf_gld_mt_probe_ctx_t, gf_gld_mt_glusterd_volinfo_t, + gf_gld_mt_volinfo_dict_data_t, gf_gld_mt_glusterd_brickinfo_t, gf_gld_mt_peer_hostname_t, gf_gld_mt_defrag_info, diff --git a/xlators/mgmt/glusterd/src/glusterd-messages.h b/xlators/mgmt/glusterd/src/glusterd-messages.h index c0891797fdf..3a1e600fb03 100644 --- a/xlators/mgmt/glusterd/src/glusterd-messages.h +++ b/xlators/mgmt/glusterd/src/glusterd-messages.h @@ -319,7 +319,8 @@ GLFS_MSGID( GD_MSG_SNAPSHOT_NOT_THIN_PROVISIONED, GD_MSG_VOL_STOP_ARGS_GET_FAILED, GD_MSG_LSTAT_FAIL, GD_MSG_VOLUME_NOT_IMPORTED, GD_MSG_ADD_BRICK_MNT_INFO_FAIL, GD_MSG_GET_MNT_ENTRY_INFO_FAIL, - GD_MSG_QUORUM_CLUSTER_COUNT_GET_FAIL); + GD_MSG_QUORUM_CLUSTER_COUNT_GET_FAIL, GD_MSG_POST_COMMIT_OP_FAIL, + GD_MSG_POST_COMMIT_FROM_UUID_REJCT, GD_MSG_POST_COMMIT_REQ_SEND_FAIL); #define GD_MSG_INVALID_ENTRY_STR "Invalid data entry" #define GD_MSG_INVALID_ARGUMENT_STR \ @@ -447,4 +448,4 @@ GLFS_MSGID( "Failed to allocate memory or get serialized length of dict" #define GD_MSG_GET_XATTR_FAIL_STR "Failed to get extended attribute" -#endif /* !_GLUSTERD_MESSAGES_H_ */
\ No newline at end of file +#endif /* !_GLUSTERD_MESSAGES_H_ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c index c170827eec0..1069688a89d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c @@ -626,6 +626,136 @@ out: } static int +glusterd_mgmt_v3_post_commit_send_resp(rpcsvc_request_t *req, int32_t op, + int32_t status, char *op_errstr, + uint32_t op_errno, dict_t *rsp_dict) +{ + gd1_mgmt_v3_post_commit_rsp rsp = { + {0}, + }; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(req); + + rsp.op_ret = status; + glusterd_get_uuid(&rsp.uuid); + rsp.op = op; + rsp.op_errno = op_errno; + if (op_errstr) + rsp.op_errstr = op_errstr; + else + rsp.op_errstr = ""; + + ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val, + &rsp.dict.dict_len); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL); + goto out; + } + + ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_rsp); + + GF_FREE(rsp.dict.dict_val); +out: + gf_msg_debug(this->name, 0, "Responded to post commit, ret: %d", ret); + return ret; +} + +static int +glusterd_handle_post_commit_fn(rpcsvc_request_t *req) +{ + int32_t ret = -1; + gd1_mgmt_v3_post_commit_req op_req = { + {0}, + }; + xlator_t *this = NULL; + char *op_errstr = NULL; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + uint32_t op_errno = 0; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(req); + + ret = xdr_to_generic(req->msg[0], &op_req, + (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_req); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, + "Failed to decode post commit " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND, + "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa(op_req.uuid)); + ret = -1; + goto out; + } + + dict = dict_new(); + if (!dict) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL); + goto out; + } + + ret = dict_unserialize(op_req.dict.dict_val, op_req.dict.dict_len, &dict); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL, + NULL); + goto out; + } + + rsp_dict = dict_new(); + if (!rsp_dict) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL); + return -1; + } + + ret = gd_mgmt_v3_post_commit_fn(op_req.op, dict, &op_errstr, &op_errno, + rsp_dict); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL, + "post commit failed on operation %s", gd_op_list[op_req.op]); + } + + ret = glusterd_mgmt_v3_post_commit_send_resp(req, op_req.op, ret, op_errstr, + op_errno, rsp_dict); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_OP_RESP_FAIL, + "Failed to send post commit " + "response for operation %s", + gd_op_list[op_req.op]); + goto out; + } + +out: + if (op_errstr && (strcmp(op_errstr, ""))) + GF_FREE(op_errstr); + + free(op_req.dict.dict_val); + + if (dict) + dict_unref(dict); + + if (rsp_dict) + dict_unref(rsp_dict); + + /* Return 0 from handler to avoid double deletion of req obj */ + return 0; +} + +static int glusterd_mgmt_v3_post_validate_send_resp(rpcsvc_request_t *req, int32_t op, int32_t status, char *op_errstr, dict_t *rsp_dict) @@ -963,6 +1093,12 @@ glusterd_handle_commit(rpcsvc_request_t *req) } static int +glusterd_handle_post_commit(rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler(req, glusterd_handle_post_commit_fn); +} + +static int glusterd_handle_post_validate(rpcsvc_request_t *req) { return glusterd_big_locked_handler(req, glusterd_handle_post_validate_fn); @@ -986,6 +1122,9 @@ static rpcsvc_actor_t gd_svc_mgmt_v3_actors[GLUSTERD_MGMT_V3_MAXVALUE] = { GLUSTERD_MGMT_V3_BRICK_OP, DRC_NA, 0}, [GLUSTERD_MGMT_V3_COMMIT] = {"COMMIT", glusterd_handle_commit, NULL, GLUSTERD_MGMT_V3_COMMIT, DRC_NA, 0}, + [GLUSTERD_MGMT_V3_POST_COMMIT] = {"POST_COMMIT", + glusterd_handle_post_commit, NULL, + GLUSTERD_MGMT_V3_POST_COMMIT, DRC_NA, 0}, [GLUSTERD_MGMT_V3_POST_VALIDATE] = {"POST_VAL", glusterd_handle_post_validate, NULL, GLUSTERD_MGMT_V3_POST_VALIDATE, DRC_NA, diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-mgmt.c index b2128e10a04..bca7221062b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-mgmt.c +++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.c @@ -86,6 +86,11 @@ gd_mgmt_v3_collate_errors(struct syncargs *args, int op_ret, int op_errno, peer_str, err_string); break; } + case GLUSTERD_MGMT_V3_POST_COMMIT: { + snprintf(op_err, sizeof(op_err), "Post commit failed on %s. %s", + peer_str, err_string); + break; + } case GLUSTERD_MGMT_V3_POST_VALIDATE: { snprintf(op_err, sizeof(op_err), "Post Validation failed on %s. %s", peer_str, @@ -405,6 +410,47 @@ out: } int32_t +gd_mgmt_v3_post_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr, + uint32_t *op_errno, dict_t *rsp_dict) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(dict); + GF_ASSERT(op_errstr); + GF_VALIDATE_OR_GOTO(this->name, op_errno, out); + GF_ASSERT(rsp_dict); + + switch (op) { + case GD_OP_ADD_BRICK: + ret = glusterd_post_commit_add_brick(dict, op_errstr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL, + "Add-brick post commit failed."); + goto out; + } + break; + case GD_OP_REPLACE_BRICK: + ret = glusterd_post_commit_replace_brick(dict, op_errstr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL, + "Replace-brick post commit failed."); + goto out; + } + break; + default: + break; + } + + ret = 0; +out: + gf_msg_debug(this->name, 0, "OP = %d. Returning %d", op, ret); + return ret; +} + +int32_t gd_mgmt_v3_post_validate_fn(glusterd_op_t op, int32_t op_ret, dict_t *dict, char **op_errstr, dict_t *rsp_dict) { @@ -1720,6 +1766,274 @@ out: } int32_t +gd_mgmt_v3_post_commit_cbk_fn(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + gd1_mgmt_v3_post_commit_rsp rsp = { + {0}, + }; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + dict_t *rsp_dict = NULL; + xlator_t *this = NULL; + uuid_t *peerid = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(req); + GF_ASSERT(myframe); + + frame = myframe; + args = frame->local; + peerid = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL); + + ret = xdr_to_generic(*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_rsp); + if (ret < 0) + goto out; + + if (rsp.dict.dict_len) { + /* Unserialize the dictionary */ + rsp_dict = dict_new(); + + ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict); + if (ret < 0) { + free(rsp.dict.dict_val); + goto out; + } else { + rsp_dict->extra_stdfree = rsp.dict.dict_val; + } + } + + gf_uuid_copy(args->uuid, rsp.uuid); + pthread_mutex_lock(&args->lock_dict); + { + ret = glusterd_syncop_aggr_rsp_dict(rsp.op, args->dict, rsp_dict); + } + pthread_mutex_unlock(&args->lock_dict); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s", + "Failed to aggregate response from " + " node/brick"); + if (!rsp.op_ret) + op_ret = ret; + else { + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + } + } else { + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + } + +out: + if (rsp_dict) + dict_unref(rsp_dict); + + gd_mgmt_v3_collate_errors(args, op_ret, op_errno, rsp.op_errstr, + GLUSTERD_MGMT_V3_POST_COMMIT, *peerid, rsp.uuid); + GF_FREE(peerid); + + if (rsp.op_errstr) + free(rsp.op_errstr); + + /* req->rpc_status set to -1 means, STACK_DESTROY will be called from + * the caller function. + */ + if (req->rpc_status != -1) + STACK_DESTROY(frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_post_commit_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + return glusterd_big_locked_cbk(req, iov, count, myframe, + gd_mgmt_v3_post_commit_cbk_fn); +} + +int +gd_mgmt_v3_post_commit_req(glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, struct syncargs *args, + uuid_t my_uuid, uuid_t recv_uuid) +{ + int32_t ret = -1; + gd1_mgmt_v3_post_commit_req req = { + {0}, + }; + xlator_t *this = NULL; + uuid_t *peerid = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(op_ctx); + GF_ASSERT(peerinfo); + GF_ASSERT(args); + + ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val, + &req.dict.dict_len); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL); + goto out; + } + + gf_uuid_copy(req.uuid, my_uuid); + req.op = op; + + GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL); + goto out; + } + + ret = gd_syncop_submit_request( + peerinfo->rpc, &req, args, peerid, &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_POST_COMMIT, gd_mgmt_v3_post_commit_cbk, + (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_req); +out: + GF_FREE(req.dict.dict_val); + gf_msg_trace(this->name, 0, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_post_commit(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + char **op_errstr, uint32_t *op_errno, + uint32_t txn_generation) +{ + int32_t ret = -1; + int32_t peer_cnt = 0; + dict_t *rsp_dict = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + struct syncargs args = {0}; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + + this = THIS; + GF_ASSERT(this); + conf = this->private; + GF_ASSERT(conf); + + GF_ASSERT(op_ctx); + GF_ASSERT(req_dict); + GF_ASSERT(op_errstr); + GF_VALIDATE_OR_GOTO(this->name, op_errno, out); + + rsp_dict = dict_new(); + if (!rsp_dict) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, + "Failed to create response dictionary"); + goto out; + } + + /* Post commit on local node */ + ret = gd_mgmt_v3_post_commit_fn(op, req_dict, op_errstr, op_errno, + rsp_dict); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL, + "Post commit failed for " + "operation %s on local node", + gd_op_list[op]); + + if (*op_errstr == NULL) { + ret = gf_asprintf(op_errstr, + "Post commit failed " + "on localhost. Please " + "check log file for details."); + if (ret == -1) + *op_errstr = NULL; + + ret = -1; + } + goto out; + } + + ret = glusterd_syncop_aggr_rsp_dict(op, op_ctx, rsp_dict); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s", + "Failed to aggregate response from " + " node/brick"); + goto out; + } + + dict_unref(rsp_dict); + rsp_dict = NULL; + + /* Sending post commit req to other nodes in the cluster */ + gd_syncargs_init(&args, op_ctx); + ret = synctask_barrier_init((&args)); + if (ret) + goto out; + peer_cnt = 0; + + RCU_READ_LOCK; + cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list) + { + /* Only send requests to peers who were available before the + * transaction started + */ + if (peerinfo->generation > txn_generation) + continue; + if (!peerinfo->connected) + continue; + + if (op != GD_OP_SYNC_VOLUME && + peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) + continue; + + gd_mgmt_v3_post_commit_req(op, req_dict, peerinfo, &args, MY_UUID, + peer_uuid); + peer_cnt++; + } + RCU_READ_UNLOCK; + + if (0 == peer_cnt) { + ret = 0; + goto out; + } + + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.op_ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL, + "Post commit failed on peers"); + + if (args.errstr) + *op_errstr = gf_strdup(args.errstr); + } + + ret = args.op_ret; + *op_errno = args.op_errno; + + gf_msg_debug(this->name, 0, + "Sent post commit req for %s to %d " + "peers. Returning %d", + gd_op_list[op], peer_cnt, ret); +out: + glusterd_op_modify_op_ctx(op, op_ctx); + return ret; +} + +int32_t gd_mgmt_v3_post_validate_cbk_fn(struct rpc_req *req, struct iovec *iov, int count, void *myframe) { @@ -2408,6 +2722,15 @@ glusterd_mgmt_v3_initiate_all_phases(rpcsvc_request_t *req, glusterd_op_t op, goto out; } + /* POST COMMIT OP PHASE */ + ret = glusterd_mgmt_v3_post_commit(op, dict, req_dict, &op_errstr, + &op_errno, txn_generation); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL, + "Post commit Op Failed"); + goto out; + } + /* POST-COMMIT VALIDATE PHASE */ /* As of now, post_validate is not trying to cleanup any failed commands. So as of now, I am sending 0 (op_ret as 0). diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-mgmt.h index 71f793d0397..27dd1849519 100644 --- a/xlators/mgmt/glusterd/src/glusterd-mgmt.h +++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.h @@ -28,6 +28,10 @@ gd_mgmt_v3_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr, uint32_t *op_errno, dict_t *rsp_dict); int32_t +gd_mgmt_v3_post_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr, + uint32_t *op_errno, dict_t *rsp_dict); + +int32_t gd_mgmt_v3_post_validate_fn(glusterd_op_t op, int32_t op_ret, dict_t *dict, char **op_errstr, dict_t *rsp_dict); @@ -84,4 +88,10 @@ glusterd_reset_brick_prevalidate(dict_t *dict, char **op_errstr, dict_t *rsp_dict); int glusterd_op_reset_brick(dict_t *dict, dict_t *rsp_dict); + +int +glusterd_post_commit_add_brick(dict_t *dict, char **op_errstr); + +int +glusterd_post_commit_replace_brick(dict_t *dict, char **op_errstr); #endif /* _GLUSTERD_MGMT_H_ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index 2afd0fe1b74..458bf168ede 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -219,6 +219,9 @@ glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr, char valgrind_logfile[PATH_MAX] = { 0, }; + char msg[1024] = { + 0, + }; char *volfileserver = NULL; char *localtime_logging = NULL; @@ -270,12 +273,17 @@ glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr, "rebalance"); runinit(&runner); - if (this->ctx->cmd_args.valgrind) { + if (this->ctx->cmd_args.vgtool != _gf_none) { snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s-rebalance.log", priv->logdir, volinfo->volname); - runner_add_args(&runner, "valgrind", "--leak-check=full", - "--trace-children=yes", "--track-origins=yes", NULL); + if (this->ctx->cmd_args.vgtool == _gf_memcheck) + runner_add_args(&runner, "valgrind", "--leak-check=full", + "--trace-children=yes", "--track-origins=yes", + NULL); + else + runner_add_args(&runner, "valgrind", "--tool=drd", NULL); + runner_argprintf(&runner, "--log-file=%s", valgrind_logfile); } @@ -316,6 +324,10 @@ glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr, runner_add_arg(&runner, "--localtime-logging"); } + snprintf(msg, sizeof(msg), "Starting the rebalance service for volume %s", + volinfo->volname); + runner_log(&runner, this->name, GF_LOG_DEBUG, msg); + ret = runner_run_nowait(&runner); if (ret) { gf_msg_debug("glusterd", 0, "rebalance command failed"); diff --git a/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c b/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c index 1f3f4909cbb..d75f249b29e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c +++ b/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c @@ -304,7 +304,7 @@ glusterd_snapdsvc_start(glusterd_svc_t *svc, int flags) } runinit(&runner); - if (this->ctx->cmd_args.valgrind) { + if (this->ctx->cmd_args.vgtool != _gf_none) { len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-snapd.log", svc->proc.logdir); if ((len < 0) || (len >= PATH_MAX)) { @@ -313,8 +313,13 @@ glusterd_snapdsvc_start(glusterd_svc_t *svc, int flags) goto out; } - runner_add_args(&runner, "valgrind", "--leak-check=full", - "--trace-children=yes", "--track-origins=yes", NULL); + if (this->ctx->cmd_args.vgtool == _gf_memcheck) + runner_add_args(&runner, "valgrind", "--leak-check=full", + "--trace-children=yes", "--track-origins=yes", + NULL); + else + runner_add_args(&runner, "valgrind", "--tool=drd", NULL); + runner_argprintf(&runner, "--log-file=%s", valgrind_logfile); } diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c index d96d5dd2cfc..995268b796d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c @@ -2037,8 +2037,9 @@ glusterd_update_snaps_synctask(void *opaque) "Failed to remove snap %s", snap->snapname); goto out; } - if (dict) - dict_unref(dict); + + dict_unref(dict); + dict = NULL; } snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix); ret = dict_get_int32(peer_data, buf, &val); diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c index c2428dc0de0..aeaa8d15214 100644 --- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c +++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c @@ -3930,7 +3930,8 @@ glusterd_handle_snapshot_create(rpcsvc_request_t *req, glusterd_op_t op, goto out; } - ret = dict_set_int64(dict, "snap-time", (int64_t)time(&snap_time)); + snap_time = gf_time(); + ret = dict_set_int64(dict, "snap-time", (int64_t)snap_time); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Unable to set snap-time"); @@ -5322,6 +5323,48 @@ glusterd_do_snap_vol(glusterd_volinfo_t *origin_vol, glusterd_snap_t *snap, dict_deln(snap_vol->dict, "features.barrier", SLEN("features.barrier")); gd_update_volume_op_versions(snap_vol); + /* * + * Create the export file from the node where ganesha.enable "on" + * is executed + * */ + if (glusterd_is_ganesha_cluster() && + glusterd_check_ganesha_export(snap_vol)) { + if (is_origin_glusterd(dict)) { + ret = manage_export_config(clonename, "on", NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + GD_MSG_EXPORT_FILE_CREATE_FAIL, + "Failed to create" + "export file for NFS-Ganesha\n"); + goto out; + } + } + + ret = dict_set_dynstr_with_alloc(snap_vol->dict, + "features.cache-invalidation", "on"); + ret = gd_ganesha_send_dbus(clonename, "on"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_EXPORT_FILE_CREATE_FAIL, + "Dynamic export addition/deletion failed." + " Please see log file for details. Clone name = %s", + clonename); + goto out; + } + } + if (!glusterd_is_ganesha_cluster() && + glusterd_check_ganesha_export(snap_vol)) { + /* This happens when a snapshot was created when Ganesha was + * enabled globally. Then Ganesha disabled from the cluster. + * In such cases, we will have the volume level option set + * on dict, So we have to disable it as it doesn't make sense + * to keep the option. + */ + + ret = dict_set_dynstr(snap_vol->dict, "ganesha.enable", "off"); + if (ret) + goto out; + } + ret = glusterd_store_volinfo(snap_vol, GLUSTERD_VOLINFO_VER_AC_INCREMENT); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL, @@ -5393,8 +5436,31 @@ out: for (i = 0; unsupported_opt[i].key; i++) GF_FREE(unsupported_opt[i].value); - if (snap_vol) + if (snap_vol) { + if (glusterd_is_ganesha_cluster() && + glusterd_check_ganesha_export(snap_vol)) { + if (is_origin_glusterd(dict)) { + ret = manage_export_config(clonename, "on", NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + GD_MSG_EXPORT_FILE_CREATE_FAIL, + "Failed to create" + "export file for NFS-Ganesha\n"); + } + } + + ret = gd_ganesha_send_dbus(clonename, "off"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + GD_MSG_EXPORT_FILE_CREATE_FAIL, + "Dynamic export addition/deletion failed." + " Please see log file for details. Clone name = %s", + clonename); + } + } + glusterd_snap_volume_remove(rsp_dict, snap_vol, _gf_true, _gf_true); + } snap_vol = NULL; } diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 465e41ef00b..d94dceb10b7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -74,7 +74,7 @@ glusterd_replace_slash_with_hyphen(char *str) while (ptr) { *ptr = '-'; - ptr = strchr(str, '/'); + ptr = strchr(ptr, '/'); } } @@ -660,85 +660,72 @@ out: } static int -_storeslaves(dict_t *this, char *key, data_t *value, void *data) -{ - int32_t ret = 0; - gf_store_handle_t *shandle = NULL; - xlator_t *xl = NULL; - - xl = THIS; - GF_ASSERT(xl); - - shandle = (gf_store_handle_t *)data; - - GF_ASSERT(shandle); - GF_ASSERT(shandle->fd > 0); - GF_ASSERT(shandle->path); - GF_ASSERT(key); - GF_ASSERT(value); - GF_ASSERT(value->data); - - gf_msg_debug(xl->name, 0, "Storing in volinfo:key= %s, val=%s", key, - value->data); - - ret = gf_store_save_value(shandle->fd, key, (char *)value->data); - if (ret) { - gf_msg(xl->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_WRITE_FAIL, - "Unable to write into store" - " handle for path: %s", - shandle->path); - return -1; - } - return 0; -} - -int -_storeopts(dict_t *this, char *key, data_t *value, void *data) +_storeopts(dict_t *dict_value, char *key, data_t *value, void *data) { int32_t ret = 0; int32_t exists = 0; + int32_t option_len = 0; gf_store_handle_t *shandle = NULL; - xlator_t *xl = NULL; + glusterd_volinfo_data_store_t *dict_data = NULL; + xlator_t *this = NULL; - xl = THIS; - GF_ASSERT(xl); + this = THIS; + GF_ASSERT(this); - shandle = (gf_store_handle_t *)data; + dict_data = (glusterd_volinfo_data_store_t *)data; + shandle = dict_data->shandle; GF_ASSERT(shandle); GF_ASSERT(shandle->fd > 0); - GF_ASSERT(shandle->path); GF_ASSERT(key); GF_ASSERT(value); GF_ASSERT(value->data); - if (is_key_glusterd_hooks_friendly(key)) { - exists = 1; + if (dict_data->key_check == 1) { + if (is_key_glusterd_hooks_friendly(key)) { + exists = 1; - } else { - exists = glusterd_check_option_exists(key, NULL); + } else { + exists = glusterd_check_option_exists(key, NULL); + } } - - if (1 == exists) { - gf_msg_debug(xl->name, 0, - "Storing in volinfo:key= %s, " + if (exists == 1 || dict_data->key_check == 0) { + gf_msg_debug(this->name, 0, + "Storing in buffer for volinfo:key= %s, " "val=%s", key, value->data); - } else { - gf_msg_debug(xl->name, 0, "Discarding:key= %s, val=%s", key, + gf_msg_debug(this->name, 0, "Discarding:key= %s, val=%s", key, value->data); return 0; } - ret = gf_store_save_value(shandle->fd, key, (char *)value->data); - if (ret) { - gf_msg(xl->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_WRITE_FAIL, - "Unable to write into store" - " handle for path: %s", - shandle->path); + /* + * The option_len considers the length of the key value + * pair and along with that '=' and '\n', but as value->len + * already considers a NULL at the end of the data, adding + * just 1. + */ + option_len = strlen(key) + value->len + 1; + + if ((VOLINFO_BUFFER_SIZE - dict_data->buffer_len - 1) < option_len) { + ret = gf_store_save_items(shandle->fd, dict_data->buffer); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, NULL); + return -1; + } + dict_data->buffer_len = 0; + dict_data->buffer[0] = '\0'; + } + ret = snprintf(dict_data->buffer + dict_data->buffer_len, option_len + 1, + "%s=%s\n", key, value->data); + if (ret < 0 || ret > option_len + 1) { + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_COPY_FAIL, NULL); return -1; } + + dict_data->buffer_len += ret; + return 0; } @@ -1013,7 +1000,7 @@ glusterd_store_create_snap_dir(glusterd_snap_t *snap) return ret; } -int32_t +static int32_t glusterd_store_volinfo_write(int fd, glusterd_volinfo_t *volinfo) { int32_t ret = -1; @@ -1021,19 +1008,47 @@ glusterd_store_volinfo_write(int fd, glusterd_volinfo_t *volinfo) GF_ASSERT(fd > 0); GF_ASSERT(volinfo); GF_ASSERT(volinfo->shandle); + xlator_t *this = NULL; + glusterd_volinfo_data_store_t *dict_data = NULL; + + this = THIS; + GF_ASSERT(this); shandle = volinfo->shandle; + + dict_data = GF_CALLOC(1, sizeof(glusterd_volinfo_data_store_t), + gf_gld_mt_volinfo_dict_data_t); + if (dict_data == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL); + return -1; + } + ret = glusterd_volume_exclude_options_write(fd, volinfo); - if (ret) + if (ret) { goto out; + } + + dict_data->shandle = shandle; + dict_data->key_check = 1; shandle->fd = fd; - dict_foreach(volinfo->dict, _storeopts, shandle); + dict_foreach(volinfo->dict, _storeopts, (void *)dict_data); + + dict_data->key_check = 0; + dict_foreach(volinfo->gsync_slaves, _storeopts, (void *)dict_data); + + if (dict_data->buffer_len > 0) { + ret = gf_store_save_items(fd, dict_data->buffer); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, NULL); + goto out; + } + } - dict_foreach(volinfo->gsync_slaves, _storeslaves, shandle); shandle->fd = 0; out: - gf_msg_debug(THIS->name, 0, "Returning %d", ret); + GF_FREE(dict_data); + gf_msg_debug(this->name, 0, "Returning %d", ret); return ret; } @@ -1274,14 +1289,6 @@ out: return ret; } -static int -_gd_store_rebalance_dict(dict_t *dict, char *key, data_t *value, void *data) -{ - int fd = *(int *)data; - - return gf_store_save_value(fd, key, value->data); -} - int32_t glusterd_store_node_state_write(int fd, glusterd_volinfo_t *volinfo) { @@ -1289,6 +1296,12 @@ glusterd_store_node_state_write(int fd, glusterd_volinfo_t *volinfo) char buf[PATH_MAX]; char uuid[UUID_SIZE + 1]; uint total_len = 0; + glusterd_volinfo_data_store_t *dict_data = NULL; + gf_store_handle_t shandle; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); GF_ASSERT(fd > 0); GF_ASSERT(volinfo); @@ -1328,14 +1341,33 @@ glusterd_store_node_state_write(int fd, glusterd_volinfo_t *volinfo) } ret = gf_store_save_items(fd, buf); - if (ret) + if (ret) { goto out; + } if (volinfo->rebal.dict) { - dict_foreach(volinfo->rebal.dict, _gd_store_rebalance_dict, &fd); + dict_data = GF_CALLOC(1, sizeof(glusterd_volinfo_data_store_t), + gf_gld_mt_volinfo_dict_data_t); + if (dict_data == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL); + return -1; + } + dict_data->shandle = &shandle; + shandle.fd = fd; + dict_foreach(volinfo->rebal.dict, _storeopts, (void *)dict_data); + if (dict_data->buffer_len > 0) { + ret = gf_store_save_items(fd, dict_data->buffer); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, + NULL); + goto out; + ; + } + } } out: - gf_msg_debug(THIS->name, 0, "Returning %d", ret); + GF_FREE(dict_data); + gf_msg_debug(this->name, 0, "Returning %d", ret); return ret; } @@ -2309,7 +2341,7 @@ glusterd_store_retrieve_snapd(glusterd_volinfo_t *volinfo) ret = 0; out: - if (gf_store_iter_destroy(iter)) { + if (gf_store_iter_destroy(&iter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; @@ -2642,7 +2674,7 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) brick_count++; } - if (gf_store_iter_destroy(tmpiter)) { + if (gf_store_iter_destroy(&tmpiter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; @@ -2823,13 +2855,13 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) ret = 0; out: - if (gf_store_iter_destroy(tmpiter)) { + if (gf_store_iter_destroy(&tmpiter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; } - if (gf_store_iter_destroy(iter)) { + if (gf_store_iter_destroy(&iter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; @@ -2962,7 +2994,7 @@ glusterd_store_retrieve_node_state(glusterd_volinfo_t *volinfo) ret = 0; out: - if (gf_store_iter_destroy(iter)) { + if (gf_store_iter_destroy(&iter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; @@ -3238,7 +3270,7 @@ glusterd_store_update_volinfo(glusterd_volinfo_t *volinfo) ret = 0; out: - if (gf_store_iter_destroy(iter)) { + if (gf_store_iter_destroy(&iter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; @@ -3343,20 +3375,6 @@ glusterd_store_set_options_path(glusterd_conf_t *conf, char *path, size_t len) snprintf(path, len, "%s/options", conf->workdir); } -int -_store_global_opts(dict_t *this, char *key, data_t *value, void *data) -{ - gf_store_handle_t *shandle = data; - - if (gf_store_save_value(shandle->fd, key, (char *)value->data)) { - gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_WRITE_FAIL, - "Unable to write into store handle for key : %s, value %s", key, - (char *)value->data); - } - - return 0; -} - int32_t glusterd_store_options(xlator_t *this, dict_t *opts) { @@ -3365,13 +3383,15 @@ glusterd_store_options(xlator_t *this, dict_t *opts) char path[PATH_MAX] = {0}; int fd = -1; int32_t ret = -1; + glusterd_volinfo_data_store_t *dict_data = NULL; conf = this->private; glusterd_store_set_options_path(conf, path, sizeof(path)); ret = gf_store_handle_new(path, &shandle); - if (ret) + if (ret) { goto out; + } fd = gf_store_mkstemp(shandle); if (fd <= 0) { @@ -3379,15 +3399,30 @@ glusterd_store_options(xlator_t *this, dict_t *opts) goto out; } + dict_data = GF_CALLOC(1, sizeof(glusterd_volinfo_data_store_t), + gf_gld_mt_volinfo_dict_data_t); + if (dict_data == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL); + return -1; + } + dict_data->shandle = shandle; shandle->fd = fd; - dict_foreach(opts, _store_global_opts, shandle); - shandle->fd = 0; + dict_foreach(opts, _storeopts, (void *)dict_data); + if (dict_data->buffer_len > 0) { + ret = gf_store_save_items(fd, dict_data->buffer); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, NULL); + goto out; + } + } + ret = gf_store_rename_tmppath(shandle); - if (ret) - goto out; out: - if ((ret < 0) && (fd > 0)) + shandle->fd = 0; + GF_FREE(dict_data); + if ((ret < 0) && (fd > 0)) { gf_store_unlink_tmppath(shandle); + } gf_store_handle_destroy(shandle); return ret; } @@ -3433,7 +3468,7 @@ glusterd_store_retrieve_options(xlator_t *this) goto out; ret = 0; out: - (void)gf_store_iter_destroy(iter); + (void)gf_store_iter_destroy(&iter); gf_store_handle_destroy(shandle); return ret; } @@ -3883,7 +3918,7 @@ glusterd_store_update_snap(glusterd_snap_t *snap) ret = 0; out: - if (gf_store_iter_destroy(iter)) { + if (gf_store_iter_destroy(&iter)) { gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, "Failed to destroy store iter"); ret = -1; @@ -4625,7 +4660,7 @@ glusterd_store_retrieve_peers(xlator_t *this) is_ok = _gf_true; next: - (void)gf_store_iter_destroy(iter); + (void)gf_store_iter_destroy(&iter); if (!is_ok) { gf_log(this->name, GF_LOG_WARNING, diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 04070549678..83f4df0783e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -29,7 +29,7 @@ typedef enum glusterd_store_ver_ac_ { } glusterd_volinfo_ver_ac_t; #define UUID_SIZE 36 - +#define VOLINFO_BUFFER_SIZE 4093 #define GLUSTERD_STORE_UUID_KEY "UUID" #define GLUSTERD_STORE_KEY_VOL_TYPE "type" @@ -112,6 +112,19 @@ typedef enum glusterd_store_ver_ac_ { #define GLUSTERD_STORE_KEY_GANESHA_GLOBAL "nfs-ganesha" +/* + * The structure is responsible for handling the parameter for writes into + * the buffer before it is finally written to the file. The writes will be + * of the form of key-value pairs. + */ +struct glusterd_volinfo_data_store_ { + gf_store_handle_t *shandle; /*Contains fd and path of the file */ + int16_t buffer_len; + char key_check; /* flag to check if key is to be validated before write*/ + char buffer[VOLINFO_BUFFER_SIZE]; +}; +typedef struct glusterd_volinfo_data_store_ glusterd_volinfo_data_store_t; + int32_t glusterd_store_volinfo(glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac); diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c index 99119d69e45..18b3fb13630 100644 --- a/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c +++ b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c @@ -162,6 +162,9 @@ glusterd_svc_start(glusterd_svc_t *svc, int flags, dict_t *cmdline) char *localtime_logging = NULL; char *log_level = NULL; char daemon_log_level[30] = {0}; + char msg[1024] = { + 0, + }; int32_t len = 0; this = THIS; @@ -187,7 +190,7 @@ glusterd_svc_start(glusterd_svc_t *svc, int flags, dict_t *cmdline) runinit(&runner); - if (this->ctx->cmd_args.valgrind) { + if (this->ctx->cmd_args.vgtool != _gf_none) { len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s.log", svc->proc.logdir, svc->name); if ((len < 0) || (len >= PATH_MAX)) { @@ -195,9 +198,13 @@ glusterd_svc_start(glusterd_svc_t *svc, int flags, dict_t *cmdline) goto unlock; } - runner_add_args(&runner, "valgrind", "--leak-check=full", - "--trace-children=yes", "--track-origins=yes", - NULL); + if (this->ctx->cmd_args.vgtool == _gf_memcheck) + runner_add_args(&runner, "valgrind", "--leak-check=full", + "--trace-children=yes", "--track-origins=yes", + NULL); + else + runner_add_args(&runner, "valgrind", "--tool=drd", NULL); + runner_argprintf(&runner, "--log-file=%s", valgrind_logfile); } @@ -226,8 +233,8 @@ glusterd_svc_start(glusterd_svc_t *svc, int flags, dict_t *cmdline) if (cmdline) dict_foreach(cmdline, svc_add_args, (void *)&runner); - gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SVC_START_SUCCESS, - "Starting %s service", svc->name); + snprintf(msg, sizeof(msg), "Starting %s service", svc->name); + runner_log(&runner, this->name, GF_LOG_DEBUG, msg); if (flags == PROC_START_NO_WAIT) { ret = runner_run_nowait(&runner); diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 7d38b0a42d7..90ef2cf4c9c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -79,6 +79,14 @@ #include <sys/sockio.h> #endif +#ifdef __FreeBSD__ +#include <sys/sysctl.h> +#include <sys/param.h> +#include <sys/queue.h> +#include <libprocstat.h> +#include <libutil.h> +#endif + #define NFS_PROGRAM 100003 #define NFSV3_VERSION 3 @@ -1117,7 +1125,8 @@ glusterd_get_brick_mount_dir(char *brickpath, char *hostname, char *mount_dir) } brick_dir = &brickpath[strlen(mnt_pt)]; - brick_dir++; + if (brick_dir[0] == '/') + brick_dir++; snprintf(mount_dir, VALID_GLUSTERD_PATHMAX, "/%s", brick_dir); } @@ -2068,8 +2077,8 @@ glusterd_volume_start_glusterfs(glusterd_volinfo_t *volinfo, retry: runinit(&runner); - if (this->ctx->cmd_args.valgrind) { - /* Run bricks with valgrind */ + if (this->ctx->cmd_args.vgtool != _gf_none) { + /* Run bricks with valgrind. */ if (volinfo->logdir) { len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s-%s.log", volinfo->logdir, volinfo->volname, exp_path); @@ -2083,8 +2092,13 @@ retry: goto out; } - runner_add_args(&runner, "valgrind", "--leak-check=full", - "--trace-children=yes", "--track-origins=yes", NULL); + if (this->ctx->cmd_args.vgtool == _gf_memcheck) + runner_add_args(&runner, "valgrind", "--leak-check=full", + "--trace-children=yes", "--track-origins=yes", + NULL); + else + runner_add_args(&runner, "valgrind", "--tool=drd", NULL); + runner_argprintf(&runner, "--log-file=%s", valgrind_logfile); } @@ -2197,7 +2211,7 @@ retry: if (is_brick_mx_enabled()) runner_add_arg(&runner, "--brick-mux"); - runner_log(&runner, "", 0, "Starting GlusterFS"); + runner_log(&runner, "", GF_LOG_DEBUG, "Starting GlusterFS"); brickinfo->port = port; brickinfo->rdma_port = rdma_port; @@ -2206,7 +2220,10 @@ retry: if (wait) { synclock_unlock(&priv->big_lock); + errno = 0; ret = runner_run(&runner); + if (errno != 0) + ret = errno; synclock_lock(&priv->big_lock); if (ret == EADDRINUSE) { @@ -2788,6 +2805,15 @@ glusterd_volume_compute_cksum(glusterd_volinfo_t *volinfo, char *cksum_path, ret = -1; goto out; } + } else if (priv->op_version < GD_OP_VERSION_7_0) { + ret = get_checksum_for_path(filepath, &cksum, priv->op_version); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_GET_FAIL, + "unable to get " + "checksum for path: %s", + filepath); + goto out; + } } ret = get_checksum_for_file(fd, &cksum, priv->op_version); @@ -6083,7 +6109,6 @@ send_attach_req(xlator_t *this, struct rpc_clnt *rpc, char *path, GF_ATOMIC_INC(conf->blockers); ret = rpc_clnt_submit(rpc, &gd_brick_prog, op, cbkfn, &iov, 1, NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL); - return ret; free_iobref: iobref_unref(iobref); @@ -6092,7 +6117,7 @@ maybe_free_iobuf: iobuf_unref(iobuf); } err: - return -1; + return ret; } extern size_t @@ -6420,7 +6445,6 @@ find_compatible_brick(glusterd_conf_t *conf, glusterd_volinfo_t *volinfo, int glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len) { - char fname[128] = ""; char buf[1024] = ""; char cmdline[2048] = ""; xlator_t *this = NULL; @@ -6435,6 +6459,22 @@ glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len) this = THIS; GF_ASSERT(this); +#ifdef __FreeBSD__ + blen = sizeof(buf); + int mib[4]; + + mib[0] = CTL_KERN; + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_ARGS; + mib[3] = pid; + + if (sys_sysctl(mib, 4, buf, &blen, NULL, blen) != 0) { + gf_log(this->name, GF_LOG_ERROR, "brick process %d is not running", + pid); + return ret; + } +#else + char fname[128] = ""; snprintf(fname, sizeof(fname), "/proc/%d/cmdline", pid); if (sys_access(fname, R_OK) != 0) { @@ -6451,6 +6491,7 @@ glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len) strerror(errno), fname); return ret; } +#endif /* convert cmdline to single string */ for (i = 0, j = 0; i < blen; i++) { @@ -6499,6 +6540,43 @@ glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len) char * search_brick_path_from_proc(pid_t brick_pid, char *brickpath) { + char *brick_path = NULL; +#ifdef __FreeBSD__ + struct filestat *fst; + struct procstat *ps; + struct kinfo_proc *kp; + struct filestat_list *head; + + ps = procstat_open_sysctl(); + if (ps == NULL) + goto out; + + kp = kinfo_getproc(brick_pid); + if (kp == NULL) + goto out; + + head = procstat_getfiles(ps, (void *)kp, 0); + if (head == NULL) + goto out; + + STAILQ_FOREACH(fst, head, next) + { + if (fst->fs_fd < 0) + continue; + + if (!strcmp(fst->fs_path, brickpath)) { + brick_path = gf_strdup(fst->fs_path); + break; + } + } + +out: + if (head != NULL) + procstat_freefiles(ps, head); + if (kp != NULL) + free(kp); + procstat_close(ps); +#else struct dirent *dp = NULL; DIR *dirp = NULL; size_t len = 0; @@ -6509,7 +6587,6 @@ search_brick_path_from_proc(pid_t brick_pid, char *brickpath) 0, }, }; - char *brick_path = NULL; if (!brickpath) goto out; @@ -6547,6 +6624,7 @@ search_brick_path_from_proc(pid_t brick_pid, char *brickpath) out: if (dirp) sys_closedir(dirp); +#endif return brick_path; } @@ -8417,7 +8495,8 @@ glusterd_sm_tr_log_transition_add(glusterd_sm_tr_log_t *log, int old_state, transitions[next].old_state = old_state; transitions[next].new_state = new_state; transitions[next].event = event; - time(&transitions[next].time); + transitions[next].time = gf_time(); + log->current = next; if (log->count < log->size) log->count++; @@ -14632,7 +14711,8 @@ glusterd_compare_addrinfo(struct addrinfo *first, struct addrinfo *next) */ int32_t glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type, - int32_t sub_count) + char **volname, char **brick_list, + int32_t *brick_count, int32_t sub_count) { int ret = -1; int i = 0; @@ -14643,12 +14723,9 @@ glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type, addrinfo_list_t *ai_list_tmp1 = NULL; addrinfo_list_t *ai_list_tmp2 = NULL; char *brick = NULL; - char *brick_list = NULL; char *brick_list_dup = NULL; char *brick_list_ptr = NULL; char *tmpptr = NULL; - char *volname = NULL; - int32_t brick_count = 0; struct addrinfo *ai_info = NULL; char brick_addr[128] = { 0, @@ -14676,32 +14753,38 @@ glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type, ai_list->info = NULL; CDS_INIT_LIST_HEAD(&ai_list->list); - ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Unable to get volume name"); - goto out; + if (!(*volname)) { + ret = dict_get_strn(dict, "volname", SLEN("volname"), &(*volname)); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Unable to get volume name"); + goto out; + } } - ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &brick_list); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Bricks check : Could not " - "retrieve bricks list"); - goto out; + if (!(*brick_list)) { + ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &(*brick_list)); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Bricks check : Could not " + "retrieve bricks list"); + goto out; + } } - ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Bricks check : Could not " - "retrieve brick count"); - goto out; + if (!(*brick_count)) { + ret = dict_get_int32n(dict, "count", SLEN("count"), &(*brick_count)); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Bricks check : Could not " + "retrieve brick count"); + goto out; + } } - brick_list_dup = brick_list_ptr = gf_strdup(brick_list); + brick_list_dup = brick_list_ptr = gf_strdup(*brick_list); /* Resolve hostnames and get addrinfo */ - while (i < brick_count) { + while (i < *brick_count) { ++i; brick = strtok_r(brick_list_dup, " \n", &tmpptr); brick_list_dup = tmpptr; @@ -14737,8 +14820,12 @@ glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type, i = 0; ai_list_tmp1 = cds_list_entry(ai_list->list.next, addrinfo_list_t, list); + if (*brick_count < sub_count) { + sub_count = *brick_count; + } + /* Check for bad brick order */ - while (i < brick_count) { + while (i < *brick_count) { ++i; ai_info = ai_list_tmp1->info; ai_list_tmp1 = cds_list_entry(ai_list_tmp1->list.next, addrinfo_list_t, @@ -14901,3 +14988,59 @@ out: GF_FREE(new_auth_allow_list); return; } + +int +glusterd_replace_old_auth_allow_list(char *volname) +{ + int ret = 0; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + char *old_auth_allow_list = NULL; + + this = THIS; + GF_ASSERT(this); + + GF_VALIDATE_OR_GOTO(this->name, volname, out); + + ret = glusterd_volinfo_find(volname, &volinfo); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, + "Unable to find volume: %s", volname); + goto out; + } + + ret = dict_get_str_sizen(volinfo->dict, "old.auth.allow", + &old_auth_allow_list); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED, + "old auth allow list is not set, no need to replace the list"); + ret = 0; + goto out; + } + + dict_del_sizen(volinfo->dict, "auth.allow"); + ret = dict_set_strn(volinfo->dict, "auth.allow", SLEN("auth.allow"), + old_auth_allow_list); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED, + "Unable to replace auth.allow list"); + goto out; + } + + dict_del_sizen(volinfo->dict, "old.auth.allow"); + + ret = glusterd_create_volfiles_and_notify_services(volinfo); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLFILE_CREATE_FAIL, + "failed to create volfiles"); + goto out; + } + ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_STORE_FAIL, + "failed to store volinfo"); + goto out; + } +out: + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index 05346916968..bf6ac295e26 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -859,6 +859,7 @@ glusterd_add_shd_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict, int32_t count); int32_t glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type, + char **volname, char **bricks, int32_t *brick_count, int32_t sub_count); #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 087be916c23..8d6fb5e0fac 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -3361,11 +3361,20 @@ volgen_link_bricks(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, j); j++; } + if (!xl) { ret = -1; goto out; } + if (strncmp(xl_type, "performance/readdir-ahead", + SLEN("performance/readdir-ahead")) == 0) { + ret = xlator_set_fixed_option(xl, "performance.readdir-ahead", + "on"); + if (ret) + goto out; + } + ret = volgen_xlator_link(xl, trav); if (ret) goto out; @@ -3593,13 +3602,13 @@ volgen_graph_build_readdir_ahead(volgen_graph_t *graph, int32_t clusters = 0; if (graph->type == GF_QUOTAD || graph->type == GF_SNAPD || - !glusterd_volinfo_get_boolean(volinfo, VKEY_PARALLEL_READDIR) || - !glusterd_volinfo_get_boolean(volinfo, VKEY_READDIR_AHEAD)) + !glusterd_volinfo_get_boolean(volinfo, VKEY_PARALLEL_READDIR)) goto out; clusters = volgen_link_bricks_from_list_tail( graph, volinfo, "performance/readdir-ahead", "%s-readdir-ahead-%d", child_count, 1); + out: return clusters; } @@ -3801,6 +3810,38 @@ out: } static int +set_volfile_id_option(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + int clusters) +{ + xlator_t *xlator = NULL; + int i = 0; + int ret = -1; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_VALIDATE_OR_GOTO("glusterd", this, out); + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + if (conf->op_version < GD_OP_VERSION_9_0) + return 0; + xlator = first_of(graph); + + for (i = 0; i < clusters; i++) { + ret = xlator_set_fixed_option(xlator, "volume-id", + uuid_utoa(volinfo->volume_id)); + if (ret) + goto out; + + xlator = xlator->next; + } + +out: + return ret; +} + +static int volgen_graph_build_afr_clusters(volgen_graph_t *graph, glusterd_volinfo_t *volinfo) { @@ -3842,6 +3883,13 @@ volgen_graph_build_afr_clusters(volgen_graph_t *graph, clusters = -1; goto out; } + + ret = set_volfile_id_option(graph, volinfo, clusters); + if (ret) { + clusters = -1; + goto out; + } + if (!volinfo->arbiter_count && !volinfo->thin_arbiter_count) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index cafdffb63c4..814ab14fb27 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -1002,7 +1002,8 @@ glusterd_op_stage_create_volume(dict_t *dict, char **op_errstr, gf_msg_debug(this->name, 0, "Replicate cluster type " "found. Checking brick order."); - ret = glusterd_check_brick_order(dict, msg, type, + ret = glusterd_check_brick_order(dict, msg, type, &volname, + &bricks, &brick_count, replica_count); } else if (type == GF_CLUSTER_TYPE_DISPERSE) { ret = dict_get_int32n(dict, "disperse-count", @@ -1016,7 +1017,8 @@ glusterd_op_stage_create_volume(dict_t *dict, char **op_errstr, gf_msg_debug(this->name, 0, "Disperse cluster type" " found. Checking brick order."); - ret = glusterd_check_brick_order(dict, msg, type, + ret = glusterd_check_brick_order(dict, msg, type, &volname, + &bricks, &brick_count, disperse_count); } if (ret) { diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 3ac8e2a29d7..398b4d76f52 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1813,7 +1813,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { {.key = "performance.readdir-ahead", .voltype = "performance/readdir-ahead", .option = "!perf", - .value = "on", + .value = "off", .op_version = 3, .description = "enable/disable readdir-ahead translator in the volume.", .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT}, @@ -3138,4 +3138,9 @@ struct volopt_map_entry glusterd_volopt_map[] = { .type = NO_DOC, }, + {.key = "cluster.use-anonymous-inode", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_9_0, + .value = "yes", + .flags = VOLOPT_FLAG_CLIENT_OPT}, {.key = NULL}}; diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c index 91c5f9ec5e3..7a86c2997b1 100644 --- a/xlators/mgmt/glusterd/src/glusterd.c +++ b/xlators/mgmt/glusterd/src/glusterd.c @@ -1423,7 +1423,7 @@ init(xlator_t *this) char *mountbroker_root = NULL; int i = 0; int total_transport = 0; - gf_boolean_t valgrind = _gf_false; + gf_valgrind_tool vgtool; char *valgrind_str = NULL; char *transport_type = NULL; char var_run_dir[PATH_MAX] = { @@ -1436,6 +1436,14 @@ init(xlator_t *this) int32_t len = 0; int op_version = 0; +#if defined(RUN_WITH_MEMCHECK) + vgtool = _gf_memcheck; +#elif defined(RUN_WITH_DRD) + vgtool = _gf_drd; +#else + vgtool = _gf_none; +#endif + #ifndef GF_DARWIN_HOST_OS { struct rlimit lim; @@ -1925,18 +1933,24 @@ init(xlator_t *this) } /* Set option to run bricks on valgrind if enabled in glusterd.vol */ - this->ctx->cmd_args.valgrind = valgrind; + this->ctx->cmd_args.vgtool = vgtool; ret = dict_get_str(this->options, "run-with-valgrind", &valgrind_str); if (ret < 0) { gf_msg_debug(this->name, 0, "cannot get run-with-valgrind value"); } if (valgrind_str) { - if (gf_string2boolean(valgrind_str, &valgrind)) { + gf_boolean_t vg = _gf_false; + + if (!strcmp(valgrind_str, "memcheck")) + this->ctx->cmd_args.vgtool = _gf_memcheck; + else if (!strcmp(valgrind_str, "drd")) + this->ctx->cmd_args.vgtool = _gf_drd; + else if (!gf_string2boolean(valgrind_str, &vg)) + this->ctx->cmd_args.vgtool = (vg ? _gf_memcheck : _gf_none); + else gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY, - "run-with-valgrind value not a boolean string"); - } else { - this->ctx->cmd_args.valgrind = valgrind; - } + "run-with-valgrind is neither boolean" + " nor one of 'memcheck' or 'drd'"); } /* Store ping-timeout in conf */ diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 2c8fab8f0e7..cc4f98ecf47 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -1197,6 +1197,8 @@ glusterd_op_set_ganesha(dict_t *dict, char **errstr); int ganesha_manage_export(dict_t *dict, char *value, gf_boolean_t update_cache_invalidation, char **op_errstr); +int +gd_ganesha_send_dbus(char *volname, char *value); gf_boolean_t glusterd_is_ganesha_cluster(); gf_boolean_t @@ -1367,4 +1369,7 @@ glusterd_recreate_volfiles(glusterd_conf_t *conf); void glusterd_add_peers_to_auth_list(char *volname); +int +glusterd_replace_old_auth_allow_list(char *volname); + #endif diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index f2eeac1d1ee..0e22fe411ee 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -179,7 +179,7 @@ fusedump_gettime(struct fusedump_timespec *fts) 0, }; - clock_gettime(CLOCK_REALTIME, &ts); + timespec_now_realtime(&ts); fts->sec = ts.tv_sec; fts->nsec = ts.tv_nsec; @@ -5899,7 +5899,9 @@ fuse_graph_sync(xlator_t *this) new_graph_id = priv->next_graph->id; priv->next_graph = NULL; need_first_lookup = 1; - priv->handle_graph_switch = _gf_true; + if (old_subvol) { + priv->handle_graph_switch = _gf_true; + } while (!priv->event_recvd) { ret = pthread_cond_wait(&priv->sync_cond, &priv->sync_mutex); @@ -5935,13 +5937,6 @@ unlock: if (winds_on_old_subvol == 0) { xlator_notify(old_subvol, GF_EVENT_PARENT_DOWN, old_subvol, NULL); } - } else { - pthread_mutex_lock(&priv->sync_mutex); - { - priv->handle_graph_switch = _gf_false; - pthread_cond_broadcast(&priv->migrate_cond); - } - pthread_mutex_unlock(&priv->sync_mutex); } return 0; diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c index fd11f2ba652..a2b0ad11fe4 100644 --- a/xlators/mount/fuse/src/fuse-helpers.c +++ b/xlators/mount/fuse/src/fuse-helpers.c @@ -139,8 +139,6 @@ get_fuse_state(xlator_t *this, fuse_in_header_t *finh) return state; } -#define FUSE_MAX_AUX_GROUPS \ - 32 /* We can get only up to 32 aux groups from /proc */ void frame_fill_groups(call_frame_t *frame) { @@ -150,8 +148,6 @@ frame_fill_groups(call_frame_t *frame) char filename[32]; char line[4096]; char *ptr = NULL; - FILE *fp = NULL; - int idx = 0; long int id = 0; char *saveptr = NULL; char *endptr = NULL; @@ -191,45 +187,72 @@ frame_fill_groups(call_frame_t *frame) call_stack_set_groups(frame->root, ngroups, &mygroups); } else { + FILE *fp = NULL; + ret = snprintf(filename, sizeof filename, "/proc/%d/status", frame->root->pid); - if (ret >= sizeof filename) + if (ret >= sizeof filename) { + gf_log(this->name, GF_LOG_ERROR, "procfs path exceeds buffer size"); goto out; + } fp = fopen(filename, "r"); - if (!fp) + if (!fp) { + gf_log(this->name, GF_LOG_ERROR, "failed to open %s: %s", filename, + strerror(errno)); goto out; + } - if (call_stack_alloc_groups(frame->root, ngroups) != 0) - goto out; + for (;;) { + gf_boolean_t found_groups = _gf_false; + int idx = 0; - while ((ptr = fgets(line, sizeof line, fp))) { - if (strncmp(ptr, "Groups:", 7) != 0) - continue; + if (call_stack_alloc_groups(frame->root, ngroups) != 0) { + gf_log(this->name, GF_LOG_ERROR, + "failed to allocate gid buffer"); + goto out; + } + while ((ptr = fgets(line, sizeof line, fp))) { + if (strncmp(ptr, "Groups:", 7) == 0) { + found_groups = _gf_true; + break; + } + } + if (!found_groups) { + gf_log(this->name, GF_LOG_ERROR, "cannot find gid list in %s", + filename); + break; + } ptr = line + 8; for (ptr = strtok_r(ptr, " \t\r\n", &saveptr); ptr; ptr = strtok_r(NULL, " \t\r\n", &saveptr)) { errno = 0; id = strtol(ptr, &endptr, 0); - if (errno == ERANGE) - break; - if (!endptr || *endptr) + if (errno == ERANGE || !endptr || *endptr) { + gf_log(this->name, GF_LOG_ERROR, "failed to parse %s", + filename); break; - frame->root->groups[idx++] = id; - if (idx == FUSE_MAX_AUX_GROUPS) + } + if (idx < call_stack_groups_capacity(frame->root)) + frame->root->groups[idx] = id; + idx++; + if (idx == GF_MAX_AUX_GROUPS) break; } - - frame->root->ngrps = idx; - break; + if (idx > call_stack_groups_capacity(frame->root)) { + ngroups = idx; + rewind(fp); + } else { + frame->root->ngrps = idx; + break; + } } + out: + if (fp) + fclose(fp); } - -out: - if (fp) - fclose(fp); #elif defined(GF_SOLARIS_HOST_OS) char filename[32]; char scratch[128]; @@ -245,7 +268,7 @@ out: fp = fopen(filename, "r"); if (fp != NULL) { if (fgets(scratch, sizeof scratch, fp) != NULL) { - ngrps = MIN(prcred->pr_ngroups, FUSE_MAX_AUX_GROUPS); + ngrps = MIN(prcred->pr_ngroups, GF_MAX_AUX_GROUPS); if (call_stack_alloc_groups(frame->root, ngrps) != 0) { fclose(fp); return; diff --git a/xlators/mount/fuse/utils/mount_glusterfs.in b/xlators/mount/fuse/utils/mount_glusterfs.in index d43fc97d084..3a5feb606d7 100755 --- a/xlators/mount/fuse/utils/mount_glusterfs.in +++ b/xlators/mount/fuse/utils/mount_glusterfs.in @@ -469,6 +469,7 @@ parse_options() main () { +#if !defined(__FreeBSD__) ## `mount` on OSX specifies options as first argument echo $1|grep -q -- "-o" if [ $? -eq 0 ]; then @@ -478,7 +479,7 @@ main () volfile_loc=$1 mount_point=$2 fi - +#endif /* __FreeBSD__ */ while getopts "Vo:h" opt; do case "${opt}" in o) @@ -499,6 +500,12 @@ main () esac done +#ifdef __FreeBSD__ + shift $((OPTIND - 1)) + volfile_loc="$1" + mount_point="$2" +#endif /* __FreeBSD__ */ + [ -r "$volfile_loc" ] || { # '%' included to support ipv6 link local addresses server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:%.\-]*\):.*/\1/p'); diff --git a/xlators/nfs/server/src/acl3.c b/xlators/nfs/server/src/acl3.c index 3745188c7a5..7e3bbf16086 100644 --- a/xlators/nfs/server/src/acl3.c +++ b/xlators/nfs/server/src/acl3.c @@ -753,8 +753,7 @@ acl3svc_init(xlator_t *nfsx) goto err; } - ret = dict_set_dynstr(options, "transport.socket.listen-port", - GF_ACL3_PORT); + ret = dict_set_str(options, "transport.socket.listen-port", GF_ACL3_PORT); if (ret == -1) goto err; ret = dict_set_str(options, "transport-type", "socket"); diff --git a/xlators/nfs/server/src/auth-cache.c b/xlators/nfs/server/src/auth-cache.c index 64768646074..ffbf5b6cad6 100644 --- a/xlators/nfs/server/src/auth-cache.c +++ b/xlators/nfs/server/src/auth-cache.c @@ -189,7 +189,7 @@ out: static int _auth_cache_expired(struct auth_cache *cache, struct auth_cache_entry *entry) { - return ((time(NULL) - entry->timestamp) > cache->ttl_sec); + return ((gf_time() - entry->timestamp) > cache->ttl_sec); } /** @@ -474,7 +474,7 @@ cache_nfs_fh(struct auth_cache *cache, struct nfs3_fh *fh, goto out; } - entry->timestamp = time(NULL); + entry->timestamp = gf_time(); /* Update entry->item if it is pointing to a different export_item */ if (entry->item && entry->item != export_item) { GF_REF_PUT(entry->item); diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c index 0688779eb65..1a2b0f85453 100644 --- a/xlators/nfs/server/src/mount3udp_svc.c +++ b/xlators/nfs/server/src/mount3udp_svc.c @@ -216,7 +216,7 @@ mount3udp_thread(void *argv) GF_ASSERT(nfsx); - glusterfs_this_set(nfsx); + THIS = nfsx; transp = svcudp_create(RPC_ANYSOCK); if (transp == NULL) { diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c index 8a58977b53c..897fb42b071 100644 --- a/xlators/nfs/server/src/nfs3-helpers.c +++ b/xlators/nfs/server/src/nfs3-helpers.c @@ -1072,7 +1072,7 @@ nfs3_sattr3_to_setattr_valid(sattr3 *sattr, struct iatt *buf, mode_t *omode) if (sattr->atime.set_it == SET_TO_SERVER_TIME) { valid |= GF_SET_ATTR_ATIME; if (buf) - buf->ia_atime = time(NULL); + buf->ia_atime = gf_time(); } if (sattr->mtime.set_it == SET_TO_CLIENT_TIME) { @@ -1084,7 +1084,7 @@ nfs3_sattr3_to_setattr_valid(sattr3 *sattr, struct iatt *buf, mode_t *omode) if (sattr->mtime.set_it == SET_TO_SERVER_TIME) { valid |= GF_SET_ATTR_MTIME; if (buf) - buf->ia_mtime = time(NULL); + buf->ia_mtime = gf_time(); } return valid; diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c index 7cfd75f9ed1..f9042bc3b3f 100644 --- a/xlators/nfs/server/src/nfs3.c +++ b/xlators/nfs/server/src/nfs3.c @@ -5651,7 +5651,7 @@ nfs3_init_state(xlator_t *nfsx) goto free_localpool; } - nfs3->serverstart = (uint64_t)time(NULL); + nfs3->serverstart = (uint64_t)gf_time(); INIT_LIST_HEAD(&nfs3->fdlru); LOCK_INIT(&nfs3->fdlrulock); nfs3->fdcount = 0; diff --git a/xlators/nfs/server/src/nlm4.c b/xlators/nfs/server/src/nlm4.c index c909e3bc093..577e8543966 100644 --- a/xlators/nfs/server/src/nlm4.c +++ b/xlators/nfs/server/src/nlm4.c @@ -1011,7 +1011,8 @@ nlm4_establish_callback(nfs3_call_state_t *cs, call_frame_t *cbk_frame) int port = -1; struct nlm4_notify_args *ncf = NULL; - glusterfs_this_set(cs->nfsx); + GF_ASSERT(cs->nfsx); + THIS = cs->nfsx; rpc_transport_get_peeraddr(cs->trans, NULL, 0, &sock_union.storage, sizeof(sock_union.storage)); @@ -2714,7 +2715,7 @@ nlm4svc_init(xlator_t *nfsx) goto err; } - (void)gf_thread_create(&thr, NULL, nsm_thread, (void *)NULL, "nfsnsm"); + (void)gf_thread_create(&thr, NULL, nsm_thread, nfsx, "nfsnsm"); timeout.tv_sec = nlm_grace_period; timeout.tv_nsec = 0; diff --git a/xlators/nfs/server/src/nlmcbk_svc.c b/xlators/nfs/server/src/nlmcbk_svc.c index d18b86ce8db..eaa7b916190 100644 --- a/xlators/nfs/server/src/nlmcbk_svc.c +++ b/xlators/nfs/server/src/nlmcbk_svc.c @@ -84,9 +84,14 @@ nlmcbk_program_0(struct svc_req *rqstp, register SVCXPRT *transp) void * nsm_thread(void *argv) { + xlator_t *nfsx = argv; register SVCXPRT *transp; int ret = 0; + GF_ASSERT(nfsx); + + THIS = nfsx; + ret = pmap_unset(NLMCBK_PROGRAM, NLMCBK_V1); if (ret == 0) { gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_PMAP_UNSET_FAIL, diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c index 2fefaef8de8..9375d29c17f 100644 --- a/xlators/performance/io-cache/src/io-cache.c +++ b/xlators/performance/io-cache/src/io-cache.c @@ -133,23 +133,17 @@ ioc_update_pages(call_frame_t *frame, ioc_inode_t *ioc_inode, return 0; } -int32_t +static gf_boolean_t ioc_inode_need_revalidate(ioc_inode_t *ioc_inode) { - int8_t need_revalidate = 0; - struct timeval tv = { - 0, - }; ioc_table_t *table = NULL; + GF_ASSERT(ioc_inode); table = ioc_inode->table; + GF_ASSERT(table); - gettimeofday(&tv, NULL); - - if (time_elapsed(&tv, &ioc_inode->cache.tv) >= table->cache_timeout) - need_revalidate = 1; - - return need_revalidate; + return (gf_time() - ioc_inode->cache.last_revalidate >= + table->cache_timeout); } /* @@ -411,9 +405,6 @@ ioc_cache_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ioc_inode_t *ioc_inode = NULL; size_t destroy_size = 0; struct iatt *local_stbuf = NULL; - struct timeval tv = { - 0, - }; local = frame->local; ioc_inode = local->inode; @@ -451,10 +442,9 @@ ioc_cache_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret < 0) local_stbuf = NULL; - gettimeofday(&tv, NULL); ioc_inode_lock(ioc_inode); { - memcpy(&ioc_inode->cache.tv, &tv, sizeof(struct timeval)); + ioc_inode->cache.last_revalidate = gf_time(); } ioc_inode_unlock(ioc_inode); @@ -1405,9 +1395,6 @@ ioc_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, { ioc_inode_t *ioc_inode = NULL; uint64_t tmp_inode = 0; - struct timeval tv = { - 0, - }; inode_ctx_get(fd->inode, this, &tmp_inode); ioc_inode = (ioc_inode_t *)(long)tmp_inode; @@ -1418,10 +1405,9 @@ ioc_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, return 0; } - gettimeofday(&tv, NULL); ioc_inode_lock(ioc_inode); { - memcpy(&ioc_inode->cache.tv, &tv, sizeof(struct timeval)); + ioc_inode->cache.last_revalidate = gf_time(); } ioc_inode_unlock(ioc_inode); @@ -1955,9 +1941,9 @@ __ioc_cache_dump(ioc_inode_t *ioc_inode, char *prefix) table = ioc_inode->table; - if (ioc_inode->cache.tv.tv_sec) { - gf_time_fmt_tv(timestr, sizeof timestr, &ioc_inode->cache.tv, - gf_timefmt_FT); + if (ioc_inode->cache.last_revalidate) { + gf_time_fmt(timestr, sizeof timestr, ioc_inode->cache.last_revalidate, + gf_timefmt_FT); gf_proc_dump_write("last-cache-validation-time", "%s", timestr); } diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h index 4303c2fae13..14923c75edc 100644 --- a/xlators/performance/io-cache/src/io-cache.h +++ b/xlators/performance/io-cache/src/io-cache.h @@ -117,15 +117,13 @@ struct ioc_page { struct ioc_cache { rbthash_table_t *page_table; struct list_head page_lru; - time_t mtime; /* - * seconds component of file mtime - */ - time_t mtime_nsec; /* - * nanosecond component of file mtime - */ - struct timeval tv; /* - * time-stamp at last re-validate - */ + time_t mtime; /* + * seconds component of file mtime + */ + time_t mtime_nsec; /* + * nanosecond component of file mtime + */ + time_t last_revalidate; /* timestamp at last re-validate */ }; struct ioc_inode { @@ -270,17 +268,6 @@ ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset, size_t size, pthread_mutex_unlock(&page->page_lock); \ } while (0) -static inline uint64_t -time_elapsed(struct timeval *now, struct timeval *then) -{ - uint64_t sec = now->tv_sec - then->tv_sec; - - if (sec) - return sec; - - return 0; -} - ioc_inode_t * ioc_inode_search(ioc_table_t *table, inode_t *inode); diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c index a8edbde23f2..84b1ae6cb20 100644 --- a/xlators/performance/io-cache/src/page.c +++ b/xlators/performance/io-cache/src/page.c @@ -413,9 +413,6 @@ ioc_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ioc_waitq_t *waitq = NULL; size_t iobref_page_size = 0; char zero_filled = 0; - struct timeval tv = { - 0, - }; GF_ASSERT(frame); @@ -431,7 +428,6 @@ ioc_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, zero_filled = ((op_ret >= 0) && (stbuf->ia_mtime == 0)); - gettimeofday(&tv, NULL); ioc_inode_lock(ioc_inode); { if (op_ret == -1 || @@ -448,7 +444,7 @@ ioc_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; } - memcpy(&ioc_inode->cache.tv, &tv, sizeof(struct timeval)); + ioc_inode->cache.last_revalidate = gf_time(); if (op_ret < 0) { /* error, readv returned -1 */ diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 6fa4d88389c..3d24cc97f4b 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -1016,16 +1016,13 @@ static uint32_t THRESH_LIMIT = 1209600; /* SECONDS * (EVENTS-1) */ static void iot_apply_event(xlator_t *this, threshold_t *thresh) { - struct timespec now; - time_t delta; + time_t delta, now = gf_time(); /* Refresh for manual testing/debugging. It's cheap. */ THRESH_LIMIT = THRESH_SECONDS * (THRESH_EVENTS - 1); - timespec_now(&now); - if (thresh->value && thresh->update_time) { - delta = now.tv_sec - thresh->update_time; + delta = now - thresh->update_time; /* Be careful about underflow. */ if (thresh->value <= delta) { thresh->value = 0; @@ -1046,7 +1043,7 @@ iot_apply_event(xlator_t *this, threshold_t *thresh) kill(getpid(), SIGTRAP); } - thresh->update_time = now.tv_sec; + thresh->update_time = now; } static void * @@ -1311,7 +1308,7 @@ notify(xlator_t *this, int32_t event, void *data, ...) /* Wait for draining stub from queue before notify PARENT_DOWN */ stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); if (stub_cnt) { - clock_gettime(CLOCK_REALTIME, &sleep_till); + timespec_now_realtime(&sleep_till); sleep_till.tv_sec += 1; pthread_mutex_lock(&conf->mutex); { diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c index 1313c2b55c3..a405be51f02 100644 --- a/xlators/performance/md-cache/src/md-cache.c +++ b/xlators/performance/md-cache/src/md-cache.c @@ -8,7 +8,6 @@ cases as published by the Free Software Foundation. */ -#include <glusterfs/timespec.h> #include <glusterfs/glusterfs.h> #include <glusterfs/defaults.h> #include <glusterfs/logging.h> @@ -33,8 +32,7 @@ struct mdc_statfs_cache { pthread_mutex_t lock; - gf_boolean_t initialized; - struct timespec last_refreshed; + time_t last_refreshed; /* (time_t)-1 if not yet initialized. */ struct statvfs buf; }; @@ -61,7 +59,7 @@ struct mdc_statistics { }; struct mdc_conf { - int timeout; + uint32_t timeout; gf_boolean_t cache_posix_acl; gf_boolean_t cache_glusterfs_acl; gf_boolean_t cache_selinux; @@ -376,10 +374,9 @@ unlock: static gf_boolean_t __is_cache_valid(xlator_t *this, time_t mdc_time) { - time_t now = 0; gf_boolean_t ret = _gf_true; struct mdc_conf *conf = NULL; - int timeout = 0; + uint32_t timeout = 0; time_t last_child_down = 0; conf = this->private; @@ -393,15 +390,13 @@ __is_cache_valid(xlator_t *this, time_t mdc_time) last_child_down = conf->last_child_down; timeout = conf->timeout; - time(&now); - if ((mdc_time == 0) || ((last_child_down != 0) && (mdc_time < last_child_down))) { ret = _gf_false; goto out; } - if (now >= (mdc_time + timeout)) { + if (gf_time() >= (mdc_time + timeout)) { ret = _gf_false; } @@ -581,10 +576,9 @@ mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf, mdc_from_iatt(mdc, iatt); mdc->valid = _gf_true; if (update_time) { - time(&mdc->ia_time); - + mdc->ia_time = gf_time(); if (mdc->xa_time && update_xa_time) - time(&mdc->xa_time); + mdc->xa_time = mdc->ia_time; } gf_msg_callingfn( @@ -785,7 +779,7 @@ mdc_inode_xatt_set(xlator_t *this, inode_t *inode, dict_t *dict) if (newdict) mdc->xattr = newdict; - time(&mdc->xa_time); + mdc->xa_time = gf_time(); gf_msg_trace("md-cache", 0, "xatt cache set for (%s) time:%lld", uuid_utoa(inode->gfid), (long long)mdc->xa_time); } @@ -1063,8 +1057,7 @@ mdc_cache_statfs(xlator_t *this, struct statvfs *buf) pthread_mutex_lock(&conf->statfs_cache.lock); { memcpy(&conf->statfs_cache.buf, buf, sizeof(struct statvfs)); - clock_gettime(CLOCK_MONOTONIC, &conf->statfs_cache.last_refreshed); - conf->statfs_cache.initialized = _gf_true; + conf->statfs_cache.last_refreshed = gf_time(); } pthread_mutex_unlock(&conf->statfs_cache.lock); } @@ -1073,8 +1066,7 @@ int mdc_load_statfs_info_from_cache(xlator_t *this, struct statvfs **buf) { struct mdc_conf *conf = this->private; - struct timespec now; - double cache_age = 0.0; + uint32_t cache_age = 0; int ret = 0; if (!buf || !conf) { @@ -1083,23 +1075,23 @@ mdc_load_statfs_info_from_cache(xlator_t *this, struct statvfs **buf) } *buf = NULL; - timespec_now(&now); pthread_mutex_lock(&conf->statfs_cache.lock); { - /* Skip if the cache is not initialized */ - if (!conf->statfs_cache.initialized) { + /* Skip if the cache is not initialized. */ + if (conf->statfs_cache.last_refreshed == (time_t)-1) { ret = -1; goto unlock; } - cache_age = (now.tv_sec - conf->statfs_cache.last_refreshed.tv_sec); + cache_age = (gf_time() - conf->statfs_cache.last_refreshed); - gf_log(this->name, GF_LOG_DEBUG, "STATFS cache age = %lf", cache_age); + gf_log(this->name, GF_LOG_DEBUG, "STATFS cache age = %u secs", + cache_age); if (cache_age > conf->timeout) { - /* Expire the cache */ + /* Expire the cache. */ gf_log(this->name, GF_LOG_DEBUG, - "Cache age %lf exceeded timeout %d", cache_age, + "Cache age %u secs exceeded timeout %u secs", cache_age, conf->timeout); ret = -1; goto unlock; @@ -3616,7 +3608,7 @@ int mdc_reconfigure(xlator_t *this, dict_t *options) { struct mdc_conf *conf = NULL; - int timeout = 0; + int timeout = 0, ret = 0; char *tmp_str = NULL; conf = this->private; @@ -3656,7 +3648,10 @@ mdc_reconfigure(xlator_t *this, dict_t *options) GF_OPTION_RECONF("md-cache-statfs", conf->cache_statfs, options, bool, out); GF_OPTION_RECONF("xattr-cache-list", tmp_str, options, str, out); - mdc_xattr_list_populate(conf, tmp_str); + + ret = mdc_xattr_list_populate(conf, tmp_str); + if (ret < 0) + goto out; /* If timeout is greater than 60s (default before the patch that added * cache invalidation support was added) then, cache invalidation @@ -3669,25 +3664,22 @@ mdc_reconfigure(xlator_t *this, dict_t *options) } conf->timeout = timeout; - (void)mdc_register_xattr_inval(this); + ret = mdc_register_xattr_inval(this); out: - return 0; + return ret; } int32_t mdc_mem_acct_init(xlator_t *this) { - int ret = -1; - - ret = xlator_mem_acct_init(this, gf_mdc_mt_end + 1); - return ret; + return xlator_mem_acct_init(this, gf_mdc_mt_end + 1); } int mdc_init(xlator_t *this) { struct mdc_conf *conf = NULL; - int timeout = 0; + uint32_t timeout = 0; char *tmp_str = NULL; conf = GF_CALLOC(sizeof(*conf), 1, gf_mdc_mt_mdc_conf_t); @@ -3699,7 +3691,7 @@ mdc_init(xlator_t *this) LOCK_INIT(&conf->lock); - GF_OPTION_INIT("md-cache-timeout", timeout, int32, out); + GF_OPTION_INIT("md-cache-timeout", timeout, uint32, out); GF_OPTION_INIT("cache-selinux", conf->cache_selinux, bool, out); @@ -3733,7 +3725,9 @@ mdc_init(xlator_t *this) GF_OPTION_INIT("xattr-cache-list", tmp_str, str, out); mdc_xattr_list_populate(conf, tmp_str); - time(&conf->last_child_down); + conf->last_child_down = gf_time(); + conf->statfs_cache.last_refreshed = (time_t)-1; + /* initialize gf_atomic_t counters */ GF_ATOMIC_INIT(conf->mdc_counter.stat_hit, 0); GF_ATOMIC_INIT(conf->mdc_counter.stat_miss, 0); @@ -3764,7 +3758,7 @@ out: } void -mdc_update_child_down_time(xlator_t *this, time_t *now) +mdc_update_child_down_time(xlator_t *this, time_t now) { struct mdc_conf *conf = NULL; @@ -3772,7 +3766,7 @@ mdc_update_child_down_time(xlator_t *this, time_t *now) LOCK(&conf->lock); { - conf->last_child_down = *now; + conf->last_child_down = now; } UNLOCK(&conf->lock); } @@ -3782,14 +3776,12 @@ mdc_notify(xlator_t *this, int event, void *data, ...) { int ret = 0; struct mdc_conf *conf = NULL; - time_t now = 0; conf = this->private; switch (event) { case GF_EVENT_CHILD_DOWN: case GF_EVENT_SOME_DESCENDENT_DOWN: - time(&now); - mdc_update_child_down_time(this, &now); + mdc_update_child_down_time(this, gf_time()); break; case GF_EVENT_UPCALL: if (conf->mdc_invalidation) diff --git a/xlators/performance/nl-cache/src/nl-cache-helper.c b/xlators/performance/nl-cache/src/nl-cache-helper.c index 03dedf8ea08..29b99b5b8ea 100644 --- a/xlators/performance/nl-cache/src/nl-cache-helper.c +++ b/xlators/performance/nl-cache/src/nl-cache-helper.c @@ -113,7 +113,7 @@ out: } void -nlc_update_child_down_time(xlator_t *this, time_t *now) +nlc_update_child_down_time(xlator_t *this, time_t now) { nlc_conf_t *conf = NULL; @@ -121,7 +121,7 @@ nlc_update_child_down_time(xlator_t *this, time_t *now) LOCK(&conf->lock); { - conf->last_child_down = *now; + conf->last_child_down = now; } UNLOCK(&conf->lock); @@ -262,7 +262,7 @@ nlc_init_invalid_ctx(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) if (nlc_ctx->timer) { gf_tw_mod_timer_pending(conf->timer_wheel, nlc_ctx->timer, conf->cache_timeout); - time(&nlc_ctx->cache_time); + nlc_ctx->cache_time = gf_time(); goto unlock; } @@ -496,7 +496,7 @@ __nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) nlc_ctx->timer_data = tmp; gf_tw_add_timer(conf->timer_wheel, timer); - time(&nlc_ctx->cache_time); + nlc_ctx->cache_time = gf_time(); gf_msg_trace(this->name, 0, "Registering timer:%p, inode:%p, " "gfid:%s", diff --git a/xlators/performance/nl-cache/src/nl-cache.c b/xlators/performance/nl-cache/src/nl-cache.c index cd0e1d195fd..33a7c471663 100644 --- a/xlators/performance/nl-cache/src/nl-cache.c +++ b/xlators/performance/nl-cache/src/nl-cache.c @@ -520,15 +520,13 @@ int nlc_notify(xlator_t *this, int event, void *data, ...) { int ret = 0; - time_t now = 0; switch (event) { case GF_EVENT_CHILD_DOWN: case GF_EVENT_SOME_DESCENDENT_DOWN: case GF_EVENT_CHILD_UP: case GF_EVENT_SOME_DESCENDENT_UP: - time(&now); - nlc_update_child_down_time(this, &now); + nlc_update_child_down_time(this, gf_time()); /* TODO: nlc_clear_all_cache (this); else lru prune will lazily clear it*/ break; @@ -731,7 +729,7 @@ nlc_init(xlator_t *this) GF_ATOMIC_INIT(conf->nlc_counter.nlc_invals, 0); INIT_LIST_HEAD(&conf->lru); - time(&conf->last_child_down); + conf->last_child_down = gf_time(); conf->timer_wheel = glusterfs_ctx_tw_get(this->ctx); if (!conf->timer_wheel) { diff --git a/xlators/performance/nl-cache/src/nl-cache.h b/xlators/performance/nl-cache/src/nl-cache.h index 8b09972bb09..85fcc176342 100644 --- a/xlators/performance/nl-cache/src/nl-cache.h +++ b/xlators/performance/nl-cache/src/nl-cache.h @@ -155,7 +155,7 @@ nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, loc_t *loc, loc_t *loc2); void -nlc_update_child_down_time(xlator_t *this, time_t *now); +nlc_update_child_down_time(xlator_t *this, time_t now); void nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason); diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c index e43fe73bcca..600c3b62ffe 100644 --- a/xlators/performance/open-behind/src/open-behind.c +++ b/xlators/performance/open-behind/src/open-behind.c @@ -333,6 +333,15 @@ ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, return 0; } +static void +ob_open_destroy(call_stub_t *stub, fd_t *fd) +{ + stub->frame->local = NULL; + STACK_DESTROY(stub->frame->root); + call_stub_destroy(stub); + fd_unref(fd); +} + static int32_t ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, call_stub_t *stub) @@ -355,8 +364,7 @@ ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, if (stub != NULL) { if (closed) { - call_stub_destroy(stub); - fd_unref(fd); + ob_open_destroy(stub, fd); } else { call_resume(stub); } @@ -509,6 +517,56 @@ ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, } static int32_t +ob_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + ob_inode_t *ob_inode; + call_stub_t *stub; + fd_t *first_fd; + ob_state_t state; + + /* Create requests are never delayed. We always send them synchronously. */ + state = ob_open_and_resume_fd(this, fd, 1, true, true, &ob_inode, + &first_fd); + if (state == OB_STATE_READY) { + /* There's no pending open, but there are other file descriptors opened + * so we simply forward the request synchronously. */ + return default_create(frame, this, loc, flags, mode, umask, fd, xdata); + } + + if (state == OB_STATE_OPEN_TRIGGERED) { + /* The first open is in progress (either because it was already issued + * or because this request triggered it). We try to create a new stub + * to retry the operation once the initial open completes. */ + stub = fop_create_stub(frame, ob_create, loc, flags, mode, umask, fd, + xdata); + if (stub != NULL) { + return ob_stub_dispatch(this, ob_inode, first_fd, stub); + } + + state = -ENOMEM; + } + + /* Since we forced a synchronous request, OB_STATE_FIRST_OPEN will never + * be returned by ob_open_and_resume_fd(). If we are here it can only be + * because there has been a problem. */ + + /* In case of failure we need to decrement the number of open files because + * ob_fdclose() won't be called. */ + + LOCK(&fd->inode->lock); + { + ob_inode->open_count--; + } + UNLOCK(&fd->inode->lock); + + gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", + "create", "path=%s", loc->path, NULL); + + return default_create_failure_cbk(frame, -state); +} + +static int32_t ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { @@ -776,8 +834,7 @@ ob_fdclose(xlator_t *this, fd_t *fd) UNLOCK(&fd->inode->lock); if (stub != NULL) { - call_stub_destroy(stub); - fd_unref(fd); + ob_open_destroy(stub, fd); } ob_resume_pending(&list); @@ -940,6 +997,7 @@ fini(xlator_t *this) struct xlator_fops fops = { .open = ob_open, + .create = ob_create, .readv = ob_readv, .writev = ob_writev, .flush = ob_flush, diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c index 640c9ac51c6..7fe4b3c3a4b 100644 --- a/xlators/performance/quick-read/src/quick-read.c +++ b/xlators/performance/quick-read/src/quick-read.c @@ -421,9 +421,6 @@ qr_content_update(xlator_t *this, qr_inode_t *qr_inode, void *data, qr_private_t *priv = NULL; qr_inode_table_t *table = NULL; uint32_t rollover = 0; - struct timeval tv = { - 0, - }; rollover = gen >> 32; gen = gen & 0xffffffff; @@ -431,7 +428,6 @@ qr_content_update(xlator_t *this, qr_inode_t *qr_inode, void *data, priv = this->private; table = &priv->table; - gettimeofday(&tv, NULL); LOCK(&table->lock); { if ((rollover != qr_inode->gen_rollover) || @@ -453,8 +449,7 @@ qr_content_update(xlator_t *this, qr_inode_t *qr_inode, void *data, qr_inode->ia_ctime_nsec = buf->ia_ctime_nsec; qr_inode->buf = *buf; - - memcpy(&qr_inode->last_refresh, &tv, sizeof(struct timeval)); + qr_inode->last_refresh = gf_time(); __qr_inode_register(this, table, qr_inode); } @@ -524,9 +519,7 @@ __qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf, if (qr_size_fits(conf, buf) && qr_time_equal(conf, qr_inode, buf)) { qr_inode->buf = *buf; - - gettimeofday(&qr_inode->last_refresh, NULL); - + qr_inode->last_refresh = gf_time(); __qr_inode_register(this, table, qr_inode); } else { __qr_inode_prune(this, table, qr_inode, gen); @@ -558,20 +551,14 @@ __qr_cache_is_fresh(xlator_t *this, qr_inode_t *qr_inode) { qr_conf_t *conf = NULL; qr_private_t *priv = NULL; - struct timeval now; - struct timeval diff; priv = this->private; conf = &priv->conf; - gettimeofday(&now, NULL); - - timersub(&now, &qr_inode->last_refresh, &diff); - - if (qr_inode->last_refresh.tv_sec < priv->last_child_down) + if (qr_inode->last_refresh < priv->last_child_down) return _gf_false; - if (diff.tv_sec >= conf->cache_timeout) + if (gf_time() - qr_inode->last_refresh >= conf->cache_timeout) return _gf_false; return _gf_true; @@ -1049,9 +1036,8 @@ qr_inodectx_dump(xlator_t *this, inode_t *inode) gf_proc_dump_write("entire-file-cached", "%s", qr_inode->data ? "yes" : "no"); - if (qr_inode->last_refresh.tv_sec) { - gf_time_fmt_tv(buf, sizeof buf, &qr_inode->last_refresh, gf_timefmt_FT); - + if (qr_inode->last_refresh) { + gf_time_fmt(buf, sizeof buf, qr_inode->last_refresh, gf_timefmt_FT); gf_proc_dump_write("last-cache-validation-time", "%s", buf); } @@ -1404,7 +1390,7 @@ qr_init(xlator_t *this) ret = 0; - time(&priv->last_child_down); + priv->last_child_down = gf_time(); GF_ATOMIC_INIT(priv->generation, 0); this->private = priv; out: @@ -1454,7 +1440,7 @@ qr_conf_destroy(qr_conf_t *conf) } void -qr_update_child_down_time(xlator_t *this, time_t *now) +qr_update_child_down_time(xlator_t *this, time_t now) { qr_private_t *priv = NULL; @@ -1462,7 +1448,7 @@ qr_update_child_down_time(xlator_t *this, time_t *now) LOCK(&priv->lock); { - priv->last_child_down = *now; + priv->last_child_down = now; } UNLOCK(&priv->lock); } @@ -1508,7 +1494,6 @@ qr_notify(xlator_t *this, int event, void *data, ...) { int ret = 0; qr_private_t *priv = NULL; - time_t now = 0; qr_conf_t *conf = NULL; priv = this->private; @@ -1517,8 +1502,7 @@ qr_notify(xlator_t *this, int event, void *data, ...) switch (event) { case GF_EVENT_CHILD_DOWN: case GF_EVENT_SOME_DESCENDENT_DOWN: - time(&now); - qr_update_child_down_time(this, &now); + qr_update_child_down_time(this, gf_time()); break; case GF_EVENT_UPCALL: if (conf->qr_invalidation) diff --git a/xlators/performance/quick-read/src/quick-read.h b/xlators/performance/quick-read/src/quick-read.h index 67850821b8e..20fcc70b3a7 100644 --- a/xlators/performance/quick-read/src/quick-read.h +++ b/xlators/performance/quick-read/src/quick-read.h @@ -39,7 +39,7 @@ struct qr_inode { uint32_t ia_ctime_nsec; uint32_t gen_rollover; struct iatt buf; - struct timeval last_refresh; + time_t last_refresh; struct list_head lru; uint64_t gen; uint64_t invalidation_time; diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index 48f4d0a7f87..721968004a0 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -267,6 +267,8 @@ server_priv(xlator_t *this) gf_proc_dump_build_key(key, "server", "total-bytes-write"); gf_proc_dump_write(key, "%" PRIu64, total_write); + rpcsvc_statedump(conf->rpc); + ret = 0; out: if (ret) diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c index 609ddea2560..f10722ec3fb 100644 --- a/xlators/storage/posix/src/posix-common.c +++ b/xlators/storage/posix/src/posix-common.c @@ -140,6 +140,7 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) struct timespec sleep_till = { 0, }; + glusterfs_ctx_t *ctx = this->ctx; switch (event) { case GF_EVENT_PARENT_UP: { @@ -150,8 +151,6 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) case GF_EVENT_PARENT_DOWN: { if (!victim->cleanup_starting) break; - gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s", - victim->name); if (priv->janitor) { pthread_mutex_lock(&priv->janitor_mutex); @@ -160,7 +159,7 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor); if (!ret) { - clock_gettime(CLOCK_REALTIME, &sleep_till); + timespec_now_realtime(&sleep_till); sleep_till.tv_sec += 1; /* Wait to set janitor_task flag to _gf_false by * janitor_task_done */ @@ -168,7 +167,7 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) (void)pthread_cond_timedwait(&priv->janitor_cond, &priv->janitor_mutex, &sleep_till); - clock_gettime(CLOCK_REALTIME, &sleep_till); + timespec_now_realtime(&sleep_till); sleep_till.tv_sec += 1; } } @@ -177,6 +176,16 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) GF_FREE(priv->janitor); } priv->janitor = NULL; + pthread_mutex_lock(&ctx->fd_lock); + { + while (priv->rel_fdcount > 0) { + pthread_cond_wait(&priv->fd_cond, &ctx->fd_lock); + } + } + pthread_mutex_unlock(&ctx->fd_lock); + + gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s", + victim->name); default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data); } break; default: @@ -1084,7 +1093,13 @@ posix_init(xlator_t *this) pthread_cond_init(&_private->fsync_cond, NULL); pthread_mutex_init(&_private->janitor_mutex, NULL); pthread_cond_init(&_private->janitor_cond, NULL); + pthread_cond_init(&_private->fd_cond, NULL); INIT_LIST_HEAD(&_private->fsyncs); + _private->rel_fdcount = 0; + ret = posix_spawn_ctx_janitor_thread(this); + if (ret) + goto out; + ret = gf_thread_create(&_private->fsyncer, NULL, posix_fsyncer, this, "posixfsy"); if (ret) { @@ -1197,6 +1212,8 @@ posix_fini(xlator_t *this) { struct posix_private *priv = this->private; gf_boolean_t health_check = _gf_false; + glusterfs_ctx_t *ctx = this->ctx; + uint32_t count; int ret = 0; int i = 0; @@ -1243,6 +1260,19 @@ posix_fini(xlator_t *this) priv->janitor = NULL; } + pthread_mutex_lock(&ctx->fd_lock); + { + count = --ctx->pxl_count; + if (count == 0) { + pthread_cond_signal(&ctx->fd_cond); + } + } + pthread_mutex_unlock(&ctx->fd_lock); + + if (count == 0) { + pthread_join(ctx->janitor, NULL); + } + if (priv->fsyncer) { (void)gf_thread_cleanup_xint(priv->fsyncer); priv->fsyncer = 0; diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index bb4a5309f45..67db3324083 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1505,7 +1505,7 @@ posix_janitor_task(void *data) if (!priv) goto out; - time(&now); + now = gf_time(); if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { if (priv->disable_landfill_purge) { gf_msg_debug(this->name, 0, @@ -1592,16 +1592,108 @@ unlock: return; } +static struct posix_fd * +janitor_get_next_fd(glusterfs_ctx_t *ctx) +{ + struct posix_fd *pfd = NULL; + + while (list_empty(&ctx->janitor_fds)) { + if (ctx->pxl_count == 0) { + return NULL; + } + + pthread_cond_wait(&ctx->fd_cond, &ctx->fd_lock); + } + + pfd = list_first_entry(&ctx->janitor_fds, struct posix_fd, list); + list_del_init(&pfd->list); + + return pfd; +} + +static void +posix_close_pfd(xlator_t *xl, struct posix_fd *pfd) +{ + THIS = xl; + + if (pfd->dir == NULL) { + gf_msg_trace(xl->name, 0, "janitor: closing file fd=%d", pfd->fd); + sys_close(pfd->fd); + } else { + gf_msg_debug(xl->name, 0, "janitor: closing dir fd=%p", pfd->dir); + sys_closedir(pfd->dir); + } + + GF_FREE(pfd); +} + +static void * +posix_ctx_janitor_thread_proc(void *data) +{ + xlator_t *xl; + struct posix_fd *pfd; + glusterfs_ctx_t *ctx = NULL; + struct posix_private *priv_fd; + + ctx = data; + + pthread_mutex_lock(&ctx->fd_lock); + + while ((pfd = janitor_get_next_fd(ctx)) != NULL) { + pthread_mutex_unlock(&ctx->fd_lock); + + xl = pfd->xl; + posix_close_pfd(xl, pfd); + + pthread_mutex_lock(&ctx->fd_lock); + + priv_fd = xl->private; + priv_fd->rel_fdcount--; + if (!priv_fd->rel_fdcount) + pthread_cond_signal(&priv_fd->fd_cond); + } + + pthread_mutex_unlock(&ctx->fd_lock); + + return NULL; +} + +int +posix_spawn_ctx_janitor_thread(xlator_t *this) +{ + int ret = 0; + glusterfs_ctx_t *ctx = NULL; + + ctx = this->ctx; + + pthread_mutex_lock(&ctx->fd_lock); + { + if (ctx->pxl_count++ == 0) { + ret = gf_thread_create(&ctx->janitor, NULL, + posix_ctx_janitor_thread_proc, ctx, + "posixctxjan"); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_THREAD_FAILED, + "spawning janitor thread failed"); + ctx->pxl_count--; + } + } + } + pthread_mutex_unlock(&ctx->fd_lock); + + return ret; +} + static int -is_fresh_file(int64_t sec, int64_t ns) +is_fresh_file(struct timespec *ts) { - struct timeval tv; + struct timespec now; int64_t elapsed; - gettimeofday(&tv, NULL); + timespec_now_realtime(&now); + elapsed = (int64_t)gf_tsdiff(ts, &now); - elapsed = (tv.tv_sec - sec) * 1000000L; - elapsed += tv.tv_usec - (ns / 1000L); if (elapsed < 0) { /* The file has been modified in the future !!! * Is it fresh ? previous implementation considered this as a @@ -1610,11 +1702,7 @@ is_fresh_file(int64_t sec, int64_t ns) } /* If the file is newer than a second, we consider it fresh. */ - if (elapsed < 1000000) { - return 1; - } - - return 0; + return elapsed < 1000000; } int @@ -1677,7 +1765,9 @@ posix_gfid_heal(xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) if (ret != 16) { /* TODO: This is a very hacky way of doing this, and very prone to * errors and unexpected behavior. This should be changed. */ - if (is_fresh_file(stbuf.ia_ctime, stbuf.ia_ctime_nsec)) { + struct timespec ts = {.tv_sec = stbuf.ia_ctime, + .tv_nsec = stbuf.ia_ctime_nsec}; + if (is_fresh_file(&ts)) { gf_msg(this->name, GF_LOG_ERROR, ENOENT, P_MSG_FRESHFILE, "Fresh file: %s", path); return -ENOENT; @@ -1691,7 +1781,7 @@ posix_gfid_heal(xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) if (ret != 16) { /* TODO: This is a very hacky way of doing this, and very prone to * errors and unexpected behavior. This should be changed. */ - if (is_fresh_file(stat.st_ctim.tv_sec, stat.st_ctim.tv_nsec)) { + if (is_fresh_file(&stat.st_ctim)) { gf_msg(this->name, GF_LOG_ERROR, ENOENT, P_MSG_FRESHFILE, "Fresh file: %s", path); return -ENOENT; @@ -1950,7 +2040,7 @@ posix_fs_health_check(xlator_t *this, char *file_path) goto out; } - time_sec = time(NULL); + time_sec = gf_time(); gf_time_fmt(timestamp, sizeof timestamp, time_sec, gf_timefmt_FT); timelen = strlen(timestamp); diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index 762041b5831..6d54d37e5aa 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1361,6 +1361,22 @@ out: return 0; } +static void +posix_add_fd_to_cleanup(xlator_t *this, struct posix_fd *pfd) +{ + glusterfs_ctx_t *ctx = this->ctx; + struct posix_private *priv = this->private; + + pfd->xl = this; + pthread_mutex_lock(&ctx->fd_lock); + { + list_add_tail(&pfd->list, &ctx->janitor_fds); + priv->rel_fdcount++; + pthread_cond_signal(&ctx->fd_cond); + } + pthread_mutex_unlock(&ctx->fd_lock); +} + int32_t posix_releasedir(xlator_t *this, fd_t *fd) { @@ -1383,11 +1399,7 @@ posix_releasedir(xlator_t *this, fd_t *fd) "pfd->dir is NULL for fd=%p", fd); goto out; } - - gf_msg_debug(this->name, 0, "janitor: closing dir fd=%p", pfd->dir); - - sys_closedir(pfd->dir); - GF_FREE(pfd); + posix_add_fd_to_cleanup(this, pfd); out: return 0; @@ -2510,7 +2522,6 @@ out: int32_t posix_release(xlator_t *this, fd_t *fd) { - struct posix_private *priv = NULL; struct posix_fd *pfd = NULL; int ret = -1; uint64_t tmp_pfd = 0; @@ -2518,8 +2529,6 @@ posix_release(xlator_t *this, fd_t *fd) VALIDATE_OR_GOTO(this, out); VALIDATE_OR_GOTO(fd, out); - priv = this->private; - ret = fd_ctx_del(fd, this, &tmp_pfd); if (ret < 0) { gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_PFD_NULL, @@ -2533,13 +2542,7 @@ posix_release(xlator_t *this, fd_t *fd) "pfd->dir is %p (not NULL) for file fd=%p", pfd->dir, fd); } - gf_msg_debug(this->name, 0, "janitor: closing dir fd=%p", pfd->dir); - - sys_close(pfd->fd); - GF_FREE(pfd); - - if (!priv) - goto out; + posix_add_fd_to_cleanup(this, pfd); out: return 0; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 35be197c869..b8db146eef2 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -125,7 +125,7 @@ struct posix_fd { off_t dir_eof; /* offset at dir EOF */ struct list_head list; /* to add to the janitor list */ int odirect; - + xlator_t *xl; char _pad[4]; /* manual padding */ }; @@ -137,10 +137,6 @@ struct posix_private { gf_lock_t lock; char *hostname; - /* Statistics, provides activity of the server */ - - struct timeval prev_fetch_time; - struct timeval init_time; time_t last_landfill_check; @@ -170,6 +166,7 @@ struct posix_private { pthread_cond_t fsync_cond; pthread_mutex_t janitor_mutex; pthread_cond_t janitor_cond; + pthread_cond_t fd_cond; int fsync_queue_count; int32_t janitor_sleep_duration; @@ -254,8 +251,7 @@ struct posix_private { gf_boolean_t aio_configured; gf_boolean_t aio_init_done; gf_boolean_t aio_capable; - - char _pad[4]; /* manual padding */ + uint32_t rel_fdcount; }; typedef struct { @@ -662,6 +658,9 @@ posix_cs_maintenance(xlator_t *this, fd_t *fd, loc_t *loc, int *pfd, int posix_check_dev_file(xlator_t *this, inode_t *inode, char *fop, int *op_errno); +int +posix_spawn_ctx_janitor_thread(xlator_t *this); + void posix_update_iatt_buf(struct iatt *buf, int fd, char *loc, dict_t *xdata); |