diff options
Diffstat (limited to 'xlators/cluster/ec/src/ec-combine.c')
| -rw-r--r-- | xlators/cluster/ec/src/ec-combine.c | 1088 |
1 files changed, 644 insertions, 444 deletions
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 3d088d9be4a..703a30e2485 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -1,31 +1,28 @@ /* - Copyright (c) 2012 DataLab, s.l. <http://www.datalab.es> + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. - This file is part of the cluster/ec translator for GlusterFS. - - The cluster/ec translator for GlusterFS is free software: you can - redistribute it and/or modify it under the terms of the GNU General - Public License as published by the Free Software Foundation, either - version 3 of the License, or (at your option) any later version. - - The cluster/ec translator for GlusterFS is distributed in the hope - that it will be useful, but WITHOUT ANY WARRANTY; without even the - implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with the cluster/ec translator for GlusterFS. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <fnmatch.h> #include "libxlator.h" +#include <glusterfs/byte-order.h> -#include "ec-data.h" +#include "ec-types.h" #include "ec-helpers.h" #include "ec-common.h" #include "ec-combine.h" +#include "ec-messages.h" +#include <glusterfs/quota-common-utils.h> + +#define EC_QUOTA_PREFIX "trusted.glusterfs.quota." + +#define EC_MISSING_DATA ((data_t *)1ULL) struct _ec_dict_info; typedef struct _ec_dict_info ec_dict_info_t; @@ -33,361 +30,452 @@ typedef struct _ec_dict_info ec_dict_info_t; struct _ec_dict_combine; typedef struct _ec_dict_combine ec_dict_combine_t; -struct _ec_dict_info -{ - dict_t * dict; - int32_t count; +struct _ec_dict_info { + dict_t *dict; + int32_t count; }; -struct _ec_dict_combine -{ - ec_cbk_data_t * cbk; - int32_t which; +struct _ec_dict_combine { + ec_cbk_data_t *cbk; + int32_t which; }; -void ec_iatt_time_merge(uint32_t * dst_sec, uint32_t * dst_nsec, - uint32_t src_sec, uint32_t src_nsec) +int32_t +ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + int valid = 0; + + if (!fop || !dst || !src) + return 0; + + switch (fop->id) { + case GF_FOP_REMOVEXATTR: + case GF_FOP_FREMOVEXATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + return 1; + + case GF_FOP_SYMLINK: + case GF_FOP_LINK: + case GF_FOP_CREATE: + case GF_FOP_MKNOD: + case GF_FOP_MKDIR: + valid = 3; + break; + case GF_FOP_UNLINK: + case GF_FOP_RMDIR: + case GF_FOP_SETATTR: + case GF_FOP_FSETATTR: + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_WRITE: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + valid = 2; + break; + case GF_FOP_RENAME: + valid = 5; + break; + default: + gf_msg_callingfn(fop->xl->name, GF_LOG_WARNING, EINVAL, + EC_MSG_INVALID_FOP, "Invalid fop %d", fop->id); + return 0; + break; + } + + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, valid)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of '%s'", + gf_fop_list[fop->id]); + return 0; + } + return 1; +} + +void +ec_iatt_time_merge(int64_t *dst_sec, uint32_t *dst_nsec, int64_t src_sec, + uint32_t src_nsec) { if ((*dst_sec < src_sec) || - ((*dst_sec == src_sec) && (*dst_nsec < src_nsec))) - { + ((*dst_sec == src_sec) && (*dst_nsec < src_nsec))) { *dst_sec = src_sec; *dst_nsec = src_nsec; } } -int32_t ec_iatt_combine(struct iatt * dst, struct iatt * src, int32_t count) +static gf_boolean_t +ec_iatt_is_trusted(ec_fop_data_t *fop, struct iatt *iatt) { + uint64_t ino; int32_t i; - for (i = 0; i < count; i++) - { - if ((dst->ia_ino != src->ia_ino) || - (dst->ia_uid != src->ia_uid) || - (dst->ia_gid != src->ia_gid) || - (((dst->ia_type == IA_IFBLK) || (dst->ia_type == IA_IFCHR)) && - (dst->ia_rdev != src->ia_rdev)) || - ((dst->ia_type == IA_IFREG) && (dst->ia_size != src->ia_size)) || - (st_mode_from_ia(dst->ia_prot, dst->ia_type) != - st_mode_from_ia(src->ia_prot, src->ia_type)) || - (uuid_compare(dst->ia_gfid, src->ia_gfid) != 0)) - { - gf_log(THIS->name, GF_LOG_WARNING, - "Failed to combine iatt (inode: %lu-%lu, links: %u-%u, " - "uid: %u-%u, gid: %u-%u, rdev: %lu-%lu, size: %lu-%lu, " - "mode: %o-%o)", - dst->ia_ino, src->ia_ino, dst->ia_nlink, src->ia_nlink, - dst->ia_uid, src->ia_uid, dst->ia_gid, src->ia_gid, - dst->ia_rdev, src->ia_rdev, dst->ia_size, src->ia_size, - st_mode_from_ia(dst->ia_prot, dst->ia_type), - st_mode_from_ia(src->ia_prot, dst->ia_type)); + /* Only the top level fop will have fop->locks filled. */ + while (fop->parent != NULL) { + fop = fop->parent; + } + + /* Lookups are special requests always done without locks taken but they + * require to be able to identify differences between bricks. Special + * handling of these differences is already done in lookup specific code + * so we shouldn't ignore any difference here and consider all iatt + * structures as trusted. */ + if (fop->id == GF_FOP_LOOKUP) { + return _gf_true; + } + + /* Check if the iatt references an inode locked by the current fop */ + for (i = 0; i < fop->lock_count; i++) { + ino = gfid_to_ino(fop->locks[i].lock->loc.inode->gfid); + if (iatt->ia_ino == ino) { + return _gf_true; + } + } + + return _gf_false; +} + +int32_t +ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src, + int32_t count) +{ + int32_t i; + gf_boolean_t failed = _gf_false; + + for (i = 0; i < count; i++) { + /* Check for basic fields. These fields must be equal always, even if + * the inode is not locked because in these cases the parent inode + * will be locked and differences in these fields require changes in + * the parent directory. */ + if ((dst[i].ia_ino != src[i].ia_ino) || + (((dst[i].ia_type == IA_IFBLK) || (dst[i].ia_type == IA_IFCHR)) && + (dst[i].ia_rdev != src[i].ia_rdev)) || + (gf_uuid_compare(dst[i].ia_gfid, src[i].ia_gfid) != 0)) { + failed = _gf_true; + } + /* Check for not so stable fields. These fields can change if the + * inode is not locked. */ + if (!failed && ((dst[i].ia_uid != src[i].ia_uid) || + (dst[i].ia_gid != src[i].ia_gid) || + (st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type) != + st_mode_from_ia(src[i].ia_prot, src[i].ia_type)))) { + if (ec_iatt_is_trusted(fop, dst)) { + /* If the iatt contains information from an inode that is + * locked, these differences are real problems, so we need to + * report them. Otherwise we ignore them and don't care which + * data is returned. */ + failed = _gf_true; + } else { + gf_msg_debug(fop->xl->name, 0, + "Ignoring iatt differences because inode is not " + "locked"); + } + } + if (failed) { + gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_IATT_COMBINE_FAIL, + "Failed to combine iatt (inode: %" PRIu64 "-%" PRIu64 + ", " + "links: %u-%u, uid: %u-%u, gid: %u-%u, " + "rdev: %" PRIu64 "-%" PRIu64 ", size: %" PRIu64 "-%" PRIu64 + ", " + "mode: %o-%o), %s", + dst[i].ia_ino, src[i].ia_ino, dst[i].ia_nlink, + src[i].ia_nlink, dst[i].ia_uid, src[i].ia_uid, dst[i].ia_gid, + src[i].ia_gid, dst[i].ia_rdev, src[i].ia_rdev, + dst[i].ia_size, src[i].ia_size, + st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type), + st_mode_from_ia(src[i].ia_prot, dst[i].ia_type), + ec_msg_str(fop)); return 0; } } - while (count-- > 0) - { - dst->ia_blocks += src->ia_blocks; - if (dst->ia_blksize < src->ia_blksize) - { - dst->ia_blksize = src->ia_blksize; + while (count-- > 0) { + dst[count].ia_blocks += src[count].ia_blocks; + if (dst[count].ia_blksize < src[count].ia_blksize) { + dst[count].ia_blksize = src[count].ia_blksize; } - ec_iatt_time_merge(&dst->ia_atime, &dst->ia_atime_nsec, src->ia_atime, - src->ia_atime_nsec); - ec_iatt_time_merge(&dst->ia_mtime, &dst->ia_mtime_nsec, src->ia_mtime, - src->ia_mtime_nsec); - ec_iatt_time_merge(&dst->ia_ctime, &dst->ia_ctime_nsec, src->ia_ctime, - src->ia_ctime_nsec); + ec_iatt_time_merge(&dst[count].ia_atime, &dst[count].ia_atime_nsec, + src[count].ia_atime, src[count].ia_atime_nsec); + ec_iatt_time_merge(&dst[count].ia_mtime, &dst[count].ia_mtime_nsec, + src[count].ia_mtime, src[count].ia_mtime_nsec); + ec_iatt_time_merge(&dst[count].ia_ctime, &dst[count].ia_ctime_nsec, + src[count].ia_ctime, src[count].ia_ctime_nsec); } return 1; } -void ec_iatt_rebuild(ec_t * ec, struct iatt * iatt, int32_t count, - int32_t answers) +void +ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers) { - size_t blocks; + uint64_t blocks; - while (count-- > 0) - { + while (count-- > 0) { blocks = iatt[count].ia_blocks * ec->fragments + answers - 1; blocks /= answers; iatt[count].ia_blocks = blocks; } } -int32_t ec_dict_data_compare(dict_t * dict, char * key, data_t * value, - void * arg) +gf_boolean_t +ec_xattr_match(dict_t *dict, char *key, data_t *value, void *arg) { - ec_dict_info_t * info = arg; - data_t * data; - - data = dict_get(info->dict, key); - if (data == NULL) - { - gf_log("ec", GF_LOG_DEBUG, "key '%s' found only on one dict", key); - - return -1; + if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) || + (strcmp(key, GET_LINK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) { + return _gf_false; } - info->count--; + return _gf_true; +} +gf_boolean_t +ec_value_ignore(char *key) +{ if ((strcmp(key, GF_CONTENT_KEY) == 0) || (strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) || (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) || (strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) || - (strcmp(key, GF_XATTR_CLRLK_CMD) == 0) || (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) || - (fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) || - (XATTR_IS_NODE_UUID(key))) - { - return 0; - } - - if ((data->len != value->len) || - (memcmp(data->data, value->data, data->len) != 0)) - { - gf_log("ec", GF_LOG_DEBUG, "key '%s' is different (size: %u, %u)", - key, data->len, value->len); - - return -1; - } - - return 0; + (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) || + (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) || + (strcmp(key, DHT_IATT_IN_XDATA_KEY) == 0) || + (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) || + (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, 0) == 0) || + (fnmatch(GF_XATTR_MARKER_KEY ".*", key, 0) == 0) || + (XATTR_IS_NODE_UUID(key))) { + return _gf_true; + } + + return _gf_false; } -int32_t ec_dict_data_show(dict_t * dict, char * key, data_t * value, - void * arg) +int32_t +ec_dict_compare(dict_t *dict1, dict_t *dict2) { - if (dict_get(arg, key) == NULL) - { - gf_log("ec", GF_LOG_DEBUG, "key '%s' found only on one dict", key); - } - + if (are_dicts_equal(dict1, dict2, ec_xattr_match, ec_value_ignore)) + return 1; return 0; } -int32_t ec_dict_compare(dict_t * dict1, dict_t * dict2) +static uint32_t +ec_dict_list(data_t **list, ec_cbk_data_t *cbk, int32_t which, char *key, + gf_boolean_t global) { - ec_dict_info_t info; - dict_t * dict; - - if (dict1 != NULL) - { - info.dict = dict1; - info.count = dict1->count; - dict = dict2; - } - else if (dict2 != NULL) - { - info.dict = dict2; - info.count = dict2->count; - dict = dict1; - } - else - { - return 1; - } + ec_t *ec = cbk->fop->xl->private; + ec_cbk_data_t *ans = NULL; + dict_t *dict = NULL; + data_t *data; + uint32_t count; + int32_t i; - if (dict != NULL) - { - if (dict_foreach(dict, ec_dict_data_compare, &info) != 0) - { - return 0; + for (i = 0; i < ec->nodes; i++) { + /* We initialize the list with EC_MISSING_DATA if we are + * returning a global list or the current subvolume belongs + * to the group of the accepted answer. Note that if some + * subvolume is known to be down before issuing the request, + * we won't have any answer from it, so we set here the + * appropriate default value. */ + if (global || ((cbk->mask & (1ULL << i)) != 0)) { + list[i] = EC_MISSING_DATA; + } else { + list[i] = NULL; } } - if (info.count != 0) + count = 0; + list_for_each_entry(ans, &cbk->fop->answer_list, answer_list) { - dict_foreach(info.dict, ec_dict_data_show, dict); - } - - return (info.count == 0); -} - -int32_t ec_dict_list(data_t ** list, int32_t * count, ec_cbk_data_t * cbk, - int32_t which, char * key) -{ - ec_cbk_data_t * ans; - dict_t * dict; - int32_t i, max; - - max = *count; - i = 0; - for (ans = cbk; ans != NULL; ans = ans->next) - { - if (i >= max) - { - gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "Unexpected number of " - "dictionaries"); - - return 0; - } - - dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict; - list[i] = dict_get(dict, key); - if (list[i] == NULL) - { - gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "Unexpected missing " - "dictionary entry"); - - return 0; + if (global || ((cbk->mask & ans->mask) != 0)) { + dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict; + data = dict_get(dict, key); + if (data != NULL) { + list[ans->idx] = data; + count++; + } } - - i++; } - *count = i; - - return 1; + return count; } -char * ec_concat_prepare(xlator_t * xl, char ** sep, char ** post, - const char * fmt, va_list args) +int32_t +ec_concat_prepare(xlator_t *xl, char **str, char **sep, char **post, + const char *fmt, va_list args) { - char * str, * tmp; + char *tmp; int32_t len; - len = gf_vasprintf(&str, fmt, args); - if (len < 0) - { - return NULL; + len = gf_vasprintf(str, fmt, args); + if (len < 0) { + return -ENOMEM; } - tmp = strchr(str, '{'); - if (tmp == NULL) - { + tmp = strchr(*str, '{'); + if (tmp == NULL) { goto out; } *tmp++ = 0; *sep = tmp; tmp = strchr(tmp, '}'); - if (tmp == NULL) - { + if (tmp == NULL) { goto out; } *tmp++ = 0; *post = tmp; - return str; + return 0; out: - gf_log(xl->name, GF_LOG_ERROR, "Invalid concat format"); + gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_FORMAT, + "Invalid concat format"); - GF_FREE(str); + GF_FREE(*str); - return NULL; + return -EINVAL; } -int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk, - int32_t which, char * key, ...) +static int32_t +ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key, + const char *def, gf_boolean_t global, const char *fmt, ...) { - data_t * data[cbk->count]; - size_t len, tmp; - char * str = NULL, * pre = NULL, * sep, * post; - dict_t * dict; + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + char *str = NULL, *pre = NULL, *sep, *post; + dict_t *dict; va_list args; - int32_t i, num, prelen, postlen, seplen; - int32_t ret = -1; + int32_t i, num, len, deflen, prelen, postlen, seplen, tmp; + int32_t err; - num = cbk->count; - if (!ec_dict_list(data, &num, cbk, which, key)) - { - return -1; - } + ec_dict_list(data, cbk, which, key, global); - va_start(args, key); - pre = ec_concat_prepare(cbk->fop->xl, &sep, &post, fmt, args); + va_start(args, fmt); + err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args); va_end(args); - if (pre == NULL) - { - return -1; + if (err != 0) { + return err; } prelen = strlen(pre); seplen = strlen(sep); postlen = strlen(post); - len = prelen + (num - 1) * seplen + postlen + 1; - for (i = 0; i < num; i++) - { - len += data[i]->len - 1; + deflen = 0; + if (def != NULL) { + deflen = strlen(def); } + len = prelen + postlen + 1; + num = -1; + for (i = 0; i < ec->nodes; i++) { + if (data[i] == NULL) { + continue; + } + if (data[i] == EC_MISSING_DATA) { + if (def == NULL) { + continue; + } + len += deflen; + } else { + len += data[i]->len - 1; + } + if (num >= 0) { + len += seplen; + } + num++; + } + + err = -ENOMEM; + str = GF_MALLOC(len, gf_common_mt_char); - if (str == NULL) - { + if (str == NULL) { goto out; } memcpy(str, pre, prelen); len = prelen; - for (i = 0; i < num; i++) - { - memcpy(str + len, sep, seplen); - len += seplen; - tmp = data[i]->len - 1; - memcpy(str + len, data[i]->data, tmp); + for (i = 0; i < ec->nodes; i++) { + if (data[i] == NULL) { + continue; + } + if (data[i] == EC_MISSING_DATA) { + if (deflen == 0) { + continue; + } + tmp = deflen; + memcpy(str + len, def, tmp); + } else { + tmp = data[i]->len - 1; + memcpy(str + len, data[i]->data, tmp); + } len += tmp; + if (i < num) { + memcpy(str + len, sep, seplen); + len += seplen; + } } memcpy(str + len, post, postlen + 1); dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; - if (dict_set_dynstr(dict, key, str) != 0) - { + if (new_key) { + key = new_key; + } + err = dict_set_dynstr(dict, key, str); + if (err != 0) { goto out; } str = NULL; - ret = 0; - out: GF_FREE(str); GF_FREE(pre); - return ret; + return err; } -int32_t ec_dict_data_merge(ec_cbk_data_t * cbk, int32_t which, char * key) +int32_t +ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key) { - data_t * data[cbk->count]; - dict_t * dict, * lockinfo, * tmp; - char * ptr = NULL; - int32_t i, num, len; - int32_t ret = -1; - - num = cbk->count; - if (!ec_dict_list(data, &num, cbk, which, key)) - { - return -1; - } + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict, *lockinfo, *tmp = NULL; + char *ptr = NULL; + int32_t i, len; + int32_t err; + + ec_dict_list(data, cbk, which, key, _gf_false); lockinfo = dict_new(); - if (lockinfo == NULL) - { - return -1; + if (lockinfo == NULL) { + return -ENOMEM; } - if (dict_unserialize(data[0]->data, data[0]->len, &lockinfo) != 0) - { - goto out; - } + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } - for (i = 1; i < num; i++) - { tmp = dict_new(); - if (tmp == NULL) - { + if (tmp == NULL) { + err = -ENOMEM; + goto out; } - if ((dict_unserialize(data[i]->data, data[i]->len, &tmp) != 0) || - (dict_copy(tmp, lockinfo) == NULL)) - { - dict_unref(tmp); + err = dict_unserialize(data[i]->data, data[i]->len, &tmp); + if (err != 0) { + goto out; + } + if (dict_copy(tmp, lockinfo) == NULL) { + err = -ENOMEM; goto out; } @@ -395,238 +483,373 @@ int32_t ec_dict_data_merge(ec_cbk_data_t * cbk, int32_t which, char * key) dict_unref(tmp); } - len = dict_serialized_length(lockinfo); - if (len < 0) - { - goto out; - } - ptr = GF_MALLOC(len, gf_common_mt_char); - if (ptr == NULL) - { - goto out; - } - if (dict_serialize(lockinfo, ptr) != 0) - { + tmp = NULL; + + err = dict_allocate_and_serialize(lockinfo, (char **)&ptr, + (unsigned int *)&len); + if (err != 0) { goto out; } + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; - if (dict_set_dynptr(dict, key, ptr, len) != 0) - { + err = dict_set_dynptr(dict, key, ptr, len); + if (err != 0) { goto out; } ptr = NULL; - ret = 0; - out: GF_FREE(ptr); dict_unref(lockinfo); + if (tmp != NULL) { + dict_unref(tmp); + } - return ret; + return err; } -int32_t ec_dict_data_uuid(ec_cbk_data_t * cbk, int32_t which, char * key) +int32_t +ec_dict_data_uuid(ec_cbk_data_t *cbk, int32_t which, char *key) { - ec_cbk_data_t * ans, * min; - dict_t * src, * dst; - data_t * data; + ec_cbk_data_t *ans, *min; + dict_t *src, *dst; + data_t *data; min = cbk; - for (ans = cbk->next; ans != NULL; ans = ans->next) - { - if (ans->idx < min->idx) - { + for (ans = cbk->next; ans != NULL; ans = ans->next) { + if (ans->idx < min->idx) { min = ans; } } - if (min != cbk) - { + if (min != cbk) { src = (which == EC_COMBINE_XDATA) ? min->xdata : min->dict; dst = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; data = dict_get(src, key); - if (data == NULL) - { - return -1; + if (data == NULL) { + return -ENOENT; } - if (dict_set(dst, key, data) != 0) - { - return -1; + if (dict_set(dst, key, data) != 0) { + return -ENOMEM; } } return 0; } -int32_t ec_dict_data_max(ec_cbk_data_t * cbk, int32_t which, char * key) +int32_t +ec_dict_data_iatt(ec_cbk_data_t *cbk, int32_t which, char *key) { - data_t * data[cbk->count]; - dict_t * dict; - int32_t i, num; - uint32_t max, tmp; + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + struct iatt *stbuf, *tmp; + int32_t i, ret; + + ec_dict_list(data, cbk, which, key, _gf_false); + + stbuf = NULL; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + tmp = data_to_iatt(data[i], key); + if (tmp == NULL) { + ret = -EINVAL; + goto out; + } + if (stbuf == NULL) { + stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char); + if (stbuf == NULL) { + ret = -ENOMEM; + goto out; + } + *stbuf = *tmp; + } else { + if (!ec_iatt_combine(cbk->fop, stbuf, tmp, 1)) { + ret = -EINVAL; + goto out; + } + } + } - num = cbk->count; - if (!ec_dict_list(data, &num, cbk, which, key)) - { - return -1; + if ((stbuf != NULL) && (stbuf->ia_type == IA_IFREG)) { + ec_iatt_rebuild(ec, stbuf, 1, cbk->count); + /* TODO: not sure if an iatt could come in xdata from a fop that takes + * no locks. */ + if (!ec_get_inode_size(cbk->fop, cbk->fop->locks[0].lock->loc.inode, + &stbuf->ia_size)) { + ret = -EINVAL; + goto out; + } } - if (num <= 1) - { - return 0; + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + ret = dict_set_iatt(dict, key, stbuf, false); + if (ret >= 0) { + stbuf = NULL; } - max = data_to_uint32(data[0]); - for (i = 1; i < num; i++) - { +out: + GF_FREE(stbuf); + + return ret; +} + +int32_t +ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + int32_t i; + uint32_t max, tmp; + + ec_dict_list(data, cbk, which, key, _gf_false); + + max = 0; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + tmp = data_to_uint32(data[i]); - if (max < tmp) - { + if (max < tmp) { max = tmp; } } dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; - if (dict_set_uint32(dict, key, max) != 0) - { - return -1; + return dict_set_uint32(dict, key, max); +} + +int32_t +ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + int32_t i; + uint64_t max, tmp; + + ec_dict_list(data, cbk, which, key, _gf_false); + + max = 0; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + + tmp = data_to_uint64(data[i]); + if (max < tmp) { + max = tmp; + } } - return 0; + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + return dict_set_uint64(dict, key, max); } -int32_t ec_dict_data_stime(ec_cbk_data_t * cbk, int32_t which, char * key) +int32_t +ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key) { - data_t * data[cbk->count]; - dict_t * dict; - int32_t i, num; + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict = NULL; + int32_t i = 0; + quota_meta_t size = { + 0, + }; + quota_meta_t max_size = { + 0, + }; + + if (ec_dict_list(data, cbk, which, key, _gf_false) == 0) { + return 0; + } - num = cbk->count; - if (!ec_dict_list(data, &num, cbk, which, key)) - { - return -1; + /* Quota size xattr is managed outside of the control of the ec xlator. + * This means that it might not be updated at the same time on all + * bricks and we can receive slightly different values. If that's the + * case, we take the maximum of all received values. + */ + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA) || + (quota_data_to_meta(data[i], &size) < 0)) { + continue; + } + + if (size.size > max_size.size) + max_size.size = size.size; + if (size.file_count > max_size.file_count) + max_size.file_count = size.file_count; + if (size.dir_count > max_size.dir_count) + max_size.dir_count = size.dir_count; } + max_size.size *= ec->fragments; + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; - for (i = 1; i < num; i++) - { - if (gf_get_max_stime(cbk->fop->xl, dict, key, data[i]) != 0) - { - gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "STIME combination " - "failed"); + return quota_dict_set_meta(dict, key, &max_size, IA_IFDIR); +} + +int32_t +ec_dict_data_stime(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + int32_t i, err; + + ec_dict_list(data, cbk, which, key, _gf_false); + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + err = gf_get_max_stime(cbk->fop->xl, dict, key, data[i]); + if (err != 0) { + gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err, + EC_MSG_STIME_COMBINE_FAIL, "STIME combination failed"); - return -1; + return err; } } return 0; } -int32_t ec_dict_data_combine(dict_t * dict, char * key, data_t * value, - void * arg) +int32_t +ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg) { - ec_dict_combine_t * data = arg; + ec_dict_combine_t *data = arg; if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) || - (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) - { - return ec_dict_data_concat("(<EC:%s> { })", data->cbk, data->which, - key, data->cbk->fop->xl->name); + (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) { + return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL, + _gf_false, _gf_false, "(<EC:%s> { })", + data->cbk->fop->xl->name); } - if (strncmp(key, GF_XATTR_CLRLK_CMD, strlen(GF_XATTR_CLRLK_CMD)) == 0) - { - return ec_dict_data_concat("{\n}", data->cbk, data->which, key); + if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) { + return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL, + _gf_false, "{\n}"); } - if (strncmp(key, GF_XATTR_LOCKINFO_KEY, - strlen(GF_XATTR_LOCKINFO_KEY)) == 0) - { + if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) { return ec_dict_data_merge(data->cbk, data->which, key); } - if (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) - { - return ec_dict_data_max(data->cbk, data->which, key); + if (strcmp(key, GET_LINK_COUNT) == 0) { + return ec_dict_data_max32(data->cbk, data->which, key); } - if (XATTR_IS_NODE_UUID(key)) - { - return ec_dict_data_uuid(data->cbk, data->which, key); + if (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) { + return ec_dict_data_max32(data->cbk, data->which, key); + } + if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) { + return ec_dict_data_max32(data->cbk, data->which, key); } - if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) - { + if (strcmp(key, QUOTA_SIZE_KEY) == 0) { + return ec_dict_data_quota(data->cbk, data->which, key); + } + /* Ignore all other quota attributes */ + if (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) { + return 0; + } + + if (XATTR_IS_NODE_UUID(key)) { + if (data->cbk->fop->int32) { + /* List of node uuid is requested */ + return ec_dict_data_concat(data->cbk, data->which, key, + GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR, + _gf_true, "{ }"); + } else { + return ec_dict_data_uuid(data->cbk, data->which, key); + } + } + + if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { return ec_dict_data_stime(data->cbk, data->which, key); } + if (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, FNM_NOESCAPE) == 0) { + return ec_dict_data_max64(data->cbk, data->which, key); + } + + if (strcmp(key, GF_PRESTAT) == 0 || strcmp(key, GF_POSTSTAT) == 0) { + return ec_dict_data_iatt(data->cbk, data->which, key); + } + return 0; } -int32_t ec_dict_combine(ec_cbk_data_t * cbk, int32_t which) +int32_t +ec_dict_combine(ec_cbk_data_t *cbk, int32_t which) { - dict_t * dict; + dict_t *dict = NULL; ec_dict_combine_t data; + int32_t err = 0; data.cbk = cbk; data.which = which; dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; - if ((dict != NULL) && - (dict_foreach(dict, ec_dict_data_combine, &data) != 0)) - { - gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "Dictionary combination " - "failed"); + if (dict != NULL) { + err = dict_foreach(dict, ec_dict_data_combine, &data); + if (err != 0) { + gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err, + EC_MSG_DICT_COMBINE_FAIL, "Dictionary combination failed"); - return 0; + return err; + } } - return 1; + return 0; } -int32_t ec_vector_compare(struct iovec * dst_vector, int32_t dst_count, - struct iovec * src_vector, int32_t src_count) +int32_t +ec_vector_compare(struct iovec *dst_vector, int32_t dst_count, + struct iovec *src_vector, int32_t src_count) { - size_t dst_size = 0, src_size = 0; + int32_t dst_size = 0, src_size = 0; - if (dst_count > 0) - { + if (dst_count > 0) { dst_size = iov_length(dst_vector, dst_count); } - if (src_count > 0) - { + if (src_count > 0) { src_size = iov_length(src_vector, src_count); } return (dst_size == src_size); } -int32_t ec_flock_compare(struct gf_flock * dst, struct gf_flock * src) +int32_t +ec_flock_compare(struct gf_flock *dst, struct gf_flock *src) { - if ((dst->l_type != src->l_type) || - (dst->l_whence != src->l_whence) || - (dst->l_start != src->l_start) || - (dst->l_len != src->l_len) || + if ((dst->l_type != src->l_type) || (dst->l_whence != src->l_whence) || + (dst->l_start != src->l_start) || (dst->l_len != src->l_len) || (dst->l_pid != src->l_pid) || - !is_same_lkowner(&dst->l_owner, &src->l_owner)) - { + !is_same_lkowner(&dst->l_owner, &src->l_owner)) { return 0; } return 1; } -void ec_statvfs_combine(struct statvfs * dst, struct statvfs * src) +void +ec_statvfs_combine(struct statvfs *dst, struct statvfs *src) { - if (dst->f_bsize < src->f_bsize) - { + if (dst->f_bsize < src->f_bsize) { dst->f_bsize = src->f_bsize; } - if (dst->f_frsize < src->f_frsize) - { + if (dst->f_frsize < src->f_frsize) { dst->f_blocks *= dst->f_frsize; dst->f_blocks /= src->f_frsize; @@ -637,9 +860,7 @@ void ec_statvfs_combine(struct statvfs * dst, struct statvfs * src) dst->f_bavail /= src->f_frsize; dst->f_frsize = src->f_frsize; - } - else if (dst->f_frsize > src->f_frsize) - { + } else if (dst->f_frsize > src->f_frsize) { src->f_blocks *= src->f_frsize; src->f_blocks /= dst->f_frsize; @@ -649,147 +870,126 @@ void ec_statvfs_combine(struct statvfs * dst, struct statvfs * src) src->f_bavail *= src->f_frsize; src->f_bavail /= dst->f_frsize; } - if (dst->f_blocks > src->f_blocks) - { + if (dst->f_blocks > src->f_blocks) { dst->f_blocks = src->f_blocks; } - if (dst->f_bfree > src->f_bfree) - { + if (dst->f_bfree > src->f_bfree) { dst->f_bfree = src->f_bfree; } - if (dst->f_bavail > src->f_bavail) - { + if (dst->f_bavail > src->f_bavail) { dst->f_bavail = src->f_bavail; } - if (dst->f_files < src->f_files) - { + if (dst->f_files < src->f_files) { dst->f_files = src->f_files; } - if (dst->f_ffree > src->f_ffree) - { + if (dst->f_ffree > src->f_ffree) { dst->f_ffree = src->f_ffree; } - if (dst->f_favail > src->f_favail) - { + if (dst->f_favail > src->f_favail) { dst->f_favail = src->f_favail; } - if (dst->f_namemax > src->f_namemax) - { + if (dst->f_namemax > src->f_namemax) { dst->f_namemax = src->f_namemax; } - if (dst->f_flag != src->f_flag) - { - gf_log(THIS->name, GF_LOG_DEBUG, "Mismatching file system flags " - "(%lX, %lX)", - dst->f_flag, src->f_flag); + if (dst->f_flag != src->f_flag) { + gf_msg_debug(THIS->name, 0, + "Mismatching file system flags " + "(%lX, %lX)", + dst->f_flag, src->f_flag); } dst->f_flag &= src->f_flag; } -int32_t ec_combine_check(ec_cbk_data_t * dst, ec_cbk_data_t * src, - ec_combine_f combine) +int32_t +ec_combine_check(ec_cbk_data_t *dst, ec_cbk_data_t *src, ec_combine_f combine) { - ec_fop_data_t * fop = dst->fop; + ec_fop_data_t *fop = dst->fop; - if (dst->op_ret != src->op_ret) - { - gf_log(fop->xl->name, GF_LOG_DEBUG, "Mismatching return code in " - "answers of '%s': %d <-> %d", - ec_fop_name(fop->id), dst->op_ret, src->op_ret); + if (dst->op_ret != src->op_ret) { + gf_msg_debug(fop->xl->name, 0, + "Mismatching return code in " + "answers of '%s': %d <-> %d", + ec_fop_name(fop->id), dst->op_ret, src->op_ret); return 0; } - if (dst->op_ret < 0) - { - if (dst->op_errno != src->op_errno) - { - gf_log(fop->xl->name, GF_LOG_DEBUG, "Mismatching errno code in " - "answers of '%s': %d <-> %d", - ec_fop_name(fop->id), dst->op_errno, src->op_errno); + if (dst->op_ret < 0) { + if (dst->op_errno != src->op_errno) { + gf_msg_debug(fop->xl->name, 0, + "Mismatching errno code in " + "answers of '%s': %d <-> %d", + ec_fop_name(fop->id), dst->op_errno, src->op_errno); return 0; } } - if (!ec_dict_compare(dst->xdata, src->xdata)) - { - gf_log(fop->xl->name, GF_LOG_WARNING, "Mismatching xdata in answers " - "of '%s'", + if (!ec_dict_compare(dst->xdata, src->xdata)) { + gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_XDATA_MISMATCH, + "Mismatching xdata in answers " + "of '%s'", ec_fop_name(fop->id)); return 0; } - if ((dst->op_ret >= 0) && (combine != NULL)) - { + if ((dst->op_ret >= 0) && (combine != NULL)) { return combine(fop, dst, src); } return 1; } -void ec_combine(ec_cbk_data_t * cbk, ec_combine_f combine) +void +ec_combine(ec_cbk_data_t *newcbk, ec_combine_f combine) { - ec_fop_data_t * fop = cbk->fop; - ec_cbk_data_t * ans = NULL, * tmp = NULL; - struct list_head * item = NULL; - int32_t needed = 0, report = 0; + ec_fop_data_t *fop = newcbk->fop; + ec_cbk_data_t *cbk = NULL, *tmp = NULL; + struct list_head *item = NULL; + int32_t needed = 0; char str[32]; LOCK(&fop->lock); + fop->received |= newcbk->mask; + item = fop->cbk_list.prev; - list_for_each_entry(ans, &fop->cbk_list, list) + list_for_each_entry(cbk, &fop->cbk_list, list) { - if (ec_combine_check(cbk, ans, combine)) - { - cbk->count += ans->count; - cbk->mask |= ans->mask; - - item = ans->list.prev; - while (item != &fop->cbk_list) - { + if (ec_combine_check(newcbk, cbk, combine)) { + newcbk->count += cbk->count; + newcbk->mask |= cbk->mask; + + item = cbk->list.prev; + while (item != &fop->cbk_list) { tmp = list_entry(item, ec_cbk_data_t, list); - if (tmp->count >= cbk->count) - { + if (tmp->count >= newcbk->count) { break; } item = item->prev; } - list_del(&ans->list); + list_del(&cbk->list); - cbk->next = ans; + newcbk->next = cbk; break; } } - list_add(&cbk->list, item); + list_add(&newcbk->list, item); ec_trace("ANSWER", fop, "combine=%s[%d]", - ec_bin(str, sizeof(str), cbk->mask, 0), cbk->count); + ec_bin(str, sizeof(str), newcbk->mask, 0), newcbk->count); - if ((cbk->count == fop->expected) && (fop->answer == NULL)) - { - fop->answer = cbk; - - ec_update_bad(fop, cbk->mask); - - report = 1; + cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); + if ((fop->mask ^ fop->remaining) == fop->received) { + needed = fop->minimum - cbk->count; } - ans = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); - needed = fop->minimum - ans->count - fop->winds + 1; - UNLOCK(&fop->lock); - if (needed > 0) - { - ec_dispatch_next(fop, cbk->idx); - } - else if (report) - { - ec_report(fop, 0); + if (needed > 0) { + ec_dispatch_next(fop, newcbk->idx); } } |
