summaryrefslogtreecommitdiffstats
path: root/xlators/storage/posix/src/posix-helpers.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/storage/posix/src/posix-helpers.c')
-rw-r--r--xlators/storage/posix/src/posix-helpers.c614
1 files changed, 540 insertions, 74 deletions
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 58708a347..5725cad7d 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -22,6 +22,7 @@
#include <pthread.h>
#include <ftw.h>
#include <sys/stat.h>
+#include <signal.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
@@ -44,25 +45,22 @@
#include "timer.h"
#include "glusterfs3-xdr.h"
#include "hashfn.h"
+#include "glusterfs-acl.h"
#include <fnmatch.h>
-typedef struct {
- xlator_t *this;
- const char *real_path;
- dict_t *xattr;
- struct iatt *stbuf;
- loc_t *loc;
-} posix_xattr_filler_t;
-
char *marker_xattrs[] = {"trusted.glusterfs.quota.*",
"trusted.glusterfs.*.xtime",
NULL};
+char *marker_contri_key = "trusted.*.*.contri";
+
static char* posix_ignore_xattrs[] = {
"gfid-req",
GLUSTERFS_ENTRYLK_COUNT,
GLUSTERFS_INODELK_COUNT,
GLUSTERFS_POSIXLK_COUNT,
+ GLUSTERFS_PARENT_ENTRYLK,
+ GF_GFIDLESS_LOOKUP,
NULL
};
@@ -108,14 +106,142 @@ out:
}
static int
+_posix_xattr_get_set_from_backend (posix_xattr_filler_t *filler, char *key)
+{
+ ssize_t xattr_size = -1;
+ int ret = 0;
+ char *value = NULL;
+
+ xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0);
+
+ if (xattr_size > 0) {
+ value = GF_CALLOC (1, xattr_size + 1,
+ gf_posix_mt_char);
+ if (!value)
+ goto out;
+
+ xattr_size = sys_lgetxattr (filler->real_path, key, value,
+ xattr_size);
+ if (xattr_size <= 0) {
+ gf_log (filler->this->name, GF_LOG_WARNING,
+ "getxattr failed. path: %s, key: %s",
+ filler->real_path, key);
+ GF_FREE (value);
+ goto out;
+ }
+
+ value[xattr_size] = '\0';
+ ret = dict_set_bin (filler->xattr, key,
+ value, xattr_size);
+ if (ret < 0) {
+ gf_log (filler->this->name, GF_LOG_DEBUG,
+ "dict set failed. path: %s, key: %s",
+ filler->real_path, key);
+ GF_FREE (value);
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static int gf_posix_xattr_enotsup_log;
+
+static int
+_posix_get_marker_all_contributions (posix_xattr_filler_t *filler)
+{
+ ssize_t size = -1, remaining_size = -1, list_offset = 0;
+ int ret = -1;
+ char *list = NULL, key[4096] = {0, };
+
+ size = sys_llistxattr (filler->real_path, NULL, 0);
+ if (size == -1) {
+ if ((errno == ENOTSUP) || (errno == ENOSYS)) {
+ GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log,
+ THIS->name, GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported (try remounting brick"
+ " with 'user_xattr' flag)");
+
+ } else {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "listxattr failed on %s: %s",
+ filler->real_path, strerror (errno));
+
+ }
+
+ goto out;
+ }
+
+ if (size == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ list = alloca (size + 1);
+ if (!list) {
+ goto out;
+ }
+
+ size = sys_llistxattr (filler->real_path, list, size);
+ if (size <= 0) {
+ ret = size;
+ goto out;
+ }
+
+ remaining_size = size;
+ list_offset = 0;
+
+ while (remaining_size > 0) {
+ if (*(list + list_offset) == '\0')
+ break;
+ strcpy (key, list + list_offset);
+ if (fnmatch (marker_contri_key, key, 0) == 0) {
+ ret = _posix_xattr_get_set_from_backend (filler, key);
+ }
+
+ remaining_size -= strlen (key) + 1;
+ list_offset += strlen (key) + 1;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int
+_posix_get_marker_quota_contributions (posix_xattr_filler_t *filler, char *key)
+{
+ char *saveptr = NULL, *token = NULL, *tmp_key = NULL;
+ char *ptr = NULL;
+ int i = 0, ret = 0;
+
+ tmp_key = ptr = gf_strdup (key);
+ for (i = 0; i < 4; i++) {
+ token = strtok_r (tmp_key, ".", &saveptr);
+ tmp_key = NULL;
+ }
+
+ if (strncmp (token, "contri", strlen ("contri")) == 0) {
+ ret = _posix_get_marker_all_contributions (filler);
+ } else {
+ ret = _posix_xattr_get_set_from_backend (filler, key);
+ }
+
+ GF_FREE (ptr);
+
+ return ret;
+}
+
+static int
_posix_xattr_get_set (dict_t *xattr_req,
char *key,
data_t *data,
void *xattrargs)
{
posix_xattr_filler_t *filler = xattrargs;
- char *value = NULL;
- ssize_t xattr_size = -1;
int ret = -1;
char *databuf = NULL;
int _fd = -1;
@@ -140,6 +266,16 @@ _posix_xattr_get_set (dict_t *xattr_req,
goto err;
}
+ /*
+ * There could be a situation where the ia_size is
+ * zero. GF_CALLOC will return a pointer to the
+ * memory initialized by gf_mem_set_acct_info.
+ * This function adds a header and a footer to
+ * the allocated memory. The returned pointer
+ * points to the memory just after the header, but
+ * when size is zero, there is no space for user
+ * data. The memory can be freed by calling GF_FREE.
+ */
databuf = GF_CALLOC (1, filler->stbuf->ia_size,
gf_posix_mt_char);
if (!databuf) {
@@ -181,48 +317,34 @@ _posix_xattr_get_set (dict_t *xattr_req,
}
} else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) {
loc = filler->loc;
- if (loc && !list_empty (&loc->inode->fd_list)) {
- ret = dict_set_uint32 (filler->xattr, key, 1);
- if (ret < 0)
- gf_log (filler->this->name, GF_LOG_WARNING,
- "Failed to set dictionary value for %s",
- key);
- } else {
- ret = dict_set_uint32 (filler->xattr, key, 0);
+ if (loc) {
+ ret = dict_set_uint32 (filler->xattr, key,
+ loc->inode->fd_count);
if (ret < 0)
gf_log (filler->this->name, GF_LOG_WARNING,
"Failed to set dictionary value for %s",
key);
}
- } else {
- xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0);
-
- if (xattr_size > 0) {
- value = GF_CALLOC (1, xattr_size + 1,
- gf_posix_mt_char);
- if (!value)
- return -1;
-
- xattr_size = sys_lgetxattr (filler->real_path, key, value,
- xattr_size);
- if (xattr_size <= 0) {
- gf_log (filler->this->name, GF_LOG_WARNING,
- "getxattr failed. path: %s, key: %s",
- filler->real_path, key);
- GF_FREE (value);
- return -1;
- }
+ } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) {
+ char *path = NULL;
+ ret = posix_get_ancestry (filler->this, filler->loc->inode,
+ NULL, &path, POSIX_ANCESTRY_PATH,
+ &filler->op_errno, xattr_req);
+ if (ret < 0) {
+ goto out;
+ }
- value[xattr_size] = '\0';
- ret = dict_set_bin (filler->xattr, key,
- value, xattr_size);
- if (ret < 0) {
- gf_log (filler->this->name, GF_LOG_DEBUG,
- "dict set failed. path: %s, key: %s",
- filler->real_path, key);
- GF_FREE (value);
- }
+ ret = dict_set_dynstr (filler->xattr, GET_ANCESTRY_PATH_KEY,
+ path);
+ if (ret < 0) {
+ GF_FREE (path);
+ goto out;
}
+
+ } else if (fnmatch (marker_contri_key, key, 0) == 0) {
+ ret = _posix_get_marker_quota_contributions (filler, key);
+ } else {
+ ret = _posix_xattr_get_set_from_backend (filler, key);
}
out:
return 0;
@@ -281,7 +403,7 @@ posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf)
goto out;
}
for (i = 15; i > (15 - 8); i--) {
- temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
+ temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
j += 8;
}
buf->ia_ino = temp_ino;
@@ -671,7 +793,10 @@ posix_handle_pair (xlator_t *this, const char *real_path,
int sys_ret = -1;
int ret = 0;
- if (ZR_FILE_CONTENT_REQUEST(key)) {
+ if (XATTR_IS_PATHINFO (key)) {
+ ret = -EACCES;
+ goto out;
+ } else if (ZR_FILE_CONTENT_REQUEST(key)) {
ret = posix_set_file_contents (this, real_path, key, value,
flags);
} else {
@@ -679,6 +804,7 @@ posix_handle_pair (xlator_t *this, const char *real_path,
value->len, flags);
if (sys_ret < 0) {
+ ret = -errno;
if (errno == ENOTSUP) {
GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log,
this->name,GF_LOG_WARNING,
@@ -710,7 +836,6 @@ posix_handle_pair (xlator_t *this, const char *real_path,
#endif /* DARWIN */
}
- ret = -errno;
goto out;
}
}
@@ -725,10 +850,16 @@ posix_fhandle_pair (xlator_t *this, int fd,
int sys_ret = -1;
int ret = 0;
+ if (XATTR_IS_PATHINFO (key)) {
+ ret = -EACCES;
+ goto out;
+ }
+
sys_ret = sys_fsetxattr (fd, key, value->data,
value->len, flags);
if (sys_ret < 0) {
+ ret = -errno;
if (errno == ENOTSUP) {
GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log,
this->name,GF_LOG_WARNING,
@@ -755,7 +886,6 @@ posix_fhandle_pair (xlator_t *this, int fd,
#endif /* DARWIN */
}
- ret = -errno;
goto out;
}
@@ -896,8 +1026,8 @@ posix_spawn_janitor_thread (xlator_t *this)
LOCK (&priv->lock);
{
if (!priv->janitor_present) {
- ret = pthread_create (&priv->janitor, NULL,
- posix_janitor_thread_proc, this);
+ ret = gf_thread_create (&priv->janitor, NULL,
+ posix_janitor_thread_proc, this);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
@@ -913,6 +1043,74 @@ unlock:
UNLOCK (&priv->lock);
}
+static int
+is_fresh_file (struct stat *stat)
+{
+ struct timeval tv;
+
+ gettimeofday (&tv, NULL);
+
+ if ((stat->st_ctime >= (tv.tv_sec - 1))
+ && (stat->st_ctime <= tv.tv_sec))
+ return 1;
+
+ return 0;
+}
+
+
+int
+posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
+{
+ /* The purpose of this function is to prevent a race
+ where an inode creation FOP (like mkdir/mknod/create etc)
+ races with lookup in the following way:
+
+ {create thread} | {lookup thread}
+ |
+ t0
+ mkdir ("name") |
+ t1
+ | posix_gfid_set ("name", 2);
+ t2
+ posix_gfid_set ("name", 1); |
+ t3
+ lstat ("name"); | lstat ("name");
+
+ In the above case mkdir FOP would have resulted with GFID 2 while
+ it should have been GFID 1. It matters in the case where GFID would
+ have gotten set to 1 on other subvolumes of replciate/distribute
+
+ The "solution" here is that, if we detect lookup is attempting to
+ set a GFID on a file which is created very recently, but does not
+ yet have a GFID (i.e, between t1 and t2), then "fake" it as though
+ posix_gfid_heal was called at t0 instead.
+ */
+
+ uuid_t uuid_curr;
+ int ret = 0;
+ struct stat stat = {0, };
+
+ if (!xattr_req)
+ goto out;
+
+ if (sys_lstat (path, &stat) != 0)
+ goto out;
+
+ ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
+ if (ret != 16) {
+ if (is_fresh_file (&stat)) {
+ ret = -1;
+ errno = ENOENT;
+ goto out;
+ }
+ }
+
+ ret = posix_gfid_set (this, path, loc, xattr_req);
+out:
+ return ret;
+}
+
+
int
posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
{
@@ -926,17 +1124,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
if (sys_lstat (path, &stat) != 0)
goto out;
- data = dict_get (xattr_req, "system.posix_acl_access");
+ data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR);
if (data) {
- ret = sys_lsetxattr (path, "system.posix_acl_access",
+ ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR,
data->data, data->len, 0);
if (ret != 0)
goto out;
}
- data = dict_get (xattr_req, "system.posix_acl_default");
+ data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR);
if (data) {
- ret = sys_lsetxattr (path, "system.posix_acl_default",
+ ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR,
data->data, data->len, 0);
if (ret != 0)
goto out;
@@ -946,35 +1144,47 @@ out:
return ret;
}
+static int
+_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int ret = -1;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ if (!strcmp (GFID_XATTR_KEY, k) ||
+ !strcmp ("gfid-req", k) ||
+ !strcmp (POSIX_ACL_DEFAULT_XATTR, k) ||
+ !strcmp (POSIX_ACL_ACCESS_XATTR, k) ||
+ ZR_FILE_CONTENT_REQUEST(k)) {
+ return 0;
+ }
+
+ ret = posix_handle_pair (filler->this, filler->real_path, k, v,
+ XATTR_CREATE);
+ if (ret < 0) {
+ errno = -ret;
+ return -1;
+ }
+ return 0;
+}
+
int
posix_entry_create_xattr_set (xlator_t *this, const char *path,
dict_t *dict)
{
int ret = -1;
+ posix_xattr_filler_t filler = {0,};
+
if (!dict)
goto out;
- int _handle_keyvalue_pair (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- if (!strcmp (GFID_XATTR_KEY, k) ||
- !strcmp ("gfid-req", k) ||
- !strcmp ("system.posix_acl_default", k) ||
- !strcmp ("system.posix_acl_access", k) ||
- ZR_FILE_CONTENT_REQUEST(k)) {
- return 0;
- }
+ filler.this = this;
+ filler.real_path = path;
- ret = posix_handle_pair (this, path, k, v, XATTR_CREATE);
- if (ret < 0) {
- errno = -ret;
- return -1;
- }
- return 0;
- }
-
- ret = dict_foreach (dict, _handle_keyvalue_pair, NULL);
+ ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler);
out:
return ret;
@@ -1064,3 +1274,259 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd)
return ret;
}
+
+static void *
+posix_health_check_thread_proc (void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ uint32_t interval = 0;
+ int ret = -1;
+ struct stat sb = {0, };
+
+ this = data;
+ priv = this->private;
+
+ /* prevent races when the interval is updated */
+ interval = priv->health_check_interval;
+ if (interval == 0)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, "
+ "interval = %d seconds", interval);
+
+ while (1) {
+ /* aborting sleep() is a request to exit this thread, sleep()
+ * will normally not return when cancelled */
+ ret = sleep (interval);
+ if (ret > 0)
+ break;
+
+ /* prevent thread errors while doing the health-check(s) */
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+
+ /* Do the health-check, it should be moved to its own function
+ * in case it gets more complex. */
+ ret = stat (priv->base_path, &sb);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "stat() on %s returned: %s", priv->base_path,
+ strerror (errno));
+ goto abort;
+ }
+
+ pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting");
+
+ LOCK (&priv->lock);
+ {
+ priv->health_check_active = _gf_false;
+ }
+ UNLOCK (&priv->lock);
+
+ return NULL;
+
+abort:
+ /* health-check failed */
+ gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down");
+ xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this);
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM");
+ kill (getpid(), SIGTERM);
+ }
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL");
+ kill (getpid(), SIGKILL);
+ }
+
+ return NULL;
+}
+
+void
+posix_spawn_health_check_thread (xlator_t *xl)
+{
+ struct posix_private *priv = NULL;
+ int ret = -1;
+
+ priv = xl->private;
+
+ LOCK (&priv->lock);
+ {
+ /* cancel the running thread */
+ if (priv->health_check_active == _gf_true) {
+ pthread_cancel (priv->health_check);
+ priv->health_check_active = _gf_false;
+ }
+
+ /* prevent scheduling a check in a tight loop */
+ if (priv->health_check_interval == 0)
+ goto unlock;
+
+ ret = gf_thread_create (&priv->health_check, NULL,
+ posix_health_check_thread_proc, xl);
+ if (ret < 0) {
+ priv->health_check_interval = 0;
+ priv->health_check_active = _gf_false;
+ gf_log (xl->name, GF_LOG_ERROR,
+ "unable to setup health-check thread: %s",
+ strerror (errno));
+ goto unlock;
+ }
+
+ /* run the thread detached, resources will be freed on exit */
+ pthread_detach (priv->health_check);
+ priv->health_check_active = _gf_true;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+}
+
+int
+posix_fsyncer_pick (xlator_t *this, struct list_head *head)
+{
+ struct posix_private *priv = NULL;
+ int count = 0;
+
+ priv = this->private;
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ while (list_empty (&priv->fsyncs))
+ pthread_cond_wait (&priv->fsync_cond,
+ &priv->fsync_mutex);
+
+ count = priv->fsync_queue_count;
+ priv->fsync_queue_count = 0;
+ list_splice_init (&priv->fsyncs, head);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return count;
+}
+
+
+void
+posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync)
+{
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get fdctx for fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, EINVAL);
+ return;
+ }
+
+ if (do_fsync) {
+#ifdef HAVE_FDATASYNC
+ if (stub->args.datasync)
+ ret = fdatasync (pfd->fd);
+ else
+#endif
+ ret = fsync (pfd->fd);
+ } else {
+ ret = 0;
+ }
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not fstat fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, errno);
+ return;
+ }
+
+ call_unwind_error (stub, 0, 0);
+}
+
+
+static void
+posix_fsyncer_syncfs (xlator_t *this, struct list_head *head)
+{
+ call_stub_t *stub = NULL;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+
+ stub = list_entry (head->prev, call_stub_t, list);
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret)
+ return;
+
+#ifdef GF_LINUX_HOST_OS
+ /* syncfs() is not "declared" in RHEL's glibc even though
+ the kernel has support.
+ */
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifdef SYS_syncfs
+ syscall (SYS_syncfs, pfd->fd);
+#else
+ sync();
+#endif
+#else
+ sync();
+#endif
+}
+
+
+void *
+posix_fsyncer (void *d)
+{
+ xlator_t *this = d;
+ struct posix_private *priv = NULL;
+ call_stub_t *stub = NULL;
+ call_stub_t *tmp = NULL;
+ struct list_head list;
+ int count = 0;
+ gf_boolean_t do_fsync = _gf_true;
+
+ priv = this->private;
+
+ for (;;) {
+ INIT_LIST_HEAD (&list);
+
+ count = posix_fsyncer_pick (this, &list);
+
+ usleep (priv->batch_fsync_delay_usec);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "picked %d fsyncs", count);
+
+ switch (priv->batch_fsync_mode) {
+ case BATCH_NONE:
+ case BATCH_REVERSE_FSYNC:
+ break;
+ case BATCH_SYNCFS:
+ case BATCH_SYNCFS_SINGLE_FSYNC:
+ case BATCH_SYNCFS_REVERSE_FSYNC:
+ posix_fsyncer_syncfs (this, &list);
+ break;
+ }
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS)
+ do_fsync = _gf_false;
+ else
+ do_fsync = _gf_true;
+
+ list_for_each_entry_safe_reverse (stub, tmp, &list, list) {
+ list_del_init (&stub->list);
+
+ posix_fsyncer_process (this, stub, do_fsync);
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC)
+ do_fsync = _gf_false;
+ }
+ }
+}