diff options
Diffstat (limited to 'xlators/storage')
26 files changed, 17751 insertions, 11219 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am index 9cb9ded3035..5e3ed0eb93b 100644 --- a/xlators/storage/Makefile.am +++ b/xlators/storage/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = posix -CLEANFILES = +CLEANFILES = diff --git a/xlators/storage/bdb/Makefile.am b/xlators/storage/bdb/Makefile.am deleted file mode 100644 index d471a3f9243..00000000000 --- a/xlators/storage/bdb/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/storage/bdb/src/Makefile.am b/xlators/storage/bdb/src/Makefile.am deleted file mode 100644 index 7e2376979ce..00000000000 --- a/xlators/storage/bdb/src/Makefile.am +++ /dev/null @@ -1,18 +0,0 @@ - -xlator_LTLIBRARIES = bdb.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/storage - -bdb_la_LDFLAGS = -module -avoidversion - -bdb_la_SOURCES = bctx.c bdb-ll.c bdb.c -bdb_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = bdb.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -AM_LDFLAGS = -ldb - -CLEANFILES = - diff --git a/xlators/storage/bdb/src/bctx.c b/xlators/storage/bdb/src/bctx.c deleted file mode 100644 index 361ce75fe9c..00000000000 --- a/xlators/storage/bdb/src/bctx.c +++ /dev/null @@ -1,341 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <list.h> -#include <bdb.h> -#include <libgen.h> /* for dirname */ - -static void -__destroy_bctx (bctx_t *bctx) -{ - if (bctx->directory) - FREE (bctx->directory); - - if (bctx->db_path) - FREE (bctx->db_path); - - FREE (bctx); -} - -static void -__unhash_bctx (bctx_t *bctx) -{ - list_del_init (&bctx->b_hash); -} - -static int32_t -bctx_table_prune (bctx_table_t *table) -{ - int32_t ret = 0; - struct list_head purge = {0,}; - struct list_head *next = NULL; - bctx_t *entry = NULL; - bctx_t *del = NULL, *tmp = NULL; - - if (!table) - return 0; - - INIT_LIST_HEAD (&purge); - - LOCK (&table->lock); - { - if ((table->lru_limit) && - (table->lru_size > table->lru_limit)) { - while (table->lru_size > table->lru_limit) { - next = table->b_lru.next; - entry = list_entry (next, bctx_t, list); - - list_move_tail (next, &table->purge); - __unhash_bctx (entry); - - table->lru_size--; - ret++; - } - } - list_move_tail (&purge, &table->purge); - list_del_init (&table->purge); - } - UNLOCK (&table->lock); - - list_for_each_entry_safe (del, tmp, &purge, list) { - list_del_init (&del->list); - if (del->primary) { - ret = del->primary->close (del->primary, 0); - if (ret != 0) { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s: %s " - "(failed to close primary database)", - del->directory, db_strerror (ret)); - } else { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s (lru=%d)" - "(closed primary database)", - del->directory, table->lru_size); - } - } - if (del->secondary) { - ret = del->secondary->close (del->secondary, 0); - if (ret != 0) { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s: %s " - "(failed to close secondary database)", - del->directory, db_strerror (ret)); - } else { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s (lru=%d)" - "(closed secondary database)", - del->directory, table->lru_size); - } - } - __destroy_bctx (del); - } - - return ret; -} - - -/* struct bdb_ctx related */ -static inline uint32_t -bdb_key_hash (char *key, uint32_t hash_size) -{ - uint32_t hash = 0; - - hash = *key; - - if (hash) { - for (key += 1; *key != '\0'; key++) { - hash = (hash << 5) - hash + *key; - } - } - - return (hash + *key) % hash_size; -} - -static void -__hash_bctx (bctx_t *bctx) -{ - bctx_table_t *table = NULL; - char *key = NULL; - - table = bctx->table; - - MAKE_KEY_FROM_PATH (key, bctx->directory); - bctx->key_hash = bdb_key_hash (key, table->hash_size); - - list_del_init (&bctx->b_hash); - list_add (&bctx->b_hash, &table->b_hash[bctx->key_hash]); -} - -static inline bctx_t * -__bctx_passivate (bctx_t *bctx) -{ - if (bctx->primary) { - list_move_tail (&bctx->list, &(bctx->table->b_lru)); - bctx->table->lru_size++; - } else { - list_move_tail (&bctx->list, &bctx->table->purge); - __unhash_bctx (bctx); - } - return bctx; -} - -static inline bctx_t * -__bctx_activate (bctx_t *bctx) -{ - list_move (&bctx->list, &bctx->table->active); - bctx->table->lru_size--; - - return bctx; -} - -static bctx_t * -__bdb_ctx_unref (bctx_t *bctx) -{ - assert (bctx->ref); - - --bctx->ref; - - if (!bctx->ref) - bctx = __bctx_passivate (bctx); - - return bctx; -} - - -bctx_t * -bctx_unref (bctx_t *bctx) -{ - bctx_table_t *table = NULL; - - if (!bctx && !bctx->table) - return NULL; - - table = bctx->table; - - LOCK (&table->lock); - { - bctx = __bdb_ctx_unref (bctx); - } - UNLOCK (&table->lock); - - bctx_table_prune (table); - - return bctx; -} - -/* - * NOTE: __bdb_ctx_ref() is called only after holding table->lock and - * bctx->lock, in that order - */ -static inline bctx_t * -__bctx_ref (bctx_t *bctx) -{ - if (!bctx->ref) - __bctx_activate (bctx); - - bctx->ref++; - - return bctx; -} - -bctx_t * -bctx_ref (bctx_t *bctx) -{ - LOCK (&(bctx->table->lock)); - { - __bctx_ref (bctx); - } - UNLOCK (&(bctx->table->lock)); - - return bctx; -} - - -#define BDB_THIS(table) (table->this) - -static inline bctx_t * -__create_bctx (bctx_table_t *table, - const char *path) -{ - bctx_t *bctx = NULL; - char *db_path = NULL; - - bctx = CALLOC (1, sizeof (*bctx)); - GF_VALIDATE_OR_GOTO ("bctx", bctx, out); - - bctx->table = table; - bctx->directory = strdup (path); - GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, BDB_THIS (table), path); - - bctx->db_path = strdup (db_path); - GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); - - INIT_LIST_HEAD (&bctx->c_list); - INIT_LIST_HEAD (&bctx->list); - INIT_LIST_HEAD (&bctx->b_hash); - - LOCK_INIT (&bctx->lock); - - __hash_bctx (bctx); - - list_add (&bctx->list, &table->b_lru); - table->lru_size++; - -out: - return bctx; -} - -/* bctx_lookup - lookup bctx_t for the directory @directory. - * (see description of bctx_t in bdb.h) - * - * @table: bctx_table_t for this instance of bdb. - * @directory: directory for which bctx_t is being looked up. - */ -bctx_t * -bctx_lookup (bctx_table_t *table, - const char *directory) -{ - char *key = NULL; - uint32_t key_hash = 0; - bctx_t *trav = NULL, *bctx = NULL, *tmp = NULL; - int32_t need_break = 0; - - GF_VALIDATE_OR_GOTO ("bctx", table, out); - GF_VALIDATE_OR_GOTO ("bctx", directory, out); - - MAKE_KEY_FROM_PATH (key, directory); - key_hash = bdb_key_hash (key, table->hash_size); - - LOCK (&table->lock); - { - if (list_empty (&table->b_hash[key_hash])) { - goto creat_bctx; - } - - list_for_each_entry_safe (trav, tmp, &table->b_hash[key_hash], - b_hash) { - LOCK(&trav->lock); - { - if (!strcmp(trav->directory, directory)) { - bctx = __bctx_ref (trav); - need_break = 1; - } - } - UNLOCK(&trav->lock); - - if (need_break) - break; - } - - creat_bctx: - if (!bctx) { - bctx = __create_bctx (table, directory); - bctx = __bctx_ref (bctx); - } - } - UNLOCK (&table->lock); -out: - return bctx; -} - - -bctx_t * -bctx_parent (bctx_table_t *table, - const char *path) -{ - char *pathname = NULL, *directory = NULL; - bctx_t *bctx = NULL; - - GF_VALIDATE_OR_GOTO ("bctx", table, out); - GF_VALIDATE_OR_GOTO ("bctx", path, out); - - pathname = strdup (path); - GF_VALIDATE_OR_GOTO ("bctx", pathname, out); - directory = dirname (pathname); - - bctx = bctx_lookup (table, directory); - GF_VALIDATE_OR_GOTO ("bctx", bctx, out); - -out: - if (pathname) - free (pathname); - return bctx; -} diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c deleted file mode 100644 index 290b29710dd..00000000000 --- a/xlators/storage/bdb/src/bdb-ll.c +++ /dev/null @@ -1,1460 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <libgen.h> -#include "bdb.h" -#include <list.h> -#include "hashfn.h" -/* - * implement the procedures to interact with bdb */ - -/**************************************************************** - * - * General wrappers and utility procedures for bdb xlator - * - ****************************************************************/ - -ino_t -bdb_inode_transform (ino_t parent, - const char *name, - size_t namelen) -{ - ino_t ino = -1; - uint64_t hash = 0; - - hash = gf_dm_hashfn (name, namelen); - - ino = (((parent << 32) | 0x00000000ffffffffULL) - & (hash | 0xffffffff00000000ULL)); - - return ino; -} - -static int -bdb_generate_secondary_hash (DB *secondary, - const DBT *pkey, - const DBT *data, - DBT *skey) -{ - char *primary = NULL; - uint32_t *hash = NULL; - - primary = pkey->data; - - hash = calloc (1, sizeof (uint32_t)); - - *hash = gf_dm_hashfn (primary, pkey->size); - - skey->data = hash; - skey->size = sizeof (hash); - skey->flags = DB_DBT_APPMALLOC; - - return 0; -} - -/*********************************************************** - * - * bdb storage database utilities - * - **********************************************************/ - -/* - * bdb_db_open - opens a storage db. - * - * @ctx: context specific to the directory for which we are supposed to open db - * - * see, if we have empty slots to open a db. - * if (no-empty-slots), then prune open dbs and close as many as possible - * if (empty-slot-available), tika muchkonDu db open maaDu - * - */ -static int -bdb_db_open (bctx_t *bctx) -{ - DB *primary = NULL; - DB *secondary = NULL; - int32_t ret = -1; - bctx_table_t *table = NULL; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - - table = bctx->table; - GF_VALIDATE_OR_GOTO ("bdb-ll", table, out); - - /* we have to do the following, we can't deny someone of db_open ;) */ - ret = db_create (&primary, table->dbenv, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: %s (failed to create database object" - " for primary database)", - bctx->directory, db_strerror (ret)); - ret = -ENOMEM; - goto out; - } - - if (table->page_size) { - ret = primary->set_pagesize (primary, - table->page_size); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: %s (failed to set page-size " - "to %"PRIu64")", - bctx->directory, db_strerror (ret), - table->page_size); - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: page-size set to %"PRIu64, - bctx->directory, table->page_size); - } - } - - ret = primary->open (primary, NULL, bctx->db_path, "primary", - table->access_mode, table->dbflags, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_DB_OPEN %s: %s " - "(failed to open primary database)", - bctx->directory, db_strerror (ret)); - ret = -1; - goto cleanup; - } - - ret = db_create (&secondary, table->dbenv, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: %s (failed to create database object" - " for secondary database)", - bctx->directory, db_strerror (ret)); - ret = -ENOMEM; - goto cleanup; - } - - ret = secondary->open (secondary, NULL, bctx->db_path, "secondary", - table->access_mode, table->dbflags, 0); - if (ret != 0 ) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_DB_OPEN %s: %s " - "(failed to open secondary database)", - bctx->directory, db_strerror (ret)); - ret = -1; - goto cleanup; - } - - ret = primary->associate (primary, NULL, secondary, - bdb_generate_secondary_hash, -#ifdef DB_IMMUTABLE_KEY - DB_IMMUTABLE_KEY); -#else - 0); -#endif - if (ret != 0 ) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_DB_OPEN %s: %s " - "(failed to associate primary database with " - "secondary database)", - bctx->directory, db_strerror (ret)); - ret = -1; - goto cleanup; - } - -out: - bctx->primary = primary; - bctx->secondary = secondary; - - return ret; -cleanup: - if (primary) - primary->close (primary, 0); - if (secondary) - secondary->close (secondary, 0); - - return ret; -} - -int32_t -bdb_cursor_close (bctx_t *bctx, - DBC *cursorp) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); - - LOCK (&bctx->lock); - { -#ifdef HAVE_BDB_CURSOR_GET - ret = cursorp->close (cursorp); -#else - ret = cursorp->c_close (cursorp); -#endif - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_CLOSE %s: %s " - "(failed to close database cursor)", - bctx->directory, db_strerror (ret)); - } - } - UNLOCK (&bctx->lock); - -out: - return ret; -} - - -int32_t -bdb_cursor_open (bctx_t *bctx, - DBC **cursorpp) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out); - - LOCK (&bctx->lock); - { - if (bctx->secondary) { - /* do nothing, just continue */ - ret = 0; - } else { - ret = bdb_db_open (bctx); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_OPEN %s: ENOMEM " - "(failed to open secondary database)", - bctx->directory); - ret = -ENOMEM; - } else { - ret = 0; - } - } - - if (ret == 0) { - /* all set, open cursor */ - ret = bctx->secondary->cursor (bctx->secondary, - NULL, cursorpp, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_OPEN %s: %s " - "(failed to open a cursor to database)", - bctx->directory, db_strerror (ret)); - } - } - } - UNLOCK (&bctx->lock); - -out: - return ret; -} - - -/* cache related */ -static bdb_cache_t * -bdb_cache_lookup (bctx_t *bctx, - char *path) -{ - bdb_cache_t *bcache = NULL; - bdb_cache_t *trav = NULL; - char *key = NULL; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); - - MAKE_KEY_FROM_PATH (key, path); - - LOCK (&bctx->lock); - { - list_for_each_entry (trav, &bctx->c_list, c_list) { - if (!strcmp (trav->key, key)){ - bcache = trav; - break; - } - } - } - UNLOCK (&bctx->lock); - -out: - return bcache; -} - -static int32_t -bdb_cache_insert (bctx_t *bctx, - DBT *key, - DBT *data) -{ - bdb_cache_t *bcache = NULL; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", data, out); - - LOCK (&bctx->lock); - { - if (bctx->c_count > 5) { - /* most of the times, we enter here */ - /* FIXME: ugly, not supposed to disect any of the - * 'struct list_head' directly */ - if (!list_empty (&bctx->c_list)) { - bcache = list_entry (bctx->c_list.prev, - bdb_cache_t, c_list); - list_del_init (&bcache->c_list); - } - if (bcache->key) { - free (bcache->key); - bcache->key = calloc (key->size + 1, - sizeof (char)); - GF_VALIDATE_OR_GOTO ("bdb-ll", - bcache->key, unlock); - memcpy (bcache->key, (char *)key->data, - key->size); - } else { - /* should never come here */ - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CACHE_INSERT %s (%s) " - "(found a cache entry with empty key)", - bctx->directory, (char *)key->data); - } /* if(bcache->key)...else */ - if (bcache->data) { - free (bcache->data); - bcache->data = memdup (data->data, data->size); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, - unlock); - bcache->size = data->size; - } else { - /* should never come here */ - gf_log ("bdb-ll", GF_LOG_CRITICAL, - "_BDB_CACHE_INSERT %s (%s) " - "(found a cache entry with no data)", - bctx->directory, (char *)key->data); - } /* if(bcache->data)...else */ - list_add (&bcache->c_list, &bctx->c_list); - ret = 0; - } else { - /* we will be entering here very rarely */ - bcache = CALLOC (1, sizeof (*bcache)); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock); - - bcache->key = calloc (key->size + 1, sizeof (char)); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); - memcpy (bcache->key, key->data, key->size); - - bcache->data = memdup (data->data, data->size); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); - - bcache->size = data->size; - list_add (&bcache->c_list, &bctx->c_list); - bctx->c_count++; - ret = 0; - } /* if(private->c_count < 5)...else */ - } -unlock: - UNLOCK (&bctx->lock); -out: - return ret; -} - -static int32_t -bdb_cache_delete (bctx_t *bctx, - const char *key) -{ - bdb_cache_t *bcache = NULL; - bdb_cache_t *trav = NULL; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); - - LOCK (&bctx->lock); - { - list_for_each_entry (trav, &bctx->c_list, c_list) { - if (!strcmp (trav->key, key)){ - bctx->c_count--; - bcache = trav; - break; - } - } - - if (bcache) { - list_del_init (&bcache->c_list); - free (bcache->key); - free (bcache->data); - free (bcache); - } - } - UNLOCK (&bctx->lock); - -out: - return 0; -} - -void * -bdb_db_stat (bctx_t *bctx, - DB_TXN *txnid, - uint32_t flags) -{ - DB *storage = NULL; - void *stat = NULL; - int32_t ret = -1; - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } /* if(bctx->dbp==NULL)...else */ - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - ret = storage->stat (storage, txnid, &stat, flags); - - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_STAT %s: %s " - "(failed to do stat database)", - bctx->directory, db_strerror (ret)); - } -out: - return stat; - -} - -/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the - * corresponding db file. - * - * @bctx: bctx_t * corresponding to the parent directory of @path. (should - * always be a valid bctx). bdb_storage_get should never be called if - * @bctx = NULL. - * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction - * or a valid DB_TXN *, when embedded in an explicit transaction. - * @path: path of the file to read from (translated to a database key using - * MAKE_KEY_FROM_PATH) - * @buf: char ** - pointer to a pointer to char. a read buffer is created in - * this procedure and pointer to the buffer is passed through @buf to the - * caller. - * @size: size of the file content to be read. - * @offset: offset from which the file content to be read. - * - * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL - * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by - * bdb_table_prune()). - * - * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then - * bdb_storage_get first looks up the cache for key/value pair. if - * bdb_lookup_cache fails, then only DB->get() is called. also, inserts a - * newly read key/value pair to cache through bdb_insert_to_cache. - * - * return: 'number of bytes read' on success or -1 on error. - * - * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb - * xlator's internal cache. - */ -static int32_t -bdb_db_get (bctx_t *bctx, - DB_TXN *txnid, - const char *path, - char *buf, - size_t size, - off_t offset) -{ - DB *storage = NULL; - DBT key = {0,}; - DBT value = {0,}; - int32_t ret = -1; - size_t copy_size = 0; - char *key_string = NULL; - bdb_cache_t *bcache = NULL; - int32_t db_flags = 0; - uint8_t need_break = 0; - int32_t retries = 1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); - - MAKE_KEY_FROM_PATH (key_string, path); - - if (bctx->cache && - ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) { - if (buf) { - copy_size = ((bcache->size - offset) < size)? - (bcache->size - offset) : size; - - memcpy (buf, (bcache->data + offset), copy_size); - ret = copy_size; - } else { - ret = bcache->size; - } - - goto out; - } - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } /* if(bctx->dbp==NULL)...else */ - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - key.data = (char *)key_string; - key.size = strlen (key_string); - key.flags = DB_DBT_USERMEM; - - if (bctx->cache){ - value.flags = DB_DBT_MALLOC; - } else { - if (size) { - value.data = buf; - value.ulen = size; - value.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL; - } else { - value.flags = DB_DBT_MALLOC; - } - value.dlen = size; - value.doff = offset; - } - - do { - /* TODO: we prefer to give our own buffer to value.data - * and ask bdb to fill in it */ - ret = storage->get (storage, txnid, &key, &value, - db_flags); - - if (ret == DB_NOTFOUND) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_GET %s - %s: ENOENT" - "(specified key not found in database)", - bctx->directory, key_string); - ret = -1; - need_break = 1; - } else if (ret == DB_LOCK_DEADLOCK) { - retries++; - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_GET %s - %s" - "(deadlock detected, retrying for %d " - "time)", - bctx->directory, key_string, retries); - } else if (ret == 0) { - /* successfully read data, lets set everything - * in place and return */ - if (bctx->cache) { - if (buf) { - copy_size = ((value.size - offset) < size) ? - (value.size - offset) : size; - - memcpy (buf, (value.data + offset), - copy_size); - ret = copy_size; - } - - bdb_cache_insert (bctx, &key, &value); - } else { - ret = value.size; - } - - if (size == 0) - free (value.data); - - need_break = 1; - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_GET %s - %s: %s" - "(failed to retrieve specified key from" - " database)", - bctx->directory, key_string, - db_strerror (ret)); - ret = -1; - need_break = 1; - } - } while (!need_break); - -out: - return ret; -}/* bdb_db_get */ - -/* TODO: handle errors here and log. propogate only the errno to caller */ -int32_t -bdb_db_fread (struct bdb_fd *bfd, char *buf, size_t size, off_t offset) -{ - return bdb_db_get (bfd->ctx, NULL, bfd->key, buf, size, offset); -} - -int32_t -bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp) -{ - char *buf = NULL; - size_t size = 0; - int64_t ret = 0; - - ret = bdb_db_get (bctx, NULL, key, NULL, 0, 0); - size = ret; - - if (bufp) { - buf = calloc (size, sizeof (char)); - *bufp = buf; - ret = bdb_db_get (bctx, NULL, key, buf, size, 0); - } - - return ret; -} - -/* bdb_storage_put - insert a key/value specified to the corresponding DB. - * - * @bctx: bctx_t * corresponding to the parent directory of @path. - * (should always be a valid bctx). bdb_storage_put should never be - * called if @bctx = NULL. - * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction - * or a valid DB_TXN *, when embedded in an explicit transaction. - * @key_string: key of the database entry. - * @buf: pointer to the buffer data to be written as data for @key_string. - * @size: size of @buf. - * @offset: offset in the key's data to be modified with provided data. - * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of - * @key_string to 0 size). - * - * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL - * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by - * bdb_table_prune()). - * - * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache. - * - * return: 0 on success or -1 on error. - * - * also see: bdb_cache_delete for details on how a cached key/value pair is - * removed. - */ -static int32_t -bdb_db_put (bctx_t *bctx, - DB_TXN *txnid, - const char *key_string, - const char *buf, - size_t size, - off_t offset, - int32_t flags) -{ - DB *storage = NULL; - DBT key = {0,}, value = {0,}; - int32_t ret = -1; - int32_t db_flags = DB_AUTO_COMMIT; - uint8_t need_break = 0; - int32_t retries = 1; - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - if (bctx->cache) { - ret = bdb_cache_delete (bctx, (char *)key_string); - GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); - } - - key.data = (void *)key_string; - key.size = strlen (key_string); - - /* NOTE: bdb lets us expand the file, suppose value.size > value.len, - * then value.len bytes from value.doff offset and value.size bytes - * will be written from value.doff and data from - * value.doff + value.dlen will be pushed value.doff + value.size - */ - value.data = (void *)buf; - - if (flags & BDB_TRUNCATE_RECORD) { - value.size = size; - value.doff = 0; - value.dlen = offset; - } else { - value.size = size; - value.dlen = size; - value.doff = offset; - } - value.flags = DB_DBT_PARTIAL; - if (buf == NULL && size == 0) - /* truncate called us */ - value.flags = 0; - - do { - ret = storage->put (storage, txnid, &key, &value, db_flags); - if (ret == DB_LOCK_DEADLOCK) { - retries++; - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_PUT %s - %s" - "(deadlock detected, retying for %d time)", - bctx->directory, key_string, retries); - } else if (ret) { - /* write failed */ - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_PUT %s - %s: %s" - "(failed to put specified entry into database)", - bctx->directory, key_string, db_strerror (ret)); - need_break = 1; - } else { - /* successfully wrote */ - ret = 0; - need_break = 1; - } - } while (!need_break); -out: - return ret; -}/* bdb_db_put */ - -int32_t -bdb_db_icreate (struct bdb_ctx *bctx, const char *key) -{ - return bdb_db_put (bctx, NULL, key, NULL, 0, 0, 0); -} - -/* TODO: handle errors here and log. propogate only the errno to caller */ -int32_t -bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset) -{ - return bdb_db_put (bfd->ctx, NULL, bfd->key, buf, size, offset, 0); -} - -/* TODO: handle errors here and log. propogate only the errno to caller */ -int32_t -bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size) -{ - return bdb_db_put (bctx, NULL, key, buf, size, 0, 0); -} - -int32_t -bdb_db_itruncate (struct bdb_ctx *bctx, const char *key) -{ - return bdb_db_put (bctx, NULL, key, NULL, 0, 1, 0); -} - -/* bdb_storage_del - delete a key/value pair corresponding to @path from - * corresponding db file. - * - * @bctx: bctx_t * corresponding to the parent directory of @path. - * (should always be a valid bctx). bdb_storage_del should never be called - * if @bctx = NULL. - * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction - * or a valid DB_TXN *, when embedded in an explicit transaction. - * @path: path to the file, whose key/value pair has to be deleted. - * - * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL - * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by - * bdb_table_prune()). - * - * return: 0 on success or -1 on error. - */ -static int32_t -bdb_db_del (bctx_t *bctx, - DB_TXN *txnid, - const char *key_string) -{ - DB *storage = NULL; - DBT key = {0,}; - int32_t ret = -1; - int32_t db_flags = 0; - uint8_t need_break = 0; - int32_t retries = 1; - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - ret = bdb_cache_delete (bctx, key_string); - GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); - - key.data = (char *)key_string; - key.size = strlen (key_string); - key.flags = DB_DBT_USERMEM; - - do { - ret = storage->del (storage, txnid, &key, db_flags); - - if (ret == DB_NOTFOUND) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s: ENOENT" - "(failed to delete entry, could not be " - "found in the database)", - bctx->directory, key_string); - need_break = 1; - } else if (ret == DB_LOCK_DEADLOCK) { - retries++; - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s" - "(deadlock detected, retying for %d time)", - bctx->directory, key_string, retries); - } else if (ret == 0) { - /* successfully deleted the entry */ - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s" - "(successfully deleted entry from database)", - bctx->directory, key_string); - ret = 0; - need_break = 1; - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s: %s" - "(failed to delete entry from database)", - bctx->directory, key_string, db_strerror (ret)); - ret = -1; - need_break = 1; - } - } while (!need_break); -out: - return ret; -} - -int32_t -bdb_db_iremove (bctx_t *bctx, - const char *key) -{ - return bdb_db_del (bctx, NULL, key); -} - -/* NOTE: bdb version compatibility wrapper */ -int32_t -bdb_cursor_get (DBC *cursorp, - DBT *sec, DBT *pri, - DBT *val, - int32_t flags) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); - -#ifdef HAVE_BDB_CURSOR_GET - ret = cursorp->pget (cursorp, sec, pri, val, flags); -#else - ret = cursorp->c_pget (cursorp, sec, pri, val, flags); -#endif - if ((ret != 0) && (ret != DB_NOTFOUND)) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_GET: %s" - "(failed to retrieve entry from database cursor)", - db_strerror (ret)); - } - -out: - return ret; -}/* bdb_cursor_get */ - -int32_t -bdb_dirent_size (DBT *key) -{ - return ALIGN (24 /* FIX MEEEE!!! */ + key->size); -} - - - -/* bdb_dbenv_init - initialize DB_ENV - * - * initialization includes: - * 1. opening DB_ENV (db_env_create(), DB_ENV->open()). - * NOTE: see private->envflags for flags used. - * 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files - * (log files are the files in which transaction logs are written by db). - * 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically - * clear the unwanted log files (flushed at each checkpoint). - * 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed - * error logs. used only for debbuging purpose. - * - * return: returns a valid DB_ENV * on success or NULL on error. - * - */ -static DB_ENV * -bdb_dbenv_init (xlator_t *this, - char *directory) -{ - /* Create a DB environment */ - DB_ENV *dbenv = NULL; - int32_t ret = 0; - bdb_private_t *private = NULL; - int32_t fatal_flags = 0; - - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (directory, err); - - private = this->private; - VALIDATE_OR_GOTO (private, err); - - ret = db_env_create (&dbenv, 0); - VALIDATE_OR_GOTO ((ret == 0), err); - - /* NOTE: set_errpfx returns 'void' */ - dbenv->set_errpfx(dbenv, this->name); - - ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT); - VALIDATE_OR_GOTO ((ret == 0), err); - - ret = dbenv->open(dbenv, directory, - private->envflags, - S_IRUSR | S_IWUSR); - if ((ret != 0) && (ret != DB_RUNRECOVERY)) { - gf_log (this->name, GF_LOG_CRITICAL, - "failed to join Berkeley DB environment at %s: %s." - "please run manual recovery and retry running " - "glusterfs", - directory, db_strerror (ret)); - dbenv = NULL; - goto err; - } else if (ret == DB_RUNRECOVERY) { - fatal_flags = ((private->envflags & (~DB_RECOVER)) - | DB_RECOVER_FATAL); - ret = dbenv->open(dbenv, directory, fatal_flags, - S_IRUSR | S_IWUSR); - if (ret != 0) { - gf_log (this->name, GF_LOG_CRITICAL, - "failed to join Berkeley DB environment in " - "recovery mode at %s: %s. please run manual " - "recovery and retry running glusterfs", - directory, db_strerror (ret)); - dbenv = NULL; - goto err; - } - } - - ret = 0; -#if (DB_VERSION_MAJOR == 4 && \ - DB_VERSION_MINOR == 7) - if (private->log_auto_remove) { - ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1); - } else { - ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0); - } -#else - if (private->log_auto_remove) { - ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1); - } else { - ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0); - } -#endif - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "autoremoval of transactional log files could not be " - "configured (%s). you may have to do a manual " - "monitoring of transactional log files and remove " - "periodically.", - db_strerror (ret)); - goto err; - } - - if (private->transaction) { - ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1); - - if (ret != 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "configuration of auto-commit failed for " - "database environment at %s. none of the " - "operations will be embedded in transaction " - "unless explicitly done so.", - db_strerror (ret)); - goto err; - } - - if (private->txn_timeout) { - ret = dbenv->set_timeout (dbenv, private->txn_timeout, - DB_SET_TXN_TIMEOUT); - if (ret != 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "could not configure Berkeley DB " - "transaction timeout to %d (%s). please" - " review 'option transaction-timeout %d" - "' option.", - private->txn_timeout, - db_strerror (ret), - private->txn_timeout); - goto err; - } - } - - if (private->lock_timeout) { - ret = dbenv->set_timeout(dbenv, - private->txn_timeout, - DB_SET_LOCK_TIMEOUT); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "could not configure Berkeley DB " - "lock timeout to %d (%s). please" - " review 'option lock-timeout %d" - "' option.", - private->lock_timeout, - db_strerror (ret), - private->lock_timeout); - goto err; - } - } - - ret = dbenv->set_lg_dir (dbenv, private->logdir); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "failed to configure libdb transaction log " - "directory at %s. please review the " - "'option logdir %s' option.", - db_strerror (ret), private->logdir); - goto err; - } - } - - if (private->errfile) { - private->errfp = fopen (private->errfile, "a+"); - if (private->errfp) { - dbenv->set_errfile (dbenv, private->errfp); - } else { - gf_log ("bdb-ll", GF_LOG_ERROR, - "failed to open error logging file for " - "libdb (Berkeley DB) internal logging (%s)." - "please review the 'option errfile %s' option.", - strerror (errno), private->errfile); - goto err; - } - } - - return dbenv; -err: - if (dbenv) { - dbenv->close (dbenv, 0); - } - - return NULL; -} - -#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) - -/* bdb_checkpoint - during transactional usage, db does not directly write the - * data to db files, instead db writes a 'log' (similar to a journal entry) - * into a log file. db normally clears the log files during opening of an - * environment. since we expect a filesystem server to run for a pretty long - * duration and flushing 'log's during dbenv->open would prove very costly, if - * we accumulate the log entries for one complete run of glusterfs server. to - * flush the logs frequently, db provides a mechanism called 'checkpointing'. - * when we do a checkpoint, db flushes the logs to disk (writes changes to db - * files) and we can also clear the accumulated log files after checkpointing. - * NOTE: removing unwanted log files is not part of dbenv->txn_checkpoint() - * call. - * - * @data: xlator_t of the current instance of bdb xlator. - * - * bdb_checkpoint is called in a different thread from the main glusterfs - * thread. bdb xlator creates the checkpoint thread after successfully opening - * the db environment. - * NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem - * thread. - * - * db environment checkpointing frequency is controlled by - * 'option checkpoint-timeout <time-in-seconds>' in volfile. - * - * NOTE: checkpointing thread is started only if 'option transaction on' - * specified in volfile. checkpointing is not valid for non-transactional - * environments. - * - */ -static void * -bdb_checkpoint (void *data) -{ - xlator_t *this = NULL; - struct bdb_private *private = NULL; - DB_ENV *dbenv = NULL; - int32_t ret = 0; - uint32_t active = 0; - - this = (xlator_t *) data; - dbenv = BDB_ENV(this); - private = this->private; - - for (;;sleep (private->checkpoint_interval)) { - LOCK (&private->active_lock); - active = private->active; - UNLOCK (&private->active_lock); - - if (active) { - ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); - if (ret) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CHECKPOINT: %s" - "(failed to checkpoint environment)", - db_strerror (ret)); - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CHECKPOINT: successfully " - "checkpointed"); - } - } else { - ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); - if (ret) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_CHECKPOINT: %s" - "(final checkpointing failed. might " - "need to run recovery tool manually on " - "next usage of this database " - "environment)", - db_strerror (ret)); - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CHECKPOINT: final successfully " - "checkpointed"); - } - break; - } - } - - return NULL; -} - - -/* bdb_db_init - initialize bdb xlator - * - * reads the options from @options dictionary and sets appropriate values in - * @this->private. also initializes DB_ENV. - * - * return: 0 on success or -1 on error - * (with logging the error through gf_log()). - */ -int -bdb_db_init (xlator_t *this, - dict_t *options) -{ - /* create a db entry for root */ - int32_t op_ret = 0; - bdb_private_t *private = NULL; - bctx_table_t *table = NULL; - - char *checkpoint_interval_str = NULL; - char *page_size_str = NULL; - char *lru_limit_str = NULL; - char *timeout_str = NULL; - char *access_mode = NULL; - char *endptr = NULL; - char *errfile = NULL; - char *directory = NULL; - char *logdir = NULL; - char *mode = NULL; - char *mode_str = NULL; - int ret = -1; - int idx = 0; - struct stat stbuf = {0,}; - - private = this->private; - - /* cache is always on */ - private->cache = ON; - - ret = dict_get_str (options, "access-mode", &access_mode); - if ((ret == 0) - && (!strcmp (access_mode, "btree"))) { - gf_log (this->name, GF_LOG_DEBUG, - "using BTREE access mode to access libdb " - "(Berkeley DB)"); - private->access_mode = DB_BTREE; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "using HASH access mode to access libdb (Berkeley DB)"); - private->access_mode = DB_HASH; - } - - ret = dict_get_str (options, "mode", &mode); - if ((ret == 0) - && (!strcmp (mode, "cache"))) { - gf_log (this->name, GF_LOG_DEBUG, - "cache data mode selected for 'storage/bdb'. filesystem" - " operations are not transactionally protected and " - "system crash does not guarantee recoverability of " - "data"); - private->envflags = DB_CREATE | DB_INIT_LOG | - DB_INIT_MPOOL | DB_THREAD; - private->dbflags = DB_CREATE | DB_THREAD; - private->transaction = OFF; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "persistent data mode selected for 'storage/bdb'. each" - "filesystem operation is guaranteed to be Berkeley DB " - "transaction protected."); - private->transaction = ON; - private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | - DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD; - private->dbflags = DB_CREATE | DB_THREAD; - - - ret = dict_get_str (options, "lock-timeout", &timeout_str); - - if (ret == 0) { - ret = gf_string2time (timeout_str, - &private->lock_timeout); - - if (private->lock_timeout > 4260000) { - /* db allows us to DB_SET_LOCK_TIMEOUT to be - * set to a maximum of 71 mins - * (4260000 milliseconds) */ - gf_log (this->name, GF_LOG_DEBUG, - "Berkeley DB lock-timeout parameter " - "(%d) is out of range. please specify" - " a valid timeout value for " - "lock-timeout and retry.", - private->lock_timeout); - goto err; - } - } - ret = dict_get_str (options, "transaction-timeout", - &timeout_str); - if (ret == 0) { - ret = gf_string2time (timeout_str, - &private->txn_timeout); - - if (private->txn_timeout > 4260000) { - /* db allows us to DB_SET_TXN_TIMEOUT to be set - * to a maximum of 71 mins - * (4260000 milliseconds) */ - gf_log (this->name, GF_LOG_DEBUG, - "Berkeley DB lock-timeout parameter " - "(%d) is out of range. please specify" - " a valid timeout value for " - "lock-timeout and retry.", - private->lock_timeout); - goto err; - } - } - - private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL; - ret = dict_get_str (options, "checkpoint-interval", - &checkpoint_interval_str); - if (ret == 0) { - ret = gf_string2time (checkpoint_interval_str, - &private->checkpoint_interval); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "'%"PRIu32"' is not a valid parameter " - "for checkpoint-interval option. " - "please specify a valid " - "checkpoint-interval and retry", - private->checkpoint_interval); - goto err; - } - } - } - - ret = dict_get_str (options, "file-mode", &mode_str); - if (ret == 0) { - private->file_mode = strtol (mode_str, &endptr, 8); - - if ((*endptr) || - (!IS_VALID_FILE_MODE(private->file_mode))) { - gf_log (this->name, GF_LOG_DEBUG, - "'%o' is not a valid parameter for file-mode " - "option. please specify a valid parameter for " - "file-mode and retry.", - private->file_mode); - goto err; - } - } else { - private->file_mode = DEFAULT_FILE_MODE; - } - private->symlink_mode = private->file_mode | S_IFLNK; - private->file_mode = private->file_mode | S_IFREG; - - ret = dict_get_str (options, "dir-mode", &mode_str); - if (ret == 0) { - private->dir_mode = strtol (mode_str, &endptr, 8); - if ((*endptr) || - (!IS_VALID_FILE_MODE(private->dir_mode))) { - gf_log (this->name, GF_LOG_DEBUG, - "'%o' is not a valid parameter for dir-mode " - "option. please specify a valid parameter for " - "dir-mode and retry.", - private->dir_mode); - goto err; - } - } else { - private->dir_mode = DEFAULT_DIR_MODE; - } - - private->dir_mode = private->dir_mode | S_IFDIR; - - table = CALLOC (1, sizeof (*table)); - if (table == NULL) { - gf_log ("bdb-ll", GF_LOG_CRITICAL, - "memory allocation for 'storage/bdb' internal " - "context table failed."); - goto err; - } - - INIT_LIST_HEAD(&(table->b_lru)); - INIT_LIST_HEAD(&(table->active)); - INIT_LIST_HEAD(&(table->purge)); - - LOCK_INIT (&table->lock); - LOCK_INIT (&table->checkpoint_lock); - - table->transaction = private->transaction; - table->access_mode = private->access_mode; - table->dbflags = private->dbflags; - table->this = this; - - ret = dict_get_str (options, "lru-limit", - &lru_limit_str); - - /* TODO: set max lockers and max txns to accomodate - * for more than lru_limit */ - if (ret == 0) { - ret = gf_string2uint32 (lru_limit_str, - &table->lru_limit); - gf_log ("bdb-ll", GF_LOG_DEBUG, - "setting lru limit of 'storage/bdb' internal context" - "table to %d. maximum of %d unused databases can be " - "open at any given point of time.", - table->lru_limit, table->lru_limit); - } else { - table->lru_limit = BDB_DEFAULT_LRU_LIMIT; - } - - ret = dict_get_str (options, "page-size", - &page_size_str); - - if (ret == 0) { - ret = gf_string2bytesize (page_size_str, - &table->page_size); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "\"%s\" is an invalid parameter to " - "\"option page-size\". please specify a valid " - "size and retry.", - page_size_str); - goto err; - } - - if (!PAGE_SIZE_IN_RANGE(table->page_size)) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "\"%s\" is out of range for Berkeley DB " - "page-size. allowed page-size range is %d to " - "%d. please specify a page-size value in the " - "range and retry.", - page_size_str, BDB_LL_PAGE_SIZE_MIN, - BDB_LL_PAGE_SIZE_MAX); - goto err; - } - } else { - table->page_size = BDB_LL_PAGE_SIZE_DEFAULT; - } - - table->hash_size = BDB_DEFAULT_HASH_SIZE; - table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE, - sizeof (struct list_head)); - - for (idx = 0; idx < table->hash_size; idx++) - INIT_LIST_HEAD(&(table->b_hash[idx])); - - private->b_table = table; - - ret = dict_get_str (options, "errfile", &errfile); - if (ret == 0) { - private->errfile = strdup (errfile); - gf_log (this->name, GF_LOG_DEBUG, - "using %s as error logging file for libdb (Berkeley DB " - "library) internal logging.", private->errfile); - } - - ret = dict_get_str (options, "directory", &directory); - - if (ret == 0) { - ret = dict_get_str (options, "logdir", &logdir); - - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "using the database environment home " - "directory (%s) itself as transaction log " - "directory", directory); - private->logdir = strdup (directory); - - } else { - private->logdir = strdup (logdir); - - op_ret = stat (private->logdir, &stbuf); - if ((op_ret != 0) - || (!S_ISDIR (stbuf.st_mode))) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "specified logdir %s does not exist. " - "please provide a valid existing " - "directory as parameter to 'option " - "logdir'", - private->logdir); - goto err; - } - } - - private->b_table->dbenv = bdb_dbenv_init (this, directory); - if (private->b_table->dbenv == NULL) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "initialization of database environment " - "failed"); - goto err; - } else { - if (private->transaction) { - /* all well, start the checkpointing thread */ - LOCK_INIT (&private->active_lock); - - LOCK (&private->active_lock); - { - private->active = 1; - } - UNLOCK (&private->active_lock); - pthread_create (&private->checkpoint_thread, - NULL, bdb_checkpoint, this); - } - } - } - - return op_ret; -err: - if (table) { - FREE (table->b_hash); - FREE (table); - } - if (private) { - if (private->errfile) - FREE (private->errfile); - - if (private->logdir) - FREE (private->logdir); - } - - return -1; -} diff --git a/xlators/storage/bdb/src/bdb.c b/xlators/storage/bdb/src/bdb.c deleted file mode 100644 index 68e5227a17b..00000000000 --- a/xlators/storage/bdb/src/bdb.c +++ /dev/null @@ -1,3585 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/* bdb based storage translator - named as 'bdb' translator - * - * - * There can be only two modes for files existing on bdb translator: - * 1. DIRECTORY - directories are stored by bdb as regular directories on - * back-end file-system. directories also have an entry in the ns_db.db of - * their parent directory. - * 2. REGULAR FILE - regular files are stored as records in the storage_db.db - * present in the directory. regular files also have an entry in ns_db.db - * - * Internally bdb has a maximum of three different types of logical files - * associated with each directory: - * 1. storage_db.db - storage database, used to store the data corresponding to - * regular files in the form of key/value pair. file-name is the 'key' and - * data is 'value'. - * 2. directory (all subdirectories) - any subdirectory will have a regular - * directory entry. - */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#define __XOPEN_SOURCE 500 - -#include <stdint.h> -#include <sys/time.h> -#include <errno.h> -#include <ftw.h> -#include <libgen.h> - -#include "glusterfs.h" -#include "dict.h" -#include "logging.h" -#include "bdb.h" -#include "xlator.h" -#include "defaults.h" -#include "common-utils.h" - -/* to be used only by fops, nobody else */ -#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) -#define B_TABLE(this) (((struct bdb_private *)this->private)->b_table) - - -int32_t -bdb_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t dev) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *key_string = NULL; /* after translating path to DB key */ - char *db_path = NULL; - bctx_t *bctx = NULL; - struct stat stbuf = {0,}; - - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - if (!S_ISREG(mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): EPERM" - "(mknod supported only for regular files. " - "file mode '%o' not supported)", - loc->parent->ino, loc->name, loc->path, mode); - op_ret = -1; - op_errno = EPERM; - goto out; - } /* if(!S_ISREG(mode)) */ - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): EINVAL" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = bdb_db_icreate (bctx, key_string); - if (op_ret > 0) { - /* create successful */ - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_mode = mode; - stbuf.st_size = 0; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, \ - stbuf.st_blksize); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): ENOMEM" - "(failed to create database entry)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = EINVAL; /* TODO: errno sari illa */ - goto out; - }/* if (!op_ret)...else */ - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); - return 0; -} - -static inline int32_t -is_dir_empty (xlator_t *this, - loc_t *loc) -{ - int32_t ret = 1; - bctx_t *bctx = NULL; - DIR *dir = NULL; - char *real_path = NULL; - void *dbstat = NULL; - struct dirent *entry = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - ret = -ENOMEM; - goto out; - } - - dbstat = bdb_db_stat (bctx, NULL, 0); - if (dbstat) { - switch (bctx->table->access_mode) - { - case DB_HASH: - ret = (((DB_HASH_STAT *)dbstat)->hash_nkeys == 0); - break; - case DB_BTREE: - case DB_RECNO: - ret = (((DB_BTREE_STAT *)dbstat)->bt_nkeys == 0); - break; - case DB_QUEUE: - ret = (((DB_QUEUE_STAT *)dbstat)->qs_nkeys == 0); - break; - case DB_UNKNOWN: - gf_log (this->name, GF_LOG_CRITICAL, - "unknown access-mode set for database"); - ret = 0; - } - } else { - ret = -EBUSY; - goto out; - } - - MAKE_REAL_PATH (real_path, this, loc->path); - dir = opendir (real_path); - if (dir == NULL) { - ret = -errno; - goto out; - } - - while ((entry = readdir (dir))) { - if ((!IS_BDB_PRIVATE_FILE(entry->d_name)) && - (!IS_DOT_DOTDOT(entry->d_name))) { - ret = 0; - break; - }/* if(!IS_BDB_PRIVATE_FILE()) */ - } /* while(true) */ - closedir (dir); -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - return ret; -} - -int32_t -bdb_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - STACK_UNWIND (frame, -1, EXDEV, NULL); - return 0; -} - -int32_t -bdb_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - STACK_UNWIND (frame, -1, EXDEV, NULL, NULL); - return 0; -} - -int32_t -is_space_left (xlator_t *this, - size_t size) -{ - struct bdb_private *private = this->private; - struct statvfs stbuf = {0,}; - int32_t ret = -1; - fsblkcnt_t req_blocks = 0; - fsblkcnt_t usable_blocks = 0; - - ret = statvfs (private->export_path, &stbuf); - if (ret != 0) { - ret = 0; - } else { - req_blocks = (size / stbuf.f_frsize) + 1; - - usable_blocks = (stbuf.f_bfree - BDB_ENOSPC_THRESHOLD); - - if (req_blocks < usable_blocks) - ret = 1; - else - ret = 0; - } - - return ret; -} - -int32_t -bdb_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - char *db_path = NULL; - struct stat stbuf = {0,}; - bctx_t *bctx = NULL; - struct bdb_private *private = NULL; - char *key_string = NULL; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - private = this->private; - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): EINVAL" - "(database file missing)", - loc->parent->ino, loc->name, loc->path); - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = bdb_db_icreate (bctx, key_string); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): ENOMEM" - "(failed to create database entry)", - loc->parent->ino, loc->name, loc->path); - op_errno = EINVAL; /* TODO: errno sari illa */ - goto out; - } - - /* create successful */ - bfd = CALLOC (1, sizeof (*bfd)); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): ENOMEM" - "(failed to allocate memory for internal fd context)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - /* NOTE: bdb_get_bctx_from () returns bctx with a ref */ - bfd->ctx = bctx; - bfd->key = strdup (key_string); - if (bfd->key == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd->key)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - BDB_FCTX_SET (fd, this, bfd); - - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_mode = private->file_mode; - stbuf.st_size = 0; - stbuf.st_nlink = 1; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - op_ret = 0; - op_errno = 0; -out: - STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); - - return 0; -} - - -/* bdb_open - * - * as input parameters bdb_open gets the file name, i.e key. bdb_open should - * effectively - * do: store key, open storage db, store storage-db pointer. - * - */ -int32_t -bdb_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - char *key_string = NULL; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPEN %"PRId64" (%s): ENOMEM" - "(failed to lookup database handle)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - bfd = CALLOC (1, sizeof (*bfd)); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPEN %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd context)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - /* NOTE: bctx_parent () returns bctx with a ref */ - bfd->ctx = bctx; - - MAKE_KEY_FROM_PATH (key_string, loc->path); - bfd->key = strdup (key_string); - if (bfd->key == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPEN %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd->key)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - BDB_FCTX_SET (fd, this, bfd); - op_ret = 0; -out: - STACK_UNWIND (frame, op_ret, op_errno, fd); - - return 0; -} - -int32_t -bdb_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct iovec vec = {0,}; - struct stat stbuf = {0,}; - struct bdb_fd *bfd = NULL; - char *db_path = NULL; - int32_t read_size = 0; - struct iobref *iobref = NULL; - struct iobuf *iobuf = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino, size, offset); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EINVAL" - "(database file missing)", - fd->inode->ino, size, offset); - goto out; - } - - iobuf = iobuf_get (this->ctx->iobuf_pool); - if (!iobuf) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - /* we are ready to go */ - op_ret = bdb_db_fread (bfd, iobuf->ptr, size, offset); - read_size = op_ret; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD" - "(failed to find entry in database)", - fd->inode->ino, size, offset); - op_ret = -1; - op_errno = ENOENT; - goto out; - } else if (op_ret == 0) { - goto out; - } - - iobref = iobref_new (); - if (iobref == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (size < read_size) { - op_ret = size; - read_size = size; - } - - iobref_add (iobref, iobuf); - - vec.iov_base = iobuf->ptr; - vec.iov_len = read_size; - - stbuf.st_ino = fd->inode->ino; - stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - op_ret = size; -out: - STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf, iobuf); - - if (iobref) - iobref_unref (iobref); - - if (iobuf) - iobuf_unref (iobuf); - - return 0; -} - - -int32_t -bdb_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t offset, - struct iobref *iobref) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct stat stbuf = {0,}; - struct bdb_fd *bfd = NULL; - int32_t idx = 0; - off_t c_off = offset; - int32_t c_ret = -1; - char *db_path = NULL; - size_t total_size = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - GF_VALIDATE_OR_GOTO (this->name, vector, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "WRITEV %"PRId64" - %"PRId32",%"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino, count, offset); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "WRITEV %"PRId64" - %"PRId32",%"PRId64": EINVAL" - "(database file missing)", - fd->inode->ino, count, offset); - goto out; - } - - for (idx = 0; idx < count; idx++) - total_size += vector[idx].iov_len; - - if (!is_space_left (this, total_size)) { - gf_log (this->name, GF_LOG_ERROR, - "WRITEV %"PRId64" - %"PRId32" (%"GF_PRI_SIZET"),%" - PRId64": ENOSPC " - "(not enough space after internal measurement)", - fd->inode->ino, count, total_size, offset); - op_ret = -1; - op_errno = ENOSPC; - goto out; - } - - /* we are ready to go */ - for (idx = 0; idx < count; idx++) { - c_ret = bdb_db_fwrite (bfd, vector[idx].iov_base, - vector[idx].iov_len, c_off); - if (c_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "WRITEV %"PRId64" - %"PRId32",%"PRId64": EINVAL" - "(database write at %"PRId64" failed)", - fd->inode->ino, count, offset, c_off); - break; - } else { - c_off += vector[idx].iov_len; - } - op_ret += vector[idx].iov_len; - } /* for(idx=0;...)... */ - - if (c_ret) { - /* write failed after a point, not an error */ - stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, - stbuf.st_blksize); - goto out; - } - - /* NOTE: we want to increment stbuf->st_size, as stored in db */ - stbuf.st_size = op_ret; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - op_errno = 0; - -out: - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - return 0; -} - -int32_t -bdb_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "FLUSH %"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - /* do nothing */ - op_ret = 0; - op_errno = 0; - -out: - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -bdb_release (xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EBADFD; - struct bdb_fd *bfd = NULL; - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASE %"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - bctx_unref (bfd->ctx); - bfd->ctx = NULL; - - if (bfd->key) - FREE (bfd->key); /* we did strdup() in bdb_open() */ - FREE (bfd); - op_ret = 0; - op_errno = 0; - -out: - return 0; -}/* bdb_release */ - - -int32_t -bdb_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t datasync) -{ - STACK_UNWIND (frame, 0, 0); - return 0; -}/* bdb_fsync */ - -static int gf_bdb_lk_log; - -int32_t -bdb_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct flock *lock) -{ - struct flock nullock = {0, }; - - if (BDB_TIMED_LOG (ENOTSUP, gf_bdb_lk_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "LK %"PRId64": ENOTSUP " - "(load \"features/locks\" translator to enable " - "lock support)", - fd->inode->ino); - } - - STACK_UNWIND (frame, -1, ENOTSUP, &nullock); - return 0; -}/* bdb_lk */ - -/* bdb_lookup - * - * there are four possibilities for a file being looked up: - * 1. file exists and is a directory. - * 2. file exists and is a symlink. - * 3. file exists and is a regular file. - * 4. file does not exist. - * case 1 and 2 are handled by doing lstat() on the @loc. if the file is a - * directory or symlink, lstat() succeeds. lookup continues to check if the - * @loc belongs to case-3 only if lstat() fails. - * to check for case 3, bdb_lookup does a bdb_db_iread() for the given @loc. - * (see description of bdb_db_iread() for more details on how @loc is transformed - * into db handle and key). if check for case 1, 2 and 3 fail, we proceed to - * conclude that file doesn't exist (case 4). - * - * @frame: call frame. - * @this: xlator_t of this instance of bdb xlator. - * @loc: loc_t specifying the file to operate upon. - * @need_xattr: if need_xattr != 0, we are asked to return all the extended - * attributed of @loc, if any exist, in a dictionary. if @loc is a regular - * file and need_xattr is set, then we look for value of need_xattr. if - * need_xattr > sizo-of-the-file @loc, then the file content of @loc is - * returned in dictionary of xattr with 'glusterfs.content' as dictionary key. - * - * NOTE: bdb currently supports only directories, symlinks and regular files. - * - * NOTE: bdb_lookup returns the 'struct stat' of underlying file itself, in - * case of directory and symlink (st_ino is modified as bdb allocates its own - * set of inodes of all files). for regular files, bdb uses 'struct stat' of - * the database file in which the @loc is stored as templete and modifies - * st_ino (see bdb_inode_transform for more details), st_mode (can be set in - * volfile 'option file-mode <mode>'), st_size (exact size of the @loc - * contents), st_blocks (block count on the underlying filesystem to - * accomodate st_size, see BDB_COUNT_BLOCKS in bdb.h for more details). - */ -int32_t -bdb_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - struct stat stbuf = {0, }; - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - dict_t *xattr = NULL; - char *pathname = NULL; - char *directory = NULL; - char *real_path = NULL; - bctx_t *bctx = NULL; - char *db_path = NULL; - struct bdb_private *private = NULL; - char *key_string = NULL; - int32_t entry_size = 0; - char *file_content = NULL; - uint64_t need_xattr = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - private = this->private; - - MAKE_REAL_PATH (real_path, this, loc->path); - - pathname = strdup (loc->path); - GF_VALIDATE_OR_GOTO (this->name, pathname, out); - - directory = dirname (pathname); - GF_VALIDATE_OR_GOTO (this->name, directory, out); - - if (!strcmp (directory, loc->path)) { - /* SPECIAL CASE: looking up root */ - op_ret = lstat (real_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - /* bctx_lookup() returns NULL only when its time to wind up, - * we should shutdown functioning */ - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64" (%s): ENOMEM" - "(failed to lookup database handle)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - stbuf.st_ino = 1; - stbuf.st_mode = private->dir_mode; - - op_ret = 0; - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = lstat (real_path, &stbuf); - if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))){ - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (loc->ino) { - /* revalidating directory inode */ - stbuf.st_ino = loc->ino; - } else { - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - } - stbuf.st_mode = private->dir_mode; - - op_ret = 0; - goto out; - - } else if (op_ret == 0) { - /* a symlink */ - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (loc->ino) { - stbuf.st_ino = loc->ino; - } else { - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - } - - stbuf.st_mode = private->symlink_mode; - - op_ret = 0; - goto out; - - } - - /* for regular files */ - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle for parent)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (GF_FILE_CONTENT_REQUESTED(xattr_req, &need_xattr)) { - entry_size = bdb_db_iread (bctx, key_string, &file_content); - } else { - entry_size = bdb_db_iread (bctx, key_string, NULL); - } - - op_ret = entry_size; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOENT" - "(database entry not found)", - loc->parent->ino, loc->name, loc->path); - op_errno = ENOENT; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): %s", - loc->parent->ino, loc->name, loc->path, - strerror (op_errno)); - goto out; - } - - if (entry_size - && (need_xattr >= entry_size) - && (file_content)) { - xattr = dict_new (); - op_ret = dict_set_dynptr (xattr, "glusterfs.content", - file_content, entry_size); - if (op_ret < 0) { - /* continue without giving file contents */ - FREE (file_content); - } - } else { - if (file_content) - FREE (file_content); - } - - if (loc->ino) { - /* revalidate */ - stbuf.st_ino = loc->ino; - stbuf.st_size = entry_size; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, - stbuf.st_blksize); - } else { - /* fresh lookup, create an inode number */ - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_size = entry_size; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, - stbuf.st_blksize); - }/* if(inode->ino)...else */ - stbuf.st_nlink = 1; - stbuf.st_mode = private->file_mode; - - op_ret = 0; -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - if (pathname) - free (pathname); - - if (xattr) - dict_ref (xattr); - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf, xattr); - - if (xattr) - dict_unref (xattr); - - return 0; - -}/* bdb_lookup */ - -int32_t -bdb_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - - struct stat stbuf = {0,}; - char *real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct bdb_private *private = NULL; - char *db_path = NULL; - bctx_t *bctx = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - private = this->private; - GF_VALIDATE_OR_GOTO (this->name, private, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = lstat (real_path, &stbuf); - op_errno = errno; - if (op_ret == 0) { - /* directory or symlink */ - stbuf.st_ino = loc->inode->ino; - if (S_ISDIR(stbuf.st_mode)) - stbuf.st_mode = private->dir_mode; - else - stbuf.st_mode = private->symlink_mode; - /* we are done, lets unwind the stack */ - goto out; - } - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "STAT %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "STAT %"PRId64" (%s): %s" - "(failed to stat on database file)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - stbuf.st_size = bdb_db_iread (bctx, loc->path, NULL); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - stbuf.st_ino = loc->inode->ino; - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - - return 0; -}/* bdb_stat */ - - - -/* bdb_opendir - in the world of bdb, open/opendir is all about opening - * correspondind databases. opendir in particular, opens the database for the - * directory which is to be opened. after opening the database, a cursor to - * the database is also created. cursor helps us get the dentries one after - * the other, and cursor maintains the state about current positions in - * directory. pack 'pointer to db', 'pointer to the cursor' into - * struct bdb_dir and store it in fd->ctx, we get from our parent xlator. - * - * @frame: call frame - * @this: our information, as we filled during init() - * @loc: location information - * @fd: file descriptor structure (glusterfs internal) - * - * return value - immaterial, async call. - * - */ -int32_t -bdb_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - char *real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - struct bdb_dir *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): ENOMEM" - "(no database handle for directory)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - bfd = CALLOC (1, sizeof (*bfd)); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - bfd->dir = opendir (real_path); - if (bfd->dir == NULL) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - /* NOTE: bctx_lookup() return bctx with ref */ - bfd->ctx = bctx; - - bfd->path = strdup (real_path); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd->path)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - BDB_FCTX_SET (fd, this, bfd); - op_ret = 0; -out: - STACK_UNWIND (frame, op_ret, op_errno, fd); - return 0; -err: - if (bctx) - bctx_unref (bctx); - if (bfd) { - if (bfd->dir) - closedir (bfd->dir); - - FREE (bfd); - } - - return 0; -}/* bdb_opendir */ - -int32_t -bdb_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off, - int32_t flag) -{ - struct bdb_dir *bfd = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - size_t filled = 0; - dir_entry_t entries = {0, }; - dir_entry_t *this_entry = NULL; - char *entry_path = NULL; - struct dirent *dirent = NULL; - off_t in_case = 0; - int32_t this_size = 0; - DBC *cursorp = NULL; - int32_t ret = -1; - int32_t real_path_len = 0; - int32_t entry_path_len = 0; - int32_t count = 0; - off_t offset = 0; - size_t tmp_name_len = 0; - struct stat db_stbuf = {0,}; - struct stat buf = {0,}; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " %o: EBADFD " - "(failed to find internal context in fd)", - fd->inode->ino, size, off, flag); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - op_ret = bdb_cursor_open (bfd->ctx, &cursorp); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - ": EBADFD " - "(failed to open cursor to database handle)", - fd->inode->ino, size, off); - op_errno = EBADFD; - goto out; - } - - if (off) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - sec.data = &(off); - sec.size = sizeof (off); - sec.flags = DB_DBT_USERMEM; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_SET); - if (op_ret == DB_NOTFOUND) { - offset = off; - goto dir_read; - } - } - - while (filled <= size) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - - this_entry = NULL; - - sec.flags = DB_DBT_MALLOC; - pri.flags = DB_DBT_MALLOC; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_NEXT); - - if (op_ret == DB_NOTFOUND) { - /* we reached end of the directory */ - op_ret = 0; - op_errno = 0; - break; - } else if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET - ",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, size, off); - op_errno = ENOENT; - break; - } /* if (op_ret == DB_NOTFOUND)...else if...else */ - - if (pri.data == NULL) { - /* NOTE: currently ignore when we get key.data == NULL. - * FIXME: we should not get key.data = NULL */ - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET - ",%"PRId64":" - "(null key read for entry from database)", - fd->inode->ino, size, off); - continue; - }/* if(key.data)...else */ - - this_entry = CALLOC (1, sizeof (*this_entry)); - if (this_entry == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an entry)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - this_entry->name = CALLOC (pri.size + 1, sizeof (char)); - if (this_entry->name == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an " - "entry->name)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - memcpy (this_entry->name, pri.data, pri.size); - this_entry->buf = db_stbuf; - this_entry->buf.st_size = bdb_db_iread (bfd->ctx, - this_entry->name, NULL); - this_entry->buf.st_blocks = BDB_COUNT_BLOCKS ( - this_entry->buf.st_size, - this_entry->buf.st_blksize); - - this_entry->buf.st_ino = bdb_inode_transform (fd->inode->ino, - pri.data, - pri.size); - count++; - - this_entry->next = entries.next; - this_entry->link = ""; - entries.next = this_entry; - /* if size is 0, count can never be = size, - * so entire dir is read */ - if (sec.data) - FREE (sec.data); - - if (pri.data) - FREE (pri.data); - - if (count == size) - break; - }/* while */ - bdb_cursor_close (bfd->ctx, cursorp); - op_ret = count; - op_errno = 0; - if (count >= size) - goto out; -dir_read: - /* hungry kyaa? */ - if (!offset) { - rewinddir (bfd->dir); - } else { - seekdir (bfd->dir, offset); - } - - while (filled <= size) { - this_entry = NULL; - this_size = 0; - - in_case = telldir (bfd->dir); - dirent = readdir (bfd->dir); - if (!dirent) - break; - - if (IS_BDB_PRIVATE_FILE(dirent->d_name)) - continue; - - tmp_name_len = strlen (dirent->d_name); - if (entry_path_len < (real_path_len + 1 + (tmp_name_len) + 1)) { - entry_path_len = real_path_len + tmp_name_len + 1024; - entry_path = realloc (entry_path, entry_path_len); - if (entry_path == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET"," - "%"PRId64" - %s: (failed to allocate " - "memory for an entry_path)", - fd->inode->ino, size, off, - strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - } - - strncpy (&entry_path[real_path_len+1], dirent->d_name, - tmp_name_len); - op_ret = stat (entry_path, &buf); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - " (failed to stat on an entry '%s')", - fd->inode->ino, size, off, - strerror (errno), entry_path); - goto out; /* FIXME: shouldn't we continue here */ - } - - if ((flag == GF_GET_DIR_ONLY) && - ((ret != -1) && (!S_ISDIR(buf.st_mode)))) { - continue; - } - - this_entry = CALLOC (1, sizeof (*this_entry)); - if (this_entry == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an entry)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - this_entry->name = strdup (dirent->d_name); - if (this_entry->name == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an " - "entry->name)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - this_entry->buf = buf; - - this_entry->buf.st_ino = -1; - if (S_ISLNK(this_entry->buf.st_mode)) { - char linkpath[ZR_PATH_MAX] = {0,}; - ret = readlink (entry_path, linkpath, ZR_PATH_MAX); - if (ret != -1) { - linkpath[ret] = '\0'; - this_entry->link = strdup (linkpath); - } - } else { - this_entry->link = ""; - } - - count++; - - this_entry->next = entries.next; - entries.next = this_entry; - - /* if size is 0, count can never be = size, - * so entire dir is read */ - if (count == size) - break; - } - op_ret = filled; - op_errno = 0; - -out: - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET" (%"PRId32")" - "/%"GF_PRI_SIZET",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, filled, count, size, off); - - STACK_UNWIND (frame, count, op_errno, &entries); - - while (entries.next) { - this_entry = entries.next; - entries.next = entries.next->next; - FREE (this_entry->name); - FREE (this_entry); - } - - return 0; -}/* bdb_getdents */ - - -int32_t -bdb_releasedir (xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = 0; - int32_t op_errno = 0; - struct bdb_dir *bfd = NULL; - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": EBADFD", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - if (bfd->path) { - free (bfd->path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": (bfd->path is NULL)", - fd->inode->ino); - } - - if (bfd->dir) { - closedir (bfd->dir); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": (bfd->dir is NULL)", - fd->inode->ino); - } - - if (bfd->ctx) { - bctx_unref (bfd->ctx); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": (bfd->ctx is NULL)", - fd->inode->ino); - } - - free (bfd); - -out: - return 0; -}/* bdb_releasedir */ - - -int32_t -bdb_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - char *dest = NULL; - int32_t op_ret = -1; - int32_t op_errno = EPERM; - char *real_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - dest = alloca (size + 1); - GF_VALIDATE_OR_GOTO (this->name, dest, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = readlink (real_path, dest, size); - - if (op_ret > 0) - dest[op_ret] = 0; - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "READLINK %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - } -out: - STACK_UNWIND (frame, op_ret, op_errno, dest); - - return 0; -}/* bdb_readlink */ - - -int32_t -bdb_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - int32_t op_ret = -1; - int32_t ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat stbuf = {0, }; - bctx_t *bctx = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_KEY_FROM_PATH (key_string, loc->path); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = mkdir (real_path, mode); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - op_ret = chown (real_path, frame->root->uid, frame->root->gid); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s " - "(failed to do chmod)", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - op_ret = lstat (real_path, &stbuf); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s " - "(failed to do lstat)", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, key_string, - strlen (key_string)); - - goto out; - -err: - ret = rmdir (real_path); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s" - "(failed to do rmdir)", - loc->ino, loc->path, strerror (errno)); - } - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); - - return 0; -}/* bdb_mkdir */ - - -int32_t -bdb_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - char *real_path = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "UNLINK %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = bdb_db_iremove (bctx, key_string); - if (op_ret == DB_NOTFOUND) { - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = unlink (real_path); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "UNLINK %"PRId64" (%s): %s" - "(symlink unlink failed)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - } else if (op_ret == 0) { - op_errno = 0; - } -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -}/* bdb_unlink */ - - - -static int32_t -bdb_do_rmdir (xlator_t *this, - loc_t *loc) -{ - char *real_path = NULL; - int32_t ret = -1; - bctx_t *bctx = NULL; - DB_ENV *dbenv = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - dbenv = BDB_ENV(this); - GF_VALIDATE_OR_GOTO (this->name, dbenv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - ret = -ENOMEM; - goto out; - } - - LOCK(&bctx->lock); - { - if ((bctx->primary == NULL) - || (bctx->secondary == NULL)) { - goto unlock; - } - - ret = bctx->primary->close (bctx->primary, 0); - if (ret < 0) { - ret = -EINVAL; - } - - ret = bctx->secondary->close (bctx->secondary, 0); - if (ret < 0) { - ret = -EINVAL; - } - - ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, - "primary", 0); - if (ret < 0) { - ret = -EBUSY; - } - - ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, - "secondary", 0); - if (ret != 0) { - ret = -EBUSY; - } - } -unlock: - UNLOCK(&bctx->lock); - - if (ret) { - goto out; - } - ret = rmdir (real_path); - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - return ret; -} - -int32_t -bdb_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - - op_ret = is_dir_empty (this, loc); - if (op_ret < 0) { - op_errno = -op_ret; - gf_log (this->name, GF_LOG_DEBUG, - "RMDIR %"PRId64" (%s): %s" - "(internal rmdir routine returned error)", - loc->ino, loc->path, strerror (op_errno)); - } else if (op_ret == 0) { - op_ret = -1; - op_errno = ENOTEMPTY; - gf_log (this->name, GF_LOG_DEBUG, - "RMDIR %"PRId64" (%s): ENOTEMPTY", - loc->ino, loc->path); - goto out; - } - - op_ret = bdb_do_rmdir (this, loc); - if (op_ret < 0) { - op_errno = -op_ret; - gf_log (this->name, GF_LOG_DEBUG, - "RMDIR %"PRId64" (%s): %s" - "(internal rmdir routine returned error)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - -out: - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} /* bdb_rmdir */ - -int32_t -bdb_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkname, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat stbuf = {0,}; - struct bdb_private *private = NULL; - bctx_t *bctx = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, linkname, out); - - private = this->private; - GF_VALIDATE_OR_GOTO (this->name, private, out); - - MAKE_KEY_FROM_PATH (key_string, loc->path); - - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = symlink (linkname, real_path); - op_errno = errno; - if (op_ret == 0) { - op_ret = lstat (real_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "SYMLINK %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "SYMLINK %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_mode = private->symlink_mode; - - goto out; - } -err: - op_ret = unlink (real_path); - op_errno = errno; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "SYMLINK %"PRId64" (%s): %s" - "(failed to unlink the created symlink)", - loc->ino, loc->path, strerror (op_errno)); - } - op_ret = -1; - op_errno = ENOENT; -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); - - return 0; -} /* bdb_symlink */ - -static int -bdb_do_chmod (xlator_t *this, - const char *path, - struct stat *stbuf) -{ - int32_t ret = -1; - - ret = lchmod (path, stbuf->st_mode); - if ((ret == -1) && (errno == ENOSYS)) { - ret = chmod (path, stbuf->st_mode); - } - - return ret; -} - -static int -bdb_do_chown (xlator_t *this, - const char *path, - struct stat *stbuf, - int32_t valid) -{ - int32_t ret = -1; - uid_t uid = -1; - gid_t gid = -1; - - if (valid & GF_SET_ATTR_UID) - uid = stbuf->st_uid; - - if (valid & GF_SET_ATTR_GID) - gid = stbuf->st_gid; - - ret = lchown (path, uid, gid); - - return ret; -} - -static int -bdb_do_utimes (xlator_t *this, - const char *path, - struct stat *stbuf) -{ - int32_t ret = -1; - struct timeval tv[2] = {{0,},{0,}}; - - tv[0].tv_sec = stbuf->st_atime; - tv[0].tv_usec = ST_ATIM_NSEC (stbuf) / 1000; - tv[1].tv_sec = stbuf->st_mtime; - tv[1].tv_usec = ST_ATIM_NSEC (stbuf) / 1000; - - ret = lutimes (path, tv); - - return ret; -} - -int32_t -bdb_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct stat *stbuf, - int32_t valid) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat preop = {0,}; - struct stat postop = {0,}; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = lstat (real_path, &preop); - op_errno = errno; - if (op_ret != 0) { - if (op_errno == ENOENT) { - op_errno = EPERM; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "CHMOD %"PRId64" (%s): %s" - "(pre-op lstat failed)", - loc->ino, loc->path, strerror (op_errno)); - } - goto out; - } - - /* directory or symlink */ - if (valid & GF_SET_ATTR_MODE) { - op_ret = bdb_do_chmod (this, real_path, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (chmod) on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){ - op_ret = bdb_do_chown (this, real_path, stbuf, valid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (chown) on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - op_ret = bdb_do_utimes (this, real_path, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (utimes) on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - - op_ret = lstat (real_path, &postop); - op_errno = errno; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "CHMOD %"PRId64" (%s): %s" - "(post-op lstat failed)", - loc->ino, loc->path, strerror (op_errno)); - } - -out: - STACK_UNWIND (frame, op_ret, op_errno, &preop, &postop); - - return 0; -}/* bdb_setattr */ - -int32_t -bdb_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct stat *stbuf, - int32_t valid) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - struct stat preop = {0,}; - struct stat postop = {0,}; - - STACK_UNWIND (frame, op_ret, op_errno, &preop, &postop); - - return 0; -}/* bdb_fsetattr */ - - -int32_t -bdb_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat stbuf = {0,}; - char *db_path = NULL; - bctx_t *bctx = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "TRUNCATE %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH (real_path, this, loc->path); - MAKE_KEY_FROM_PATH (key_string, loc->path); - - /* now truncate */ - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "TRUNCATE %"PRId64" (%s): %s" - "(lstat on database file failed)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - if (loc->inode->ino) { - stbuf.st_ino = loc->inode->ino; - }else { - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - } - - op_ret = bdb_db_itruncate (bctx, key_string); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "TRUNCATE %"PRId64" (%s): EINVAL" - "(truncating entry in database failed - %s)", - loc->ino, loc->path, db_strerror (op_ret)); - op_errno = EINVAL; /* TODO: better errno */ - } - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - - return 0; -}/* bdb_truncate */ - - -int32_t -bdb_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) - -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct statvfs buf = {0, }; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = statvfs (real_path, &buf); - op_errno = errno; -out: - STACK_UNWIND (frame, op_ret, op_errno, &buf); - return 0; -}/* bdb_statfs */ - -static int gf_bdb_xattr_log; - -/* bdb_setxattr - set extended attributes. - * - * bdb allows setxattr operation only on directories. - * bdb reservers 'glusterfs.file.<attribute-name>' to operate on the content - * of the files under the specified directory. - * 'glusterfs.file.<attribute-name>' transforms to contents of file of name - * '<attribute-name>' under specified directory. - * - * @frame: call frame. - * @this: xlator_t of this instance of bdb xlator. - * @loc: loc_t specifying the file to operate upon. - * @dict: list of extended attributes to set on @loc. - * @flags: can be XATTR_REPLACE (replace an existing extended attribute only if - * it exists) or XATTR_CREATE (create an extended attribute only if it - * doesn't already exist). - * - * - */ -int32_t -bdb_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int flags) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - data_pair_t *trav = dict->members_list; - bctx_t *bctx = NULL; - char *real_path = NULL; - char *key = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, dict, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - if (!S_ISDIR (loc->inode->st_mode)) { - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - while (trav) { - if (GF_FILE_CONTENT_REQUEST(trav->key) ) { - key = BDB_KEY_FROM_FREQUEST_KEY(trav->key); - - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s: ENOMEM" - "(no database handle for directory)", - loc->ino, loc->path, key); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (flags & XATTR_REPLACE) { - op_ret = bdb_db_itruncate (bctx, key); - if (op_ret == -1) { - /* key doesn't exist in database */ - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s:" - " (entry not present in " - "database)", - loc->ino, loc->path, key); - op_ret = -1; - op_errno = ENOATTR; - break; - } - op_ret = bdb_db_iwrite (bctx, key, - trav->value->data, - trav->value->len); - if (op_ret != 0) { - op_ret = -1; - op_errno = ENOATTR; - break; - } - } else { - /* fresh create */ - op_ret = bdb_db_iwrite (bctx, key, - trav->value->data, - trav->value->len); - if (op_ret != 0) { - op_ret = -1; - op_errno = EEXIST; - break; - } else { - op_ret = 0; - op_errno = 0; - } /* if(op_ret!=0)...else */ - } /* if(flags&XATTR_REPLACE)...else */ - if (bctx) { - /* NOTE: bctx_unref always returns success, see - * description of bctx_unref for more details */ - bctx_unref (bctx); - } - } else { - /* do plain setxattr */ - op_ret = lsetxattr (real_path, - trav->key, trav->value->data, - trav->value->len, - flags); - op_errno = errno; - - if ((op_errno == ENOATTR) || (op_errno == EEXIST)) { - /* don't log, normal behaviour */ - ; - } else if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, trav->key, - strerror (op_errno)); - /* do not continue, break out */ - break; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, trav->key, - strerror (op_errno)); - } - } /* if(ZR_FILE_CONTENT_REQUEST())...else */ - trav = trav->next; - }/* while(trav) */ -out: - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -}/* bdb_setxattr */ - - -/* bdb_gettxattr - get extended attributes. - * - * bdb allows getxattr operation only on directories. - * bdb_getxattr retrieves the whole content of the file, when - * glusterfs.file.<attribute-name> is specified. - * - * @frame: call frame. - * @this: xlator_t of this instance of bdb xlator. - * @loc: loc_t specifying the file to operate upon. - * @name: name of extended attributes to get for @loc. - * - * NOTE: see description of bdb_setxattr for details on how - * 'glusterfs.file.<attribute-name>' is handles by bdb. - */ -int32_t -bdb_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t op_ret = 0; - int32_t op_errno = 0; - dict_t *dict = NULL; - bctx_t *bctx = NULL; - char *buf = NULL; - char *key_string = NULL; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char *real_path = NULL; - char key[1024] = {0,}; - char *value = NULL; - char *list = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, name, out); - - dict = dict_new (); - GF_VALIDATE_OR_GOTO (this->name, dict, out); - - if (!S_ISDIR (loc->inode->st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOATTR " - "(not a directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - if (name && GF_FILE_CONTENT_REQUEST(name)) { - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOMEM" - "(no database handle for directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - key_string = BDB_KEY_FROM_FREQUEST_KEY(name); - - op_ret = bdb_db_iread (bctx, key_string, &buf); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOATTR" - "(attribute not present in database)", - loc->ino, loc->path, name); - op_errno = ENOATTR; - goto out; - } - - op_ret = dict_set_dynptr (dict, (char *)name, buf, op_ret); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOATTR" - "(attribute present in database, " - "dict set failed)", - loc->ino, loc->path, name); - op_errno = ENODATA; - } - - goto out; - } - - MAKE_REAL_PATH (real_path, this, loc->path); - size = sys_llistxattr (real_path, NULL, 0); - op_errno = errno; - if (size < 0) { - if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } - op_ret = -1; - op_errno = ENOATTR; - - goto out; - } - - if (size == 0) - goto done; - - list = alloca (size + 1); - if (list == NULL) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } - - size = sys_llistxattr (real_path, list, size); - op_ret = size; - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - goto out; - } - - remaining_size = size; - list_offset = 0; - while (remaining_size > 0) { - if(*(list+list_offset) == '\0') - break; - - strcpy (key, list + list_offset); - - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) - break; - - value = CALLOC (op_ret + 1, sizeof(char)); - GF_VALIDATE_OR_GOTO (this->name, value, out); - - op_ret = sys_lgetxattr (real_path, key, value, - op_ret); - if (op_ret == -1) - break; - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, - value, op_ret); - if (op_ret < 0) { - FREE (value); - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: " - "(skipping key %s)", - loc->ino, loc->path, name, key); - continue; - } - remaining_size -= strlen (key) + 1; - list_offset += strlen (key) + 1; - } /* while(remaining_size>0) */ -done: -out: - if(bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, dict); - - if (dict) - dict_unref (dict); - - return 0; -}/* bdb_getxattr */ - - -int32_t -bdb_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - char *real_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, name, out); - - if (!S_ISDIR(loc->inode->st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR " - "(not a directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - if (GF_FILE_CONTENT_REQUEST(name)) { - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR" - "(no database handle for directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - op_ret = bdb_db_iremove (bctx, name); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR" - "(no such attribute in database)", - loc->ino, loc->path, name); - op_errno = ENOATTR; - } - goto out; - } - - MAKE_REAL_PATH(real_path, this, loc->path); - op_ret = lremovexattr (real_path, name); - op_errno = errno; - if (op_ret == -1) { - if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } - } /* if(op_ret == -1) */ -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -}/* bdb_removexattr */ - - -int32_t -bdb_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int datasync) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "FSYNCDIR %"PRId64": EBADFD" - "(failed to find internal context from fd)", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - } - -out: - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -}/* bdb_fsycndir */ - - -int32_t -bdb_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = access (real_path, mask); - op_errno = errno; - /* TODO: implement for db entries */ -out: - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -}/* bdb_access */ - - -int32_t -bdb_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - struct stat buf = {0,}; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - /* TODO: impelement */ -out: - STACK_UNWIND (frame, op_ret, op_errno, &buf); - - return 0; -} - - - -int32_t -bdb_setdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, - dir_entry_t *entries, - int32_t count) -{ - int32_t op_ret = -1, op_errno = EINVAL; - char *entry_path = NULL; - int32_t real_path_len = 0; - int32_t entry_path_len = 0; - int32_t ret = 0; - struct bdb_dir *bfd = NULL; - dir_entry_t *trav = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - GF_VALIDATE_OR_GOTO (this->name, entries, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64": EBADFD", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - real_path_len = strlen (bfd->path); - entry_path_len = real_path_len + 256; - entry_path = CALLOC (1, entry_path_len); - GF_VALIDATE_OR_GOTO (this->name, entry_path, out); - - strcpy (entry_path, bfd->path); - entry_path[real_path_len] = '/'; - - trav = entries->next; - while (trav) { - char pathname[ZR_PATH_MAX] = {0,}; - strcpy (pathname, entry_path); - strcat (pathname, trav->name); - - if (S_ISDIR(trav->buf.st_mode)) { - /* If the entry is directory, create it by calling - * 'mkdir'. If directory is not present, it will be - * created, if its present, no worries even if it fails. - */ - ret = mkdir (pathname, trav->buf.st_mode); - if ((ret == -1) && (errno != EEXIST)) { - op_errno = errno; - op_ret = ret; - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" - %s: %s " - "(mkdir failed)", - fd->inode->ino, pathname, - strerror (op_errno)); - goto loop; - } - - /* Change the mode - * NOTE: setdents tries its best to restore the state - * of storage. if chmod and chown fail, they can - * be ignored now */ - ret = chmod (pathname, trav->buf.st_mode); - if (ret < 0) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" - %s: %s " - "(chmod failed)", - fd->inode->ino, pathname, - strerror (op_errno)); - goto loop; - } - /* change the ownership */ - ret = chown (pathname, trav->buf.st_uid, - trav->buf.st_gid); - if (ret != 0) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" - %s: %s " - "(chown failed)", - fd->inode->ino, pathname, - strerror (op_errno)); - goto loop; - } - } else if ((flags == GF_SET_IF_NOT_PRESENT) || - (flags != GF_SET_DIR_ONLY)) { - /* Create a 0 byte file here */ - if (S_ISREG (trav->buf.st_mode)) { - op_ret = bdb_db_icreate (bfd->ctx, - trav->name); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" (%s) - %s: " - "%s (database entry creation" - " failed)", - fd->inode->ino, - bfd->ctx->directory, trav->name, - strerror (op_errno)); - } - } else if (S_ISLNK (trav->buf.st_mode)) { - /* TODO: impelement */; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" (%s) - %s mode=%o: " - "(unsupported file type)", - fd->inode->ino, - bfd->ctx->directory, trav->name, - trav->buf.st_mode); - } /* if(S_ISREG())...else */ - } /* if(S_ISDIR())...else if */ - loop: - /* consider the next entry */ - trav = trav->next; - } /* while(trav) */ - -out: - STACK_UNWIND (frame, op_ret, op_errno); - - FREE (entry_path); - return 0; -} - -int32_t -bdb_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct stat stbuf = {0,}; - struct bdb_fd *bfd = NULL; - bctx_t *bctx = NULL; - char *db_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "FSTAT %"PRId64": EBADFD " - "(failed to find internal context in fd)", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - bctx = bfd->ctx; - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - op_errno = errno; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "FSTAT %"PRId64": %s" - "(failed to stat database file %s)", - fd->inode->ino, strerror (op_errno), db_path); - goto out; - } - - stbuf.st_ino = fd->inode->ino; - stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - -out: - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - return 0; -} - -gf_dirent_t * -gf_dirent_for_namen (const char *name, - size_t len) -{ - char *tmp_name = NULL; - - tmp_name = alloca (len + 1); - - memcpy (tmp_name, name, len); - - tmp_name[len] = 0; - - return gf_dirent_for_name (tmp_name); -} - -int32_t -bdb_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) -{ - struct bdb_dir *bfd = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - size_t filled = 0; - gf_dirent_t *this_entry = NULL; - gf_dirent_t entries; - struct dirent *entry = NULL; - off_t in_case = 0; - int32_t this_size = 0; - DBC *cursorp = NULL; - int32_t count = 0; - off_t offset = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - INIT_LIST_HEAD (&entries.list); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD " - "(failed to find internal context in fd)", - fd->inode->ino, size, off); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - op_ret = bdb_cursor_open (bfd->ctx, &cursorp); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD " - "(failed to open cursor to database handle)", - fd->inode->ino, size, off); - op_errno = EBADFD; - goto out; - } - - if (off) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - sec.data = &(off); - sec.size = sizeof (off); - sec.flags = DB_DBT_USERMEM; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_SET); - if (op_ret == DB_NOTFOUND) { - offset = off; - goto dir_read; - } - } - - while (filled <= size) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - - this_entry = NULL; - - sec.flags = DB_DBT_MALLOC; - pri.flags = DB_DBT_MALLOC; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_NEXT); - - if (op_ret == DB_NOTFOUND) { - /* we reached end of the directory */ - op_ret = 0; - op_errno = 0; - break; - } else if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, size, off); - op_errno = ENOENT; - break; - } /* if (op_ret == DB_NOTFOUND)...else if...else */ - - if (pri.data == NULL) { - /* NOTE: currently ignore when we get key.data == NULL. - * TODO: we should not get key.data = NULL */ - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64":" - "(null key read for entry from database)", - fd->inode->ino, size, off); - continue; - }/* if(key.data)...else */ - count++; - this_size = bdb_dirent_size (&pri); - if (this_size + filled > size) - break; - /* TODO - consider endianness here */ - this_entry = gf_dirent_for_namen ((const char *)pri.data, - pri.size); - - this_entry->d_ino = bdb_inode_transform (fd->inode->ino, - pri.data, - pri.size); - this_entry->d_off = *(uint32_t *)sec.data; - this_entry->d_type = 0; - this_entry->d_len = pri.size + 1; - - if (sec.data) { - FREE (sec.data); - } - - if (pri.data) - FREE (pri.data); - - list_add_tail (&this_entry->list, &entries.list); - - filled += this_size; - }/* while */ - bdb_cursor_close (bfd->ctx, cursorp); - op_ret = filled; - op_errno = 0; - if (filled >= size) { - goto out; - } -dir_read: - /* hungry kyaa? */ - if (!offset) { - rewinddir (bfd->dir); - } else { - seekdir (bfd->dir, offset); - } - - while (filled <= size) { - this_entry = NULL; - entry = NULL; - this_size = 0; - - in_case = telldir (bfd->dir); - entry = readdir (bfd->dir); - if (!entry) - break; - - if (IS_BDB_PRIVATE_FILE(entry->d_name)) - continue; - - this_size = dirent_size (entry); - - if (this_size + filled > size) { - seekdir (bfd->dir, in_case); - break; - } - - count++; - - this_entry = gf_dirent_for_name (entry->d_name); - this_entry->d_ino = entry->d_ino; - - this_entry->d_off = entry->d_off; - - this_entry->d_type = entry->d_type; - this_entry->d_len = entry->d_reclen; - - - list_add_tail (&this_entry->list, &entries.list); - - filled += this_size; - } - op_ret = filled; - op_errno = 0; - -out: - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET" (%"PRId32")" - "/%"GF_PRI_SIZET",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, filled, count, size, off); - - STACK_UNWIND (frame, count, op_errno, &entries); - - gf_dirent_free (&entries); - - return 0; -} - - -int32_t -bdb_stats (call_frame_t *frame, - xlator_t *this, - int32_t flags) - -{ - int32_t op_ret = 0; - int32_t op_errno = 0; - - struct xlator_stats xlstats = {0, }, *stats = NULL; - struct statvfs buf = {0,}; - struct timeval tv; - struct bdb_private *private = NULL; - int64_t avg_read = 0; - int64_t avg_write = 0; - int64_t _time_ms = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - - private = (struct bdb_private *)(this->private); - stats = &xlstats; - - op_ret = statvfs (private->export_path, &buf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "STATS %s: %s", - private->export_path, strerror (op_errno)); - goto out; - } - - stats->nr_files = private->stats.nr_files; - - /* client info is maintained at FSd */ - stats->nr_clients = private->stats.nr_clients; - - /* Number of Free block in the filesystem. */ - stats->free_disk = buf.f_bfree * buf.f_bsize; - stats->total_disk_size = buf.f_blocks * buf.f_bsize; /* */ - stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; - - /* Calculate read and write usage */ - gettimeofday (&tv, NULL); - - /* Read */ - _time_ms = (tv.tv_sec - private->init_time.tv_sec) * 1000 + - ((tv.tv_usec - private->init_time.tv_usec) / 1000); - - avg_read = (_time_ms) ? (private->read_value / _time_ms) : 0;/* KBps */ - avg_write = (_time_ms) ? (private->write_value / _time_ms) : 0; - - _time_ms = (tv.tv_sec - private->prev_fetch_time.tv_sec) * 1000 + - ((tv.tv_usec - private->prev_fetch_time.tv_usec) / 1000); - if (_time_ms - && ((private->interval_read / _time_ms) > private->max_read)) { - private->max_read = (private->interval_read / _time_ms); - } - if (_time_ms - && ((private->interval_write / _time_ms) > private->max_write)) { - private->max_write = private->interval_write / _time_ms; - } - - stats->read_usage = avg_read / private->max_read; - stats->write_usage = avg_write / private->max_write; - - gettimeofday (&(private->prev_fetch_time), NULL); - private->interval_read = 0; - private->interval_write = 0; - -out: - STACK_UNWIND (frame, op_ret, op_errno, stats); - return 0; -} - - -int32_t -bdb_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *lock) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - - -int32_t -bdb_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *lock) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - - -int32_t -bdb_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - - -int32_t -bdb_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - -int32_t -bdb_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - char *real_path = NULL; - DIR *dir = NULL; - struct dirent *dirent = NULL; - uint8_t file_checksum[NAME_MAX] = {0,}; - uint8_t dir_checksum[NAME_MAX] = {0,}; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - int32_t idx = 0, length = 0; - bctx_t *bctx = NULL; - DBC *cursorp = NULL; - char *data = NULL; - uint8_t no_break = 1; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - { - dir = opendir (real_path); - op_errno = errno; - GF_VALIDATE_OR_GOTO (this->name, dir, out); - while ((dirent = readdir (dir))) { - if (!dirent) - break; - - if (IS_BDB_PRIVATE_FILE(dirent->d_name)) - continue; - - length = strlen (dirent->d_name); - for (idx = 0; idx < length; idx++) - dir_checksum[idx] ^= dirent->d_name[idx]; - } /* while((dirent...)) */ - closedir (dir); - } - - { - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CHECKSUM %"PRId64" (%s): ENOMEM" - "(failed to lookup database handle)", - loc->inode->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - op_ret = bdb_cursor_open (bctx, &cursorp); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "CHECKSUM %"PRId64" (%s): EBADFD" - "(failed to open cursor to database handle)", - loc->inode->ino, loc->path); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - - do { - DBT key = {0,}, value = {0,}, sec = {0,}; - - key.flags = DB_DBT_MALLOC; - value.doff = 0; - value.dlen = 0; - op_ret = bdb_cursor_get (cursorp, &sec, &key, - &value, DB_NEXT); - - if (op_ret == DB_NOTFOUND) { - op_ret = 0; - op_errno = 0; - no_break = 0; - } else if (op_ret == 0){ - /* successfully read */ - data = key.data; - length = key.size; - for (idx = 0; idx < length; idx++) - file_checksum[idx] ^= data[idx]; - - FREE (key.data); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "CHECKSUM %"PRId64" (%s)", - loc->inode->ino, loc->path); - op_ret = -1; - op_errno = ENOENT; /* TODO: watch errno */ - no_break = 0; - }/* if(op_ret == DB_NOTFOUND)...else if...else */ - } while (no_break); - bdb_cursor_close (bctx, cursorp); - } -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); - - return 0; -} - -/** - * notify - when parent sends PARENT_UP, send CHILD_UP event from here - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - switch (event) - { - case GF_EVENT_PARENT_UP: - { - /* Tell the parent that bdb xlator is up */ - assert ((this->private != NULL) && - (BDB_ENV(this) != NULL)); - default_notify (this, GF_EVENT_CHILD_UP, data); - } - break; - default: - /* */ - break; - } - return 0; -} - - - -/** - * init - - */ -int32_t -init (xlator_t *this) -{ - int32_t ret = -1; - struct stat buf = {0,}; - struct bdb_private *_private = NULL; - char *directory = NULL; - bctx_t *bctx = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", this, out); - - if (this->children) { - gf_log (this->name, GF_LOG_ERROR, - "'storage/bdb' translator should be used as leaf node " - "in translator tree. please remove the subvolumes" - " specified and retry."); - goto err; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_ERROR, - "'storage/bdb' translator needs at least one among " - "'protocol/server' or 'mount/fuse' translator as " - "parent. please add 'protocol/server' or 'mount/fuse' " - "as parent of 'storage/bdb' and retry. or you can also" - " try specifying mount-point on command-line."); - goto err; - } - - _private = CALLOC (1, sizeof (*_private)); - if (_private == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "could not allocate memory for 'storage/bdb' " - "configuration data-structure. cannot continue from " - "here"); - goto err; - } - - - ret = dict_get_str (this->options, "directory", &directory); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "'storage/bdb' needs at least " - "'option directory <path-to-export-directory>' as " - "minimal configuration option. please specify an " - "export directory using " - "'option directory <path-to-export-directory>' and " - "retry."); - goto err; - } - - umask (000); /* umask `masking' is done at the client side */ - - /* Check whether the specified directory exists, if not create it. */ - ret = stat (directory, &buf); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "specified export path '%s' does not exist. " - "please create the export path '%s' and retry.", - directory, directory); - goto err; - } else if (!S_ISDIR (buf.st_mode)) { - gf_log (this->name, GF_LOG_ERROR, - "specified export path '%s' is not a directory. " - "please specify a valid and existing directory as " - "export directory and retry.", - directory); - goto err; - } else { - ret = 0; - } - - - _private->export_path = strdup (directory); - if (_private->export_path == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "could not allocate memory for 'storage/bdb' " - "configuration data-structure. cannot continue from " - "here"); - goto err; - } - - _private->export_path_length = strlen (_private->export_path); - - { - /* Stats related variables */ - gettimeofday (&_private->init_time, NULL); - gettimeofday (&_private->prev_fetch_time, NULL); - _private->max_read = 1; - _private->max_write = 1; - } - - this->private = (void *)_private; - - { - ret = bdb_db_init (this, this->options); - - if (ret < 0){ - gf_log (this->name, GF_LOG_ERROR, - "database environment initialisation failed. " - "manually run database recovery tool and " - "retry to run glusterfs"); - goto err; - } else { - bctx = bctx_lookup (_private->b_table, "/"); - /* NOTE: we are not doing bctx_unref() for root bctx, - * let it remain in active list forever */ - if (bctx == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "could not allocate memory for " - "'storage/bdb' configuration data-" - "structure. cannot continue from " - "here"); - goto err; - } else { - ret = 0; - goto out; - } - } - } -err: - if (_private) { - if (_private->export_path) - FREE (_private->export_path); - - FREE (_private); - } -out: - return ret; -} - -void -bctx_cleanup (struct list_head *head) -{ - bctx_t *trav = NULL; - bctx_t *tmp = NULL; - DB *storage = NULL; - DB *secondary = NULL; - - list_for_each_entry_safe (trav, tmp, head, list) { - LOCK (&trav->lock); - { - storage = trav->primary; - trav->primary = NULL; - - secondary = trav->secondary; - trav->secondary = NULL; - - list_del_init (&trav->list); - } - UNLOCK (&trav->lock); - - if (storage) { - storage->close (storage, 0); - storage = NULL; - } - - if (secondary) { - secondary->close (secondary, 0); - secondary = NULL; - } - } - return; -} - -void -fini (xlator_t *this) -{ - struct bdb_private *private = NULL; - int32_t ret = 0; - - private = this->private; - - if (B_TABLE(this)) { - /* close all the dbs from lru list */ - bctx_cleanup (&(B_TABLE(this)->b_lru)); - bctx_cleanup (&(B_TABLE(this)->active)); - - if (BDB_ENV(this)) { - LOCK (&private->active_lock); - { - private->active = 0; - } - UNLOCK (&private->active_lock); - - ret = pthread_join (private->checkpoint_thread, NULL); - if (ret != 0) { - gf_log (this->name, GF_LOG_CRITICAL, - "could not complete checkpointing " - "database environment. this might " - "result in inconsistencies in few" - " recent data and meta-data " - "operations"); - } - - BDB_ENV(this)->close (BDB_ENV(this), 0); - } else { - /* impossible to reach here */ - } - - FREE (B_TABLE(this)); - } - FREE (private); - return; -} - -struct xlator_mops mops = { - .stats = bdb_stats, -}; - -struct xlator_fops fops = { - .lookup = bdb_lookup, - .stat = bdb_stat, - .opendir = bdb_opendir, - .readdir = bdb_readdir, - .readlink = bdb_readlink, - .mknod = bdb_mknod, - .mkdir = bdb_mkdir, - .unlink = bdb_unlink, - .rmdir = bdb_rmdir, - .symlink = bdb_symlink, - .rename = bdb_rename, - .link = bdb_link, - .truncate = bdb_truncate, - .create = bdb_create, - .open = bdb_open, - .readv = bdb_readv, - .writev = bdb_writev, - .statfs = bdb_statfs, - .flush = bdb_flush, - .fsync = bdb_fsync, - .setxattr = bdb_setxattr, - .getxattr = bdb_getxattr, - .removexattr = bdb_removexattr, - .fsyncdir = bdb_fsyncdir, - .access = bdb_access, - .ftruncate = bdb_ftruncate, - .fstat = bdb_fstat, - .lk = bdb_lk, - .inodelk = bdb_inodelk, - .finodelk = bdb_finodelk, - .entrylk = bdb_entrylk, - .fentrylk = bdb_fentrylk, - .setdents = bdb_setdents, - .getdents = bdb_getdents, - .checksum = bdb_checksum, - .setattr = bdb_setattr, - .fsetattr = bdb_fsetattr, -}; - -struct xlator_cbks cbks = { - .release = bdb_release, - .releasedir = bdb_releasedir -}; - - -struct volume_options options[] = { - { .key = { "directory" }, - .type = GF_OPTION_TYPE_PATH, - .description = "export directory" - }, - { .key = { "logdir" }, - .type = GF_OPTION_TYPE_PATH, - .description = "directory to be used by libdb for writing" - "transaction logs. NOTE: in absence of 'logdir' " - "export directory itself will be used as 'logdir' also" - }, - { .key = { "errfile" }, - .type = GF_OPTION_TYPE_PATH, - .description = "path to be used for libdb error logging. " - "NOTE: absence of 'errfile' will disable any " - "error logging by libdb." - }, - { .key = { "dir-mode" }, - .type = GF_OPTION_TYPE_ANY /* base 8 number */ - }, - { .key = { "file-mode" }, - .type = GF_OPTION_TYPE_ANY, - .description = "file mode for regular files. stat() on a regular file" - " returns the mode specified by this option. " - "NOTE: specify value in octal" - }, - { .key = { "page-size" }, - .type = GF_OPTION_TYPE_SIZET, - .min = 512, - .max = 16384, - .description = "size of pages used to hold data by libdb. set it to " - "block size of exported filesystem for " - "optimal performance" - }, - { .key = { "open-db-lru-limit" }, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 2048, - .description = "maximum number of per directory databases that can " - "be kept open. NOTE: for _advanced_ users only." - }, - { .key = { "lock-timeout" }, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 4260000, - .description = "define the maximum time a lock request can " - "be blocked by libdb. NOTE: only for _advanced_ users." - " do not specify this option when not sure." - }, - { .key = { "checkpoint-interval" }, - .type = GF_OPTION_TYPE_TIME, - .min = 1, - .max = 86400, - .description = "define the time interval between two consecutive " - "libdb checpoints. setting to lower value will leave " - "bdb perform slowly, but guarantees that minimum data" - " will be lost in case of a crash. NOTE: this option " - "is valid only when " - "'option mode=\"persistent\"' is set." - }, - { .key = { "transaction-timeout" }, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 4260000, - .description = "maximum time for which a transaction can block " - "waiting for required resources." - }, - { .key = { "mode" }, - .type = GF_OPTION_TYPE_BOOL, - .value = { "cache", "persistent" }, - .description = "cache: data recovery is not guaranteed in case " - "of crash. persistent: data recovery is guaranteed, " - "since all operations are transaction protected." - }, - { .key = { "access-mode" }, - .type = GF_OPTION_TYPE_STR, - .value = {"btree", "hash" }, - .description = "chose the db access method. " - "NOTE: for _advanced_ users. leave the choice to " - "glusterfs when in doubt." - }, - { .key = { NULL } } -}; diff --git a/xlators/storage/bdb/src/bdb.h b/xlators/storage/bdb/src/bdb.h deleted file mode 100644 index cfe1c9b5555..00000000000 --- a/xlators/storage/bdb/src/bdb.h +++ /dev/null @@ -1,530 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _BDB_H -#define _BDB_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdio.h> -#include <dirent.h> -#include <unistd.h> -#include <sys/types.h> -#include <dirent.h> - -#include <db.h> - -#ifdef linux -#ifdef __GLIBC__ -#include <sys/fsuid.h> -#else -#include <unistd.h> -#endif -#endif - -#ifdef HAVE_SYS_XATTR_H -#include <sys/xattr.h> -#endif - -#ifdef HAVE_SYS_EXTATTR_H -#include <sys/extattr.h> -#endif - -#include <pthread.h> -#include "xlator.h" -#include "inode.h" -#include "compat.h" -#include "compat-errno.h" -#include "fd.h" -#include "syscall.h" - -#define BDB_STORAGE "/glusterfs_storage.db" - -/* numbers are not so reader-friendly, so lets have ON and OFF macros */ -#define ON 1 -#define OFF 0 - -#define BDB_DEFAULT_LRU_LIMIT 100 -#define BDB_DEFAULT_HASH_SIZE 100 - -#define BDB_ENOSPC_THRESHOLD 25600 - -#define BDB_DEFAULT_CHECKPOINT_INTERVAL 30 - -#define BCTX_ENV(bctx) (bctx->table->dbenv) - -#define BDB_EXPORT_PATH_LEN(_private) \ - (((struct bdb_private *)_private)->export_path_length) - -#define BDB_KEY_FROM_FREQUEST_KEY(_key) (&(key[15])) - -#define BDB_EXPORT_PATH(_private) \ - (((struct bdb_private *)_private)->export_path) -/* MAKE_REAL_PATH(var,this,path) - * make the real path on the underlying file-system - * - * @var: destination to hold the real path - * @this: pointer to xlator_t corresponding to bdb xlator - * @path: path, as seen from mount-point - */ -#define MAKE_REAL_PATH(var, this, path) do { \ - int base_len = BDB_EXPORT_PATH_LEN(this->private); \ - var = alloca (strlen (path) + base_len + 2); \ - strcpy (var, BDB_EXPORT_PATH(this->private)); \ - strcpy (&var[base_len], path); \ - } while (0) - - -#define BDB_TIMED_LOG(_errno,_counter) \ - ((_errno == ENOTSUP) && (((++_counter) % GF_UNIVERSAL_ANSWER) == 1)) - -#define GF_FILE_CONTENT_REQUEST ZR_FILE_CONTENT_REQUEST - -/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path) - * make the real path to the storage-database file on file-system - * - * @var: destination to hold the real path - * @this: pointer to xlator_t corresponding to bdb xlator - * @path: path of the directory, as seen from mount-point - */ -#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do { \ - int base_len = BDB_EXPORT_PATH_LEN(this->private); \ - var = alloca (strlen (path) + \ - base_len + \ - strlen (BDB_STORAGE)); \ - strcpy (var, BDB_EXPORT_PATH(this->private)); \ - strcpy (&var[base_len], path); \ - strcat (var, BDB_STORAGE); \ - } while (0) - -/* MAKE_KEY_FROM_PATH(key,path) - * make a 'key', which we use as key in the underlying database by using - * the path - * - * @key: destination to hold the key - * @path: path to file as seen from mount-point - */ -#define MAKE_KEY_FROM_PATH(key, path) do { \ - char *tmp = alloca (strlen (path)); \ - strcpy (tmp, path); \ - key = basename (tmp); \ - }while (0); - -/* IS_BDB_PRIVATE_FILE(name) - * check if a given 'name' is bdb xlator's internal file name - * - * @name: basename of a file. - * - * bdb xlator reserves file names 'glusterfs_storage.db', - * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*' - * (used by libdb) - */ -#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) || \ - (!strcmp(name, "glusterfs_storage.db")) || \ - (!strcmp(name, "glusterfs_ns.db")) || \ - (!strncmp(name, "log.0000", 8))) - -/* check if 'name' is '.' or '..' entry */ -#define IS_DOT_DOTDOT(name) \ - ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2))) - -/* BDB_ICTX_SET(this,inode,bctx) - * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories. - * this will happen either in lookup() or mkdir(). - * - * @this: pointer xlator_t of bdb xlator. - * @inode: inode where 'struct bdb_ctx *' has to be stored. - * @bctx: a 'struct bdb_ctx *' - */ -#define BDB_ICTX_SET(_inode,_this,_bctx) do{ \ - inode_ctx_put(_inode, _this, (uint64_t)(long)_bctx); \ - }while (0); - -#define BDB_ICTX_GET(_inode,_this,_bctxp) do { \ - uint64_t tmp_bctx = 0; \ - inode_ctx_get (_inode, _this, &tmp_bctx); \ - *_bctxp = tmp_bctx; \ - }while (0); - -/* BDB_FCTX_SET(this,fd,bctx) - * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories. - * this will happen either in lookup() or mkdir(). - * - * @this: pointer xlator_t of bdb xlator. - * @inode: inode where 'struct bdb_ctx *' has to be stored. - * @bctx: a 'struct bdb_ctx *' - */ -#define BDB_FCTX_SET(_fd,_this,_bfd) do{ \ - fd_ctx_set(_fd, _this, (uint64_t)(long)_bfd); \ - }while (0); - -#define BDB_FCTX_GET(_fd,_this,_bfdp) do { \ - uint64_t tmp_bfd = 0; \ - fd_ctx_get (_fd, _this, &tmp_bfd); \ - *_bfdp = (void *)(long)tmp_bfd; \ - }while (0); - - -/* maximum number of open dbs that bdb xlator will ever have */ -#define BDB_MAX_OPEN_DBS 100 - -/* convert file size to block-count */ -#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1) - -/* file permissions, again macros are more readable */ -#define RWXRWXRWX 0777 -#define DEFAULT_FILE_MODE 0644 -#define DEFAULT_DIR_MODE 0755 - -/* see, if have a valid file permissions specification in @mode */ -#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX))) -#define IS_VALID_DIR_MODE(mode) (!(mode & (~(RWXRWXRWX))) - -/* maximum retries for a failed transactional operation */ -#define BDB_MAX_RETRIES 10 - -#define BDB_LL_PAGE_SIZE_DEFAULT 4096 -#define BDB_LL_PAGE_SIZE_MIN 4096 -#define BDB_LL_PAGE_SIZE_MAX 65536 - -#define PAGE_SIZE_IN_RANGE(_page_size) \ - ((_page_size >= BDB_LL_PAGE_SIZE_MIN) \ - && (table->page_size <= BDB_LL_PAGE_SIZE_MAX)) - -typedef struct bctx_table bctx_table_t; -typedef struct bdb_ctx bctx_t; -typedef struct bdb_cache bdb_cache_t; -typedef struct bdb_private bdb_private_t; - -struct bctx_table { - /* flags to be used for opening each database */ - uint64_t dbflags; - - /* cache: can be either ON or OFF */ - uint64_t cache; - - /* used to lock the 'struct bctx_table *' */ - gf_lock_t lock; - - /* lock for checkpointing */ - gf_lock_t checkpoint_lock; - - /* hash table of 'struct bdb_ctx' */ - struct list_head *b_hash; - - /* list of active 'struct bdb_ctx' */ - struct list_head active; - - /* lru list of inactive 'struct bdb_ctx' */ - struct list_head b_lru; - struct list_head purge; - uint32_t lru_limit; - uint32_t lru_size; - uint32_t hash_size; - - /* access mode for accessing the databases, can be DB_HASH, DB_BTREE */ - DBTYPE access_mode; - - /* DB_ENV under which every db operation is carried over */ - DB_ENV *dbenv; - int32_t transaction; - xlator_t *this; - - /* page-size of DB, DB->set_pagesize(), should be set before DB->open */ - uint64_t page_size; -}; - -struct bdb_ctx { - /* controller members */ - - /* lru list of 'struct bdb_ctx's, a bdb_ctx can exist in one of - * b_hash or lru lists */ - struct list_head list; - - /* directory 'name' hashed list of 'struct bdb_ctx's */ - struct list_head b_hash; - - struct bctx_table *table; - int32_t ref; /* reference count */ - gf_lock_t lock; /* used to lock this 'struct bdb_ctx' */ - - char *directory; /* directory path */ - - /* pointer to open database, that resides inside this directory */ - DB *primary; - DB *secondary; - uint32_t cache; /* cache ON or OFF */ - - /* per directory cache, bdb xlator's internal cache */ - struct list_head c_list; /* linked list of cached records */ - int32_t c_count; /* number of cached records */ - - /* index to hash table list, to which this ctx belongs */ - int32_t key_hash; - char *db_path; /* absolute path to db file */ -}; - -struct bdb_fd { - /* pointer to bdb_ctx of the parent directory */ - struct bdb_ctx *ctx; - - /* name of the file. NOTE: basename, not the complete path */ - char *key; - int32_t flags; /* open flags */ -}; - -struct bdb_dir { - /* pointer to bdb_ctx of this directory */ - struct bdb_ctx *ctx; - - /* open directory pointer, as returned by opendir() */ - DIR *dir; - - char *path; /* path to this directory */ -}; - -/* cache */ -struct bdb_cache { - /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */ - struct list_head c_list; - - /* name of the file this cache holds. NOTE: basename of file */ - char *key; - char *data; /* file content */ - - /* size of the file content that this cache holds */ - size_t size; -}; - - -struct bdb_private { - /* pointer to inode table that we use */ - inode_table_t *itable; - int32_t temp; /**/ - char is_stateless; /**/ - - /* path to the export directory - * (option directory <export-path>) */ - char *export_path; - - /* length of 'export_path' string */ - int32_t export_path_length; - - /* statistics */ - /* Statistics, provides activity of the server */ - struct xlator_stats stats; - - struct timeval prev_fetch_time; - struct timeval init_time; - int32_t max_read; /* */ - int32_t max_write; /* */ - - /* Used to calculate the max_read value */ - int64_t interval_read; - - /* Used to calculate the max_write value */ - int64_t interval_write; - int64_t read_value; /* Total read, from init */ - int64_t write_value; /* Total write, from init */ - - /* bdb xlator specific private data */ - - /* flags used for opening DB_ENV for this xlator */ - uint64_t envflags; - - /* flags to be used for opening each database */ - uint64_t dbflags; - - /* cache: can be either ON or OFF */ - uint64_t cache; - - /* transaction: can be either ON or OFF */ - uint32_t transaction; - uint32_t active; - gf_lock_t active_lock; - struct bctx_table *b_table; - - /* access mode for accessing the databases, can be DB_HASH, DB_BTREE - * (option access-mode <mode>) */ - DBTYPE access_mode; - - /* mode for each and every file stored on bdb - * (option file-mode <mode>) */ - mode_t file_mode; - - /* mode for each and every directory stored on bdb - * (option dir-mode <mode>) */ - mode_t dir_mode; - - /* mode for each and every symlink stored on bdb */ - mode_t symlink_mode; - - /* pthread_t object used for creating checkpoint thread */ - pthread_t checkpoint_thread; - - /* time duration between two consecutive checkpoint operations. - * (option checkpoint-interval <time-in-seconds>) */ - uint32_t checkpoint_interval; - - /* environment log directory (option logdir <directory>) */ - char *logdir; - - /* errfile path, used by environment to print detailed error log. - * (option errfile <errfile-path>) */ - char *errfile; - - /* DB_ENV->set_errfile() expects us to fopen - * the errfile before doing DB_ENV->set_errfile() */ - FILE *errfp; - - /* used by DB_ENV->set_timeout to set the timeout for - * a transactionally encapsulated DB->operation() to - * timeout before waiting for locks to be released. - * (option transaction-timeout <time-in-milliseconds>) - */ - uint32_t txn_timeout; - uint32_t lock_timeout; - - /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/ - uint32_t log_auto_remove; - uint32_t log_region_max; -}; - - -static inline int32_t -bdb_txn_begin (DB_ENV *dbenv, - DB_TXN **ptxnid) -{ - return dbenv->txn_begin (dbenv, NULL, ptxnid, 0); -} - -static inline int32_t -bdb_txn_abort (DB_TXN *txnid) -{ - return txnid->abort (txnid); -} - -static inline int32_t -bdb_txn_commit (DB_TXN *txnid) -{ - return txnid->commit (txnid, 0); -} - -void * -bdb_db_stat (bctx_t *bctx, - DB_TXN *txnid, - uint32_t flags); - -/*int32_t -bdb_db_get(struct bdb_ctx *bctx, - DB_TXN *txnid, - const char *key_string, - char **buf, - size_t size, - off_t offset); -*/ -int32_t -bdb_db_fread (struct bdb_fd *bfd, char *bufp, size_t size, off_t offset); - -int32_t -bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp); - -#define BDB_TRUNCATE_RECORD 0xcafebabe - -/*int32_t -bdb_db_put (struct bdb_ctx *bctx, - DB_TXN *txnid, - const char *key_string, - const char *buf, - size_t size, - off_t offset, - int32_t flags); -*/ -int32_t -bdb_db_icreate (struct bdb_ctx *bctx, const char *key); - -int32_t -bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset); - -int32_t -bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size); - -int32_t -bdb_db_itruncate (struct bdb_ctx *bctx, const char *key); - -int32_t -bdb_db_iremove (struct bdb_ctx *bctx, - const char *key); - -ino_t -bdb_inode_transform (ino_t parent, - const char *name, - size_t namelen); - -int32_t -bdb_cursor_open (struct bdb_ctx *bctx, - DBC **cursorp); - -int32_t -bdb_cursor_get (DBC *cursorp, - DBT *sec, DBT *pri, - DBT *value, - int32_t flags); - - -int32_t -bdb_cursor_close (struct bdb_ctx *ctx, - DBC *cursorp); - - -int32_t -bdb_dirent_size (DBT *key); - -int32_t -dirent_size (struct dirent *entry); - -int -bdb_db_init (xlator_t *this, - dict_t *options); - -void -bdb_dbs_from_dict_close (dict_t *this, - char *key, - data_t *value, - void *data); - -bctx_t * -bctx_lookup (struct bctx_table *table, - const char *path); - -bctx_t * -bctx_parent -(struct bctx_table *table, - const char *path); - -bctx_t * -bctx_unref (bctx_t *ctx); - -bctx_t * -bctx_ref (bctx_t *ctx); - -#endif /* _BDB_H */ diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 9acaad65185..c080a229ff3 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -1,17 +1,25 @@ - +if WITH_SERVER xlator_LTLIBRARIES = posix.la +endif xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -posix_la_LDFLAGS = -module -avoidversion +posix_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c \ + posix-gfid-path.c posix-entry-ops.c posix-inode-fd-ops.c \ + posix-common.c posix-metadata.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) \ + $(ACL_LIBS) -posix_la_SOURCES = posix.c -posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h \ + posix-messages.h posix-gfid-path.h posix-inode-handle.h \ + posix-metadata.h posix-metadata-disk.h -noinst_HEADERS = posix.h +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -I$(CONTRIBDIR)/timer-wheel -AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ - $(GF_CFLAGS) +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) -I$(top_srcdir)/glusterfsd/src -CLEANFILES = +CLEANFILES = diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c new file mode 100644 index 00000000000..d0cb0002bbf --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.c @@ -0,0 +1,556 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include "posix.h" +#include <sys/uio.h> +#include "posix-messages.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> + +void +__posix_fd_set_odirect(fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = pfd->odirect; + + if ((fd->flags | opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset | size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && pfd->odirect) { + flags = fcntl(pfd->fd, F_GETFL); + ret = fcntl(pfd->fd, F_SETFL, (flags & (~O_DIRECT))); + pfd->odirect = 0; + } + + if (odirect && !pfd->odirect) { + flags = fcntl(pfd->fd, F_GETFL); + ret = fcntl(pfd->fd, F_SETFL, (flags | O_DIRECT)); + pfd->odirect = 1; + } + + if (ret) { + gf_msg(THIS->name, GF_LOG_WARNING, errno, P_MSG_FCNTL_FAILED, + "fcntl() failed. fd=%d flags=%d pfd->odirect=%d", pfd->fd, flags, + pfd->odirect); + } +} + +struct posix_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int _fd; + fd_t *fd; + int op; + off_t offset; +}; + +int +posix_aio_readv_complete(struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = { + 0, + }; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + int ret = 0; + off_t offset = 0; + struct posix_private *priv = NULL; + fd_t *fd = NULL; + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + iobuf = paiocb->iobuf; + fd = paiocb->fd; + _fd = paiocb->_fd; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_READV_FAILED, + "readv(async) failed fd=%d,size=%lu,offset=%llu (%d)", _fd, + paiocb->iocb.u.c.nbytes, (unsigned long long)paiocb->offset, + res); + goto out; + } + + ret = posix_fdstat(this, fd->inode, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%d", _fd); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new(); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add(iobref, iobuf); + + iov.iov_base = iobuf_ptr(iobuf); + iov.iov_len = op_ret; + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + + GF_ATOMIC_ADD(priv->read_value, op_ret); + +out: + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, &postbuf, + iobref, NULL); + if (iobuf) + iobuf_unref(iobuf); + if (iobref) + iobref_unref(iobref); + + if (paiocb->fd) + fd_unref(paiocb->fd); + + GF_FREE(paiocb); + + return 0; +} + +int +posix_aio_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + if (!size) { + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_INVALID_ARGUMENT, + "size=%" GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2(this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = GF_CALLOC(1, sizeof(*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->fd = fd_ref(fd); + paiocb->_fd = _fd; + paiocb->op = GF_FOP_READ; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr(iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK(&fd->lock); + { + __posix_fd_set_odirect(fd, pfd, flags, offset, size); + + ret = io_submit(priv->ctxp, 1, &iocb); + } + UNLOCK(&fd->lock); + + if (ret != 1) { + op_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_IO_SUBMIT_FAILED, + "io_submit() returned %d", ret); + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref(iobuf); + + if (paiocb) { + if (paiocb->fd) + fd_unref(paiocb->fd); + GF_FREE(paiocb); + } + + return 0; +} + +int +posix_aio_writev_complete(struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = { + 0, + }; + struct iatt postbuf = { + 0, + }; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + struct posix_private *priv = NULL; + fd_t *fd = NULL; + + if (!paiocb) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + prebuf = paiocb->prebuf; + fd = paiocb->fd; + _fd = paiocb->_fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_WRITEV_FAILED, + "writev(async) failed fd=%d,offset=%llu (%d)", _fd, + (unsigned long long)paiocb->offset, res); + + goto out; + } + + ret = posix_fdstat(this, fd->inode, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%d", _fd); + goto out; + } + + op_ret = res; + op_errno = 0; + + GF_ATOMIC_ADD(priv->write_value, op_ret); + +out: + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref(paiocb->iobref); + if (paiocb->fd) + fd_unref(paiocb->fd); + GF_FREE(paiocb); + } + + return 0; +} + +int +posix_aio_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + priv = this->private; + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_errno, op_errno, err); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + paiocb = GF_CALLOC(1, sizeof(*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->fd = fd_ref(fd); + paiocb->_fd = _fd; + paiocb->op = GF_FOP_WRITE; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref(iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + ret = posix_fdstat(this, fd->inode, _fd, &paiocb->prebuf); + if (ret != 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%p", fd); + goto err; + } + + LOCK(&fd->lock); + { + __posix_fd_set_odirect(fd, pfd, flags, offset, iov_length(iov, count)); + + ret = io_submit(priv->ctxp, 1, &iocb); + } + UNLOCK(&fd->lock); + + if (ret != 1) { + op_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_IO_SUBMIT_FAILED, + "io_submit() returned %d,gfid=%s", ret, + uuid_utoa(fd->inode->gfid)); + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref(paiocb->iobref); + if (paiocb->fd) + fd_unref(paiocb->fd); + GF_FREE(paiocb); + } + + return 0; +} + +void * +posix_aio_thread(void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int i = 0; + struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS]; + struct io_event *event = NULL; + struct posix_aio_cb *paiocb = NULL; + + this = data; + THIS = this; + priv = this->private; + + for (;;) { + memset(&events[0], 0, sizeof(events)); + ret = io_getevents(priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS, + &events[0], NULL); + if (ret <= 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_IO_GETEVENTS_FAILED, + "io_getevents() returned %d", ret); + if (ret == -EINTR) + continue; + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + posix_aio_readv_complete(paiocb, event->res, event->res2); + break; + case GF_FOP_WRITE: + posix_aio_writev_complete(paiocb, event->res, event->res2); + break; + default: + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_UNKNOWN_OP, + "unknown op %d found in piocb", paiocb->op); + break; + } + } + } + + return NULL; +} + +int +posix_aio_init(xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup(POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_AIO_UNAVAILABLE, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_IO_SETUP_FAILED, + "io_setup() failed. ret=%d", ret); + goto out; + } + + ret = gf_thread_create(&priv->aiothread, NULL, posix_aio_thread, this, + "posixaio"); + if (ret != 0) { + io_destroy(priv->ctxp); + goto out; + } + + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; +out: + return ret; +} + +int +posix_aio_on(xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = posix_aio_init(this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; + } + + return ret; +} + +int +posix_aio_off(xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + + return 0; +} + +#else + +int +posix_aio_on(xlator_t *this) +{ + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_AIO_UNAVAILABLE, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +posix_aio_off(xlator_t *this) +{ + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_AIO_UNAVAILABLE, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__posix_fd_set_odirect(fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_AIO_UNAVAILABLE, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} + +#endif diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h new file mode 100644 index 00000000000..b316deb3229 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.h @@ -0,0 +1,34 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_AIO_H +#define _POSIX_AIO_H + +// Maximum number of concurrently submitted IO events. The heaviest load +// GlusterFS has been able to handle had 60-80 concurrent calls +#define POSIX_AIO_MAX_NR_EVENTS 256 + +// Maximum number of completed IO operations to reap per getevents syscall +#define POSIX_AIO_MAX_NR_GETEVENTS 16 + +int +posix_aio_on(xlator_t *this); +int +posix_aio_off(xlator_t *this); + +int +posix_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int +posix_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +#endif /* !_POSIX_AIO_H */ diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c new file mode 100644 index 00000000000..f10722ec3fb --- /dev/null +++ b/xlators/storage/posix/src/posix-common.c @@ -0,0 +1,1524 @@ +/* + Copyright (c) 2006-2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#define __XOPEN_SOURCE 500 + +/* for SEEK_HOLE and SEEK_DATA */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <openssl/md5.h> +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <libgen.h> +#include <pthread.h> +#include <ftw.h> +#include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> +#include <unistd.h> + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#ifdef HAVE_LINKAT +#include <fcntl.h> +#endif /* HAVE_LINKAT */ + +#include "posix-inode-handle.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/syscall.h> +#include <glusterfs/statedump.h> +#include <glusterfs/locking.h> +#include <glusterfs/timer.h> +#include "glusterfs3-xdr.h" +#include "posix-aio.h" +#include <glusterfs/glusterfs-acl.h> +#include "posix-messages.h" +#include <glusterfs/events.h> +#include "posix-gfid-path.h" +#include <glusterfs/compat-uuid.h> +#include "timer-wheel.h" + +extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 + +#undef HAVE_SET_FSID +#ifdef HAVE_SET_FSID + +#define DECLARE_OLD_FS_ID_VAR \ + uid_t old_fsuid; \ + gid_t old_fsgid; + +#define SET_FS_ID(uid, gid) \ + do { \ + old_fsuid = setfsuid(uid); \ + old_fsgid = setfsgid(gid); \ + } while (0) + +#define SET_TO_OLD_FS_ID() \ + do { \ + setfsuid(old_fsuid); \ + setfsgid(old_fsgid); \ + } while (0) + +#else + +#define DECLARE_OLD_FS_ID_VAR +#define SET_FS_ID(uid, gid) +#define SET_TO_OLD_FS_ID() + +#endif + +/* Setting microseconds or nanoseconds depending on what's supported: + The passed in `tv` can be + struct timespec + if supported (better, because it supports nanosecond resolution) or + struct timeval + otherwise. */ +#if HAVE_UTIMENSAT +#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) tv.tv_nsec = nanosecs +#else +#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \ + tv.tv_usec = nanosecs / 1000 +#endif + +int32_t +posix_priv(xlator_t *this) +{ + struct posix_private *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + if (!this) + return 0; + + (void)snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, + this->name); + gf_proc_dump_add_section("%s", key_prefix); + + priv = this->private; + + if (!priv) + return 0; + + gf_proc_dump_write("base_path", "%s", priv->base_path); + gf_proc_dump_write("base_path_length", "%d", priv->base_path_length); + gf_proc_dump_write("max_read", "%" PRId64, GF_ATOMIC_GET(priv->read_value)); + gf_proc_dump_write("max_write", "%" PRId64, + GF_ATOMIC_GET(priv->write_value)); + + return 0; +} + +int32_t +posix_inode(xlator_t *this) +{ + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +posix_notify(xlator_t *this, int32_t event, void *data, ...) +{ + xlator_t *victim = data; + struct posix_private *priv = this->private; + int ret = 0; + struct timespec sleep_till = { + 0, + }; + glusterfs_ctx_t *ctx = this->ctx; + + switch (event) { + case GF_EVENT_PARENT_UP: { + /* Notify the parent that posix xlator is up */ + default_notify(this, GF_EVENT_CHILD_UP, data); + } break; + + case GF_EVENT_PARENT_DOWN: { + if (!victim->cleanup_starting) + break; + + if (priv->janitor) { + pthread_mutex_lock(&priv->janitor_mutex); + { + priv->janitor_task_stop = _gf_true; + ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, + priv->janitor); + if (!ret) { + timespec_now_realtime(&sleep_till); + sleep_till.tv_sec += 1; + /* Wait to set janitor_task flag to _gf_false by + * janitor_task_done */ + while (priv->janitor_task_stop) { + (void)pthread_cond_timedwait(&priv->janitor_cond, + &priv->janitor_mutex, + &sleep_till); + timespec_now_realtime(&sleep_till); + sleep_till.tv_sec += 1; + } + } + } + pthread_mutex_unlock(&priv->janitor_mutex); + GF_FREE(priv->janitor); + } + priv->janitor = NULL; + pthread_mutex_lock(&ctx->fd_lock); + { + while (priv->rel_fdcount > 0) { + pthread_cond_wait(&priv->fd_cond, &ctx->fd_lock); + } + } + pthread_mutex_unlock(&ctx->fd_lock); + + gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s", + victim->name); + default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data); + } break; + default: + /* */ + break; + } + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_posix_mt_end + 1); + + if (ret != 0) { + return ret; + } + + return ret; +} + +static int +posix_set_owner(xlator_t *this, uid_t uid, gid_t gid) +{ + struct posix_private *priv = NULL; + int ret = -1; + struct stat st = { + 0, + }; + + priv = this->private; + + ret = sys_lstat(priv->base_path, &st); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_DIR_OPERATION_FAILED, + "Failed to stat " + "brick path %s", + priv->base_path); + return ret; + } + + if ((uid == -1 || st.st_uid == uid) && (gid == -1 || st.st_gid == gid)) + return 0; + + ret = sys_chown(priv->base_path, uid, gid); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_DIR_OPERATION_FAILED, + "Failed to set uid/gid for" + " brick path %s", + priv->base_path); + + return ret; +} +static int +set_gfid2path_separator(struct posix_private *priv, const char *str) +{ + int str_len = 0; + + str_len = strlen(str); + if (str_len > 0 && str_len < 8) { + strcpy(priv->gfid2path_sep, str); + return 0; + } + + return -1; +} + +static int +set_batch_fsync_mode(struct posix_private *priv, const char *str) +{ + if (strcmp(str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp(str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp(str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp(str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp(str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + +#ifdef GF_DARWIN_HOST_OS +static int +set_xattr_user_namespace_mode(struct posix_private *priv, const char *str) +{ + if (strcmp(str, "none") == 0) + priv->xattr_user_namespace = XATTR_NONE; + else if (strcmp(str, "strip") == 0) + priv->xattr_user_namespace = XATTR_STRIP; + else if (strcmp(str, "append") == 0) + priv->xattr_user_namespace = XATTR_APPEND; + else if (strcmp(str, "both") == 0) + priv->xattr_user_namespace = XATTR_BOTH; + else + return -1; + return 0; +} +#endif + +int +posix_reconfigure(xlator_t *this, dict_t *options) +{ + int ret = -1; + struct posix_private *priv = NULL; + int32_t uid = -1; + int32_t gid = -1; + char *batch_fsync_mode_str = NULL; + char *gfid2path_sep = NULL; + int32_t force_create_mode = -1; + int32_t force_directory_mode = -1; + int32_t create_mask = -1; + int32_t create_directory_mask = -1; + + priv = this->private; + + GF_OPTION_RECONF("brick-uid", uid, options, int32, out); + GF_OPTION_RECONF("brick-gid", gid, options, int32, out); + if (uid != -1 || gid != -1) + posix_set_owner(this, uid, gid); + + GF_OPTION_RECONF("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF("batch-fsync-mode", batch_fsync_mode_str, options, str, + out); + + if (set_batch_fsync_mode(priv, batch_fsync_mode_str) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT, + "Unknown mode string: %s", batch_fsync_mode_str); + goto out; + } + + GF_OPTION_RECONF("gfid2path-separator", gfid2path_sep, options, str, out); + if (set_gfid2path_separator(priv, gfid2path_sep) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT, + "Length of separator exceeds 7: %s", gfid2path_sep); + goto out; + } + +#ifdef GF_DARWIN_HOST_OS + + char *xattr_user_namespace_mode_str = NULL; + + GF_OPTION_RECONF("xattr-user-namespace-mode", xattr_user_namespace_mode_str, + options, str, out); + + if (set_xattr_user_namespace_mode(priv, xattr_user_namespace_mode_str) != + 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_UNKNOWN_ARGUMENT, + "Unknown xattr user namespace mode string: %s", + xattr_user_namespace_mode_str); + goto out; + } + +#endif + + GF_OPTION_RECONF("linux-aio", priv->aio_configured, options, bool, out); + + if (priv->aio_configured) + posix_aio_on(this); + else + posix_aio_off(this); + + GF_OPTION_RECONF("update-link-count-parent", priv->update_pgfid_nlinks, + options, bool, out); + + GF_OPTION_RECONF("gfid2path", priv->gfid2path, options, bool, out); + + GF_OPTION_RECONF("node-uuid-pathinfo", priv->node_uuid_pathinfo, options, + bool, out); + + if (priv->node_uuid_pathinfo && (gf_uuid_is_null(priv->glusterd_uuid))) { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_UUID_NULL, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + GF_OPTION_RECONF("reserve", priv->disk_reserve, options, percent_or_size, + out); + /* option can be any one of percent or bytes */ + priv->disk_unit = 0; + if (priv->disk_reserve < 100.0) + priv->disk_unit = 'p'; + + if (priv->disk_reserve) { + ret = posix_spawn_disk_space_check_thread(this); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED, + "Getting disk space check from thread failed"); + goto out; + } + } + + GF_OPTION_RECONF("health-check-interval", priv->health_check_interval, + options, uint32, out); + GF_OPTION_RECONF("health-check-timeout", priv->health_check_timeout, + options, uint32, out); + if (priv->health_check_interval) { + ret = posix_spawn_health_check_thread(this); + if (ret) + goto out; + } + + GF_OPTION_RECONF("shared-brick-count", priv->shared_brick_count, options, + int32, out); + + GF_OPTION_RECONF("disable-landfill-purge", priv->disable_landfill_purge, + options, bool, out); + if (priv->disable_landfill_purge) { + gf_log(this->name, GF_LOG_WARNING, + "Janitor WILL NOT purge the landfill directory. " + "Your landfill directory" + " may fill up this brick."); + } else { + gf_msg_debug(this->name, 0, + "Janitor will purge the landfill " + "directory, which is default behavior"); + } + + GF_OPTION_RECONF("force-create-mode", force_create_mode, options, int32, + out); + priv->force_create_mode = force_create_mode; + + GF_OPTION_RECONF("force-directory-mode", force_directory_mode, options, + int32, out); + priv->force_directory_mode = force_directory_mode; + + GF_OPTION_RECONF("create-mask", create_mask, options, int32, out); + priv->create_mask = create_mask; + + GF_OPTION_RECONF("create-directory-mask", create_directory_mask, options, + int32, out); + priv->create_directory_mask = create_directory_mask; + + GF_OPTION_RECONF("max-hardlinks", priv->max_hardlinks, options, uint32, + out); + + GF_OPTION_RECONF("fips-mode-rchecksum", priv->fips_mode_rchecksum, options, + bool, out); + + GF_OPTION_RECONF("ctime", priv->ctime, options, bool, out); + + ret = 0; +out: + return ret; +} + +int32_t +posix_delete_unlink_entry(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + int ret = 0; + + if (!fpath) + goto out; + + switch (typeflag) { + case FTW_SL: + case FTW_NS: + case FTW_F: + case FTW_SLN: + ret = sys_unlink(fpath); + break; + case FTW_D: + case FTW_DP: + case FTW_DNR: + if (ftwbuf->level != 0) { + ret = sys_rmdir(fpath); + } + break; + default: + break; + } + if (ret) { + gf_msg("posix_delete_unlink_entry", GF_LOG_WARNING, errno, + P_MSG_HANDLE_CREATE, + "Deletion of entries %s failed" + "Please delete it manually", + fpath); + } +out: + return 0; +} + +int32_t +posix_delete_unlink(const char *unlink_path) +{ + int ret = -1; + int flags = 0; + + flags |= (FTW_DEPTH | FTW_PHYS); + + ret = nftw(unlink_path, posix_delete_unlink_entry, 2, flags); + if (ret) { + gf_msg("posix_delete_unlink", GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Deleting files from %s failed", unlink_path); + } + return ret; +} + +int32_t +posix_create_unlink_dir(xlator_t *this) +{ + struct posix_private *priv = NULL; + struct stat stbuf; + int ret = -1; + uuid_t gfid = {0}; + char gfid_str[64] = {0}; + char unlink_path[PATH_MAX] = { + 0, + }; + char landfill_path[PATH_MAX] = { + 0, + }; + + priv = this->private; + + (void)snprintf(unlink_path, sizeof(unlink_path), "%s/%s", priv->base_path, + GF_UNLINK_PATH); + + gf_uuid_generate(gfid); + uuid_utoa_r(gfid, gfid_str); + + (void)snprintf(landfill_path, sizeof(landfill_path), "%s/%s/%s", + priv->base_path, GF_LANDFILL_PATH, gfid_str); + + ret = sys_stat(unlink_path, &stbuf); + switch (ret) { + case -1: + if (errno != ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "Checking for %s failed", unlink_path); + return -1; + } + break; + case 0: + if (!S_ISDIR(stbuf.st_mode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Not a directory: %s", unlink_path); + return -1; + } + ret = posix_delete_unlink(unlink_path); + return 0; + default: + break; + } + ret = sys_mkdir(unlink_path, 0600); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "Creating directory %s failed", unlink_path); + return -1; + } + + return 0; +} + +int +posix_create_open_directory_based_fd(xlator_t *this, int pdirfd, char *dir_name) +{ + int ret = -1; + + ret = sys_openat(pdirfd, dir_name, (O_DIRECTORY | O_RDONLY), 0); + if (ret < 0 && errno == ENOENT) { + ret = sys_mkdirat(pdirfd, dir_name, 0700); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "Creating directory %s failed", dir_name); + goto out; + } + ret = sys_openat(pdirfd, dir_name, (O_DIRECTORY | O_RDONLY), 0); + if (ret < 0 && errno != EEXIST) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "error mkdir hash-1 %s ", dir_name); + goto out; + } + } +out: + return ret; +} + +/** + * init - + */ +int +posix_init(xlator_t *this) +{ + struct posix_private *_private = NULL; + data_t *dir_data = NULL; + data_t *tmp_data = NULL; + struct stat buf = { + 0, + }; + gf_boolean_t tmp_bool = 0; + int ret = 0; + int op_ret = -1; + int op_errno = 0; + ssize_t size = -1; + uuid_t old_uuid = { + 0, + }; + uuid_t dict_uuid = { + 0, + }; + uuid_t gfid = { + 0, + }; + static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char *guuid = NULL; + int32_t uid = -1; + int32_t gid = -1; + char *batch_fsync_mode_str; + char *gfid2path_sep = NULL; + int force_create = -1; + int force_directory = -1; + int create_mask = -1; + int create_directory_mask = -1; + char dir_handle[PATH_MAX] = { + 0, + }; + int i; + char fhash[4] = { + 0, + }; + int hdirfd = -1; + char value; + + dir_data = dict_get(this->options, "directory"); + + if (this->children) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_SUBVOLUME_ERROR, + "FATAL: storage/posix cannot have subvolumes"); + ret = -1; + goto out; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_VOLUME_DANGLING, + "Volume is dangling. Please check the volume file."); + } + + if (!dir_data) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_EXPORT_DIR_MISSING, + "Export directory not specified in volume file."); + ret = -1; + goto out; + } + + umask(000); // umask `masking' is done at the client side + + /* Check whether the specified directory exists, if not log it. */ + op_ret = sys_stat(dir_data->data, &buf); + if ((op_ret != 0) || !S_ISDIR(buf.st_mode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DIR_OPERATION_FAILED, + "Directory '%s' doesn't exist, exiting.", dir_data->data); + ret = -1; + goto out; + } + + _private = GF_CALLOC(1, sizeof(*_private), gf_posix_mt_posix_private); + if (!_private) { + ret = -1; + goto out; + } + + _private->base_path = gf_strdup(dir_data->data); + _private->base_path_length = dir_data->len - 1; + + _private->dirfd = -1; + _private->mount_lock = -1; + for (i = 0; i < 256; i++) + _private->arrdfd[i] = -1; + + ret = dict_get_str(this->options, "hostname", &_private->hostname); + if (ret) { + _private->hostname = GF_CALLOC(256, sizeof(char), gf_common_mt_char); + if (!_private->hostname) { + goto out; + } + ret = gethostname(_private->hostname, 256); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HOSTNAME_MISSING, + "could not find hostname "); + } + } + + /* Check for Extended attribute support, if not present, log it */ + size = sys_lgetxattr(dir_data->data, "user.x", &value, sizeof(value)); + + if ((size == -1) && (errno == EOPNOTSUPP)) { + gf_msg(this->name, GF_LOG_DEBUG, 0, P_MSG_XDATA_GETXATTR, + "getxattr returned %zd", size); + tmp_data = dict_get(this->options, "mandate-attribute"); + if (tmp_data) { + if (gf_string2boolean(tmp_data->data, &tmp_bool) == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION, + "wrong option provided for key " + "\"mandate-attribute\""); + ret = -1; + goto out; + } + if (!tmp_bool) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_NOTSUP, + "Extended attribute not supported, " + "starting as per option"); + } else { + gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_XATTR_NOTSUP, + "Extended attribute not supported, " + "exiting."); + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_XATTR_NOTSUP, + "Extended attribute not supported, exiting."); + ret = -1; + goto out; + } + } + + tmp_data = dict_get(this->options, "volume-id"); + if (tmp_data) { + op_ret = gf_uuid_parse(tmp_data->data, dict_uuid); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_VOLUME_ID, + "wrong volume-id (%s) set" + " in volume file", + tmp_data->data); + ret = -1; + goto out; + } + size = sys_lgetxattr(dir_data->data, "trusted.glusterfs.volume-id", + old_uuid, 16); + if (size == 16) { + if (gf_uuid_compare(old_uuid, dict_uuid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_VOLUME_ID, + "mismatching volume-id (%s) received. " + "already is a part of volume %s ", + tmp_data->data, uuid_utoa(old_uuid)); + gf_event(EVENT_POSIX_ALREADY_PART_OF_VOLUME, + "volume-id=%s;brick=%s:%s", uuid_utoa(old_uuid), + _private->hostname, _private->base_path); + ret = -1; + goto out; + } + } else if ((size == -1) && (errno == ENODATA || errno == ENOATTR)) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_VOLUME_ID_ABSENT, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + gf_event(EVENT_POSIX_BRICK_NOT_IN_VOLUME, "brick=%s:%s", + _private->hostname, _private->base_path); + ret = -1; + goto out; + + } else if ((size == -1) && (errno != ENODATA) && (errno != ENOATTR)) { + /* Wrong 'volume-id' is set, it should be error */ + gf_event(EVENT_POSIX_BRICK_VERIFICATION_FAILED, "brick=%s:%s", + _private->hostname, _private->base_path); + gf_msg(this->name, GF_LOG_WARNING, errno, + P_MSG_VOLUME_ID_FETCH_FAILED, + "%s: failed to fetch volume-id", dir_data->data); + ret = -1; + goto out; + } else { + ret = -1; + gf_event(EVENT_POSIX_BRICK_VERIFICATION_FAILED, "brick=%s:%s", + _private->hostname, _private->base_path); + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_VOLUME_ID_FETCH_FAILED, + "failed to fetch proper volume id from export"); + goto out; + } + } + + /* Now check if the export directory has some other 'gfid', + other than that of root '/' */ + size = sys_lgetxattr(dir_data->data, "trusted.gfid", gfid, 16); + if (size == 16) { + if (!__is_root_gfid(gfid)) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_SET_FAILED, + "%s: gfid (%s) is not that of glusterfs '/' ", + dir_data->data, uuid_utoa(gfid)); + ret = -1; + goto out; + } + } else if (size != -1) { + /* Wrong 'gfid' is set, it should be error */ + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_SET_FAILED, + "%s: wrong value set as gfid", dir_data->data); + ret = -1; + goto out; + } else if ((size == -1) && (errno != ENODATA) && (errno != ENOATTR)) { + /* Wrong 'gfid' is set, it should be error */ + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_SET_FAILED, + "%s: failed to fetch gfid", dir_data->data); + ret = -1; + goto out; + } else { + /* First time volume, set the GFID */ + size = sys_lsetxattr(dir_data->data, "trusted.gfid", rootgfid, 16, + XATTR_CREATE); + if (size == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_GFID_SET_FAILED, + "%s: failed to set gfid", dir_data->data); + ret = -1; + goto out; + } + } + + ret = 0; + + size = sys_lgetxattr(dir_data->data, POSIX_ACL_ACCESS_XATTR, NULL, 0); + if ((size < 0) && (errno == ENOTSUP)) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_ACL_NOTSUP, + "Posix access control list is not supported."); + gf_event(EVENT_POSIX_ACL_NOT_SUPPORTED, "brick=%s:%s", + _private->hostname, _private->base_path); + } + + /* + * _XOPEN_PATH_MAX is the longest file path len we MUST + * support according to POSIX standard. When prepended + * by the brick base path it may exceed backed filesystem + * capacity (which MAY be bigger than _XOPEN_PATH_MAX). If + * this is the case, chdir() to the brick base path and + * use relative paths when they are too long. See also + * MAKE_REAL_PATH in posix-handle.h + */ + _private->path_max = pathconf(_private->base_path, _PC_PATH_MAX); + if (_private->path_max != -1 && + _XOPEN_PATH_MAX + _private->base_path_length > _private->path_max) { + ret = chdir(_private->base_path); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_BASEPATH_CHDIR_FAILED, + "chdir() to \"%s\" failed", _private->base_path); + goto out; + } +#ifdef __NetBSD__ + /* + * At least on NetBSD, the chdir() above uncovers a + * race condition which cause file lookup to fail + * with ENODATA for a few seconds. The volume quickly + * reaches a sane state, but regression tests are fast + * enough to choke on it. The reason is obscure (as + * often with race conditions), but sleeping here for + * a second seems to workaround the problem. + */ + sleep(1); +#endif + } + + LOCK_INIT(&_private->lock); + GF_ATOMIC_INIT(_private->read_value, 0); + GF_ATOMIC_INIT(_private->write_value, 0); + + _private->export_statfs = 1; + tmp_data = dict_get(this->options, "export-statfs-size"); + if (tmp_data) { + if (gf_string2boolean(tmp_data->data, &_private->export_statfs) == -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL, + "'export-statfs-size' takes only boolean " + "options"); + goto out; + } + if (!_private->export_statfs) + gf_msg_debug(this->name, 0, "'statfs()' returns dummy size"); + } + + _private->background_unlink = 0; + tmp_data = dict_get(this->options, "background-unlink"); + if (tmp_data) { + if (gf_string2boolean(tmp_data->data, &_private->background_unlink) == + -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL, + "'background-unlink'" + " takes only boolean options"); + goto out; + } + + if (_private->background_unlink) + gf_msg_debug(this->name, 0, + "unlinks will be performed in background"); + } + + tmp_data = dict_get(this->options, "o-direct"); + if (tmp_data) { + if (gf_string2boolean(tmp_data->data, &_private->o_direct) == -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL, + "wrong option provided for 'o-direct'"); + goto out; + } + if (_private->o_direct) + gf_msg_debug(this->name, 0, + "o-direct mode is enabled" + " (O_DIRECT for every open)"); + } + + tmp_data = dict_get(this->options, "update-link-count-parent"); + if (tmp_data) { + if (gf_string2boolean(tmp_data->data, &_private->update_pgfid_nlinks) == + -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION, + "wrong value provided " + "for 'update-link-count-parent'"); + goto out; + } + if (_private->update_pgfid_nlinks) + gf_msg_debug(this->name, 0, + "update-link-count-parent" + " is enabled. Thus for each file an " + "extended attribute representing the " + "number of hardlinks for that file " + "within the same parent directory is" + " set."); + } + + ret = dict_get_str(this->options, "glusterd-uuid", &guuid); + if (!ret) { + if (gf_uuid_parse(guuid, _private->glusterd_uuid)) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_INVALID_NODE_UUID, + "Cannot parse " + "glusterd (node) UUID, node-uuid xattr " + "request would return - \"No such attribute\""); + } else { + gf_msg_debug(this->name, 0, + "No glusterd (node) UUID passed -" + " node-uuid xattr request will return \"No such" + " attribute\""); + } + ret = 0; + + GF_OPTION_INIT("janitor-sleep-duration", _private->janitor_sleep_duration, + int32, out); + + /* performing open dir on brick dir locks the brick dir + * and prevents it from being unmounted + */ + _private->mount_lock = sys_open(dir_data->data, (O_DIRECTORY | O_RDONLY), + 0); + if (_private->mount_lock < 0) { + ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DIR_OPERATION_FAILED, + "Could not lock brick directory (%s)", strerror(op_errno)); + goto out; + } +#ifndef GF_DARWIN_HOST_OS + { + struct rlimit lim; + lim.rlim_cur = 1048576; + lim.rlim_max = 1048576; + + if (setrlimit(RLIMIT_NOFILE, &lim) == -1) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SET_ULIMIT_FAILED, + "Failed to set 'ulimit -n " + " 1048576'"); + lim.rlim_cur = 65536; + lim.rlim_max = 65536; + + if (setrlimit(RLIMIT_NOFILE, &lim) == -1) { + gf_msg(this->name, GF_LOG_WARNING, errno, + P_MSG_SET_FILE_MAX_FAILED, + "Failed to set maximum allowed open " + "file descriptors to 64k"); + } else { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_MAX_FILE_OPEN, + "Maximum allowed " + "open file descriptors set to 65536"); + } + } + } +#endif + _private->shared_brick_count = 1; + ret = dict_get_int32(this->options, "shared-brick-count", + &_private->shared_brick_count); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL, + "'shared-brick-count' takes only integer " + "values"); + goto out; + } + + this->private = (void *)_private; + snprintf(dir_handle, sizeof(dir_handle), "%s/%s", _private->base_path, + GF_HIDDEN_PATH); + hdirfd = posix_create_open_directory_based_fd(this, _private->mount_lock, + dir_handle); + if (hdirfd < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "error open directory failed for dir %s", dir_handle); + ret = -1; + goto out; + } + _private->dirfd = hdirfd; + for (i = 0; i < 256; i++) { + snprintf(fhash, sizeof(fhash), "%02x", i); + _private->arrdfd[i] = posix_create_open_directory_based_fd(this, hdirfd, + fhash); + if (_private->arrdfd[i] < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "error openat failed for file %s", fhash); + ret = -1; + goto out; + } + } + + op_ret = posix_handle_init(this); + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Posix handle setup failed"); + ret = -1; + goto out; + } + + op_ret = posix_handle_trash_init(this); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE_TRASH, + "Posix landfill setup failed"); + ret = -1; + goto out; + } + + op_ret = posix_create_unlink_dir(this); + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Creation of unlink directory failed"); + ret = -1; + goto out; + } + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT("brick-uid", uid, int32, out); + GF_OPTION_INIT("brick-gid", gid, int32, out); + if (uid != -1 || gid != -1) + posix_set_owner(this, uid, gid); + + GF_OPTION_INIT("linux-aio", _private->aio_configured, bool, out); + + if (_private->aio_configured) { + op_ret = posix_aio_on(this); + + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_POSIX_AIO, + "Posix AIO init failed"); + ret = -1; + goto out; + } + } + + GF_OPTION_INIT("node-uuid-pathinfo", _private->node_uuid_pathinfo, bool, + out); + if (_private->node_uuid_pathinfo && + (gf_uuid_is_null(_private->glusterd_uuid))) { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_UUID_NULL, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->disk_space_check_active = _gf_false; + _private->disk_space_full = 0; + + GF_OPTION_INIT("reserve", _private->disk_reserve, percent_or_size, out); + + /* option can be any one of percent or bytes */ + _private->disk_unit = 0; + if (_private->disk_reserve < 100.0) + _private->disk_unit = 'p'; + + if (_private->disk_reserve) { + ret = posix_spawn_disk_space_check_thread(this); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED, + "Getting disk space check from thread failed "); + goto out; + } + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT("health-check-interval", _private->health_check_interval, + uint32, out); + GF_OPTION_INIT("health-check-timeout", _private->health_check_timeout, + uint32, out); + if (_private->health_check_interval) { + ret = posix_spawn_health_check_thread(this); + if (ret) + goto out; + } + posix_janitor_timer_start(this); + + pthread_mutex_init(&_private->fsync_mutex, NULL); + pthread_cond_init(&_private->fsync_cond, NULL); + pthread_mutex_init(&_private->janitor_mutex, NULL); + pthread_cond_init(&_private->janitor_cond, NULL); + pthread_cond_init(&_private->fd_cond, NULL); + INIT_LIST_HEAD(&_private->fsyncs); + _private->rel_fdcount = 0; + ret = posix_spawn_ctx_janitor_thread(this); + if (ret) + goto out; + + ret = gf_thread_create(&_private->fsyncer, NULL, posix_fsyncer, this, + "posixfsy"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_FSYNCER_THREAD_CREATE_FAILED, + "fsyncer thread creation failed"); + goto out; + } + + GF_OPTION_INIT("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode(_private, batch_fsync_mode_str) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT, + "Unknown mode string: %s", batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT("gfid2path", _private->gfid2path, bool, out); + + GF_OPTION_INIT("gfid2path-separator", gfid2path_sep, str, out); + if (set_gfid2path_separator(_private, gfid2path_sep) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT, + "Length of separator exceeds 7: %s", gfid2path_sep); + goto out; + } + +#ifdef GF_DARWIN_HOST_OS + + char *xattr_user_namespace_mode_str = NULL; + + GF_OPTION_INIT("xattr-user-namespace-mode", xattr_user_namespace_mode_str, + str, out); + + if (set_xattr_user_namespace_mode(_private, + xattr_user_namespace_mode_str) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT, + "Unknown xattr user namespace mode string: %s", + xattr_user_namespace_mode_str); + goto out; + } +#endif + + GF_OPTION_INIT("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); + + GF_OPTION_INIT("disable-landfill-purge", _private->disable_landfill_purge, + bool, out); + if (_private->disable_landfill_purge) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "Janitor WILL NOT purge the landfill directory. " + "Your landfill directory" + " may fill up this brick."); + } + + GF_OPTION_INIT("force-create-mode", force_create, int32, out); + _private->force_create_mode = force_create; + + GF_OPTION_INIT("force-directory-mode", force_directory, int32, out); + _private->force_directory_mode = force_directory; + + GF_OPTION_INIT("create-mask", create_mask, int32, out); + _private->create_mask = create_mask; + + GF_OPTION_INIT("create-directory-mask", create_directory_mask, int32, out); + _private->create_directory_mask = create_directory_mask; + + GF_OPTION_INIT("max-hardlinks", _private->max_hardlinks, uint32, out); + + GF_OPTION_INIT("fips-mode-rchecksum", _private->fips_mode_rchecksum, bool, + out); + + GF_OPTION_INIT("ctime", _private->ctime, bool, out); + +out: + if (ret) { + if (_private) { + if (_private->dirfd >= 0) { + sys_close(_private->dirfd); + _private->dirfd = -1; + } + + for (i = 0; i < 256; i++) { + if (_private->arrdfd[i] >= 0) { + sys_close(_private->arrdfd[i]); + _private->arrdfd[i] = -1; + } + } + /*unlock brick dir*/ + if (_private->mount_lock >= 0) { + (void)sys_close(_private->mount_lock); + _private->mount_lock = -1; + } + + GF_FREE(_private->base_path); + + GF_FREE(_private->hostname); + + GF_FREE(_private->trash_path); + + GF_FREE(_private); + } + + this->private = NULL; + } + return ret; +} + +void +posix_fini(xlator_t *this) +{ + struct posix_private *priv = this->private; + gf_boolean_t health_check = _gf_false; + glusterfs_ctx_t *ctx = this->ctx; + uint32_t count; + int ret = 0; + int i = 0; + + if (!priv) + return; + LOCK(&priv->lock); + { + health_check = priv->health_check_active; + priv->health_check_active = _gf_false; + } + UNLOCK(&priv->lock); + + if (priv->dirfd >= 0) { + sys_close(priv->dirfd); + priv->dirfd = -1; + } + + for (i = 0; i < 256; i++) { + if (priv->arrdfd[i] >= 0) { + sys_close(priv->arrdfd[i]); + priv->arrdfd[i] = -1; + } + } + + if (health_check) { + (void)gf_thread_cleanup_xint(priv->health_check); + priv->health_check = 0; + } + + if (priv->disk_space_check) { + priv->disk_space_check_active = _gf_false; + (void)gf_thread_cleanup_xint(priv->disk_space_check); + priv->disk_space_check = 0; + } + + if (priv->janitor) { + /*TODO: Make sure the synctask is also complete */ + ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TIMER_DELETE_FAILED, + "Failed to delete janitor timer"); + } + GF_FREE(priv->janitor); + priv->janitor = NULL; + } + + pthread_mutex_lock(&ctx->fd_lock); + { + count = --ctx->pxl_count; + if (count == 0) { + pthread_cond_signal(&ctx->fd_cond); + } + } + pthread_mutex_unlock(&ctx->fd_lock); + + if (count == 0) { + pthread_join(ctx->janitor, NULL); + } + + if (priv->fsyncer) { + (void)gf_thread_cleanup_xint(priv->fsyncer); + priv->fsyncer = 0; + } + /*unlock brick dir*/ + if (priv->mount_lock >= 0) { + (void)sys_close(priv->mount_lock); + priv->mount_lock = -1; + } + + GF_FREE(priv->base_path); + LOCK_DESTROY(&priv->lock); + pthread_mutex_destroy(&priv->fsync_mutex); + pthread_cond_destroy(&priv->fsync_cond); + pthread_mutex_destroy(&priv->janitor_mutex); + pthread_cond_destroy(&priv->janitor_cond); + GF_FREE(priv->hostname); + GF_FREE(priv->trash_path); + GF_FREE(priv); + this->private = NULL; + + return; +} + +struct volume_options posix_options[] = { + {.key = {"o-direct"}, .type = GF_OPTION_TYPE_BOOL}, + {.key = {"directory"}, + .type = GF_OPTION_TYPE_PATH, + .default_value = "{{brick.path}}"}, + {.key = {"hostname"}, .type = GF_OPTION_TYPE_ANY}, + {.key = {"export-statfs-size"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on"}, + {.key = {"mandate-attribute"}, .type = GF_OPTION_TYPE_BOOL}, + {.key = {"background-unlink"}, .type = GF_OPTION_TYPE_BOOL}, + {.key = {"janitor-sleep-duration"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .default_value = "10", + .description = "Interval (in seconds) between times the internal " + "'landfill' directory is emptied."}, + {.key = {"volume-id"}, + .type = GF_OPTION_TYPE_ANY, + .default_value = "{{brick.volumeid}}"}, + {.key = {"glusterd-uuid"}, .type = GF_OPTION_TYPE_STR}, + {.key = {"linux-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"brick-uid"}, + .type = GF_OPTION_TYPE_INT, + .min = -1, + .validate = GF_OPT_VALIDATE_MIN, + .default_value = "-1", + .description = "Support for setting uid of brick's owner", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"brick-gid"}, + .type = GF_OPTION_TYPE_INT, + .min = -1, + .validate = GF_OPT_VALIDATE_MIN, + .default_value = "-1", + .description = "Support for setting gid of brick's owner", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"health-check-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "20", + .validate = GF_OPT_VALIDATE_MIN, + .description = + "Interval in seconds to wait aio_write finish for health check, " + "set to 0 to disable", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"reserve"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .min = 0, + .default_value = "1", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Percentage/Size of disk space to be reserved." + " Set to 0 to disable", + .op_version = {GD_OP_VERSION_3_13_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = + "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order.", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"update-link-count-parent"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable placeholders for gfid to path conversion", + .op_version = {GD_OP_VERSION_3_6_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"gfid2path"}, + .type = GF_OPTION_TYPE_BOOL, +#ifdef __NetBSD__ + /* + * NetBSD storage of extended attributes for UFS1 badly + * scales when the list of extended attributes names rises. + * This option can add as many extended attributes names + * as we have files, hence we keep it disabled for performance + * sake. + */ + .default_value = "off", +#else + .default_value = "on", +#endif + .description = "Enable logging metadata for gfid to path conversion", + .op_version = {GD_OP_VERSION_3_12_0}, + .flags = OPT_FLAG_SETTABLE}, + {.key = {"gfid2path-separator"}, + .type = GF_OPTION_TYPE_STR, + .default_value = ":", + .description = "Path separator for glusterfs.gfidtopath virt xattr", + .op_version = {GD_OP_VERSION_3_12_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, +#if GF_DARWIN_HOST_OS + {.key = {"xattr-user-namespace-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "none", + .description = + "Option to control XATTR user namespace on the raw filesystem: " + "\t- None: Will use the user namespace, so files will be exchangeable " + "with Linux.\n" + " The raw filesystem will not be compatible with OS X Finder.\n" + "\t- Strip: Will strip the user namespace before setting. The raw " + "filesystem will work in OS X.\n", + .op_version = {GD_OP_VERSION_3_6_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, +#endif + { + .key = {"shared-brick-count"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "1", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = + "Number of bricks sharing the same backend export." + " Useful for displaying the proper usable size through statvfs() " + "call (df command)", + }, + { + .key = {"disable-landfill-purge"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Disable glusterfs/landfill purges. " + "WARNING: This can fill up a brick.", + .op_version = {GD_OP_VERSION_4_0_0}, + .tags = {"diagnosis"}, + }, + {.key = {"force-create-mode"}, + .type = GF_OPTION_TYPE_INT, + .min = 0000, + .max = 0777, + .default_value = "0000", + .validate = GF_OPT_VALIDATE_BOTH, + .description = "Mode bit permission that will always be set on a file."}, + {.key = {"force-directory-mode"}, + .type = GF_OPTION_TYPE_INT, + .min = 0000, + .max = 0777, + .default_value = "0000", + .validate = GF_OPT_VALIDATE_BOTH, + .description = "Mode bit permission that will be always set on directory"}, + {.key = {"create-mask"}, + .type = GF_OPTION_TYPE_INT, + .min = 0000, + .max = 0777, + .default_value = "0777", + .validate = GF_OPT_VALIDATE_BOTH, + .description = "Any bit not set here will be removed from the" + "modes set on a file when it is created"}, + {.key = {"create-directory-mask"}, + .type = GF_OPTION_TYPE_INT, + .min = 0000, + .max = 0777, + .default_value = "0777", + .validate = GF_OPT_VALIDATE_BOTH, + .description = "Any bit not set here will be removed from the" + "modes set on a directory when it is created"}, + {.key = {"max-hardlinks"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "100", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"posix"}, + .validate = GF_OPT_VALIDATE_MIN, + .description = "max number of hardlinks allowed on any one inode.\n" + "0 is unlimited, 1 prevents any hardlinking at all."}, + {.key = {"fips-mode-rchecksum"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"posix"}, + .description = "If enabled, posix_rchecksum uses the FIPS compliant" + "SHA256 checksum. MD5 otherwise."}, + {.key = {"ctime"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = {GD_OP_VERSION_4_1_0}, + .tags = {"ctime"}, + .description = + "When this option is enabled, time attributes (ctime,mtime,atime) " + "are stored in xattr to keep it consistent across replica and " + "distribute set. The time attributes stored at the backend are " + "not considered "}, + {.key = {NULL}}, +}; diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c new file mode 100644 index 00000000000..8cc3ccf8c00 --- /dev/null +++ b/xlators/storage/posix/src/posix-entry-ops.c @@ -0,0 +1,2496 @@ +/* + Copyright (c) 2006-2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#define __XOPEN_SOURCE 500 + +/* for SEEK_HOLE and SEEK_DATA */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <openssl/md5.h> +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <libgen.h> +#include <pthread.h> +#include <ftw.h> +#include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> +#include <unistd.h> + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#ifdef HAVE_LINKAT +#include <fcntl.h> +#endif /* HAVE_LINKAT */ + +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include "posix.h" +#include "posix-handle.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/syscall.h> +#include <glusterfs/statedump.h> +#include <glusterfs/locking.h> +#include <glusterfs/timer.h> +#include "glusterfs3-xdr.h" +#include "posix-aio.h" +#include <glusterfs/glusterfs-acl.h> +#include "posix-messages.h" +#include "posix-metadata.h" +#include <glusterfs/events.h> +#include "posix-gfid-path.h" +#include <glusterfs/compat-uuid.h> +#include <glusterfs/syncop.h> + +extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 + +#undef HAVE_SET_FSID +#ifdef HAVE_SET_FSID + +#define DECLARE_OLD_FS_ID_VAR \ + uid_t old_fsuid; \ + gid_t old_fsgid; + +#define SET_FS_ID(uid, gid) \ + do { \ + old_fsuid = setfsuid(uid); \ + old_fsgid = setfsgid(gid); \ + } while (0) + +#define SET_TO_OLD_FS_ID() \ + do { \ + setfsuid(old_fsuid); \ + setfsgid(old_fsgid); \ + } while (0) + +#else + +#define DECLARE_OLD_FS_ID_VAR +#define SET_FS_ID(uid, gid) +#define SET_TO_OLD_FS_ID() + +#endif + +static gf_boolean_t +posix_symlinks_match(xlator_t *this, loc_t *loc, uuid_t gfid) +{ + struct posix_private *priv = NULL; + char linkname_actual[PATH_MAX] = { + 0, + }; + char linkname_expected[PATH_MAX] = {0}; + char *dir_handle = NULL; + ssize_t len = 0; + size_t handle_size = 0; + gf_boolean_t ret = _gf_false; + + priv = this->private; + handle_size = POSIX_GFID_HANDLE_SIZE(priv->base_path_length); + dir_handle = alloca0(handle_size); + + snprintf(linkname_expected, PATH_MAX, "../../%02x/%02x/%s/%s", + loc->pargfid[0], loc->pargfid[1], uuid_utoa(loc->pargfid), + loc->name); + + MAKE_HANDLE_GFID_PATH(dir_handle, this, gfid); + len = sys_readlink(dir_handle, linkname_actual, PATH_MAX); + if (len < 0 || len == PATH_MAX) { + if (len == PATH_MAX) { + errno = EINVAL; + } + + if (errno != ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "readlink[%s] failed", dir_handle); + } + goto out; + } + linkname_actual[len] = '\0'; + + if (!strcmp(linkname_actual, linkname_expected)) + ret = _gf_true; + +out: + return ret; +} + +static dict_t * +posix_dict_set_nlink(dict_t *req, dict_t *res, int32_t nlink) +{ + int ret = -1; + + if (req == NULL || !dict_get_sizen(req, GF_REQUEST_LINK_COUNT_XDATA)) + goto out; + + if (res == NULL) + res = dict_new(); + if (res == NULL) + goto out; + + ret = dict_set_uint32(res, GF_RESPONSE_LINK_COUNT_XDATA, nlink); + if (ret == -1) + gf_msg("posix", GF_LOG_WARNING, 0, P_MSG_SET_XDATA_FAIL, + "Failed to set GF_RESPONSE_LINK_COUNT_XDATA"); +out: + return res; +} + +/* Regular fops */ + +int32_t +posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + struct iatt buf = { + 0, + }; + int32_t op_ret = -1; + int32_t entry_ret = 0; + int32_t op_errno = 0; + dict_t *xattr = NULL; + char *real_path = NULL; + char *par_path = NULL; + char *gfid_path = NULL; + uuid_t gfid = {0}; + struct iatt postparent = { + 0, + }; + struct stat statbuf = {0}; + int32_t gfidless = 0; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; + struct posix_private *priv = NULL; + posix_inode_ctx_t *ctx = NULL; + int ret = 0; + int dfd = -1; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(this->private, out); + + priv = this->private; + + /* The Hidden directory should be for housekeeping purpose and it + should not get any gfid on it */ + if (__is_root_gfid(loc->pargfid) && loc->name && + (strcmp(loc->name, GF_HIDDEN_PATH) == 0)) { + gf_msg(this->name, GF_LOG_WARNING, EPERM, P_MSG_LOOKUP_NOT_PERMITTED, + "Lookup issued on %s," + " which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + +#ifdef __NetBSD__ + /* Same for NetBSD's .attribute directory */ + if (__is_root_gfid(loc->pargfid) && loc->name && + (strcmp(loc->name, ".attribute") == 0)) { + gf_msg(this->name, GF_LOG_WARNING, EPERM, P_MSG_LOOKUP_NOT_PERMITTED, + "Lookup issued on .attribute," + " which is not permitted"); + op_errno = EPERM; + op_ret = -1; + goto out; + } +#endif /* __NetBSD__ */ + + op_ret = dict_get_int32_sizen(xdata, GF_GFIDLESS_LOOKUP, &gfidless); + op_ret = -1; + if (gf_uuid_is_null(loc->pargfid) || (loc->name == NULL)) { + /* nameless lookup */ + MAKE_INODE_HANDLE(real_path, this, loc, &buf); + } else { + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &buf); + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + if (gf_uuid_is_null(loc->inode->gfid)) { + op_ret = posix_gfid_heal(this, real_path, loc, xdata); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + goto out; + } + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &buf); + } + } + + op_errno = errno; + + if (op_ret == -1) { + if (op_errno != ENOENT) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", real_path ? real_path : "null"); + } + entry_ret = -1; + if (loc_is_nameless(loc)) { + if (!op_errno) + op_errno = ESTALE; + loc_gfid(loc, gfid); + MAKE_HANDLE_ABSPATH_FD(gfid_path, this, gfid, dfd); + ret = sys_fstatat(dfd, gfid_path, &statbuf, 0); + if (ret == 0 && ((statbuf.st_mode & S_IFMT) == S_IFDIR)) + /*Don't unset if it was a symlink to a dir.*/ + goto parent; + ret = sys_fstatat(dfd, gfid_path, &statbuf, AT_SYMLINK_NOFOLLOW); + if (ret == 0 && statbuf.st_nlink == 1) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, + P_MSG_HANDLE_DELETE, + "Found stale gfid " + "handle %s, removing it.", + gfid_path); + posix_handle_unset(this, gfid, NULL); + } + } + goto parent; + } + + if (xdata && (op_ret == 0)) { + xattr = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, &buf); + + posix_cs_maintenance(this, NULL, loc, NULL, &buf, real_path, xdata, + &xattr, _gf_true); + + if (dict_get_sizen(xdata, GF_CLEAN_WRITE_PROTECTION)) { + ret = sys_lremovexattr(real_path, GF_PROTECT_FROM_EXTERNAL_WRITES); + if (ret == -1 && (errno != ENODATA && errno != ENOATTR)) + gf_msg(this->name, GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, errno, + "removexattr failed. key %s path %s", + GF_PROTECT_FROM_EXTERNAL_WRITES, loc->path); + } + } + + posix_update_iatt_buf(&buf, -1, real_path, xdata); + if (priv->update_pgfid_nlinks) { + if (!gf_uuid_is_null(loc->pargfid) && !IA_ISDIR(buf.ia_type)) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + + op_ret = posix_inode_ctx_get_all(loc->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + pthread_mutex_lock(&ctx->pgfid_lock); + { + SET_PGFID_XATTR_IF_ABSENT(real_path, pgfid_xattr_key, + nlink_samepgfid, XATTR_CREATE, op_ret, + this, unlock); + } + unlock: + pthread_mutex_unlock(&ctx->pgfid_lock); + } + } + +parent: + if (par_path) { + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, + &postparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on" + " parent %s failed", + par_path); + if (op_errno == ENOENT) + /* If parent directory is missing in a lookup, + errno should be ESTALE (bad handle) and not + ENOENT (missing entry) + */ + op_errno = ESTALE; + goto out; + } + } + + op_ret = entry_ret; +out: + if (!op_ret && !gfidless && gf_uuid_is_null(buf.ia_gfid)) { + gf_msg(this->name, GF_LOG_ERROR, ENODATA, P_MSG_NULL_GFID, + "buf->ia_gfid is null for " + "%s", + (real_path) ? real_path : ""); + op_ret = -1; + op_errno = ENODATA; + } + + if (op_ret == 0) + op_errno = 0; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, + (loc) ? loc->inode : NULL, &buf, xattr, &postparent); + + if (xattr) + dict_unref(xattr); + + return 0; +} + +static int32_t +posix_set_gfid2path_xattr(xlator_t *this, const char *path, uuid_t pgfid, + const char *bname) +{ + char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = { + 0, + }; + char pgfid_bname[1024] = { + 0, + }; + char *key = NULL; + const size_t key_size = GFID2PATH_XATTR_KEY_PREFIX_LENGTH + + GF_XXH64_DIGEST_LENGTH * 2 + 1; + int ret = 0; + int len; + + len = snprintf(pgfid_bname, sizeof(pgfid_bname), "%s/%s", uuid_utoa(pgfid), + bname); + gf_xxh64_wrapper((unsigned char *)pgfid_bname, len, + GF_XXHSUM64_DEFAULT_SEED, xxh64); + key = alloca(key_size); + snprintf(key, key_size, GFID2PATH_XATTR_KEY_PREFIX "%s", xxh64); + + ret = sys_lsetxattr(path, key, pgfid_bname, len, XATTR_CREATE); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PGFID_OP, + "setting gfid2path xattr failed on %s: key = %s ", path, key); + } + + return ret; +} + +int +posix_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) +{ + int tmp_fd = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + char *par_path = 0; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + gid_t gid = 0; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + uuid_t uuid_req = { + 0, + }; + int32_t nlink_samepgfid = 0; + char *pgfid_xattr_key = NULL; + gf_boolean_t entry_created = _gf_false, gfid_set = _gf_false; + gf_boolean_t linked = _gf_false; + gf_loglevel_t level = GF_LOG_NONE; + mode_t mode_bit = 0; + posix_inode_ctx_t *ctx = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + GFID_NULL_CHECK_AND_GOTO(frame, this, loc, xdata, op_ret, op_errno, + uuid_req, out); + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, NULL); + + mode_bit = (priv->create_mask & mode) | priv->force_create_mode; + mode = posix_override_umask(mode, mode_bit); + + gid = frame->root->gid; + + SET_FS_ID(frame->root->uid, gid); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent of %s failed", real_path); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + + /* Check if the 'gfid' already exists, because this mknod may be an + internal call from distribute for creating 'linkfile', and that + linkfile may be for a hardlinked file */ + if (dict_get_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { + dict_del_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY); + /* trash xlator did not bring the uuid_via the call + * to GFID_NULL_CHECK_AND_GOTO() above. + * Fetch it explicitly here. + */ + if (frame->root->pid == GF_SERVER_PID_TRASH) { + op_ret = dict_get_gfuuid(xdata, "gfid-req", &uuid_req); + if (op_ret) { + gf_msg_debug(this->name, 0, + "failed to get the gfid from dict for %s", + loc->path); + goto real_op; + } + } + + op_ret = posix_create_link_if_gfid_exists(this, uuid_req, real_path, + loc->inode->table); + if (!op_ret) { + linked = _gf_true; + goto post_op; + } + } + +real_op: +#ifdef __NetBSD__ + if (S_ISFIFO(mode)) + op_ret = mkfifo(real_path, mode); + else +#endif /* __NetBSD__ */ + op_ret = sys_mknod(real_path, mode, dev); + + if (op_ret == -1) { + op_errno = errno; + if ((op_errno == EINVAL) && S_ISREG(mode)) { + /* Over Darwin, mknod with (S_IFREG|mode) + doesn't work */ + tmp_fd = sys_creat(real_path, mode); + if (tmp_fd == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CREATE_FAILED, + "create failed on" + "%s", + real_path); + goto out; + } + sys_close(tmp_fd); + } else { + if (op_errno == EEXIST) + level = GF_LOG_DEBUG; + else + level = GF_LOG_ERROR; + gf_msg(this->name, level, errno, P_MSG_MKNOD_FAILED, + "mknod on %s failed", real_path); + goto out; + } + } + + entry_created = _gf_true; + +#ifndef HAVE_SET_FSID + op_ret = sys_lchown(real_path, frame->root->uid, gid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LCHOWN_FAILED, + "lchown on %s failed", real_path); + goto out; + } +#endif + +post_op: + op_ret = posix_acl_xattr_set(this, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_ACL_FAILED, + "setting ACLs on %s failed", real_path); + } + + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + op_ret = posix_inode_ctx_get_all(loc->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + pthread_mutex_lock(&ctx->pgfid_lock); + { + LINK_MODIFY_PGFID_XATTR(real_path, pgfid_xattr_key, nlink_samepgfid, + 0, op_ret, this, unlock); + } + unlock: + pthread_mutex_unlock(&ctx->pgfid_lock); + } + + if (priv->gfid2path) { + posix_set_gfid2path_xattr(this, real_path, loc->pargfid, loc->name); + } + + op_ret = posix_entry_create_xattr_set(this, loc, real_path, xdata); + if (op_ret) { + if (errno != EEXIST) + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "setting xattrs on %s failed", real_path); + else + gf_msg_debug(this->name, 0, "setting xattrs on %s failed", + real_path); + } + + if (!linked) { + op_ret = posix_gfid_set(this, real_path, loc, xdata, frame->root->pid, + &op_errno); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_GFID_FAILED, + "setting gfid on %s failed", real_path); + goto out; + } else { + gfid_set = _gf_true; + } + } + + op_ret = posix_pstat(this, loc->inode, NULL, real_path, &stbuf, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_MKNOD_FAILED, + "mknod on %s failed", real_path); + goto out; + } + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &stbuf); + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_path); + goto out; + } + + posix_set_parent_ctime(frame, this, par_path, -1, loc->parent, &postparent); + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + if (op_ret < 0) { + if (entry_created) { + if (S_ISREG(mode)) + sys_unlink(real_path); + else + sys_rmdir(real_path); + } + + if (gfid_set) + posix_gfid_unset(this, xdata); + } + + STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, + (loc) ? loc->inode : NULL, &stbuf, &preparent, + &postparent, NULL); + + return 0; +} + +int +posix_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL, *gfid_path = NULL; + char *par_path = NULL, *xattr_name = NULL; + int xattr_name_len; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + gid_t gid = 0; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + gf_boolean_t entry_created = _gf_false, gfid_set = _gf_false; + uuid_t uuid_req = { + 0, + }; + ssize_t size = 0; + dict_t *xdata_rsp = NULL; + char *disk_xattr = NULL; + data_t *arg_data = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + mode_t mode_bit = 0; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + /* The Hidden directory should be for housekeeping purpose and it + should not get created from a user request */ + if (__is_root_gfid(loc->pargfid) && + (strcmp(loc->name, GF_HIDDEN_PATH) == 0)) { + gf_msg(this->name, GF_LOG_WARNING, EPERM, P_MSG_MKDIR_NOT_PERMITTED, + "mkdir issued on %s, which" + "is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + +#ifdef __NetBSD__ + /* Same for NetBSD's .attribute directory */ + if (__is_root_gfid(loc->pargfid) && + (strcmp(loc->name, ".attribute") == 0)) { + gf_msg(this->name, GF_LOG_WARNING, EPERM, P_MSG_MKDIR_NOT_PERMITTED, + "mkdir issued on .attribute, which" + "is not permitted"); + op_errno = EPERM; + op_ret = -1; + goto out; + } +#endif + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + GFID_NULL_CHECK_AND_GOTO(frame, this, loc, xdata, op_ret, op_errno, + uuid_req, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, NULL); + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + gid = frame->root->gid; + + op_ret = posix_pstat(this, loc->inode, NULL, real_path, &stbuf, _gf_false); + + SET_FS_ID(frame->root->uid, gid); + + mode_bit = (priv->create_directory_mask & mode) | + priv->force_directory_mode; + mode = posix_override_umask(mode, mode_bit); + + if (xdata) { + if (!gf_uuid_compare(stbuf.ia_gfid, uuid_req)) { + op_ret = -1; + op_errno = EEXIST; + goto out; + } + } + + if (!gf_uuid_is_null(uuid_req)) { + op_ret = posix_istat(this, loc->inode, uuid_req, NULL, &stbuf); + if ((op_ret == 0) && IA_ISDIR(stbuf.ia_type)) { + gfid_path = alloca(PATH_MAX); + size = posix_handle_path(this, uuid_req, NULL, gfid_path, PATH_MAX); + if (size <= 0) { + op_errno = ESTALE; + op_ret = -1; + goto out; + } + + if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DIR_OF_SAME_ID, + "mkdir (%s): " + "gfid (%s) is already associated with " + "directory (%s). Hence, both " + "directories will share same gfid and " + "this can lead to inconsistencies.", + loc->path, uuid_utoa(uuid_req), + gfid_path ? gfid_path : "<NULL>"); + + gf_event(EVENT_POSIX_SAME_GFID, + "gfid=%s;" + "path=%s;newpath=%s;brick=%s:%s", + uuid_utoa(uuid_req), gfid_path ? gfid_path : "<NULL>", + loc->path, priv->hostname, priv->base_path); + } + if (!posix_symlinks_match(this, loc, uuid_req)) + /* For afr selfheal of dir renames, we need to + * remove the old symlink in order for + * posix_gfid_set to set the symlink to the + * new dir.*/ + posix_handle_unset(this, stbuf.ia_gfid, NULL); + } + } else if (frame->root->pid != GF_SERVER_PID_TRASH) { + op_ret = -1; + op_errno = EPERM; + gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno, P_MSG_NULL_GFID, + "mkdir (%s): is issued without " + "gfid-req %p", + loc->path, xdata); + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_path); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + mode |= S_ISGID; + } + + op_ret = dict_get_str_sizen(xdata, GF_PREOP_PARENT_KEY, &xattr_name); + if (xattr_name != NULL) { + xattr_name_len = strlen(xattr_name); + arg_data = dict_getn(xdata, xattr_name, xattr_name_len); + if (arg_data) { + if (loc->parent) + gf_uuid_unparse(loc->parent->gfid, pgfid); + else + gf_uuid_unparse(loc->pargfid, pgfid); + + size = 256; + disk_xattr = GF_MALLOC(size + 1, gf_posix_mt_char); + if (!disk_xattr) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): GF_MALLOC failed during" + " preop of mkdir (%s)", + pgfid, loc->name, real_path); + goto out; + } + disk_xattr[size] = '\0'; + + size = sys_lgetxattr(par_path, xattr_name, disk_xattr, size); + if (size == -1) { + if (disk_xattr) { + GF_FREE(disk_xattr); + disk_xattr = NULL; + } + if (errno != ERANGE) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): getxattr failed during" + " preop of mkdir (%s).", + pgfid, loc->name, real_path); + goto out; + } + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): getxattr on key " + "(%s) path (%s) failed due to " + " buffer overflow", + pgfid, loc->name, xattr_name, par_path); + size = sys_lgetxattr(par_path, xattr_name, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): getxattr on key (%s)" + " path (%s) failed ", + pgfid, loc->name, xattr_name, par_path); + goto out; + } + disk_xattr = GF_MALLOC(size + 1, gf_posix_mt_char); + if (!disk_xattr) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): GF_MALLOC failed during" + " preop of mkdir (%s)", + pgfid, loc->name, real_path); + goto out; + } + disk_xattr[size] = '\0'; + size = sys_lgetxattr(par_path, xattr_name, disk_xattr, size); + if (size == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): getxattr on " + " key (%s) path (%s) failed " + "(%s)", + pgfid, loc->name, xattr_name, par_path, + strerror(errno)); + goto out; + } + } + if ((arg_data->len != size) || + (memcmp(arg_data->data, disk_xattr, size))) { + gf_msg(this->name, GF_LOG_INFO, EIO, P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): failing preop of " + "mkdir (%s) as on-disk" + " xattr value differs from argument " + "value for key %s", + pgfid, loc->name, real_path, xattr_name); + op_ret = -1; + op_errno = EIO; + + xdata_rsp = dict_new(); + if (xdata_rsp == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + P_MSG_PREOP_CHECK_FAILED, + "mkdir (%s/%s): " + "dict allocation failed", + pgfid, loc->name); + op_errno = ENOMEM; + goto out; + } + + op_errno = dict_set_int8(xdata_rsp, GF_PREOP_CHECK_FAILED, 1); + if (op_errno < 0) + op_errno = errno; + goto out; + } + + dict_deln(xdata, xattr_name, xattr_name_len); + } + + dict_del_sizen(xdata, GF_PREOP_PARENT_KEY); + } + + op_ret = sys_mkdir(real_path, mode); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_MKDIR_FAILED, + "mkdir of %s failed", real_path); + goto out; + } + + entry_created = _gf_true; + +#ifndef HAVE_SET_FSID + op_ret = sys_chown(real_path, frame->root->uid, gid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CHOWN_FAILED, + "chown on %s failed", real_path); + goto out; + } +#endif + op_ret = posix_acl_xattr_set(this, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_ACL_FAILED, + "setting ACLs on %s failed ", real_path); + } + + op_ret = posix_entry_create_xattr_set(this, loc, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "setting xattrs on %s failed", real_path); + } + + op_ret = posix_gfid_set(this, real_path, loc, xdata, frame->root->pid, + &op_errno); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_GFID_FAILED, + "setting gfid on %s failed", real_path); + goto out; + } else { + gfid_set = _gf_true; + } + + op_ret = posix_pstat(this, loc->inode, NULL, real_path, &stbuf, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", real_path); + goto out; + } + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &stbuf); + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent of %s failed", real_path); + goto out; + } + + posix_set_parent_ctime(frame, this, par_path, -1, loc->parent, &postparent); + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + if (disk_xattr) + GF_FREE(disk_xattr); + + if (op_ret < 0) { + if (entry_created) + sys_rmdir(real_path); + + if (gfid_set) + posix_gfid_unset(this, xdata); + } + + STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, + (loc) ? loc->inode : NULL, &stbuf, &preparent, + &postparent, xdata_rsp); + + if (xdata_rsp) + dict_unref(xdata_rsp); + + return 0; +} + +static int +posix_add_unlink_to_ctx(inode_t *inode, xlator_t *this, char *unlink_path) +{ + uint64_t ctx = GF_UNLINK_FALSE; + int ret = 0; + + if (!unlink_path) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, + "Creation of unlink entry failed"); + ret = -1; + goto out; + } + + ctx = GF_UNLINK_TRUE; + ret = posix_inode_ctx_set_unlink_flag(inode, this, ctx); + if (ret < 0) { + goto out; + } + +out: + return ret; +} + +static int32_t +posix_move_gfid_to_unlink(xlator_t *this, uuid_t gfid, loc_t *loc) +{ + char *unlink_path = NULL; + char *gfid_path = NULL; + int ret = -1; + struct posix_private *priv_posix = NULL; + + priv_posix = (struct posix_private *)this->private; + + MAKE_HANDLE_GFID_PATH(gfid_path, this, gfid); + + POSIX_GET_FILE_UNLINK_PATH(priv_posix->base_path, loc->inode->gfid, + unlink_path); + if (!unlink_path) { + ret = -1; + goto out; + } + gf_msg_debug(this->name, 0, "Moving gfid: %s to unlink_path : %s", + gfid_path, unlink_path); + ret = sys_rename(gfid_path, unlink_path); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_UNLINK_FAILED, + "Creation of unlink entry failed for gfid: %s", unlink_path); + goto out; + } + ret = posix_add_unlink_to_ctx(loc->inode, this, unlink_path); + if (ret < 0) + goto out; + +out: + return ret; +} + +static int32_t +posix_unlink_gfid_handle_and_entry(call_frame_t *frame, xlator_t *this, + const char *real_path, struct iatt *stbuf, + int32_t *op_errno, loc_t *loc, + gf_boolean_t get_link_count, + dict_t *rsp_dict) +{ + int32_t ret = 0; + struct iatt prebuf = { + 0, + }; + gf_boolean_t locked = _gf_false; + gf_boolean_t update_ctime = _gf_false; + + /* Unlink the gfid_handle_first */ + if (stbuf && stbuf->ia_nlink == 1) { + LOCK(&loc->inode->lock); + + if (loc->inode->fd_count == 0) { + UNLOCK(&loc->inode->lock); + ret = posix_handle_unset(this, stbuf->ia_gfid, NULL); + } else { + UNLOCK(&loc->inode->lock); + ret = posix_move_gfid_to_unlink(this, stbuf->ia_gfid, loc); + } + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_UNLINK_FAILED, + "unlink of gfid handle " + "failed for path:%s with gfid %s", + real_path, uuid_utoa(stbuf->ia_gfid)); + } + } else { + update_ctime = _gf_true; + } + + if (get_link_count) { + LOCK(&loc->inode->lock); + locked = _gf_true; + /* Since this stat is to get link count and not for time + * attributes, intentionally passing inode as NULL + */ + ret = posix_pstat(this, NULL, loc->gfid, real_path, &prebuf, _gf_true); + if (ret) { + UNLOCK(&loc->inode->lock); + locked = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", real_path); + goto err; + } + } + + /* Unlink the actual file */ + ret = sys_unlink(real_path); + + if (locked) { + UNLOCK(&loc->inode->lock); + locked = _gf_false; + } + + if (ret == -1) { + if (op_errno) + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_UNLINK_FAILED, + "unlink of %s failed", real_path); + goto err; + } + + if (update_ctime) { + posix_set_ctime(frame, this, NULL, -1, loc->inode, stbuf); + } + + ret = dict_set_uint32(rsp_dict, GET_LINK_COUNT, prebuf.ia_nlink); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_SET_XDATA_FAIL, + "failed to set " GET_LINK_COUNT " for %s", real_path); + + return 0; + +err: + if (locked) { + UNLOCK(&loc->inode->lock); + locked = _gf_false; + } + return -1; +} + +static gf_boolean_t +posix_skip_non_linkto_unlink(dict_t *xdata, loc_t *loc, char *key, + const int keylen, const char *linkto_xattr, + struct iatt *stbuf, const char *real_path) +{ + gf_boolean_t skip_unlink = _gf_false; + gf_boolean_t is_dht_linkto_file = _gf_false; + int unlink_if_linkto = 0; + ssize_t xattr_size = -1; + int op_ret = -1; + + op_ret = dict_get_int32n(xdata, key, keylen, &unlink_if_linkto); + + if (!op_ret && unlink_if_linkto) { + is_dht_linkto_file = IS_DHT_LINKFILE_MODE(stbuf); + if (!is_dht_linkto_file) + return _gf_true; + + LOCK(&loc->inode->lock); + + xattr_size = sys_lgetxattr(real_path, linkto_xattr, NULL, 0); + + UNLOCK(&loc->inode->lock); + + if (xattr_size <= 0) + skip_unlink = _gf_true; + + gf_msg("posix", GF_LOG_INFO, 0, P_MSG_XATTR_STATUS, + "linkto_xattr status: %" PRIu32 " for %s", skip_unlink, + real_path); + } + return skip_unlink; +} + +static int32_t +posix_remove_gfid2path_xattr(xlator_t *this, const char *path, uuid_t pgfid, + const char *bname) +{ + char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = { + 0, + }; + char pgfid_bname[1024] = { + 0, + }; + int ret = 0; + char *key = NULL; + const size_t key_size = GFID2PATH_XATTR_KEY_PREFIX_LENGTH + + GF_XXH64_DIGEST_LENGTH * 2 + 1; + int len; + + len = snprintf(pgfid_bname, sizeof(pgfid_bname), "%s/%s", uuid_utoa(pgfid), + bname); + gf_xxh64_wrapper((unsigned char *)pgfid_bname, len, + GF_XXHSUM64_DEFAULT_SEED, xxh64); + key = alloca(key_size); + snprintf(key, key_size, GFID2PATH_XATTR_KEY_PREFIX "%s", xxh64); + + ret = sys_lremovexattr(path, key); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PGFID_OP, + "removing gfid2path xattr failed on %s: key = %s", path, key); + } + + return ret; +} + +int32_t +posix_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *par_path = NULL; + int32_t fd = -1; + int ret = -1; + struct iatt stbuf = { + 0, + }; + struct iatt postbuf = { + 0, + }; + struct posix_private *priv = NULL; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; + int32_t check_open_fd = 0; + int32_t skip_unlink = 0; + int32_t fdstat_requested = 0; + dict_t *unwind_dict = NULL; + gf_boolean_t get_link_count = _gf_false; + posix_inode_ctx_t *ctx = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(loc, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &stbuf); + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_path); + goto out; + } + + priv = this->private; + + op_ret = dict_get_int32_sizen(xdata, DHT_SKIP_OPEN_FD_UNLINK, + &check_open_fd); + + if (!op_ret && check_open_fd) { + LOCK(&loc->inode->lock); + + if (loc->inode->fd_count) { + skip_unlink = 1; + } + + UNLOCK(&loc->inode->lock); + + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_KEY_STATUS_INFO, + "open-fd-key-status: %" PRIu32 " for %s", skip_unlink, + real_path); + + if (skip_unlink) { + op_ret = -1; + op_errno = EBUSY; + goto out; + } + } + /* + * If either of the function return true, skip_unlink. + * If first first function itself return true, + * we don't need to call second function, skip unlink. + */ + skip_unlink = posix_skip_non_linkto_unlink( + xdata, loc, DHT_SKIP_NON_LINKTO_UNLINK, + SLEN(DHT_SKIP_NON_LINKTO_UNLINK), DHT_LINKTO, &stbuf, real_path); + if (skip_unlink) { + op_ret = -1; + op_errno = EBUSY; + goto out; + } + + if (IA_ISREG(loc->inode->ia_type) && xdata && + dict_get_sizen(xdata, DHT_IATT_IN_XDATA_KEY)) { + fdstat_requested = 1; + } + + if (fdstat_requested || + (priv->background_unlink && IA_ISREG(loc->inode->ia_type))) { + fd = sys_open(real_path, O_RDONLY, 0); + if (fd == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_OPEN_FAILED, + "open of %s failed", real_path); + goto out; + } + } + + if (priv->update_pgfid_nlinks && (stbuf.ia_nlink > 1)) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + op_ret = posix_inode_ctx_get_all(loc->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + pthread_mutex_lock(&ctx->pgfid_lock); + { + UNLINK_MODIFY_PGFID_XATTR(real_path, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, this, unlock); + } + unlock: + pthread_mutex_unlock(&ctx->pgfid_lock); + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_FAILED, + "modification of " + "parent gfid xattr failed (path:%s gfid:%s)", + real_path, uuid_utoa(loc->inode->gfid)); + if (op_errno != ENOATTR) + /* Allow unlink if pgfid xattr is not set. */ + goto out; + } + } + + if (priv->gfid2path && (stbuf.ia_nlink > 1)) { + op_ret = posix_remove_gfid2path_xattr(this, real_path, loc->pargfid, + loc->name); + if (op_ret < 0) { + /* Allow unlink if pgfid xattr is not set. */ + if (errno != ENOATTR) + goto out; + } + } + + unwind_dict = dict_new(); + if (!unwind_dict) { + op_errno = ENOMEM; + op_ret = -1; + goto out; + } + + if (xdata && dict_get_sizen(xdata, GF_GET_FILE_BLOCK_COUNT)) { + ret = dict_set_uint64(unwind_dict, GF_GET_FILE_BLOCK_COUNT, + stbuf.ia_blocks); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_SET_XDATA_FAIL, + "Failed to set %s in rsp dict", GF_GET_FILE_BLOCK_COUNT); + } + + if (xdata && dict_get_sizen(xdata, GET_LINK_COUNT)) + get_link_count = _gf_true; + op_ret = posix_unlink_gfid_handle_and_entry(frame, this, real_path, &stbuf, + &op_errno, loc, get_link_count, + unwind_dict); + if (op_ret == -1) { + goto out; + } + + if (fdstat_requested) { + op_ret = posix_fdstat(this, loc->inode, fd, &postbuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post operation " + "fstat failed on fd=%d", + fd); + goto out; + } + op_ret = posix_set_iatt_in_dict(unwind_dict, NULL, &postbuf); + if (op_ret == -1) { + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_DICT_SET_FAILED, + "failed to set fdstat in dict"); + } + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_path); + goto out; + } + + posix_set_parent_ctime(frame, this, par_path, -1, loc->parent, &postparent); + + unwind_dict = posix_dict_set_nlink(xdata, unwind_dict, stbuf.ia_nlink); + op_ret = 0; +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, &preparent, + &postparent, unwind_dict); + + if (fd != -1) { + sys_close(fd); + } + + /* unref unwind_dict*/ + if (unwind_dict) { + dict_unref(unwind_dict); + } + + return 0; +} + +int +posix_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *par_path = NULL; + char *gfid_str = NULL; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + char tmp_path[PATH_MAX] = { + 0, + }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + + /* The Hidden directory should be for housekeeping purpose and it + should not get deleted from inside process */ + if (__is_root_gfid(loc->pargfid) && + (strcmp(loc->name, GF_HIDDEN_PATH) == 0)) { + gf_msg(this->name, GF_LOG_WARNING, EPERM, P_MSG_RMDIR_NOT_PERMITTED, + "rmdir issued on %s, which" + "is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + +#ifdef __NetBSD__ + /* Same for NetBSD's .attribute directory */ + if (__is_root_gfid(loc->pargfid) && + (strcmp(loc->name, ".attribute") == 0)) { + gf_msg(this->name, GF_LOG_WARNING, EPERM, P_MSG_RMDIR_NOT_PERMITTED, + "rmdir issued on .attribute, which" + "is not permitted"); + op_errno = EPERM; + op_ret = -1; + goto out; + } +#endif + + priv = this->private; + + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &stbuf); + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_path); + goto out; + } + + if (flags) { + op_ret = sys_mkdir(priv->trash_path, 0755); + if (errno != EEXIST && op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_MKDIR_FAILED, + "mkdir of %s failed", priv->trash_path); + } else { + gfid_str = uuid_utoa(stbuf.ia_gfid); + (void)snprintf(tmp_path, sizeof(tmp_path), "%s/%s", + priv->trash_path, gfid_str); + gf_msg_debug(this->name, 0, "Moving %s to %s", real_path, tmp_path); + op_ret = sys_rename(real_path, tmp_path); + } + } else { + op_ret = sys_rmdir(real_path); + } + op_errno = errno; + + if (op_ret == 0) { + if (posix_symlinks_match(this, loc, stbuf.ia_gfid)) + posix_handle_unset(this, stbuf.ia_gfid, NULL); + } + + if (op_errno == EEXIST) + /* Solaris sets errno = EEXIST instead of ENOTEMPTY */ + op_errno = ENOTEMPTY; + + /* No need to log a common error as ENOTEMPTY */ + if (op_ret == -1 && op_errno != ENOTEMPTY) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_RMDIR_FAILED, + "rmdir of %s failed", real_path); + } + + if (op_ret == -1) { + if (op_errno == ENOTEMPTY) { + gf_msg_debug(this->name, 0, "%s on %s failed", + (flags) ? "rename" : "rmdir", real_path); + } else { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + P_MSG_DIR_OPERATION_FAILED, "%s on %s failed", + (flags) ? "rename" : "rmdir", real_path); + } + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent of %s failed", par_path); + goto out; + } + + posix_set_parent_ctime(frame, this, par_path, -1, loc->parent, &postparent); + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, &preparent, &postparent, + NULL); + + return 0; +} + +int +posix_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + char *par_path = 0; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + gid_t gid = 0; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; + gf_boolean_t entry_created = _gf_false, gfid_set = _gf_false; + uuid_t uuid_req = { + 0, + }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(linkname, out); + VALIDATE_OR_GOTO(loc, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + GFID_NULL_CHECK_AND_GOTO(frame, this, loc, xdata, op_ret, op_errno, + uuid_req, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &stbuf); + + gid = frame->root->gid; + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + SET_FS_ID(frame->root->uid, gid); + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_path); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + + op_ret = sys_symlink(linkname, real_path); + + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_SYMLINK_FAILED, + "symlink of %s --> %s failed", real_path, linkname); + goto out; + } + + entry_created = _gf_true; + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &stbuf); + +#ifndef HAVE_SET_FSID + op_ret = sys_lchown(real_path, frame->root->uid, gid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LCHOWN_FAILED, + "lchown failed on %s", real_path); + goto out; + } +#endif + op_ret = posix_acl_xattr_set(this, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_ACL_FAILED, + "setting ACLs on %s failed", real_path); + } + + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + nlink_samepgfid = 1; + SET_PGFID_XATTR(real_path, pgfid_xattr_key, nlink_samepgfid, + XATTR_CREATE, op_ret, this, ignore); + } + + if (priv->gfid2path) { + posix_set_gfid2path_xattr(this, real_path, loc->pargfid, loc->name); + } + +ignore: + op_ret = posix_entry_create_xattr_set(this, loc, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "setting xattrs on %s failed ", real_path); + } + + op_ret = posix_gfid_set(this, real_path, loc, xdata, frame->root->pid, + &op_errno); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_GFID_FAILED, + "setting gfid on %s failed", real_path); + goto out; + } else { + gfid_set = _gf_true; + } + + op_ret = posix_pstat(this, loc->inode, NULL, real_path, &stbuf, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat failed on %s", real_path); + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_path); + goto out; + } + + posix_set_parent_ctime(frame, this, par_path, -1, loc->parent, &postparent); + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + if (op_ret < 0) { + if (entry_created) + sys_unlink(real_path); + + if (gfid_set) + posix_gfid_unset(this, xdata); + } + + STACK_UNWIND_STRICT(symlink, frame, op_ret, op_errno, + (loc) ? loc->inode : NULL, &stbuf, &preparent, + &postparent, NULL); + + return 0; +} + +int +posix_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_oldpath = NULL; + char *real_newpath = NULL; + char *par_oldpath = NULL; + char *par_newpath = NULL; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + char was_present = 1; + struct iatt preoldparent = { + 0, + }; + struct iatt postoldparent = { + 0, + }; + struct iatt prenewparent = { + 0, + }; + struct iatt postnewparent = { + 0, + }; + char olddirid[64]; + char newdirid[64]; + uuid_t victim = {0}; + int was_dir = 0; + int nlink = 0; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; + char *gfid_path = NULL; + dict_t *unwind_dict = NULL; + gf_boolean_t locked = _gf_false; + gf_boolean_t get_link_count = _gf_false; + posix_inode_ctx_t *ctx_old = NULL; + posix_inode_ctx_t *ctx_new = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(oldloc, out); + VALIDATE_OR_GOTO(newloc, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + MAKE_ENTRY_HANDLE(real_oldpath, par_oldpath, this, oldloc, NULL); + if (!real_oldpath || !par_oldpath) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + MAKE_ENTRY_HANDLE(real_newpath, par_newpath, this, newloc, &stbuf); + if (!real_newpath || !par_newpath) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + unwind_dict = dict_new(); + if (!unwind_dict) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + op_ret = posix_pstat(this, oldloc->parent, oldloc->pargfid, par_oldpath, + &preoldparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_oldpath); + goto out; + } + + op_ret = posix_pstat(this, newloc->parent, newloc->pargfid, par_newpath, + &prenewparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent of %s failed", par_newpath); + goto out; + } + + op_ret = posix_pstat(this, newloc->inode, NULL, real_newpath, &stbuf, + _gf_false); + if ((op_ret == -1) && (errno == ENOENT)) { + was_present = 0; + } else { + gf_uuid_copy(victim, stbuf.ia_gfid); + if (IA_ISDIR(stbuf.ia_type)) + was_dir = 1; + nlink = stbuf.ia_nlink; + } + + if (was_present && IA_ISDIR(stbuf.ia_type) && !newloc->inode) { + gf_msg(this->name, GF_LOG_WARNING, EEXIST, P_MSG_DIR_FOUND, + "found directory at %s while expecting ENOENT", real_newpath); + op_ret = -1; + op_errno = EEXIST; + goto out; + } + + if (was_present && IA_ISDIR(stbuf.ia_type) && + gf_uuid_compare(newloc->inode->gfid, stbuf.ia_gfid)) { + gf_msg(this->name, GF_LOG_WARNING, EEXIST, P_MSG_DIR_FOUND, + "found directory %s at %s while renaming %s", + uuid_utoa_r(newloc->inode->gfid, olddirid), real_newpath, + uuid_utoa_r(stbuf.ia_gfid, newdirid)); + op_ret = -1; + op_errno = EEXIST; + goto out; + } + + op_ret = posix_inode_ctx_get_all(oldloc->inode, this, &ctx_old); + if (op_ret < 0) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + if (newloc->inode) { + op_ret = posix_inode_ctx_get_all(newloc->inode, this, &ctx_new); + if (op_ret < 0) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + } + + if (IA_ISDIR(oldloc->inode->ia_type)) + posix_handle_unset(this, oldloc->inode->gfid, NULL); + + pthread_mutex_lock(&ctx_old->pgfid_lock); + { + if (!IA_ISDIR(oldloc->inode->ia_type) && priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + oldloc->pargfid); + UNLINK_MODIFY_PGFID_XATTR(real_oldpath, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, this, unlock); + } + + if ((xdata) && (dict_get(xdata, GET_LINK_COUNT)) && (real_newpath) && + (was_present) && ctx_new) { + pthread_mutex_lock(&ctx_new->pgfid_lock); + locked = _gf_true; + get_link_count = _gf_true; + op_ret = posix_pstat(this, newloc->inode, newloc->gfid, + real_newpath, &stbuf, _gf_false); + if ((op_ret == -1) && (errno != ENOENT)) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", real_newpath); + goto unlock; + } + } + + op_ret = sys_rename(real_oldpath, real_newpath); + if (op_ret == -1) { + op_errno = errno; + if (op_errno == ENOTEMPTY) { + gf_msg_debug(this->name, 0, + "rename of %s to" + " %s failed: %s", + real_oldpath, real_newpath, strerror(op_errno)); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_RENAME_FAILED, + "rename of %s to %s failed", real_oldpath, real_newpath); + } + + if (priv->update_pgfid_nlinks && + !IA_ISDIR(oldloc->inode->ia_type)) { + LINK_MODIFY_PGFID_XATTR(real_oldpath, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, this, + unlock); + } + + goto unlock; + } + + if (locked) { + pthread_mutex_unlock(&ctx_new->pgfid_lock); + locked = _gf_false; + } + + if ((get_link_count) && + (dict_set_uint32(unwind_dict, GET_LINK_COUNT, stbuf.ia_nlink))) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_SET_XDATA_FAIL, + "failed to set " GET_LINK_COUNT " for %s", real_newpath); + + if (!IA_ISDIR(oldloc->inode->ia_type) && priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + newloc->pargfid); + LINK_MODIFY_PGFID_XATTR(real_newpath, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, this, unlock); + } + + if (!IA_ISDIR(oldloc->inode->ia_type) && priv->gfid2path) { + MAKE_HANDLE_ABSPATH(gfid_path, this, oldloc->inode->gfid); + + posix_remove_gfid2path_xattr(this, gfid_path, oldloc->pargfid, + oldloc->name); + posix_set_gfid2path_xattr(this, gfid_path, newloc->pargfid, + newloc->name); + } + } + +unlock: + if (locked) { + pthread_mutex_unlock(&ctx_new->pgfid_lock); + locked = _gf_false; + } + pthread_mutex_unlock(&ctx_old->pgfid_lock); + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_FAILED, + "modification of " + "parent gfid xattr failed (gfid:%s)", + uuid_utoa(oldloc->inode->gfid)); + goto out; + } + + if (was_dir) + posix_handle_unset(this, victim, NULL); + + if (was_present && !was_dir && nlink == 1) + posix_handle_unset(this, victim, NULL); + + if (IA_ISDIR(oldloc->inode->ia_type)) { + posix_handle_soft(this, real_newpath, newloc, oldloc->inode->gfid, + NULL); + } + + op_ret = posix_pstat(this, newloc->inode, NULL, real_newpath, &stbuf, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", real_newpath); + goto out; + } + + /* Since the same inode is later used and dst inode is not present, + * update ctime on source inode. It can't use old path because it + * doesn't exist and xattr has to be stored on disk */ + posix_set_ctime(frame, this, real_newpath, -1, oldloc->inode, &stbuf); + + op_ret = posix_pstat(this, oldloc->parent, oldloc->pargfid, par_oldpath, + &postoldparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_oldpath); + goto out; + } + + posix_set_parent_ctime(frame, this, par_oldpath, -1, oldloc->parent, + &postoldparent); + + op_ret = posix_pstat(this, newloc->parent, newloc->pargfid, par_newpath, + &postnewparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_newpath); + goto out; + } + + posix_set_parent_ctime(frame, this, par_newpath, -1, newloc->parent, + &postnewparent); + + if (was_present) + unwind_dict = posix_dict_set_nlink(xdata, unwind_dict, nlink); + op_ret = 0; +out: + + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, &stbuf, &preoldparent, + &postoldparent, &prenewparent, &postnewparent, + unwind_dict); + + if (unwind_dict) + dict_unref(unwind_dict); + + return 0; +} + +int +posix_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_oldpath = 0; + char *real_newpath = 0; + char *par_newpath = 0; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + int32_t nlink_samepgfid = 0; + char *pgfid_xattr_key = NULL; + gf_boolean_t entry_created = _gf_false; + posix_inode_ctx_t *ctx = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(oldloc, out); + VALIDATE_OR_GOTO(newloc, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + MAKE_INODE_HANDLE(real_oldpath, this, oldloc, &stbuf); + if (!real_oldpath) { + op_errno = errno; + goto out; + } + + if (priv->max_hardlinks && stbuf.ia_nlink >= priv->max_hardlinks) { + op_ret = -1; + op_errno = EMLINK; + gf_log(this->name, GF_LOG_ERROR, + "hardlink failed: %s exceeds max link count (%u/%u).", + real_oldpath, stbuf.ia_nlink, priv->max_hardlinks); + goto out; + } + + MAKE_ENTRY_HANDLE(real_newpath, par_newpath, this, newloc, &stbuf); + if (!real_newpath || !par_newpath) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + op_ret = posix_pstat(this, newloc->parent, newloc->pargfid, par_newpath, + &preparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat failed: %s", par_newpath); + goto out; + } + + op_ret = sys_link(real_oldpath, real_newpath); + + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LINK_FAILED, + "link %s to %s failed", real_oldpath, real_newpath); + goto out; + } + + entry_created = _gf_true; + + op_ret = posix_pstat(this, newloc->inode, NULL, real_newpath, &stbuf, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", real_newpath); + goto out; + } + + posix_set_ctime(frame, this, real_newpath, -1, newloc->inode, &stbuf); + + op_ret = posix_pstat(this, newloc->parent, newloc->pargfid, par_newpath, + &postparent, _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat failed: %s", par_newpath); + goto out; + } + + posix_set_parent_ctime(frame, this, par_newpath, -1, newloc->parent, + &postparent); + + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + newloc->pargfid); + + op_ret = posix_inode_ctx_get_all(newloc->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + pthread_mutex_lock(&ctx->pgfid_lock); + { + LINK_MODIFY_PGFID_XATTR(real_newpath, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, this, unlock); + } + unlock: + pthread_mutex_unlock(&ctx->pgfid_lock); + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_FAILED, + "modification of " + "parent gfid xattr failed (path:%s gfid:%s)", + real_newpath, uuid_utoa(newloc->inode->gfid)); + goto out; + } + } + + if (priv->gfid2path) { + if (stbuf.ia_nlink <= MAX_GFID2PATH_LINK_SUP) { + op_ret = posix_set_gfid2path_xattr(this, real_newpath, + newloc->pargfid, newloc->name); + if (op_ret) { + op_errno = errno; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_XATTR_NOTSUP, + "Link count exceeded. " + "gfid2path xattr not set (path:%s gfid:%s)", + real_newpath, uuid_utoa(newloc->inode->gfid)); + } + } + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, + (oldloc) ? oldloc->inode : NULL, &stbuf, &preparent, + &postparent, NULL); + + if (op_ret < 0) { + if (entry_created) + sys_unlink(real_newpath); + } + + return 0; +} + +int +posix_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t _fd = -1; + int _flags = 0; + char *real_path = NULL; + char *par_path = NULL; + struct iatt stbuf = { + 0, + }; + struct posix_fd *pfd = NULL; + struct posix_private *priv = NULL; + char was_present = 1; + + gid_t gid = 0; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + + int nlink_samepgfid = 0; + char *pgfid_xattr_key = NULL; + gf_boolean_t entry_created = _gf_false, gfid_set = _gf_false; + mode_t mode_bit = 0; + uuid_t uuid_req = { + 0, + }; + + dict_t *xdata_rsp = dict_ref(xdata); + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + GFID_NULL_CHECK_AND_GOTO(frame, this, loc, xdata, op_ret, op_errno, + uuid_req, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &stbuf); + + gid = frame->root->gid; + + SET_FS_ID(frame->root->uid, gid); + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_path); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + + if (!flags) { + _flags = O_CREAT | O_RDWR | O_EXCL; + } else { + _flags = flags | O_CREAT; + } + + op_ret = posix_pstat(this, loc->inode, NULL, real_path, &stbuf, _gf_false); + if ((op_ret == -1) && (errno == ENOENT)) { + was_present = 0; + } + + if (!was_present) { + if (posix_is_layout_stale(xdata, par_path, this)) { + op_ret = -1; + op_errno = EIO; + if (!xdata_rsp) { + xdata_rsp = dict_new(); + if (!xdata_rsp) { + op_errno = ENOMEM; + goto out; + } + } + + if (dict_set_int32_sizen(xdata_rsp, GF_PREOP_CHECK_FAILED, 1) == + -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_DICT_SET_FAILED, + "setting key %s in dict failed", GF_PREOP_CHECK_FAILED); + } + + goto out; + } + } + + if (priv->o_direct) + _flags |= O_DIRECT; + + mode_bit = (priv->create_mask & mode) | priv->force_create_mode; + mode = posix_override_umask(mode, mode_bit); + _fd = sys_open(real_path, _flags, mode); + + if (_fd == -1) { + op_errno = errno; + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_OPEN_FAILED, + "open on %s failed", real_path); + goto out; + } + + if ((_flags & O_CREAT) && (_flags & O_EXCL)) { + entry_created = _gf_true; + } + + if (was_present) + goto fill_stat; + +#ifndef HAVE_SET_FSID + op_ret = sys_chown(real_path, frame->root->uid, gid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CHOWN_FAILED, + "chown on %s failed", real_path); + } +#endif + op_ret = posix_acl_xattr_set(this, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_ACL_FAILED, + "setting ACLs on %s failed", real_path); + } + + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY(pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + nlink_samepgfid = 1; + SET_PGFID_XATTR(real_path, pgfid_xattr_key, nlink_samepgfid, + XATTR_CREATE, op_ret, this, ignore); + } + + if (priv->gfid2path) { + posix_set_gfid2path_xattr(this, real_path, loc->pargfid, loc->name); + } +ignore: + op_ret = posix_entry_create_xattr_set(this, loc, real_path, xdata); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "setting xattrs on %s failed ", real_path); + } + +fill_stat: + op_ret = posix_gfid_set(this, real_path, loc, xdata, frame->root->pid, + &op_errno); + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_GFID_FAILED, + "setting gfid on %s failed", real_path); + goto out; + } else { + gfid_set = _gf_true; + } + + op_ret = posix_fdstat(this, loc->inode, _fd, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fstat on %d failed", _fd); + goto out; + } + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &stbuf); + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_path); + goto out; + } + + posix_set_parent_ctime(frame, this, par_path, -1, loc->parent, &postparent); + + op_ret = -1; + pfd = GF_CALLOC(1, sizeof(*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + op_errno = errno; + goto out; + } + + pfd->flags = flags; + pfd->fd = _fd; + + op_ret = fd_ctx_set(fd, this, (uint64_t)(long)pfd); + if (op_ret) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_FD_PATH_SETTING_FAILED, + "failed to set the fd context path=%s fd=%p", real_path, fd); + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + if (op_ret < 0) { + if (_fd != -1) + sys_close(_fd); + + if (entry_created) + sys_unlink(real_path); + + if (gfid_set) + posix_gfid_unset(this, xdata); + } + + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, + (loc) ? loc->inode : NULL, &stbuf, &preparent, + &postparent, xdata_rsp); + + if (xdata_rsp) + dict_unref(xdata_rsp); + + return 0; +} + +/* TODO: Ensure atomocity of put, and rollback in case of failure + * One of the ways, is to perform put in the hidden directory + * and rename it to the specified location, if the put was successful + */ +int32_t +posix_put(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, uint32_t flags, struct iovec *vector, int32_t count, + off_t offset, struct iobref *iobref, dict_t *xattr, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + fd_t *fd = NULL; + char *real_path = NULL; + char *par_path = NULL; + struct iatt stbuf = { + 0, + }; + struct iatt preparent = { + 0, + }; + struct iatt postparent = { + 0, + }; + + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &stbuf); + + if (!real_path || !par_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &preparent, + _gf_false); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on parent %s failed", par_path); + goto out; + } + fd = fd_create(loc->inode, getpid()); + if (!fd) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + fd->flags = flags; + + /* No xlators are expected below posix, but we cannot still call + * sys_create() directly here, as posix_create does many other things like + * chmod, setxattr etc. along with sys_create(). But we cannot also directly + * call posix_create() as it calls STACK_UNWIND. Hence using syncop() + */ + op_ret = syncop_create(this, loc, flags, mode, fd, &stbuf, xdata, NULL); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CREATE_FAILED, + "create of %s failed", loc->path); + goto out; + } + + op_ret = posix_pstat(this, loc->parent, loc->pargfid, par_path, &postparent, + _gf_false); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on parent %s failed", par_path); + goto out; + } + + op_ret = syncop_writev(this, fd, vector, count, offset, iobref, flags, NULL, + NULL, xdata, NULL); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_WRITE_FAILED, + "write on file %s failed", loc->path); + goto out; + } + + op_ret = syncop_fsetxattr(this, fd, xattr, flags, xdata, NULL); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "setxattr on file %s failed", loc->path); + goto out; + } + + op_ret = syncop_flush(this, fd, xdata, NULL); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CLOSE_FAILED, + "setxattr on file %s failed", loc->path); + goto out; + } + + op_ret = posix_pstat(this, loc->inode, loc->gfid, real_path, &stbuf, + _gf_false); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "post-operation lstat on %s failed", real_path); + goto out; + } +out: + STACK_UNWIND_STRICT(put, frame, op_ret, op_errno, loc->inode, &stbuf, + &preparent, &postparent, NULL); + + return 0; +} diff --git a/xlators/storage/posix/src/posix-gfid-path.c b/xlators/storage/posix/src/posix-gfid-path.c new file mode 100644 index 00000000000..1b38e9b0479 --- /dev/null +++ b/xlators/storage/posix/src/posix-gfid-path.c @@ -0,0 +1,243 @@ +/* + Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <stdint.h> + +#include <glusterfs/compat-errno.h> +#include <glusterfs/syscall.h> +#include <glusterfs/logging.h> +#include "posix-messages.h" +#include "posix-mem-types.h" +#include "posix-gfid-path.h" +#include "posix.h" + +gf_boolean_t +posix_is_gfid2path_xattr(const char *name) +{ + if (name && strncmp(GFID2PATH_XATTR_KEY_PREFIX, name, + GFID2PATH_XATTR_KEY_PREFIX_LENGTH) == 0) + return _gf_true; + + return _gf_false; +} + +static int gf_posix_xattr_enotsup_log; + +int32_t +posix_get_gfid2path(xlator_t *this, inode_t *inode, const char *real_path, + int *op_errno, dict_t *dict) +{ + int ret = 0; + char *path = NULL; + ssize_t size = 0; + char *list = NULL; + int32_t list_offset = 0; + int32_t i = 0; + int32_t j = 0; + char *paths[MAX_GFID2PATH_LINK_SUP] = { + NULL, + }; + char *value = NULL; + size_t remaining_size = 0; + size_t bytes = 0; + char keybuffer[4096] = { + 0, + }; + + uuid_t pargfid = { + 0, + }; + gf_boolean_t have_val = _gf_false; + struct posix_private *priv = NULL; + char pargfid_str[UUID_CANONICAL_FORM_LEN + 1] = { + 0, + }; + gf_boolean_t found = _gf_false; + int len; + + priv = this->private; + + if (IA_ISDIR(inode->ia_type)) { + ret = posix_resolve_dirgfid_to_path(inode->gfid, priv->base_path, NULL, + &path); + if (ret < 0) { + ret = -1; + goto err; + } + ret = dict_set_dynstr(dict, GFID2PATH_VIRT_XATTR_KEY, path); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_DICT_SET_FAILED, + "could not set " + "value for key (%s)", + GFID2PATH_VIRT_XATTR_KEY); + goto err; + } + found = _gf_true; + } else { + char value_buf[8192] = { + 0, + }; + char xattr_value[8192] = { + 0, + }; + have_val = _gf_false; + size = sys_llistxattr(real_path, value_buf, sizeof(value_buf) - 1); + if (size > 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_DEBUG, errno, P_MSG_XATTR_FAILED, + "listxattr failed due to overflow of" + " buffer on %s ", + real_path); + size = sys_llistxattr(real_path, NULL, 0); + } + if (size == -1) { + *op_errno = errno; + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "listxattr failed on %s", real_path); + } + goto err; + } + if (size == 0) + goto done; + } + list = alloca(size); + if (!list) { + *op_errno = errno; + goto err; + } + if (have_val) { + memcpy(list, value_buf, size); + } else { + size = sys_llistxattr(real_path, list, size); + if (size < 0) { + ret = -1; + *op_errno = errno; + goto err; + } + } + remaining_size = size; + list_offset = 0; + while (remaining_size > 0) { + len = snprintf(keybuffer, sizeof(keybuffer), "%s", + list + list_offset); + + if (!posix_is_gfid2path_xattr(keybuffer)) { + goto ignore; + } + + found = _gf_true; + size = sys_lgetxattr(real_path, keybuffer, xattr_value, + sizeof(xattr_value) - 1); + if (size == -1) { + ret = -1; + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "getxattr failed on" + " %s: key = %s ", + real_path, keybuffer); + break; + } + + /* Parse pargfid from xattr value*/ + strncpy(pargfid_str, xattr_value, 36); + pargfid_str[36] = '\0'; + gf_uuid_parse(pargfid_str, pargfid); + + /* Convert pargfid to path */ + ret = posix_resolve_dirgfid_to_path(pargfid, priv->base_path, + &xattr_value[37], &paths[i]); + i++; + + ignore: + remaining_size -= (len + 1); + list_offset += (len + 1); + } /* while (remaining_size > 0) */ + + /* gfid2path xattr is absent in the list of xattrs */ + if (!found) { + ret = -1; + /* + * ENODATA because xattr is not present in the + * list of xattrs. Thus the consumer should + * face error instead of a success and a empty + * string in the dict for the key. + */ + *op_errno = ENODATA; + goto err; + } + + /* + * gfid2path xattr is found in list of xattrs, but getxattr + * on the 1st gfid2path xattr itself failed and the while + * loop above broke. So there is nothing in the value. So + * it would be better not to send "" as the value for any + * key, as it is not true. + */ + if (found && !i) + goto err; /* both errno and ret are set before beak */ + + /* Calculate memory to be allocated */ + for (j = 0; j < i; j++) { + bytes += strlen(paths[j]); + if (j < i - 1) + bytes += strlen(priv->gfid2path_sep); + } + value = GF_CALLOC(bytes + 1, sizeof(char), gf_posix_mt_char); + if (!value) { + ret = -1; + *op_errno = errno; + goto err; + } + + for (j = 0; j < i; j++) { + strcat(value, paths[j]); + if (j != i - 1) + strcat(value, priv->gfid2path_sep); + } + value[bytes] = '\0'; + + ret = dict_set_dynptr(dict, GFID2PATH_VIRT_XATTR_KEY, value, bytes); + if (ret < 0) { + *op_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_DICT_SET_FAILED, + "dict set operation " + "on %s for the key %s failed.", + real_path, GFID2PATH_VIRT_XATTR_KEY); + GF_FREE(value); + goto err; + } + } + +done: + for (j = 0; j < i; j++) { + if (paths[j]) + GF_FREE(paths[j]); + } + ret = 0; + return ret; +err: + if (path) + GF_FREE(path); + for (j = 0; j < i; j++) { + if (paths[j]) + GF_FREE(paths[j]); + } + return ret; +} diff --git a/xlators/storage/posix/src/posix-gfid-path.h b/xlators/storage/posix/src/posix-gfid-path.h new file mode 100644 index 00000000000..79096e5893f --- /dev/null +++ b/xlators/storage/posix/src/posix-gfid-path.h @@ -0,0 +1,28 @@ +/* + Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _POSIX_GFID_PATH_H +#define _POSIX_GFID_PATH_H + +#include <glusterfs/compat-errno.h> + +#include <stdint.h> // for int32_t +#include "glusterfs/dict.h" // for dict_t +#include "glusterfs/glusterfs.h" // for gf_boolean_t +#include "glusterfs/inode.h" // for inode_t +#include "uuid.h" // for uuid_t +#define MAX_GFID2PATH_LINK_SUP 500 + +gf_boolean_t +posix_is_gfid2path_xattr(const char *name); +int32_t +posix_get_gfid2path(xlator_t *this, inode_t *inode, const char *real_path, + int *op_errno, dict_t *dict); +#endif /* _POSIX_GFID_PATH_H */ diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c new file mode 100644 index 00000000000..410b38da8cb --- /dev/null +++ b/xlators/storage/posix/src/posix-handle.c @@ -0,0 +1,1020 @@ +/* + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <libgen.h> +#ifdef GF_LINUX_HOST_OS +#include <alloca.h> +#endif + +#include "posix-handle.h" +#include "posix.h" +#include <glusterfs/syscall.h> +#include "posix-messages.h" +#include "posix-metadata.h" + +#include <glusterfs/compat-errno.h> + +int +posix_handle_mkdir_hashes(xlator_t *this, int dfd, uuid_t gfid); + +inode_t * +posix_resolve(xlator_t *this, inode_table_t *itable, inode_t *parent, + char *bname, struct iatt *iabuf) +{ + inode_t *inode = NULL; + int ret = -1; + + ret = posix_istat(this, NULL, parent->gfid, bname, iabuf); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "gfid: %s, bname: %s " + "failed", + uuid_utoa(parent->gfid), bname); + goto out; + } + + if (__is_root_gfid(iabuf->ia_gfid) && !strcmp(bname, "/")) { + inode = itable->root; + } else { + inode = inode_find(itable, iabuf->ia_gfid); + if (inode == NULL) { + inode = inode_new(itable); + gf_uuid_copy(inode->gfid, iabuf->ia_gfid); + } + } + + /* posix_istat wouldn't have fetched posix_mdata_t i.e., + * time attributes as inode is passed as NULL, hence get + * here once you got the inode + */ + ret = posix_get_mdata_xattr(this, NULL, -1, inode, iabuf); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GETMDATA_FAILED, + "posix get mdata failed on gfid:%s", uuid_utoa(inode->gfid)); + goto out; + } + + /* Linking an inode here, can cause a race in posix_acl. + Parent inode gets linked here, but before + it reaches posix_acl_readdirp_cbk, create/lookup can + come on a leaf-inode, as parent-inode-ctx not yet updated + in posix_acl_readdirp_cbk, create and lookup can fail + with EACCESS. So do the inode linking in the quota xlator + + if (__is_root_gfid (iabuf->ia_gfid) && !strcmp (bname, "/")) + linked_inode = itable->root; + else + linked_inode = inode_link (inode, parent, bname, iabuf); + + inode_unref (inode);*/ + +out: + return inode; +} + +int +posix_make_ancestral_node(const char *priv_base_path, char *path, int pathsize, + gf_dirent_t *head, char *dir_name, struct iatt *iabuf, + inode_t *inode, int type, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + char real_path[PATH_MAX + 1] = + { + 0, + }, + len = 0; + loc_t loc = { + 0, + }; + int ret = -1; + + len = strlen(path) + strlen(dir_name) + 1; + if (len > pathsize) { + goto out; + } + + strcat(path, dir_name); + if (*dir_name != '/') + strcat(path, "/"); + + if (type & POSIX_ANCESTRY_DENTRY) { + entry = gf_dirent_for_name(dir_name); + if (!entry) + goto out; + + entry->d_stat = *iabuf; + entry->inode = inode_ref(inode); + + list_add_tail(&entry->list, &head->list); + snprintf(real_path, sizeof(real_path), "%s/%s", priv_base_path, path); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + entry->dict = posix_xattr_fill(THIS, real_path, &loc, NULL, -1, xdata, + iabuf); + loc_wipe(&loc); + } + + ret = 0; + +out: + return ret; +} + +int +posix_make_ancestryfromgfid(xlator_t *this, char *path, int pathsize, + gf_dirent_t *head, int type, uuid_t gfid, + const size_t handle_size, + const char *priv_base_path, inode_table_t *itable, + inode_t **parent, dict_t *xdata, int32_t *op_errno) +{ + char *linkname = NULL; /* "../../<gfid[0]>/<gfid[1]/" + "<gfidstr>/<NAME_MAX>" */ + char *dir_handle = NULL; + char *pgfidstr = NULL; + char *saveptr = NULL; + ssize_t len = 0; + inode_t *inode = NULL; + struct iatt iabuf = { + 0, + }; + int ret = -1; + uuid_t tmp_gfid = { + 0, + }; + char *dir_stack[PATH_MAX / 2 + 1]; /* Since PATH_MAX/2 also gives + an upper bound on depth of + directories tree */ + uuid_t gfid_stack[PATH_MAX / 2 + 1]; + + char *dir_name = NULL; + char *saved_dir = NULL; + int top = -1; + + if (!path || !parent || !priv_base_path || gf_uuid_is_null(gfid)) { + *op_errno = EINVAL; + goto out; + } + + dir_handle = alloca(handle_size); + linkname = alloca(PATH_MAX); + gf_uuid_copy(tmp_gfid, gfid); + + while (top < PATH_MAX / 2) { + gf_uuid_copy(gfid_stack[++top], tmp_gfid); + if (__is_root_gfid(tmp_gfid)) { + *parent = inode_ref(itable->root); + + saved_dir = alloca(sizeof("/")); + strcpy(saved_dir, "/"); + dir_stack[top] = saved_dir; + break; + } else { + snprintf(dir_handle, handle_size, "%s/%s/%02x/%02x/%s", + priv_base_path, GF_HIDDEN_PATH, tmp_gfid[0], tmp_gfid[1], + uuid_utoa(tmp_gfid)); + + len = sys_readlink(dir_handle, linkname, PATH_MAX); + if (len < 0) { + *op_errno = errno; + gf_msg(this->name, + (errno == ENOENT || errno == ESTALE) ? GF_LOG_DEBUG + : GF_LOG_ERROR, + errno, P_MSG_READLINK_FAILED, + "could not read" + " the link from the gfid handle %s ", + dir_handle); + ret = -1; + goto out; + } + + linkname[len] = '\0'; + + pgfidstr = strtok_r(linkname + SLEN("../../00/00/"), "/", &saveptr); + dir_name = strtok_r(NULL, "/", &saveptr); + saved_dir = alloca(strlen(dir_name) + 1); + gf_uuid_parse(pgfidstr, tmp_gfid); + strcpy(saved_dir, dir_name); + dir_stack[top] = saved_dir; + } + } + if (top == PATH_MAX / 2) { + gf_msg(this->name, GF_LOG_ERROR, P_MSG_ANCESTORY_FAILED, 0, + "build ancestry failed due to " + "deep directory hierarchy, depth: %d.", + top); + *op_errno = EINVAL; + ret = -1; + goto out; + } + + while (top >= 0) { + if (!*parent) { + /* There's no real "root" cause for how we end up here, + * so for now let's log this and bail out to prevent + * crashes. + */ + gf_msg(this->name, GF_LOG_WARNING, P_MSG_INODE_RESOLVE_FAILED, 0, + "OOPS: *parent is null (path: %s), bailing!", path); + goto out; + } + + memset(&iabuf, 0, sizeof(iabuf)); + inode = posix_resolve(this, itable, *parent, dir_stack[top], &iabuf); + if (inode == NULL) { + gf_msg(this->name, GF_LOG_ERROR, P_MSG_INODE_RESOLVE_FAILED, 0, + "posix resolve on the inode %s failed", + uuid_utoa(gfid_stack[top])); + *op_errno = ESTALE; + ret = -1; + goto out; + } + + ret = posix_make_ancestral_node(priv_base_path, path, pathsize, head, + dir_stack[top], &iabuf, inode, type, + xdata); + if (ret < 0) { + *op_errno = ENOMEM; + goto out; + } + + inode_unref(*parent); + *parent = inode; + top--; + } +out: + return ret; +} + +int +posix_handle_relpath(xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t buflen) +{ + char *uuid_str = NULL; + int len = 0; + + len = POSIX_GFID_HANDLE_RELSIZE; + + if (basename) { + len += (strlen(basename) + 1); + } + + if (buflen < len || !buf) + return len; + + uuid_str = uuid_utoa(gfid); + + if (basename) { + len = snprintf(buf, buflen, "../../%02x/%02x/%s/%s", gfid[0], gfid[1], + uuid_str, basename); + } else { + len = snprintf(buf, buflen, "../../%02x/%02x/%s", gfid[0], gfid[1], + uuid_str); + } + + return len; +} + +/* + TODO: explain how this pump fixes ELOOP +*/ +gf_boolean_t +posix_is_malformed_link(xlator_t *this, char *base_str, char *linkname, + size_t len) +{ + if ((len == 8) && strcmp(linkname, "../../..")) /*for root*/ + goto err; + + if (len < 50 || len >= 512) + goto err; + + if (memcmp(linkname, "../../", 6) != 0) + goto err; + + if ((linkname[2] != '/') || (linkname[5] != '/') || (linkname[8] != '/') || + (linkname[11] != '/') || (linkname[48] != '/')) { + goto err; + } + + if ((linkname[20] != '-') || (linkname[25] != '-') || + (linkname[30] != '-') || (linkname[35] != '-')) { + goto err; + } + + return _gf_false; + +err: + gf_log_callingfn(this->name, GF_LOG_ERROR, + "malformed internal link " + "%s for %s", + linkname, base_str); + return _gf_true; +} + +int +posix_handle_pump(xlator_t *this, char *buf, int len, int maxlen, + char *base_str, int base_len, int pfx_len) +{ + char linkname[512] = { + 0, + }; /* "../../<gfid>/<NAME_MAX>" */ + int ret = 0; + int blen = 0; + int link_len = 0; + char tmpstr[POSIX_GFID_HASH2_LEN] = { + 0, + }; + char d2[3] = { + 0, + }; + int index = 0; + int dirfd = 0; + struct posix_private *priv = this->private; + + strncpy(tmpstr, (base_str + pfx_len + 3), 40); + strncpy(d2, (base_str + pfx_len), 2); + index = strtoul(d2, NULL, 16); + dirfd = priv->arrdfd[index]; + + /* is a directory's symlink-handle */ + ret = readlinkat(dirfd, tmpstr, linkname, 512); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_READLINK_FAILED, + "internal readlink failed on %s ", base_str); + goto err; + } + + if (ret < 512) + linkname[ret] = 0; + + link_len = ret; + + if ((ret == 8) && memcmp(linkname, "../../..", 8) == 0) { + if (strcmp(base_str, buf) == 0) { + strcpy(buf + pfx_len, ".."); + } + goto out; + } + + if (posix_is_malformed_link(this, base_str, linkname, ret)) + goto err; + + blen = link_len - 48; + + if (len + blen >= maxlen) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLEPATH_FAILED, + "Unable to form handle path for %s (maxlen = %d)", buf, maxlen); + goto err; + } + + memmove(buf + base_len + blen, buf + base_len, + (strlen(buf) - base_len) + 1); + + strncpy(base_str + pfx_len, linkname + 6, 42); + + strncpy(buf + pfx_len, linkname + 6, link_len - 6); +out: + return len + blen; +err: + return -1; +} + +/* + posix_handle_path differs from posix_handle_gfid_path in the way that the + path filled in @buf by posix_handle_path will return type IA_IFDIR when + an lstat() is performed on it, whereas posix_handle_gfid_path returns path + to the handle symlink (typically used for the purpose of unlinking it). + + posix_handle_path also guarantees immunity to ELOOP on the path returned by it +*/ + +int +posix_handle_path(xlator_t *this, uuid_t gfid, const char *basename, char *ubuf, + size_t size) +{ + struct posix_private *priv = NULL; + char *uuid_str = NULL; + int len = 0; + int ret = -1; + struct stat stat; + char *base_str = NULL; + int base_len = 0; + int pfx_len; + int maxlen; + char *buf; + int index = 0; + int dfd = 0; + char newstr[POSIX_GFID_HASH2_LEN] = { + 0, + }; + + priv = this->private; + + uuid_str = uuid_utoa(gfid); + + if (ubuf) { + buf = ubuf; + maxlen = size; + } else { + maxlen = PATH_MAX; + buf = alloca(maxlen); + } + + index = gfid[0]; + dfd = priv->arrdfd[index]; + + base_len = (priv->base_path_length + SLEN(GF_HIDDEN_PATH) + 45); + base_str = alloca(base_len + 1); + base_len = snprintf(base_str, base_len + 1, "%s/%s/%02x/%02x/%s", + priv->base_path, GF_HIDDEN_PATH, gfid[0], gfid[1], + uuid_str); + pfx_len = priv->base_path_length + 1 + SLEN(GF_HIDDEN_PATH) + 1; + + if (basename) { + len = snprintf(buf, maxlen, "%s/%s", base_str, basename); + } else { + len = snprintf(buf, maxlen, "%s", base_str); + } + + snprintf(newstr, sizeof(newstr), "%02x/%s", gfid[1], uuid_str); + ret = sys_fstatat(dfd, newstr, &stat, AT_SYMLINK_NOFOLLOW); + + if (!(ret == 0 && S_ISLNK(stat.st_mode) && stat.st_nlink == 1)) + goto out; + + do { + errno = 0; + ret = posix_handle_pump(this, buf, len, maxlen, base_str, base_len, + pfx_len); + len = ret; + + if (ret == -1) + break; + ret = sys_lstat(buf, &stat); + } while ((ret == -1) && errno == ELOOP); + +out: + return len + 1; +} + +int +posix_handle_gfid_path(xlator_t *this, uuid_t gfid, char *buf, size_t buflen) +{ + struct posix_private *priv = NULL; + char *uuid_str = NULL; + int len = 0; + + priv = this->private; + + len = POSIX_GFID_HANDLE_SIZE(priv->base_path_length); + + len += 256; /* worst-case for directory's symlink-handle expansion */ + + if ((buflen < len) || !buf) + return len; + + uuid_str = uuid_utoa(gfid); + + if (__is_root_gfid(gfid)) { + len = snprintf(buf, buflen, "%s", priv->base_path); + } else { + len = snprintf(buf, buflen, "%s/%s/%02x/%02x/%s", priv->base_path, + GF_HIDDEN_PATH, gfid[0], gfid[1], uuid_str); + } + + return len; +} + +int +posix_handle_init(xlator_t *this) +{ + struct posix_private *priv = NULL; + char *handle_pfx = NULL; + int ret = 0; + struct stat stbuf; + struct stat rootbuf; + struct stat exportbuf; + char *rootstr = NULL; + static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + int dfd = 0; + + priv = this->private; + + ret = sys_stat(priv->base_path, &exportbuf); + if (ret || !S_ISDIR(exportbuf.st_mode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Not a directory: %s", priv->base_path); + return -1; + } + + handle_pfx = alloca(priv->base_path_length + 1 + SLEN(GF_HIDDEN_PATH) + 1); + + sprintf(handle_pfx, "%s/%s", priv->base_path, GF_HIDDEN_PATH); + + ret = sys_stat(handle_pfx, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = sys_mkdir(handle_pfx, 0600); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "Creating directory %s failed", handle_pfx); + return -1; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "Checking for %s failed", handle_pfx); + return -1; + } + break; + case 0: + if (!S_ISDIR(stbuf.st_mode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Not a directory: %s", handle_pfx); + return -1; + } + break; + default: + break; + } + + ret = sys_stat(handle_pfx, &priv->handledir); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "stat for %s failed", handle_pfx); + return -1; + } + + MAKE_HANDLE_ABSPATH_FD(rootstr, this, gfid, dfd); + ret = sys_fstatat(dfd, rootstr, &rootbuf, 0); + switch (ret) { + case -1: + if (errno != ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "%s", priv->base_path); + return -1; + } + ret = posix_handle_mkdir_hashes(this, dfd, gfid); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "mkdir %s failed", rootstr); + return -1; + } + + ret = sys_symlinkat("../../..", dfd, rootstr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "symlink %s creation failed", rootstr); + return -1; + } + break; + case 0: + if ((exportbuf.st_ino == rootbuf.st_ino) && + (exportbuf.st_dev == rootbuf.st_dev)) + return 0; + + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE, + "Different dirs %s (%lld/%lld) != %s (%lld/%lld)", + priv->base_path, (long long)exportbuf.st_ino, + (long long)exportbuf.st_dev, rootstr, + (long long)rootbuf.st_ino, (long long)rootbuf.st_dev); + return -1; + + break; + } + + return 0; +} + +gf_boolean_t +posix_does_old_trash_exists(char *old_trash) +{ + uuid_t gfid = {0}; + gf_boolean_t exists = _gf_false; + struct stat stbuf = {0}; + int ret = 0; + + ret = sys_lstat(old_trash, &stbuf); + if ((ret == 0) && S_ISDIR(stbuf.st_mode)) { + ret = sys_lgetxattr(old_trash, "trusted.gfid", gfid, 16); + if ((ret < 0) && (errno == ENODATA || errno == ENOATTR)) + exists = _gf_true; + } + return exists; +} + +int +posix_handle_new_trash_init(xlator_t *this, char *trash) +{ + int ret = 0; + struct stat stbuf = {0}; + + ret = sys_lstat(trash, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = sys_mkdir(trash, 0755); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_HANDLE_TRASH_CREATE, + "Creating directory %s failed", trash); + } + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_HANDLE_TRASH_CREATE, "Checking for %s failed", + trash); + } + break; + case 0: + if (!S_ISDIR(stbuf.st_mode)) { + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_HANDLE_TRASH_CREATE, "Not a directory: %s", trash); + ret = -1; + } + break; + default: + break; + } + return ret; +} + +int +posix_mv_old_trash_into_new_trash(xlator_t *this, char *old, char *new) +{ + char dest_old[PATH_MAX] = {0}; + int ret = 0; + uuid_t dest_name = {0}; + + if (!posix_does_old_trash_exists(old)) + goto out; + gf_uuid_generate(dest_name); + snprintf(dest_old, sizeof(dest_old), "%s/%s", new, uuid_utoa(dest_name)); + ret = sys_rename(old, dest_old); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_TRASH_CREATE, + "Not able to move %s -> %s ", old, dest_old); + } +out: + return ret; +} + +int +posix_handle_trash_init(xlator_t *this) +{ + int ret = -1; + struct posix_private *priv = NULL; + char old_trash[PATH_MAX] = {0}; + + priv = this->private; + + priv->trash_path = GF_MALLOC(priv->base_path_length + SLEN("/") + + SLEN(GF_HIDDEN_PATH) + SLEN("/") + + SLEN(TRASH_DIR) + 1, + gf_posix_mt_trash_path); + + if (!priv->trash_path) + goto out; + + snprintf( + priv->trash_path, + priv->base_path_length + SLEN(GF_HIDDEN_PATH) + SLEN(TRASH_DIR) + 3, + "%s/%s/%s", priv->base_path, GF_HIDDEN_PATH, TRASH_DIR); + + ret = posix_handle_new_trash_init(this, priv->trash_path); + if (ret) + goto out; + snprintf(old_trash, sizeof(old_trash), "%s/.landfill", priv->base_path); + ret = posix_mv_old_trash_into_new_trash(this, old_trash, priv->trash_path); +out: + return ret; +} + +int +posix_handle_mkdir_hashes(xlator_t *this, int dirfd, uuid_t gfid) +{ + int ret = -1; + char d2[3] = { + 0, + }; + + snprintf(d2, sizeof(d2), "%02x", gfid[1]); + ret = sys_mkdirat(dirfd, d2, 0700); + if (ret == -1 && errno != EEXIST) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE, + "error mkdir hash-2 %s ", uuid_utoa(gfid)); + return -1; + } + + return 0; +} + +int +posix_handle_hard(xlator_t *this, const char *oldpath, uuid_t gfid, + struct stat *oldbuf) +{ + struct stat newbuf; + struct stat hashbuf; + int ret = -1; + gf_boolean_t link_exists = _gf_false; + char d2[3] = { + 0, + }; + int dfd = -1; + char *newstr = NULL; + + MAKE_HANDLE_ABSPATH_FD(newstr, this, gfid, dfd); + ret = sys_fstatat(dfd, newstr, &newbuf, AT_SYMLINK_NOFOLLOW); + + if (ret == -1 && errno != ENOENT) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, "%s", + uuid_utoa(gfid)); + return -1; + } + + if (ret == -1 && errno == ENOENT) { + snprintf(d2, sizeof(d2), "%02x", gfid[1]); + ret = sys_fstatat(dfd, d2, &hashbuf, 0); + if (ret) { + ret = posix_handle_mkdir_hashes(this, dfd, gfid); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "mkdir %s failed ", uuid_utoa(gfid)); + return -1; + } + } + ret = sys_linkat(AT_FDCWD, oldpath, dfd, newstr); + + if (ret) { + if (errno != EEXIST) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "link %s -> %s" + "failed ", + oldpath, newstr); + return -1; + } else { + link_exists = _gf_true; + } + } + ret = sys_fstatat(dfd, newstr, &newbuf, AT_SYMLINK_NOFOLLOW); + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "lstat on %s failed", uuid_utoa(gfid)); + return -1; + } + if ((link_exists) && (!S_ISREG(newbuf.st_mode))) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_HANDLE_CREATE, + "%s - Expected regular file", uuid_utoa(gfid)); + return -1; + } + } + + if (newbuf.st_ino != oldbuf->st_ino || newbuf.st_dev != oldbuf->st_dev) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_HANDLE_CREATE, + "mismatching ino/dev between file %s (%lld/%lld) " + "and handle %s (%lld/%lld)", + oldpath, (long long)oldbuf->st_ino, (long long)oldbuf->st_dev, + uuid_utoa(gfid), (long long)newbuf.st_ino, + (long long)newbuf.st_dev); + ret = -1; + } + + return ret; +} + +int +posix_handle_soft(xlator_t *this, const char *real_path, loc_t *loc, + uuid_t gfid, struct stat *oldbuf) +{ + char *oldpath = NULL; + char *newpath = NULL; + struct stat newbuf; + struct stat hashbuf; + int ret = -1; + char d2[3] = { + 0, + }; + int dfd = -1; + char *newstr = NULL; + + MAKE_HANDLE_ABSPATH(newpath, this, gfid); + MAKE_HANDLE_ABSPATH_FD(newstr, this, gfid, dfd); + MAKE_HANDLE_RELPATH(oldpath, this, loc->pargfid, loc->name); + + ret = sys_fstatat(dfd, newstr, &newbuf, AT_SYMLINK_NOFOLLOW); + + if (ret == -1 && errno != ENOENT) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, "%s", + newstr); + return -1; + } + + if (ret == -1 && errno == ENOENT) { + if (posix_is_malformed_link(this, newpath, oldpath, strlen(oldpath))) { + GF_ASSERT(!"Malformed link"); + errno = EINVAL; + return -1; + } + + snprintf(d2, sizeof(d2), "%02x", gfid[1]); + ret = sys_fstatat(dfd, d2, &hashbuf, 0); + + if (ret) { + ret = posix_handle_mkdir_hashes(this, dfd, gfid); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "mkdir %s failed ", newstr); + return -1; + } + } + ret = sys_symlinkat(oldpath, dfd, newstr); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "symlink %s -> %s failed", oldpath, newstr); + return -1; + } + + ret = sys_fstatat(dfd, newstr, &newbuf, AT_SYMLINK_NOFOLLOW); + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "stat on %s failed ", newstr); + return -1; + } + } + + ret = sys_stat(real_path, &newbuf); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "stat on %s failed ", real_path); + return -1; + } + + if (!oldbuf) + return ret; + + if (newbuf.st_ino != oldbuf->st_ino || newbuf.st_dev != oldbuf->st_dev) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_HANDLE_CREATE, + "mismatching ino/dev between file %s (%lld/%lld) " + "and handle %s (%lld/%lld)", + oldpath, (long long)oldbuf->st_ino, (long long)oldbuf->st_dev, + newpath, (long long)newbuf.st_ino, (long long)newbuf.st_dev); + ret = -1; + } + + return ret; +} + +int +posix_handle_unset_gfid(xlator_t *this, uuid_t gfid) +{ + int ret = 0; + struct stat stat; + int index = 0; + int dfd = 0; + char newstr[POSIX_GFID_HASH2_LEN] = { + 0, + }; + struct posix_private *priv = this->private; + + index = gfid[0]; + dfd = priv->arrdfd[index]; + + snprintf(newstr, sizeof(newstr), "%02x/%s", gfid[1], uuid_utoa(gfid)); + ret = sys_fstatat(dfd, newstr, &stat, AT_SYMLINK_NOFOLLOW); + + if (ret == -1) { + if (errno != ENOENT) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_DELETE, "%s", + newstr); + } + goto out; + } + + ret = sys_unlinkat(dfd, newstr); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_DELETE, + "unlink %s is failed", newstr); + } + +out: + return ret; +} + +int +posix_handle_unset(xlator_t *this, uuid_t gfid, const char *basename) +{ + int ret; + struct iatt stat; + char *path = NULL; + + if (!basename) { + ret = posix_handle_unset_gfid(this, gfid); + return ret; + } + + MAKE_HANDLE_PATH(path, this, gfid, basename); + if (!path) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_HANDLE_DELETE, + "Failed to create handle path for %s (%s)", basename, + uuid_utoa(gfid)); + return -1; + } + + /* stat is being used only for gfid, so passing a NULL inode + * doesn't fetch time attributes which is fine + */ + ret = posix_istat(this, NULL, gfid, basename, &stat); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_DELETE, "%s", + path); + return -1; + } + + ret = posix_handle_unset_gfid(this, stat.ia_gfid); + + return ret; +} + +int +posix_create_link_if_gfid_exists(xlator_t *this, uuid_t gfid, char *real_path, + inode_table_t *itable) +{ + int ret = -1; + char *newpath = NULL; + char *unlink_path = NULL; + uint64_t ctx_int = 0; + inode_t *inode = NULL; + struct stat stbuf = { + 0, + }; + struct posix_private *priv = NULL; + posix_inode_ctx_t *ctx = NULL; + + priv = this->private; + + MAKE_HANDLE_PATH(newpath, this, gfid, NULL); + if (!newpath) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_HANDLE_CREATE, + "Failed to create handle path (%s)", uuid_utoa(gfid)); + return ret; + } + + ret = sys_lstat(newpath, &stbuf); + if (!ret) { + ret = sys_link(newpath, real_path); + } else { + inode = inode_find(itable, gfid); + if (!inode) + return -1; + + LOCK(&inode->lock); + { + ret = __posix_inode_ctx_get_all(inode, this, &ctx); + if (ret) + goto unlock; + + if (ctx->unlink_flag != GF_UNLINK_TRUE) { + ret = -1; + goto unlock; + } + + POSIX_GET_FILE_UNLINK_PATH(priv->base_path, gfid, unlink_path); + ret = sys_link(unlink_path, real_path); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "Failed to link " + "%s with %s", + real_path, unlink_path); + goto unlock; + } + ret = sys_rename(unlink_path, newpath); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HANDLE_CREATE, + "Failed to link " + "%s with %s", + real_path, unlink_path); + goto unlock; + } + ctx_int = GF_UNLINK_FALSE; + ret = __posix_inode_ctx_set_unlink_flag(inode, this, ctx_int); + } + unlock: + UNLOCK(&inode->lock); + + inode_unref(inode); + } + + return ret; +} diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h new file mode 100644 index 00000000000..f33ed92620d --- /dev/null +++ b/xlators/storage/posix/src/posix-handle.h @@ -0,0 +1,221 @@ +/* + Copyright (c) 2011-2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_HANDLE_H +#define _POSIX_HANDLE_H + +#include "posix-inode-handle.h" + +#define HANDLE_ABSPATH_LEN(this) \ + (POSIX_BASE_PATH_LEN(this) + \ + SLEN("/" GF_HIDDEN_PATH "/00/00/" UUID0_STR) + 1) + +#define MAKE_PGFID_XATTR_KEY(var, prefix, pgfid) \ + do { \ + var = alloca(SLEN(prefix) + UUID_CANONICAL_FORM_LEN + 1); \ + strcpy(var, prefix); \ + strcat(var, uuid_utoa(pgfid)); \ + } while (0) + +#define SET_PGFID_XATTR(path, key, value, flags, op_ret, this, label) \ + do { \ + value = hton32(value); \ + op_ret = sys_lsetxattr(path, key, &value, sizeof(value), flags); \ + if (op_ret == -1) { \ + op_errno = errno; \ + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PGFID_OP, \ + "setting xattr failed on %s: key = %s ", path, key); \ + goto label; \ + } \ + } while (0) + +#define SET_PGFID_XATTR_IF_ABSENT(path, key, value, flags, op_ret, this, \ + label) \ + do { \ + op_ret = sys_lgetxattr(path, key, &value, sizeof(value)); \ + if (op_ret == -1) { \ + op_errno = errno; \ + if (op_errno == ENOATTR) { \ + value = 1; \ + SET_PGFID_XATTR(path, key, value, flags, op_ret, this, label); \ + } else { \ + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PGFID_OP, \ + "getting xattr " \ + "failed on %s: key = %s ", \ + path, key); \ + } \ + } \ + } while (0) + +#define REMOVE_PGFID_XATTR(path, key, op_ret, this, label) \ + do { \ + op_ret = sys_lremovexattr(path, key); \ + if (op_ret == -1) { \ + op_errno = errno; \ + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PGFID_OP, \ + "removing xattr failed" \ + "on %s: key = %s", \ + path, key); \ + goto label; \ + } \ + } while (0) + +/* should be invoked holding a lock */ +#define LINK_MODIFY_PGFID_XATTR(path, key, value, flags, op_ret, this, label) \ + do { \ + op_ret = sys_lgetxattr(path, key, &value, sizeof(value)); \ + if (op_ret == -1) { \ + op_errno = errno; \ + if (op_errno == ENOATTR || op_errno == ENODATA) { \ + value = 1; \ + } else { \ + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PGFID_OP, \ + "getting xattr " \ + "failed on %s: key = %s ", \ + path, key); \ + goto label; \ + } \ + } else { \ + value = ntoh32(value); \ + value++; \ + } \ + SET_PGFID_XATTR(path, key, value, flags, op_ret, this, label); \ + } while (0) + +/* should be invoked holding a lock */ +#define UNLINK_MODIFY_PGFID_XATTR(path, key, value, flags, op_ret, this, \ + label) \ + do { \ + op_ret = sys_lgetxattr(path, key, &value, sizeof(value)); \ + if (op_ret == -1) { \ + op_errno = errno; \ + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PGFID_OP, \ + "getting xattr failed on " \ + "%s: key = %s ", \ + path, key); \ + goto label; \ + } else { \ + value = ntoh32(value); \ + value--; \ + if (value > 0) { \ + SET_PGFID_XATTR(path, key, value, flags, op_ret, this, label); \ + } else { \ + REMOVE_PGFID_XATTR(path, key, op_ret, this, label); \ + } \ + } \ + } while (0) + +#define MAKE_HANDLE_GFID_PATH(var, this, gfid) \ + do { \ + int __len = 0; \ + struct posix_private *__priv = this->private; \ + __len = POSIX_GFID_HANDLE_SIZE(__priv->base_path_length); \ + __len += 256; \ + var = alloca(__len); \ + __len = posix_handle_gfid_path(this, gfid, var, __len); \ + } while (0) + +#define MAKE_HANDLE_RELPATH(var, this, gfid, base) \ + do { \ + int __len; \ + __len = POSIX_GFID_HANDLE_RELSIZE; \ + if (base) { \ + __len += (strlen(base) + 1); \ + } \ + var = alloca(__len); \ + __len = posix_handle_relpath(this, gfid, base, var, __len); \ + } while (0) + +#define MAKE_HANDLE_ABSPATH(var, this, gfid) \ + do { \ + struct posix_private *__priv = this->private; \ + int __len = HANDLE_ABSPATH_LEN(this); \ + var = alloca(__len); \ + snprintf(var, __len, "%s/" GF_HIDDEN_PATH "/%02x/%02x/%s", \ + __priv->base_path, gfid[0], gfid[1], uuid_utoa(gfid)); \ + } while (0) + +#define MAKE_HANDLE_ABSPATH_FD(var, this, gfid, dfd) \ + do { \ + struct posix_private *__priv = this->private; \ + int findex = gfid[0]; \ + int __len = POSIX_GFID_HASH2_LEN; \ + var = alloca(__len); \ + snprintf(var, __len, "%02x/%s", gfid[1], uuid_utoa(gfid)); \ + dfd = __priv->arrdfd[findex]; \ + } while (0) + +#define MAKE_ENTRY_HANDLE(entp, parp, this, loc, ent_p) \ + do { \ + char *__parp; \ + \ + if (gf_uuid_is_null(loc->pargfid) || !loc->name) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_ENTRY_HANDLE_CREATE, \ + "null pargfid/name for path %s", loc->path); \ + break; \ + } \ + \ + if (strchr(loc->name, '/')) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_ENTRY_HANDLE_CREATE, \ + "'/' in name not allowed: (%s)", loc->name); \ + op_ret = -1; \ + break; \ + } \ + if (LOC_HAS_ABSPATH(loc)) { \ + MAKE_REAL_PATH(entp, this, loc->path); \ + __parp = strdupa(entp); \ + parp = dirname(__parp); \ + op_ret = posix_pstat(this, loc->inode, NULL, entp, ent_p, \ + _gf_false); \ + break; \ + } \ + errno = 0; \ + op_ret = posix_istat(this, loc->inode, loc->pargfid, loc->name, \ + ent_p); \ + if (errno != ELOOP) { \ + MAKE_HANDLE_PATH(parp, this, loc->pargfid, NULL); \ + MAKE_HANDLE_PATH(entp, this, loc->pargfid, loc->name); \ + if (!parp || !entp) { \ + gf_msg(this->name, GF_LOG_ERROR, errno, \ + P_MSG_ENTRY_HANDLE_CREATE, \ + "Failed to create entry handle " \ + "for path %s", \ + loc->path); \ + } \ + break; \ + } \ + /* __ret == -1 && errno == ELOOP */ \ + /* expand ELOOP */ \ + } while (0) + +#define POSIX_GFID_HASH2_LEN 45 +int +posix_handle_gfid_path(xlator_t *this, uuid_t gfid, char *buf, size_t len); + +int +posix_handle_hard(xlator_t *this, const char *path, uuid_t gfid, + struct stat *buf); + +int +posix_handle_soft(xlator_t *this, const char *real_path, loc_t *loc, + uuid_t gfid, struct stat *buf); + +int +posix_handle_unset(xlator_t *this, uuid_t gfid, const char *basename); + +int +posix_create_link_if_gfid_exists(xlator_t *this, uuid_t gfid, char *real_path, + inode_table_t *itable); + +int +posix_check_internal_writes(xlator_t *this, fd_t *fd, int sysfd, dict_t *xdata); + +void +posix_disk_space_check(xlator_t *this); +#endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c new file mode 100644 index 00000000000..67db3324083 --- /dev/null +++ b/xlators/storage/posix/src/posix-helpers.c @@ -0,0 +1,3666 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#define __XOPEN_SOURCE 500 + +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <libgen.h> +#include <pthread.h> +#include <ftw.h> +#include <sys/stat.h> +#include <signal.h> +#include <aio.h> + +#ifdef HAVE_SYS_ACL_H +#ifdef HAVE_ACL_LIBACL_H /* for acl_to_any_text() */ +#include <acl/libacl.h> +#else /* FreeBSD and others */ +#include <sys/acl.h> +#endif +#endif + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#include <fnmatch.h> +#include "posix.h" +#include "posix-messages.h" +#include "posix-metadata.h" +#include "posix-handle.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/syscall.h> +#include <glusterfs/statedump.h> +#include <glusterfs/locking.h> +#include <glusterfs/timer.h> +#include "glusterfs3-xdr.h" +#include <glusterfs/glusterfs-acl.h> +#include "posix-gfid-path.h" +#include <glusterfs/events.h> +#include "glusterfs/syncop.h" +#include "timer-wheel.h" +#include <sys/types.h> + +char *marker_xattrs[] = {"trusted.glusterfs.quota.*", + "trusted.glusterfs.*.xtime", NULL}; + +static char *marker_contri_key = "trusted.*.*.contri"; + +static char *posix_ignore_xattrs[] = {"gfid-req", + GLUSTERFS_INTERNAL_FOP_KEY, + GLUSTERFS_ENTRYLK_COUNT, + GLUSTERFS_INODELK_COUNT, + GLUSTERFS_POSIXLK_COUNT, + GLUSTERFS_PARENT_ENTRYLK, + GF_GFIDLESS_LOOKUP, + GLUSTERFS_INODELK_DOM_COUNT, + NULL}; + +static char *list_xattr_ignore_xattrs[] = {GFID_XATTR_KEY, GF_XATTR_VOL_ID_KEY, + GF_SELINUX_XATTR_KEY, NULL}; + +gf_boolean_t +posix_special_xattr(char **pattern, char *key) +{ + int i = 0; + gf_boolean_t flag = _gf_false; + + GF_VALIDATE_OR_GOTO("posix", pattern, out); + GF_VALIDATE_OR_GOTO("posix", key, out); + + for (i = 0; pattern[i]; i++) { + if (!fnmatch(pattern[i], key, 0)) { + flag = _gf_true; + break; + } + } +out: + return flag; +} + +int +posix_handle_mdata_xattr(call_frame_t *frame, const char *name, int *op_errno) +{ + int i = 0; + int ret = 0; + int pid = 1; + static const char *const internal_xattr[] = {GF_XATTR_MDATA_KEY, NULL}; + if (frame && frame->root) { + pid = frame->root->pid; + } + + if (!name || pid < GF_CLIENT_PID_MAX) { + /* No need to do anything here */ + ret = 0; + goto out; + } + + for (i = 0; internal_xattr[i]; i++) { + if (fnmatch(internal_xattr[i], name, FNM_PERIOD) == 0) { + ret = -1; + if (op_errno) { + *op_errno = ENOATTR; + } + + gf_msg_debug("posix", ENOATTR, + "Ignoring the key %s as an internal " + "xattrs.", + name); + goto out; + } + } + + ret = 0; +out: + return ret; +} + +int +posix_handle_georep_xattrs(call_frame_t *frame, const char *name, int *op_errno, + gf_boolean_t is_getxattr) +{ + int i = 0; + int ret = 0; + int pid = 1; + gf_boolean_t filter_xattr = _gf_true; + static const char *georep_xattr[] = { + "*.glusterfs.*.stime", "*.glusterfs.*.xtime", + "*.glusterfs.*.entry_stime", "*.glusterfs.volume-mark.*", NULL}; + + if (!name) { + /* No need to do anything here */ + ret = 0; + goto out; + } + + if (frame && frame->root) { + pid = frame->root->pid; + } + + if (pid == GF_CLIENT_PID_GSYNCD && is_getxattr) { + filter_xattr = _gf_false; + + /* getxattr from gsyncd process should return all the + * internal xattr. In other cases ignore such xattrs + */ + } + + for (i = 0; filter_xattr && georep_xattr[i]; i++) { + if (fnmatch(georep_xattr[i], name, FNM_PERIOD) == 0) { + ret = -1; + if (op_errno) + *op_errno = ENOATTR; + + gf_msg_debug("posix", ENOATTR, + "Ignoring the key %s as an internal " + "xattrs.", + name); + goto out; + } + } + + ret = 0; +out: + return ret; +} + +int32_t +posix_set_mode_in_dict(dict_t *in_dict, dict_t *out_dict, struct iatt *in_stbuf) +{ + int ret = -1; + mode_t mode = 0; + + if ((!in_dict) || (!in_stbuf) || (!out_dict)) { + goto out; + } + + /* We need this only for files */ + if (!(IA_ISREG(in_stbuf->ia_type))) { + ret = 0; + goto out; + } + + /* Nobody asked for this */ + if (!dict_get(in_dict, DHT_MODE_IN_XDATA_KEY)) { + ret = 0; + goto out; + } + mode = st_mode_from_ia(in_stbuf->ia_prot, in_stbuf->ia_type); + + ret = dict_set_int32(out_dict, DHT_MODE_IN_XDATA_KEY, mode); + +out: + return ret; +} + +static gf_boolean_t +posix_xattr_ignorable(char *key) +{ + return gf_get_index_by_elem(posix_ignore_xattrs, key) >= 0; +} + +static int +_posix_xattr_get_set_from_backend(posix_xattr_filler_t *filler, char *key) +{ + ssize_t xattr_size = 256; /* guesstimated initial size of xattr */ + int ret = -1; + char *value = NULL; + + if (!gf_is_valid_xattr_namespace(key)) { + goto out; + } + + /* Most of the gluster internal xattrs don't exceed 256 bytes. So try + * getxattr with ~256 bytes. If it gives ERANGE then go the old way + * of getxattr with NULL buf to find the length and then getxattr with + * allocated buf to fill the data. This way we reduce lot of getxattrs. + */ + + value = GF_MALLOC(xattr_size + 1, gf_posix_mt_char); + if (!value) { + goto out; + } + + if (filler->real_path) + xattr_size = sys_lgetxattr(filler->real_path, key, value, xattr_size); + else + xattr_size = sys_fgetxattr(filler->fdnum, key, value, xattr_size); + + if (xattr_size == -1) { + if (value) { + GF_FREE(value); + value = NULL; + } + /* xattr_size == -1 - failed to fetch the xattr with + * current settings. + * If it was not because value was too small, abort + */ + if (errno != ERANGE) { + goto out; + } + + /* Get the real length needed */ + if (filler->real_path) { + xattr_size = sys_lgetxattr(filler->real_path, key, NULL, 0); + } else { + xattr_size = sys_fgetxattr(filler->fdnum, key, NULL, 0); + } + if (xattr_size == -1) { + goto out; + } + + value = GF_MALLOC(xattr_size + 1, gf_posix_mt_char); + if (!value) { + goto out; + } + + if (filler->real_path) { + xattr_size = sys_lgetxattr(filler->real_path, key, value, + xattr_size); + } else { + xattr_size = sys_fgetxattr(filler->fdnum, key, value, xattr_size); + } + if (xattr_size == -1) { + GF_FREE(value); + value = NULL; + if (filler->real_path) + gf_msg(filler->this->name, GF_LOG_WARNING, 0, + P_MSG_XATTR_FAILED, "getxattr failed. path: %s, key: %s", + filler->real_path, key); + else + gf_msg(filler->this->name, GF_LOG_WARNING, 0, + P_MSG_XATTR_FAILED, "getxattr failed. gfid: %s, key: %s", + uuid_utoa(filler->fd->inode->gfid), key); + goto out; + } + } + + value[xattr_size] = '\0'; + ret = dict_set_bin(filler->xattr, key, value, xattr_size); + + if (ret < 0) { + if (value) + GF_FREE(value); + if (filler->real_path) + gf_msg_debug(filler->this->name, 0, + "dict set failed. path: %s, key: %s", + filler->real_path, key); + else + gf_msg_debug(filler->this->name, 0, + "dict set failed. gfid: %s, key: %s", + uuid_utoa(filler->fd->inode->gfid), key); + goto out; + } + ret = 0; +out: + return ret; +} + +static int gf_posix_xattr_enotsup_log; + +static int +_posix_get_marker_all_contributions(posix_xattr_filler_t *filler) +{ + ssize_t size = -1, remaining_size = -1, list_offset = 0; + int ret = -1; + int len; + char *list = NULL, key[4096] = { + 0, + }; + + if (filler->real_path) + size = sys_llistxattr(filler->real_path, NULL, 0); + else + size = sys_flistxattr(filler->fdnum, NULL, 0); + if (size == -1) { + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, THIS->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting brick" + " with 'user_xattr' flag)"); + } else { + if (filler->real_path) + gf_msg(THIS->name, GF_LOG_WARNING, errno, P_MSG_XATTR_FAILED, + "listxattr failed on %s", filler->real_path); + else + gf_msg(THIS->name, GF_LOG_WARNING, errno, P_MSG_XATTR_FAILED, + "listxattr failed on %s", + uuid_utoa(filler->fd->inode->gfid)); + } + goto out; + } + + if (size == 0) { + ret = 0; + goto out; + } + + list = alloca(size); + if (!list) { + goto out; + } + + if (filler->real_path) + size = sys_llistxattr(filler->real_path, list, size); + else + size = sys_flistxattr(filler->fdnum, list, size); + if (size <= 0) { + ret = size; + goto out; + } + + remaining_size = size; + list_offset = 0; + + while (remaining_size > 0) { + len = snprintf(key, sizeof(key), "%s", list + list_offset); + if (fnmatch(marker_contri_key, key, 0) == 0) { + (void)_posix_xattr_get_set_from_backend(filler, key); + } + remaining_size -= (len + 1); + list_offset += (len + 1); + } + + ret = 0; + +out: + return ret; +} + +static int +_posix_get_marker_quota_contributions(posix_xattr_filler_t *filler, char *key) +{ + char *saveptr = NULL, *token = NULL, *tmp_key = NULL; + char *ptr = NULL; + int i = 0, ret = 0; + + tmp_key = ptr = gf_strdup(key); + if (tmp_key == NULL) { + return -1; + } + for (i = 0; i < 4; i++) { + token = strtok_r(tmp_key, ".", &saveptr); + tmp_key = NULL; + } + + if (strncmp(token, "contri", SLEN("contri")) == 0) { + ret = _posix_get_marker_all_contributions(filler); + } else { + ret = _posix_xattr_get_set_from_backend(filler, key); + } + + GF_FREE(ptr); + + return ret; +} + +static inode_t * +_get_filler_inode(posix_xattr_filler_t *filler) +{ + if (filler->fd) + return filler->fd->inode; + else if (filler->loc && filler->loc->inode) + return filler->loc->inode; + else + return NULL; +} + +static int +_posix_xattr_get_set(dict_t *xattr_req, char *key, data_t *data, + void *xattrargs) +{ + posix_xattr_filler_t *filler = xattrargs; + int ret = -1; + int len = 0; + char *databuf = NULL; + int _fd = -1; + ssize_t req_size = 0; + int32_t list_offset = 0; + ssize_t remaining_size = 0; + char *xattr = NULL; + inode_t *inode = NULL; + char *value = NULL; + struct iatt stbuf = { + 0, + }; + + if (posix_xattr_ignorable(key)) + goto out; + + len = strlen(key); + /* should size be put into the data_t ? */ + if ((filler->stbuf != NULL && IA_ISREG(filler->stbuf->ia_type)) && + (len == SLEN(GF_CONTENT_KEY) && !strcmp(key, GF_CONTENT_KEY))) { + if (!filler->real_path) + goto out; + + /* file content request */ + req_size = data_to_uint64(data); + if (req_size >= filler->stbuf->ia_size) { + _fd = open(filler->real_path, O_RDONLY); + if (_fd == -1) { + gf_msg(filler->this->name, GF_LOG_ERROR, errno, + P_MSG_XDATA_GETXATTR, "Opening file %s failed", + filler->real_path); + goto err; + } + + /* + * There could be a situation where the ia_size is + * zero. GF_CALLOC will return a pointer to the + * memory initialized by gf_mem_set_acct_info. + * This function adds a header and a footer to + * the allocated memory. The returned pointer + * points to the memory just after the header, but + * when size is zero, there is no space for user + * data. The memory can be freed by calling GF_FREE. + */ + databuf = GF_CALLOC(1, filler->stbuf->ia_size, gf_posix_mt_char); + if (!databuf) { + goto err; + } + + ret = sys_read(_fd, databuf, filler->stbuf->ia_size); + if (ret == -1) { + gf_msg(filler->this->name, GF_LOG_ERROR, errno, + P_MSG_XDATA_GETXATTR, "Read on file %s failed", + filler->real_path); + goto err; + } + + ret = sys_close(_fd); + _fd = -1; + if (ret == -1) { + gf_msg(filler->this->name, GF_LOG_ERROR, errno, + P_MSG_XDATA_GETXATTR, "Close on file %s failed", + filler->real_path); + goto err; + } + + ret = dict_set_bin(filler->xattr, key, databuf, + filler->stbuf->ia_size); + if (ret < 0) { + gf_msg(filler->this->name, GF_LOG_ERROR, 0, + P_MSG_XDATA_GETXATTR, + "failed to set dict value. key: %s," + "path: %s", + key, filler->real_path); + goto err; + } + + /* To avoid double free in cleanup below */ + databuf = NULL; + err: + if (_fd != -1) + sys_close(_fd); + GF_FREE(databuf); + } + } else if (len == SLEN(GLUSTERFS_OPEN_FD_COUNT) && + !strcmp(key, GLUSTERFS_OPEN_FD_COUNT)) { + inode = _get_filler_inode(filler); + if (!inode || gf_uuid_is_null(inode->gfid)) + goto out; + ret = dict_set_uint32(filler->xattr, key, inode->fd_count); + if (ret < 0) { + gf_msg(filler->this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "Failed to set dictionary value for %s", key); + } + } else if (len == SLEN(GLUSTERFS_ACTIVE_FD_COUNT) && + !strcmp(key, GLUSTERFS_ACTIVE_FD_COUNT)) { + inode = _get_filler_inode(filler); + if (!inode || gf_uuid_is_null(inode->gfid)) + goto out; + ret = dict_set_uint32(filler->xattr, key, inode->active_fd_count); + if (ret < 0) { + gf_msg(filler->this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "Failed to set dictionary value for %s", key); + } + } else if (len == SLEN(GET_ANCESTRY_PATH_KEY) && + !strcmp(key, GET_ANCESTRY_PATH_KEY)) { + /* As of now, the only consumers of POSIX_ANCESTRY_PATH attempt + * fetching it via path-based fops. Hence, leaving it as it is + * for now. + */ + if (!filler->real_path) + goto out; + char *path = NULL; + ret = posix_get_ancestry(filler->this, filler->loc->inode, NULL, &path, + POSIX_ANCESTRY_PATH, &filler->op_errno, + xattr_req); + if (ret < 0) { + goto out; + } + + ret = dict_set_dynstr_sizen(filler->xattr, GET_ANCESTRY_PATH_KEY, path); + if (ret < 0) { + GF_FREE(path); + goto out; + } + + } else if (fnmatch(marker_contri_key, key, 0) == 0) { + ret = _posix_get_marker_quota_contributions(filler, key); + } else if (len == SLEN(GF_REQUEST_LINK_COUNT_XDATA) && + strcmp(key, GF_REQUEST_LINK_COUNT_XDATA) == 0) { + ret = dict_set_sizen(filler->xattr, GF_REQUEST_LINK_COUNT_XDATA, data); + } else if (len == SLEN(GF_GET_SIZE) && strcmp(key, GF_GET_SIZE) == 0) { + if (filler->stbuf && IA_ISREG(filler->stbuf->ia_type)) { + ret = dict_set_uint64(filler->xattr, GF_GET_SIZE, + filler->stbuf->ia_size); + } + } else if (GF_POSIX_ACL_REQUEST(key)) { + if (filler->real_path) + ret = posix_pstat(filler->this, NULL, NULL, filler->real_path, + &stbuf, _gf_false); + else + ret = posix_fdstat(filler->this, filler->fd->inode, filler->fdnum, + &stbuf); + if (ret < 0) { + gf_msg(filler->this->name, GF_LOG_ERROR, errno, + P_MSG_XDATA_GETXATTR, "lstat on %s failed", + filler->real_path ?: uuid_utoa(filler->fd->inode->gfid)); + goto out; + } + + /* Avoid link follow in virt_pacl_get, donot fill acl for symlink.*/ + if (IA_ISLNK(stbuf.ia_type)) + goto out; + + /* ACL_TYPE_DEFAULT is not supported for non-directory, skip */ + if (!IA_ISDIR(stbuf.ia_type) && + !strncmp(key, GF_POSIX_ACL_DEFAULT, SLEN(GF_POSIX_ACL_DEFAULT))) + goto out; + + ret = posix_pacl_get(filler->real_path, filler->fdnum, key, &value); + if (ret || !value) { + gf_msg(filler->this->name, GF_LOG_ERROR, errno, + P_MSG_XDATA_GETXATTR, "could not get acl (%s) for %s, %d", + key, filler->real_path ?: uuid_utoa(filler->fd->inode->gfid), + ret); + goto out; + } + + ret = dict_set_dynstrn(filler->xattr, (char *)key, len, value); + if (ret < 0) { + GF_FREE(value); + gf_msg(filler->this->name, GF_LOG_ERROR, errno, + P_MSG_XDATA_GETXATTR, + "could not set acl (%s) for %s in dictionary", key, + filler->real_path ?: uuid_utoa(filler->fd->inode->gfid)); + goto out; + } + } else { + remaining_size = filler->list_size; + while (remaining_size > 0) { + xattr = filler->list + list_offset; + if (fnmatch(key, xattr, 0) == 0) + ret = _posix_xattr_get_set_from_backend(filler, xattr); + len = strlen(xattr); + remaining_size -= (len + 1); + list_offset += (len + 1); + } + } +out: + return 0; +} + +int +posix_fill_gfid_path(xlator_t *this, const char *path, struct iatt *iatt) +{ + int ret = 0; + ssize_t size = 0; + + if (!iatt) + return 0; + + size = sys_lgetxattr(path, GFID_XATTR_KEY, iatt->ia_gfid, 16); + /* Return value of getxattr */ + if ((size == 16) || (size == -1)) + ret = 0; + else + ret = size; + + return ret; +} + +int +posix_fill_gfid_fd(xlator_t *this, int fd, struct iatt *iatt) +{ + int ret = 0; + ssize_t size = 0; + + if (!iatt) + return 0; + + size = sys_fgetxattr(fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); + /* Return value of getxattr */ + if ((size == 16) || (size == -1)) + ret = 0; + else + ret = size; + + return ret; +} + +void +posix_fill_ino_from_gfid(xlator_t *this, struct iatt *buf) +{ + /* consider least significant 8 bytes of value out of gfid */ + if (gf_uuid_is_null(buf->ia_gfid)) { + buf->ia_ino = -1; + goto out; + } + buf->ia_ino = gfid_to_ino(buf->ia_gfid); + buf->ia_flags |= IATT_INO; +out: + return; +} + +int +posix_fdstat(xlator_t *this, inode_t *inode, int fd, struct iatt *stbuf_p) +{ + int ret = 0; + struct stat fstatbuf = { + 0, + }; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = sys_fstat(fd, &fstatbuf); + if (ret == -1) + goto out; + + if (fstatbuf.st_nlink && !S_ISDIR(fstatbuf.st_mode)) + fstatbuf.st_nlink--; + + iatt_from_stat(&stbuf, &fstatbuf); + + if (inode && priv->ctime) { + ret = posix_get_mdata_xattr(this, NULL, fd, inode, &stbuf); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GETMDATA_FAILED, + "posix get mdata failed on gfid: %s", + uuid_utoa(inode->gfid)); + goto out; + } + } + ret = posix_fill_gfid_fd(this, fd, &stbuf); + stbuf.ia_flags |= IATT_GFID; + + posix_fill_ino_from_gfid(this, &stbuf); + + if (stbuf_p) + *stbuf_p = stbuf; + +out: + return ret; +} + +/* The inode here is expected to update posix_mdata stored on disk. + * Don't use it as a general purpose inode and don't expect it to + * be always exists + */ +int +posix_istat(xlator_t *this, inode_t *inode, uuid_t gfid, const char *basename, + struct iatt *buf_p) +{ + char *real_path = NULL; + struct stat lstatbuf = { + 0, + }; + struct iatt stbuf = { + 0, + }; + int ret = 0; + struct posix_private *priv = NULL; + + priv = this->private; + + MAKE_HANDLE_PATH(real_path, this, gfid, basename); + if (!real_path) { + gf_msg(this->name, GF_LOG_ERROR, ESTALE, P_MSG_HANDLE_PATH_CREATE, + "Failed to create handle path for %s/%s", uuid_utoa(gfid), + basename ? basename : ""); + errno = ESTALE; + ret = -1; + goto out; + } + + ret = sys_lstat(real_path, &lstatbuf); + + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT && errno != ELOOP) + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_LSTAT_FAILED, + "lstat failed on %s", real_path); + } else { + // may be some backend filesystem issue + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_LSTAT_FAILED, + "lstat failed on %s and return value is %d " + "instead of -1. Please see dmesg output to " + "check whether the failure is due to backend " + "filesystem issue", + real_path, ret); + ret = -1; + } + goto out; + } + + if ((lstatbuf.st_ino == priv->handledir.st_ino) && + (lstatbuf.st_dev == priv->handledir.st_dev)) { + errno = ENOENT; + return -1; + } + + if (!S_ISDIR(lstatbuf.st_mode)) + lstatbuf.st_nlink--; + + iatt_from_stat(&stbuf, &lstatbuf); + + if (inode && priv->ctime) { + ret = posix_get_mdata_xattr(this, real_path, -1, inode, &stbuf); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GETMDATA_FAILED, + "posix get mdata failed on %s", real_path); + goto out; + } + } + + if (basename) + posix_fill_gfid_path(this, real_path, &stbuf); + else + gf_uuid_copy(stbuf.ia_gfid, gfid); + stbuf.ia_flags |= IATT_GFID; + + posix_fill_ino_from_gfid(this, &stbuf); + + if (buf_p) + *buf_p = stbuf; +out: + return ret; +} + +int +posix_pstat(xlator_t *this, inode_t *inode, uuid_t gfid, const char *path, + struct iatt *buf_p, gf_boolean_t inode_locked) +{ + struct stat lstatbuf = { + 0, + }; + struct iatt stbuf = { + 0, + }; + int ret = 0; + int op_errno = 0; + struct posix_private *priv = NULL; + + priv = this->private; + + if (gfid && !gf_uuid_is_null(gfid)) + gf_uuid_copy(stbuf.ia_gfid, gfid); + else + posix_fill_gfid_path(this, path, &stbuf); + stbuf.ia_flags |= IATT_GFID; + + ret = sys_lstat(path, &lstatbuf); + if (ret == -1) { + if (errno != ENOENT) { + op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_LSTAT_FAILED, + "lstat failed on %s", path); + errno = op_errno; /*gf_msg could have changed errno*/ + } else { + op_errno = errno; + gf_msg_debug(this->name, 0, "lstat failed on %s (%s)", path, + strerror(errno)); + errno = op_errno; /*gf_msg could have changed errno*/ + } + goto out; + } + + if ((lstatbuf.st_ino == priv->handledir.st_ino) && + (lstatbuf.st_dev == priv->handledir.st_dev)) { + errno = ENOENT; + return -1; + } + + if (!S_ISDIR(lstatbuf.st_mode)) + lstatbuf.st_nlink--; + + iatt_from_stat(&stbuf, &lstatbuf); + + if (priv->ctime) { + if (inode) { + if (!inode_locked) { + ret = posix_get_mdata_xattr(this, path, -1, inode, &stbuf); + } else { + ret = __posix_get_mdata_xattr(this, path, -1, inode, &stbuf); + } + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GETMDATA_FAILED, + "posix get mdata failed on gfid: %s", + uuid_utoa(inode->gfid)); + goto out; + } + } else { + ret = __posix_get_mdata_xattr(this, path, -1, NULL, &stbuf); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GETMDATA_FAILED, + "posix get mdata failed on path: %s", path); + goto out; + } + } + } + + posix_fill_ino_from_gfid(this, &stbuf); + + if (buf_p) + *buf_p = stbuf; +out: + return ret; +} + +static void +_get_list_xattr(posix_xattr_filler_t *filler) +{ + ssize_t size = 0; + + if ((!filler) || ((!filler->real_path) && (filler->fdnum < 0))) + goto out; + + if (filler->real_path) + size = sys_llistxattr(filler->real_path, NULL, 0); + else + size = sys_flistxattr(filler->fdnum, NULL, 0); + + if (size <= 0) + goto out; + + filler->list = GF_CALLOC(1, size, gf_posix_mt_char); + if (!filler->list) + goto out; + + if (filler->real_path) + size = sys_llistxattr(filler->real_path, filler->list, size); + else + size = sys_flistxattr(filler->fdnum, filler->list, size); + + filler->list_size = size; +out: + return; +} + +static void +_handle_list_xattr(posix_xattr_filler_t *filler) +{ + int32_t list_offset = 0; + ssize_t remaining_size = 0; + char *key = NULL; + int len; + + remaining_size = filler->list_size; + while (remaining_size > 0) { + key = filler->list + list_offset; + len = strlen(key); + + if (gf_get_index_by_elem(list_xattr_ignore_xattrs, key) >= 0) + goto next; + + if (posix_special_xattr(marker_xattrs, key)) + goto next; + + if (posix_handle_georep_xattrs(NULL, key, NULL, _gf_false)) + goto next; + + if (posix_is_gfid2path_xattr(key)) + goto next; + + if (dict_getn(filler->xattr, key, len)) + goto next; + + (void)_posix_xattr_get_set_from_backend(filler, key); + next: + remaining_size -= (len + 1); + list_offset += (len + 1); + + } /* while (remaining_size > 0) */ + return; +} + +dict_t * +posix_xattr_fill(xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd, + int fdnum, dict_t *xattr_req, struct iatt *buf) +{ + dict_t *xattr = NULL; + posix_xattr_filler_t filler = { + 0, + }; + gf_boolean_t list = _gf_false; + + if (dict_get_sizen(xattr_req, "list-xattr")) { + dict_del_sizen(xattr_req, "list-xattr"); + list = _gf_true; + } + + xattr = dict_new(); + if (!xattr) { + goto out; + } + + filler.this = this; + filler.real_path = real_path; + filler.xattr = xattr; + filler.stbuf = buf; + filler.loc = loc; + filler.fd = fd; + filler.fdnum = fdnum; + + _get_list_xattr(&filler); + dict_foreach(xattr_req, _posix_xattr_get_set, &filler); + if (list) + _handle_list_xattr(&filler); + + GF_FREE(filler.list); +out: + return xattr; +} + +void +posix_gfid_unset(xlator_t *this, dict_t *xdata) +{ + uuid_t uuid = { + 0, + }; + int ret = 0; + + if (xdata == NULL) + goto out; + + ret = dict_get_gfuuid(xdata, "gfid-req", &uuid); + if (ret) { + goto out; + } + + posix_handle_unset(this, uuid, NULL); +out: + return; +} + +int +posix_gfid_set(xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req, + pid_t pid, int *op_errno) +{ + uuid_t uuid_req; + uuid_t uuid_curr; + int ret = 0; + ssize_t size = 0; + struct stat stat = { + 0, + }; + + *op_errno = 0; + + if (!xattr_req) { + if (pid != GF_SERVER_PID_TRASH) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_INVALID_ARGUMENT, + "xattr_req is null"); + *op_errno = EINVAL; + ret = -1; + } + goto out; + } + + if (sys_lstat(path, &stat) != 0) { + ret = -1; + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", path); + goto out; + } + + size = sys_lgetxattr(path, GFID_XATTR_KEY, uuid_curr, 16); + if (size == 16) { + ret = 0; + goto verify_handle; + } + + ret = dict_get_gfuuid(xattr_req, "gfid-req", &uuid_req); + if (ret) { + gf_msg_debug(this->name, 0, "failed to get the gfid from dict for %s", + loc->path); + *op_errno = -ret; + ret = -1; + goto out; + } + if (gf_uuid_is_null(uuid_req)) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_NULL_GFID, + "gfid is null for %s", loc ? loc->path : ""); + ret = -1; + *op_errno = EINVAL; + goto out; + } + + ret = sys_lsetxattr(path, GFID_XATTR_KEY, uuid_req, 16, XATTR_CREATE); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_FAILED, + "setting GFID on %s failed ", path); + goto out; + } + gf_uuid_copy(uuid_curr, uuid_req); + +verify_handle: + if (!S_ISDIR(stat.st_mode)) + ret = posix_handle_hard(this, path, uuid_curr, &stat); + else + ret = posix_handle_soft(this, path, loc, uuid_curr, &stat); + +out: + if (ret && !(*op_errno)) + *op_errno = errno; + return ret; +} + +#ifdef HAVE_SYS_ACL_H +static int +posix_pacl_set(const char *path, int fdnum, const char *key, const char *acl_s) +{ + int ret = -1; + acl_t acl = NULL; + acl_type_t type = 0; + + if ((!path) && (fdnum < 0)) { + errno = -EINVAL; + return -1; + } + + type = gf_posix_acl_get_type(key); + if (!type) + return -1; + + acl = acl_from_text(acl_s); + if (!acl) + return -1; + + if (path) + ret = acl_set_file(path, type, acl); + else if (type == ACL_TYPE_ACCESS) + ret = acl_set_fd(fdnum, acl); + else { + errno = -EINVAL; + return -1; + } + + if (ret) + /* posix_handle_pair expects ret to be the errno */ + ret = -errno; + + acl_free(acl); + + return ret; +} + +int +posix_pacl_get(const char *path, int fdnum, const char *key, char **acl_s) +{ + int ret = -1; + acl_t acl = NULL; + acl_type_t type = 0; + char *acl_tmp = NULL; + + if ((!path) && (fdnum < 0)) { + errno = -EINVAL; + return -1; + } + + type = gf_posix_acl_get_type(key); + if (!type) + return -1; + + if (path) + acl = acl_get_file(path, type); + else if (type == ACL_TYPE_ACCESS) + acl = acl_get_fd(fdnum); + else { + errno = -EINVAL; + return -1; + } + + if (!acl) + return -1; + +#ifdef HAVE_ACL_LIBACL_H + acl_tmp = acl_to_any_text(acl, NULL, ',', + TEXT_ABBREVIATE | TEXT_NUMERIC_IDS); +#else /* FreeBSD and the like */ + acl_tmp = acl_to_text_np(acl, NULL, ACL_TEXT_NUMERIC_IDS); +#endif + if (!acl_tmp) + goto free_acl; + + *acl_s = gf_strdup(acl_tmp); + if (*acl_s) + ret = 0; + + acl_free(acl_tmp); +free_acl: + acl_free(acl); + + return ret; +} +#else /* !HAVE_SYS_ACL_H (NetBSD) */ +int +posix_pacl_set(const char *path, int fdnum, const char *key, const char *acl_s) +{ + errno = ENOTSUP; + return -1; +} + +int +posix_pacl_get(const char *path, int fdnum, const char *key, char **acl_s) +{ + errno = ENOTSUP; + return -1; +} +#endif + +#ifdef GF_DARWIN_HOST_OS +static void +posix_dump_buffer(xlator_t *this, const char *real_path, const char *key, + data_t *value, int flags) +{ + char buffer[3 * value->len + 1]; + int index = 0; + buffer[0] = 0; + gf_loglevel_t log_level = gf_log_get_loglevel(); + if (log_level == GF_LOG_TRACE) { + char *data = (char *)value->data; + for (index = 0; index < value->len; index++) + sprintf(buffer + 3 * index, " %02x", data[index]); + } + gf_msg_debug(this->name, 0, "Dump %s: key:%s flags: %u length:%u data:%s ", + real_path, key, flags, value->len, + (log_level == GF_LOG_TRACE ? buffer : "<skipped in DEBUG>")); +} +#endif + +int +posix_handle_pair(xlator_t *this, loc_t *loc, const char *real_path, char *key, + data_t *value, int flags, struct iatt *stbuf) +{ + int sys_ret = -1; + int ret = 0; + int op_errno = 0; + struct mdata_iatt mdata_iatt = { + 0, + }; +#ifdef GF_DARWIN_HOST_OS + const int error_code = EINVAL; +#else + const int error_code = EEXIST; +#endif + + if (XATTR_IS_PATHINFO(key)) { + ret = -EACCES; + goto out; + } else if (posix_is_gfid2path_xattr(key)) { + ret = -ENOTSUP; + goto out; + } else if (GF_POSIX_ACL_REQUEST(key)) { + if (stbuf && IS_DHT_LINKFILE_MODE(stbuf)) + goto out; + ret = posix_pacl_set(real_path, -1, key, value->data); + } else if (!strncmp(key, POSIX_ACL_ACCESS_XATTR, + SLEN(POSIX_ACL_ACCESS_XATTR)) && + stbuf && IS_DHT_LINKFILE_MODE(stbuf)) { + goto out; + } else if (!strncmp(key, GF_INTERNAL_CTX_KEY, SLEN(GF_INTERNAL_CTX_KEY))) { + /* ignore this key value pair */ + ret = 0; + goto out; + } else if (!strncmp(key, GF_XATTR_MDATA_KEY, strlen(key))) { + /* This is either by rebalance or self heal. Create the xattr if it's + * not present. Compare and update the larger value if the xattr is + * already present. + */ + if (loc == NULL) { + ret = -EINVAL; + goto out; + } + posix_mdata_iatt_from_disk(&mdata_iatt, + (posix_mdata_disk_t *)value->data); + ret = posix_set_mdata_xattr_legacy_files(this, loc->inode, real_path, + &mdata_iatt, &op_errno); + if (ret != 0) { + ret = -op_errno; + } + goto out; + } else { + sys_ret = sys_lsetxattr(real_path, key, value->data, value->len, flags); +#ifdef GF_DARWIN_HOST_OS + posix_dump_buffer(this, real_path, key, value, flags); +#endif + if (sys_ret < 0) { + ret = -errno; + if (errno == ENOENT) { + if (!posix_special_xattr(marker_xattrs, key)) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "setxattr on %s failed", real_path); + } + } else { + if (errno == error_code) { + gf_msg_debug(this->name, 0, + "%s: key:%s" + "flags: %u length:%d", + real_path, key, flags, value->len); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "%s: key:%s" + "flags: %u length:%d", + real_path, key, flags, value->len); + } + } + + goto out; + } + } +out: + return ret; +} + +int +posix_fhandle_pair(call_frame_t *frame, xlator_t *this, int fd, char *key, + data_t *value, int flags, struct iatt *stbuf, fd_t *_fd) +{ + int sys_ret = -1; + int ret = 0; + + if (XATTR_IS_PATHINFO(key)) { + ret = -EACCES; + goto out; + } else if (posix_is_gfid2path_xattr(key)) { + ret = -ENOTSUP; + goto out; + } else if (!strncmp(key, POSIX_ACL_ACCESS_XATTR, + SLEN(POSIX_ACL_ACCESS_XATTR)) && + stbuf && IS_DHT_LINKFILE_MODE(stbuf)) { + goto out; + } + + sys_ret = sys_fsetxattr(fd, key, value->data, value->len, flags); + + if (sys_ret < 0) { + ret = -errno; + if (errno == ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fsetxattr on fd=%d" + " failed", + fd); + } else { +#ifdef GF_DARWIN_HOST_OS + if (errno == EINVAL) { + gf_msg_debug(this->name, 0, + "fd=%d: key:%s " + "error:%s", + fd, key, strerror(errno)); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fd=%d: key:%s", fd, key); + } + +#else /* ! DARWIN */ + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fd=%d: key:%s", fd, key); +#endif /* DARWIN */ + } + + goto out; + } else if (_fd) { + posix_set_ctime(frame, this, NULL, fd, _fd->inode, NULL); + } + +out: + return ret; +} + +static void +del_stale_dir_handle(xlator_t *this, uuid_t gfid) +{ + char newpath[PATH_MAX] = { + 0, + }; + uuid_t gfid_curr = { + 0, + }; + ssize_t size = -1; + gf_boolean_t stale = _gf_false; + char *hpath = NULL; + struct stat stbuf = { + 0, + }; + struct iatt iabuf = { + 0, + }; + + MAKE_HANDLE_GFID_PATH(hpath, this, gfid); + + /* check that it is valid directory handle */ + size = sys_lstat(hpath, &stbuf); + if (size < 0) { + gf_msg_debug(this->name, 0, + "%s: Handle stat failed: " + "%s", + hpath, strerror(errno)); + goto out; + } + + iatt_from_stat(&iabuf, &stbuf); + if (iabuf.ia_nlink != 1 || !IA_ISLNK(iabuf.ia_type)) { + gf_msg_debug(this->name, 0, "%s: Handle nlink %d %d", hpath, + iabuf.ia_nlink, IA_ISLNK(iabuf.ia_type)); + goto out; + } + + size = posix_handle_path(this, gfid, NULL, newpath, sizeof(newpath)); + if (size <= 0) { + if (errno == ENOENT) { + gf_msg_debug(this->name, 0, "%s: %s", newpath, strerror(ENOENT)); + stale = _gf_true; + } + goto out; + } + + size = sys_lgetxattr(newpath, GFID_XATTR_KEY, gfid_curr, 16); + if (size < 0 && errno == ENOENT) { + gf_msg_debug(this->name, 0, "%s: %s", newpath, strerror(ENOENT)); + stale = _gf_true; + } else if (size == 16 && gf_uuid_compare(gfid, gfid_curr)) { + gf_msg_debug(this->name, 0, + "%s: mismatching gfid: %s, " + "at %s", + hpath, uuid_utoa(gfid_curr), newpath); + stale = _gf_true; + } + +out: + if (stale) { + size = sys_unlink(hpath); + if (size < 0 && errno != ENOENT) + gf_msg(this->name, GF_LOG_ERROR, errno, + P_MSG_STALE_HANDLE_REMOVE_FAILED, + "%s: Failed" + "to remove handle to %s", + hpath, newpath); + } else if (size == 16) { + gf_msg_debug(this->name, 0, + "%s: Fresh handle for " + "%s with gfid %s", + hpath, newpath, uuid_utoa(gfid_curr)); + } + return; +} + +static int +janitor_walker(const char *fpath, const struct stat *sb, int typeflag, + struct FTW *ftwbuf) +{ + struct iatt stbuf = { + 0, + }; + xlator_t *this = NULL; + + this = THIS; + /* posix_mdata_t is not filled, no time or size attributes + * are being used, so fine. + */ + posix_pstat(this, NULL, NULL, fpath, &stbuf, _gf_false); + switch (sb->st_mode & S_IFMT) { + case S_IFREG: + case S_IFBLK: + case S_IFLNK: + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + gf_msg_trace(THIS->name, 0, "unlinking %s", fpath); + sys_unlink(fpath); + if (stbuf.ia_nlink == 1) + posix_handle_unset(this, stbuf.ia_gfid, NULL); + break; + + case S_IFDIR: + if (ftwbuf->level) { /* don't remove top level dir */ + gf_msg_debug(THIS->name, 0, "removing directory %s", fpath); + + sys_rmdir(fpath); + del_stale_dir_handle(this, stbuf.ia_gfid); + } + break; + } + + return 0; /* 0 = FTW_CONTINUE */ +} + +void +__posix_janitor_timer_start(xlator_t *this); + +static int +posix_janitor_task_done(int ret, call_frame_t *frame, void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + + this = data; + priv = this->private; + + pthread_mutex_lock(&priv->janitor_mutex); + { + if (priv->janitor_task_stop) { + priv->janitor_task_stop = _gf_false; + pthread_cond_signal(&priv->janitor_cond); + pthread_mutex_unlock(&priv->janitor_mutex); + goto out; + } + } + pthread_mutex_unlock(&priv->janitor_mutex); + + LOCK(&priv->lock); + { + __posix_janitor_timer_start(this); + } + UNLOCK(&priv->lock); + +out: + return 0; +} + +static int +posix_janitor_task(void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + xlator_t *old_this = NULL; + + time_t now; + + this = data; + priv = this->private; + /* We need THIS to be set for janitor_walker */ + old_this = THIS; + THIS = this; + + if (!priv) + goto out; + + now = gf_time(); + if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { + if (priv->disable_landfill_purge) { + gf_msg_debug(this->name, 0, + "Janitor would have " + "cleaned out %s, but purge" + "is disabled.", + priv->trash_path); + } else { + gf_msg_trace(this->name, 0, "janitor cleaning out %s", + priv->trash_path); + + nftw(priv->trash_path, janitor_walker, 32, FTW_DEPTH | FTW_PHYS); + } + priv->last_landfill_check = now; + } + + THIS = old_this; + +out: + return 0; +} + +static void +posix_janitor_task_initator(struct gf_tw_timer_list *timer, void *data, + unsigned long calltime) +{ + xlator_t *this = NULL; + int ret = 0; + + this = data; + + ret = synctask_new(this->ctx->env, posix_janitor_task, + posix_janitor_task_done, NULL, this); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_THREAD_FAILED, + "spawning janitor " + "thread failed"); + } + + return; +} + +void +__posix_janitor_timer_start(xlator_t *this) +{ + struct posix_private *priv = NULL; + struct gf_tw_timer_list *timer = NULL; + + priv = this->private; + timer = priv->janitor; + + INIT_LIST_HEAD(&timer->entry); + timer->expires = priv->janitor_sleep_duration; + timer->function = posix_janitor_task_initator; + timer->data = this; + gf_tw_add_timer(glusterfs_ctx_tw_get(this->ctx), timer); + + return; +} + +void +posix_janitor_timer_start(xlator_t *this) +{ + struct posix_private *priv = NULL; + struct gf_tw_timer_list *timer = NULL; + + priv = this->private; + + LOCK(&priv->lock); + { + if (!priv->janitor) { + timer = GF_CALLOC(1, sizeof(struct gf_tw_timer_list), + gf_common_mt_tw_timer_list); + if (!timer) { + goto unlock; + } + priv->janitor = timer; + __posix_janitor_timer_start(this); + } + } +unlock: + UNLOCK(&priv->lock); + + return; +} + +static struct posix_fd * +janitor_get_next_fd(glusterfs_ctx_t *ctx) +{ + struct posix_fd *pfd = NULL; + + while (list_empty(&ctx->janitor_fds)) { + if (ctx->pxl_count == 0) { + return NULL; + } + + pthread_cond_wait(&ctx->fd_cond, &ctx->fd_lock); + } + + pfd = list_first_entry(&ctx->janitor_fds, struct posix_fd, list); + list_del_init(&pfd->list); + + return pfd; +} + +static void +posix_close_pfd(xlator_t *xl, struct posix_fd *pfd) +{ + THIS = xl; + + if (pfd->dir == NULL) { + gf_msg_trace(xl->name, 0, "janitor: closing file fd=%d", pfd->fd); + sys_close(pfd->fd); + } else { + gf_msg_debug(xl->name, 0, "janitor: closing dir fd=%p", pfd->dir); + sys_closedir(pfd->dir); + } + + GF_FREE(pfd); +} + +static void * +posix_ctx_janitor_thread_proc(void *data) +{ + xlator_t *xl; + struct posix_fd *pfd; + glusterfs_ctx_t *ctx = NULL; + struct posix_private *priv_fd; + + ctx = data; + + pthread_mutex_lock(&ctx->fd_lock); + + while ((pfd = janitor_get_next_fd(ctx)) != NULL) { + pthread_mutex_unlock(&ctx->fd_lock); + + xl = pfd->xl; + posix_close_pfd(xl, pfd); + + pthread_mutex_lock(&ctx->fd_lock); + + priv_fd = xl->private; + priv_fd->rel_fdcount--; + if (!priv_fd->rel_fdcount) + pthread_cond_signal(&priv_fd->fd_cond); + } + + pthread_mutex_unlock(&ctx->fd_lock); + + return NULL; +} + +int +posix_spawn_ctx_janitor_thread(xlator_t *this) +{ + int ret = 0; + glusterfs_ctx_t *ctx = NULL; + + ctx = this->ctx; + + pthread_mutex_lock(&ctx->fd_lock); + { + if (ctx->pxl_count++ == 0) { + ret = gf_thread_create(&ctx->janitor, NULL, + posix_ctx_janitor_thread_proc, ctx, + "posixctxjan"); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_THREAD_FAILED, + "spawning janitor thread failed"); + ctx->pxl_count--; + } + } + } + pthread_mutex_unlock(&ctx->fd_lock); + + return ret; +} + +static int +is_fresh_file(struct timespec *ts) +{ + struct timespec now; + int64_t elapsed; + + timespec_now_realtime(&now); + elapsed = (int64_t)gf_tsdiff(ts, &now); + + if (elapsed < 0) { + /* The file has been modified in the future !!! + * Is it fresh ? previous implementation considered this as a + * non-fresh file, so maintaining the same behavior. */ + return 0; + } + + /* If the file is newer than a second, we consider it fresh. */ + return elapsed < 1000000; +} + +int +posix_gfid_heal(xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = { + 0, + }; + struct iatt stbuf = { + 0, + }; + struct posix_private *priv = NULL; + + priv = this->private; + + if (!xattr_req) + return 0; + + if (loc->inode && priv->ctime) { + if (sys_lstat(path, &stat) != 0) { + return -errno; + } + /* stbuf is only to compare ctime, don't use it to access + * other fields as they are zero. */ + ret = posix_get_mdata_xattr(this, path, -1, loc->inode, &stbuf); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GETMDATA_FAILED, + "posix get mdata failed on gfid: %s", + uuid_utoa(loc->inode->gfid)); + return -ENOENT; + } + ret = sys_lgetxattr(path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + /* TODO: This is a very hacky way of doing this, and very prone to + * errors and unexpected behavior. This should be changed. */ + struct timespec ts = {.tv_sec = stbuf.ia_ctime, + .tv_nsec = stbuf.ia_ctime_nsec}; + if (is_fresh_file(&ts)) { + gf_msg(this->name, GF_LOG_ERROR, ENOENT, P_MSG_FRESHFILE, + "Fresh file: %s", path); + return -ENOENT; + } + } + } else { + if (sys_lstat(path, &stat) != 0) { + return -errno; + } + ret = sys_lgetxattr(path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + /* TODO: This is a very hacky way of doing this, and very prone to + * errors and unexpected behavior. This should be changed. */ + if (is_fresh_file(&stat.st_ctim)) { + gf_msg(this->name, GF_LOG_ERROR, ENOENT, P_MSG_FRESHFILE, + "Fresh file: %s", path); + return -ENOENT; + } + } + } + + (void)posix_gfid_set(this, path, loc, xattr_req, GF_CLIENT_PID_MAX, &ret); + return 0; +} + +int +posix_acl_xattr_set(xlator_t *this, const char *path, dict_t *xattr_req) +{ + int ret = 0; + data_t *data = NULL; + struct stat stat = { + 0, + }; + + if (!xattr_req) + goto out; + + if (sys_lstat(path, &stat) != 0) + goto out; + + data = dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR); + if (data) { + ret = sys_lsetxattr(path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, + 0); +#ifdef __FreeBSD__ + if (ret != -1) { + ret = 0; + } +#endif /* __FreeBSD__ */ + if (ret != 0) + goto out; + } + + data = dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR); + if (data) { + ret = sys_lsetxattr(path, POSIX_ACL_DEFAULT_XATTR, data->data, + data->len, 0); +#ifdef __FreeBSD__ + if (ret != -1) { + ret = 0; + } +#endif /* __FreeBSD__ */ + if (ret != 0) + goto out; + } + +out: + return ret; +} + +static int +_handle_entry_create_keyvalue_pair(dict_t *d, char *k, data_t *v, void *tmp) +{ + int ret = -1; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + if (!strcmp(GFID_XATTR_KEY, k) || !strcmp("gfid-req", k) || + !strcmp(POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp(POSIX_ACL_ACCESS_XATTR, k) || posix_xattr_ignorable(k)) { + return 0; + } + + ret = posix_handle_pair(filler->this, filler->loc, filler->real_path, k, v, + XATTR_CREATE, filler->stbuf); + if (ret < 0) { + errno = -ret; + return -1; + } + return 0; +} + +int +posix_entry_create_xattr_set(xlator_t *this, loc_t *loc, const char *path, + dict_t *dict) +{ + int ret = -1; + + posix_xattr_filler_t filler = { + 0, + }; + + if (!dict) + goto out; + + filler.this = this; + filler.real_path = path; + filler.stbuf = NULL; + filler.loc = loc; + + ret = dict_foreach(dict, _handle_entry_create_keyvalue_pair, &filler); + +out: + return ret; +} + +static int +__posix_fd_ctx_get(fd_t *fd, xlator_t *this, struct posix_fd **pfd_p, + int *op_errno_p) +{ + uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; + int ret = -1; + char *real_path = NULL; + char *unlink_path = NULL; + int _fd = -1; + int op_errno = 0; + DIR *dir = NULL; + + struct posix_private *priv = NULL; + + priv = this->private; + + ret = __fd_ctx_get(fd, this, &tmp_pfd); + if (ret == 0) { + pfd = (void *)(long)tmp_pfd; + goto out; + } + if (!fd_is_anonymous(fd)) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_READ_FAILED, + "Failed to get fd context for a non-anonymous fd, " + "gfid: %s", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto out; + } + + MAKE_HANDLE_PATH(real_path, this, fd->inode->gfid, NULL); + if (!real_path) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_READ_FAILED, + "Failed to create handle path (%s)", uuid_utoa(fd->inode->gfid)); + ret = -1; + op_errno = EINVAL; + goto out; + } + pfd = GF_CALLOC(1, sizeof(*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + op_errno = ENOMEM; + goto out; + } + pfd->fd = -1; + + if (fd->inode->ia_type == IA_IFDIR) { + dir = sys_opendir(real_path); + if (!dir) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_READ_FAILED, + "Failed to get anonymous fd for " + "real_path: %s.", + real_path); + GF_FREE(pfd); + pfd = NULL; + goto out; + } + _fd = dirfd(dir); + } + + /* Using fd->flags in case we choose to have anonymous + * fds with different flags some day. As of today it + * would be GF_ANON_FD_FLAGS and nothing else. + */ + if (fd->inode->ia_type == IA_IFREG) { + _fd = open(real_path, fd->flags); + if ((_fd == -1) && (errno == ENOENT)) { + POSIX_GET_FILE_UNLINK_PATH(priv->base_path, fd->inode->gfid, + unlink_path); + _fd = open(unlink_path, fd->flags); + } + if (_fd == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_READ_FAILED, + "Failed to get anonymous fd for " + "real_path: %s.", + real_path); + GF_FREE(pfd); + pfd = NULL; + goto out; + } + } + + pfd->fd = _fd; + pfd->dir = dir; + pfd->flags = fd->flags; + + ret = __fd_ctx_set(fd, this, (uint64_t)(long)pfd); + if (ret != 0) { + op_errno = ENOMEM; + if (_fd != -1) + sys_close(_fd); + if (dir) + sys_closedir(dir); + GF_FREE(pfd); + pfd = NULL; + goto out; + } + + ret = 0; +out: + if (ret < 0 && op_errno_p) + *op_errno_p = op_errno; + + if (pfd_p) + *pfd_p = pfd; + return ret; +} + +int +posix_fd_ctx_get(fd_t *fd, xlator_t *this, struct posix_fd **pfd, int *op_errno) +{ + int ret; + + LOCK(&fd->inode->lock); + { + ret = __posix_fd_ctx_get(fd, this, pfd, op_errno); + } + UNLOCK(&fd->inode->lock); + + return ret; +} + +static int +posix_fs_health_check(xlator_t *this, char *file_path) +{ + struct posix_private *priv = NULL; + int ret = -1; + char timestamp[GF_TIMESTR_SIZE] = { + 0, + }; + int fd = -1; + int timelen = -1; + time_t time_sec = { + 0, + }; + char buff[256] = {0}; + char *op = NULL; + int op_errno = 0; + int cnt; + int timeout = 0; + struct aiocb aiocb; + + priv = this->private; + + timeout = priv->health_check_timeout; + + fd = open(file_path, O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (fd == -1) { + op_errno = errno; + op = "open_for_write"; + goto out; + } + + time_sec = gf_time(); + gf_time_fmt(timestamp, sizeof timestamp, time_sec, gf_timefmt_FT); + timelen = strlen(timestamp); + + memset(&aiocb, 0, sizeof(struct aiocb)); + aiocb.aio_fildes = fd; + aiocb.aio_buf = timestamp; + aiocb.aio_nbytes = timelen; + aiocb.aio_sigevent.sigev_notify = SIGEV_NONE; + if (aio_write(&aiocb) == -1) { + op_errno = errno; + op = "aio_write"; + goto out; + } + + cnt = 0; + /* Wait until write completion */ + while ((aio_error(&aiocb) == EINPROGRESS) && (++cnt <= timeout)) + sleep(1); + + ret = aio_error(&aiocb); + if (ret != 0) { + op_errno = errno; + op = "aio_write_error"; + goto out; + } + + ret = aio_return(&aiocb); + if (ret != timelen) { + op_errno = errno; + op = "aio_write_buf"; + ret = -1; + goto out; + } + + sys_close(fd); + + fd = open(file_path, O_RDONLY); + if (fd == -1) { + op_errno = errno; + op = "open_for_read"; + goto out; + } + + memset(&aiocb, 0, sizeof(struct aiocb)); + aiocb.aio_fildes = fd; + aiocb.aio_buf = buff; + aiocb.aio_nbytes = sizeof(buff); + if (aio_read(&aiocb) == -1) { + op_errno = errno; + op = "aio_read"; + goto out; + } + cnt = 0; + /* Wait until read completion */ + while ((aio_error(&aiocb) == EINPROGRESS) && (++cnt <= timeout)) + sleep(1); + + ret = aio_error(&aiocb); + if (ret != 0) { + op_errno = errno; + op = "aio_read_error"; + goto out; + } + + ret = aio_return(&aiocb); + if (ret != timelen) { + op_errno = errno; + op = "aio_read_buf"; + ret = -1; + goto out; + } + + if (memcmp(timestamp, buff, ret)) { + op_errno = EUCLEAN; + op = "aio_read_cmp_buf"; + ret = -1; + goto out; + } + ret = 0; +out: + if (fd != -1) { + sys_close(fd); + } + + if (ret && file_path[0]) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HEALTHCHECK_FAILED, + "%s() on %s returned ret is %d error is %s", op, file_path, ret, + ret != -1 ? strerror(ret) : strerror(op_errno)); + + if ((op_errno == EAGAIN) || (ret == EAGAIN)) { + ret = 0; + } else { + gf_event(EVENT_POSIX_HEALTH_CHECK_FAILED, + "op=%s;path=%s;error=%s;brick=%s:%s timeout is %d", op, + file_path, strerror(op_errno), priv->hostname, + priv->base_path, timeout); + } + } + return ret; +} + +static void * +posix_health_check_thread_proc(void *data) +{ + xlator_t *this = data; + struct posix_private *priv = this->private; + uint32_t interval = priv->health_check_interval; + int ret = -1; + xlator_t *top = NULL; + xlator_t *victim = NULL; + xlator_list_t **trav_p = NULL; + int count = 0; + gf_boolean_t victim_found = _gf_false; + glusterfs_ctx_t *ctx = THIS->ctx; + char file_path[PATH_MAX]; + + /* prevent races when the interval is updated */ + if (interval == 0) + goto out; + + snprintf(file_path, sizeof(file_path) - 1, "%s/%s/health_check", + priv->base_path, GF_HIDDEN_PATH); + + gf_msg_debug(this->name, 0, + "health-check thread started, " + "on path %s, " + "interval = %d seconds", + file_path, interval); + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep(interval); + if (ret > 0) + break; + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check.*/ + ret = posix_fs_health_check(this, file_path); + if (ret < 0 && priv->health_check_active) + goto abort; + if (!priv->health_check_active) + goto out; + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_msg_debug(this->name, 0, "health-check thread exiting"); + + LOCK(&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK(&priv->lock); + + return NULL; + +abort: + LOCK(&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK(&priv->lock); + + /* health-check failed */ + gf_msg(this->name, GF_LOG_EMERG, 0, P_MSG_HEALTHCHECK_FAILED, + "health-check failed, going down"); + + xlator_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + /* Below code is use to ensure if brick multiplexing is enabled if + count is more than 1 it means brick mux has enabled + */ + if (this->ctx->active) { + top = this->ctx->active->first; + LOCK(&ctx->volfile_lock); + for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) { + count++; + } + UNLOCK(&ctx->volfile_lock); + } + + if (count == 1) { + gf_msg(this->name, GF_LOG_EMERG, 0, P_MSG_HEALTHCHECK_FAILED, + "still alive! -> SIGTERM"); + ret = sleep(30); + + /* Need to kill the process only while brick mux has not enabled + */ + if (ret == 0) + kill(getpid(), SIGTERM); + + ret = sleep(30); + gf_msg(this->name, GF_LOG_EMERG, 0, P_MSG_HEALTHCHECK_FAILED, + "still alive! -> SIGKILL"); + if (ret == 0) + kill(getpid(), SIGKILL); + + } else if (top) { + LOCK(&ctx->volfile_lock); + for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) { + victim = (*trav_p)->xlator; + if (!victim->call_cleanup && + strcmp(victim->name, priv->base_path) == 0) { + victim_found = _gf_true; + break; + } + } + UNLOCK(&ctx->volfile_lock); + if (victim_found && !victim->cleanup_starting) { + gf_log(THIS->name, GF_LOG_INFO, + "detaching not-only " + " child %s", + priv->base_path); + victim->cleanup_starting = 1; + top->notify(top, GF_EVENT_CLEANUP, victim); + } + } + + return NULL; +} + +int +posix_spawn_health_check_thread(xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK(&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel(priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create(&priv->health_check, NULL, + posix_health_check_thread_proc, xl, "posixhc"); + if (ret) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_msg(xl->name, GF_LOG_ERROR, errno, P_MSG_HEALTHCHECK_FAILED, + "unable to setup health-check thread"); + goto unlock; + } + + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK(&priv->lock); + return ret; +} + +void +posix_disk_space_check(xlator_t *this) +{ + struct posix_private *priv = NULL; + char *subvol_path = NULL; + int op_ret = 0; + double size = 0; + double percent = 0; + struct statvfs buf = {0}; + double totsz = 0; + double freesz = 0; + + GF_VALIDATE_OR_GOTO("posix-helpers", this, out); + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + subvol_path = priv->base_path; + + op_ret = sys_statvfs(subvol_path, &buf); + + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_STATVFS_FAILED, + "statvfs failed on %s", subvol_path); + goto out; + } + + if (priv->disk_unit == 'p') { + percent = priv->disk_reserve; + totsz = (buf.f_blocks * buf.f_bsize); + size = ((totsz * percent) / 100); + } else { + size = priv->disk_reserve; + } + + freesz = (buf.f_bfree * buf.f_bsize); + if (freesz <= size) { + priv->disk_space_full = 1; + } else { + priv->disk_space_full = 0; + } +out: + return; +} + +static void * +posix_disk_space_check_thread_proc(void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + + this = data; + priv = this->private; + + interval = 5; + gf_msg_debug(this->name, 0, + "disk-space thread started, " + "interval = %d seconds", + interval); + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep(interval); + if (ret > 0) + break; + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the disk-check.*/ + posix_disk_space_check(this); + if (!priv->disk_space_check_active) + goto out; + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_msg_debug(this->name, 0, "disk space check thread exiting"); + LOCK(&priv->lock); + { + priv->disk_space_check_active = _gf_false; + } + UNLOCK(&priv->lock); + + return NULL; +} + +int +posix_spawn_disk_space_check_thread(xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK(&priv->lock); + { + /* cancel the running thread */ + if (priv->disk_space_check_active == _gf_true) { + pthread_cancel(priv->disk_space_check); + priv->disk_space_check_active = _gf_false; + } + + ret = gf_thread_create(&priv->disk_space_check, NULL, + posix_disk_space_check_thread_proc, xl, + "posixrsv"); + if (ret) { + priv->disk_space_check_active = _gf_false; + gf_msg(xl->name, GF_LOG_ERROR, errno, P_MSG_DISK_SPACE_CHECK_FAILED, + "unable to setup disk space check thread"); + goto unlock; + } + + priv->disk_space_check_active = _gf_true; + } +unlock: + UNLOCK(&priv->lock); + return ret; +} + +int +posix_fsyncer_pick(xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock(&priv->fsync_mutex); + { + while (list_empty(&priv->fsyncs)) + pthread_cond_wait(&priv->fsync_cond, &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init(&priv->fsyncs, head); + } + pthread_mutex_unlock(&priv->fsync_mutex); + + return count; +} + +void +posix_fsyncer_process(xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + int op_errno = 0; + + ret = posix_fd_ctx_get(stub->args.fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_GET_FDCTX_FAILED, + "could not get fdctx for fd(%s)", + uuid_utoa(stub->args.fd->inode->gfid)); + call_unwind_error(stub, -1, op_errno); + return; + } + + if (do_fsync && pfd) { + if (stub->args.datasync) + ret = sys_fdatasync(pfd->fd); + else + ret = sys_fsync(pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "could not fstat fd(%s)", uuid_utoa(stub->args.fd->inode->gfid)); + call_unwind_error(stub, -1, errno); + return; + } + + call_unwind_error(stub, 0, 0); +} + +static void +posix_fsyncer_syncfs(xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry(head->prev, call_stub_t, list); + ret = posix_fd_ctx_get(stub->args.fd, this, &pfd, NULL); + if (!ret) + (void)gf_syncfs(pfd->fd); +} + +void * +posix_fsyncer(void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD(&list); + + count = posix_fsyncer_pick(this, &list); + + gf_nanosleep(priv->batch_fsync_delay_usec * GF_US_IN_NS); + + gf_msg_debug(this->name, 0, "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs(this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse(stub, tmp, &list, list) + { + list_del_init(&stub->list); + + posix_fsyncer_process(this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } +} + +/** + * TODO: move fd/inode interfaces into a single routine.. + */ +static int32_t +posix_fetch_signature_xattr(char *real_path, const char *key, dict_t *xattr, + size_t *xsize) +{ + int32_t ret = 0; + char *memptr = NULL; + ssize_t xattrsize = 0; + char val_buf[2048] = { + 0, + }; + gf_boolean_t have_val = _gf_false; + + xattrsize = sys_lgetxattr(real_path, key, val_buf, sizeof(val_buf) - 1); + if (xattrsize >= 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) + xattrsize = sys_lgetxattr(real_path, key, NULL, 0); + if ((errno == ENOATTR) || (errno == ENODATA)) + return 0; + if (xattrsize == -1) + goto error_return; + } + memptr = GF_MALLOC(xattrsize + 1, gf_posix_mt_char); + if (!memptr) + goto error_return; + if (have_val) { + memcpy(memptr, val_buf, xattrsize); + memptr[xattrsize] = '\0'; + } else { + bzero(memptr, xattrsize + 1); + ret = sys_lgetxattr(real_path, key, memptr, xattrsize); + if (ret == -1) + goto freemem; + } + ret = dict_set_dynptr(xattr, (char *)key, memptr, xattrsize); + if (ret) + goto freemem; + + if (xsize) + *xsize = xattrsize; + + return 0; + +freemem: + GF_FREE(memptr); +error_return: + return -1; +} + +static int32_t +posix_fd_fetch_signature_xattr(int fd, const char *key, dict_t *xattr, + size_t *xsize) +{ + int32_t ret = 0; + char *memptr = NULL; + ssize_t xattrsize = 0; + + xattrsize = sys_fgetxattr(fd, key, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) + return 0; + if (xattrsize == -1) + goto error_return; + + memptr = GF_CALLOC(xattrsize + 1, sizeof(char), gf_posix_mt_char); + if (!memptr) + goto error_return; + ret = sys_fgetxattr(fd, key, memptr, xattrsize); + if (ret == -1) + goto freemem; + + ret = dict_set_dynptr(xattr, (char *)key, memptr, xattrsize); + if (ret) + goto freemem; + + if (xsize) + *xsize = xattrsize; + + return 0; + +freemem: + GF_FREE(memptr); +error_return: + return -1; +} + +/** + * Fetch on-disk ongoing version and object signature extended attribute. + * Be generous to absence of xattrs (just *absence*, other errors are + * propagated up to the invoker), higher layer (br-stub) takes care of + * interpreting the xattrs for anomalies. + */ +int32_t +posix_get_objectsignature(char *real_path, dict_t *xattr) +{ + int32_t ret = 0; + size_t signsize = 0; + + ret = posix_fetch_signature_xattr(real_path, BITROT_CURRENT_VERSION_KEY, + xattr, NULL); + if (ret) + goto error_return; + + ret = posix_fetch_signature_xattr(real_path, BITROT_SIGNING_VERSION_KEY, + xattr, &signsize); + if (ret) + goto delkey1; + + ret = dict_set_uint32(xattr, BITROT_SIGNING_XATTR_SIZE_KEY, + (uint32_t)signsize); + if (ret) + goto delkey2; + + return 0; + +delkey2: + dict_del(xattr, BITROT_SIGNING_VERSION_KEY); +delkey1: + dict_del(xattr, BITROT_CURRENT_VERSION_KEY); +error_return: + return -EINVAL; +} + +int32_t +posix_fdget_objectsignature(int fd, dict_t *xattr) +{ + int32_t ret = 0; + size_t signsize = 0; + + ret = posix_fd_fetch_signature_xattr(fd, BITROT_CURRENT_VERSION_KEY, xattr, + NULL); + if (ret) + goto error_return; + + ret = posix_fd_fetch_signature_xattr(fd, BITROT_SIGNING_VERSION_KEY, xattr, + &signsize); + if (ret) + goto delkey1; + + ret = dict_set_uint32(xattr, BITROT_SIGNING_XATTR_SIZE_KEY, + (uint32_t)signsize); + if (ret) + goto delkey2; + + return 0; + +delkey2: + dict_del(xattr, BITROT_SIGNING_VERSION_KEY); +delkey1: + dict_del(xattr, BITROT_CURRENT_VERSION_KEY); +error_return: + return -EINVAL; +} + +/* + * posix_resolve_dirgfid_to_path: + * It converts given dirgfid to path by doing recursive readlinks at the + * backend. If bname is given, it suffixes bname to dir path to form the + * complete path else it doesn't. It allocates memory for the path and is + * caller's responsibility to free the same. If bname is NULL and pargfid + * is ROOT, then it returns "/" + **/ + +int32_t +posix_resolve_dirgfid_to_path(const uuid_t dirgfid, const char *brick_path, + const char *bname, char **path) +{ + char *linkname = NULL; + char *dir_handle = NULL; + char *pgfidstr = NULL; + char *saveptr = NULL; + ssize_t len = 0; + int ret = 0; + uuid_t tmp_gfid = { + 0, + }; + uuid_t pargfid = { + 0, + }; + char gpath[PATH_MAX] = { + 0, + }; + char result[PATH_MAX] = { + 0, + }; + char result1[PATH_MAX] = { + 0, + }; + char *dir_name = NULL; + char pre_dir_name[PATH_MAX] = { + 0, + }; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + + gf_uuid_copy(pargfid, dirgfid); + if (!path || gf_uuid_is_null(pargfid)) { + ret = -1; + goto out; + } + + if (__is_root_gfid(pargfid)) { + if (bname) { + snprintf(result, PATH_MAX, "/%s", bname); + *path = gf_strdup(result); + } else { + *path = gf_strdup("/"); + } + return ret; + } + + dir_handle = alloca(PATH_MAX); + linkname = alloca(PATH_MAX); + (void)snprintf(gpath, PATH_MAX, "%s/.glusterfs/", brick_path); + + while (!(__is_root_gfid(pargfid))) { + len = snprintf(dir_handle, PATH_MAX, "%s/%02x/%02x/%s", gpath, + pargfid[0], pargfid[1], uuid_utoa(pargfid)); + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + + len = sys_readlink(dir_handle, linkname, PATH_MAX); + if (len < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_READLINK_FAILED, + "could not read the " + "link from the gfid handle %s", + dir_handle); + ret = -1; + goto out; + } + + linkname[len] = '\0'; + + pgfidstr = strtok_r(linkname + SLEN("../../00/00/"), "/", &saveptr); + dir_name = strtok_r(NULL, "/", &saveptr); + + if (pre_dir_name[0] != '\0') { /* Remove '/' at the end */ + len = snprintf(result, PATH_MAX, "%s/%s", dir_name, pre_dir_name); + } else { + len = snprintf(result, PATH_MAX, "%s", dir_name); + } + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + + snprintf(pre_dir_name, sizeof(pre_dir_name), "%s", result); + + gf_uuid_parse(pgfidstr, tmp_gfid); + gf_uuid_copy(pargfid, tmp_gfid); + } + + if (bname) { + len = snprintf(result1, PATH_MAX, "/%s/%s", result, bname); + } else { + len = snprintf(result1, PATH_MAX, "/%s", result); + } + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + + *path = gf_strdup(result1); + if (*path == NULL) { + ret = -1; + goto out; + } + +out: + return ret; +} + +posix_inode_ctx_t * +__posix_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + posix_inode_ctx_t *ctx_p = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret == 0) { + return (posix_inode_ctx_t *)(uintptr_t)ctx_uint; + } + + ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_posix_mt_inode_ctx_t); + if (!ctx_p) + return NULL; + + pthread_mutex_init(&ctx_p->xattrop_lock, NULL); + pthread_mutex_init(&ctx_p->write_atomic_lock, NULL); + pthread_mutex_init(&ctx_p->pgfid_lock, NULL); + + ctx_uint = (uint64_t)(uintptr_t)ctx_p; + ret = __inode_ctx_set(inode, this, &ctx_uint); + if (ret < 0) { + pthread_mutex_destroy(&ctx_p->xattrop_lock); + pthread_mutex_destroy(&ctx_p->write_atomic_lock); + pthread_mutex_destroy(&ctx_p->pgfid_lock); + GF_FREE(ctx_p); + return NULL; + } + + return ctx_p; +} + +int +__posix_inode_ctx_set_unlink_flag(inode_t *inode, xlator_t *this, uint64_t ctx) +{ + posix_inode_ctx_t *ctx_p = NULL; + + ctx_p = __posix_inode_ctx_get(inode, this); + if (ctx_p == NULL) + return -1; + + ctx_p->unlink_flag = ctx; + + return 0; +} + +int +posix_inode_ctx_set_unlink_flag(inode_t *inode, xlator_t *this, uint64_t ctx) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __posix_inode_ctx_set_unlink_flag(inode, this, ctx); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__posix_inode_ctx_get_all(inode_t *inode, xlator_t *this, + posix_inode_ctx_t **ctx) +{ + posix_inode_ctx_t *ctx_p = NULL; + + ctx_p = __posix_inode_ctx_get(inode, this); + if (ctx_p == NULL) + return -1; + + *ctx = ctx_p; + + return 0; +} + +int +posix_inode_ctx_get_all(inode_t *inode, xlator_t *this, posix_inode_ctx_t **ctx) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __posix_inode_ctx_get_all(inode, this, ctx); + } + UNLOCK(&inode->lock); + + return ret; +} + +gf_boolean_t +posix_is_bulk_removexattr(char *name, dict_t *xdata) +{ + if (name && (name[0] == '\0') && xdata) + return _gf_true; + return _gf_false; +} + +int32_t +posix_set_iatt_in_dict(dict_t *dict, struct iatt *preop, struct iatt *postop) +{ + int ret = -1; + struct iatt *stbuf = NULL; + int32_t len = sizeof(struct iatt); + struct iatt *prebuf = NULL; + struct iatt *postbuf = NULL; + + if (!dict) + return ret; + + if (postop) { + stbuf = GF_MALLOC(len, gf_common_mt_char); + if (!stbuf) + goto out; + memcpy(stbuf, postop, len); + ret = dict_set_iatt(dict, DHT_IATT_IN_XDATA_KEY, stbuf, false); + if (ret < 0) { + GF_FREE(stbuf); + goto out; + } + } + + if (preop) { + prebuf = GF_MALLOC(len, gf_common_mt_char); + if (!prebuf) + goto out; + memcpy(prebuf, preop, len); + ret = dict_set_iatt(dict, GF_PRESTAT, prebuf, false); + if (ret < 0) { + GF_FREE(prebuf); + goto out; + } + } + + if (postop) { + postbuf = GF_MALLOC(len, gf_common_mt_char); + if (!postbuf) + goto out; + memcpy(postbuf, postop, len); + ret = dict_set_iatt(dict, GF_POSTSTAT, postbuf, false); + if (ret < 0) { + GF_FREE(postbuf); + goto out; + } + } + + ret = 0; +out: + return ret; +} + +mode_t +posix_override_umask(mode_t mode, mode_t mode_bit) +{ + gf_msg_debug("posix", 0, "The value of mode is %u", mode); + mode = mode >> 9; /* 3x3 (bits for each octal digit)*/ + mode = (mode << 9) | mode_bit; + gf_msg_debug("posix", 0, "The value of mode is %u", mode); + return mode; +} + +int +posix_check_internal_writes(xlator_t *this, fd_t *fd, int sysfd, dict_t *xdata) +{ + int ret = 0; + size_t xattrsize = 0; + data_t *val = NULL; + + if (!xdata) + return 0; + + LOCK(&fd->inode->lock); + { + val = dict_get_sizen(xdata, GF_PROTECT_FROM_EXTERNAL_WRITES); + if (val) { + ret = sys_fsetxattr(sysfd, GF_PROTECT_FROM_EXTERNAL_WRITES, + val->data, val->len, 0); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, P_MSG_XATTR_FAILED, errno, + "setxattr failed key %s", + GF_PROTECT_FROM_EXTERNAL_WRITES); + } + + goto out; + } + + if (dict_get_sizen(xdata, GF_AVOID_OVERWRITE)) { + xattrsize = sys_fgetxattr(sysfd, GF_PROTECT_FROM_EXTERNAL_WRITES, + NULL, 0); + if ((xattrsize == -1) && + ((errno == ENOATTR) || (errno == ENODATA))) { + ret = 0; + } else { + ret = -1; + } + } + } +out: + UNLOCK(&fd->inode->lock); + return ret; +} + +gf_cs_obj_state +posix_cs_heal_state(xlator_t *this, const char *realpath, int *fd, + struct iatt *buf) +{ + gf_boolean_t remote = _gf_false; + gf_boolean_t downloading = _gf_false; + int ret = 0; + gf_cs_obj_state state = GF_CS_ERROR; + size_t xattrsize = 0; + + if (!buf) { + ret = -1; + goto out; + } + + if (fd) { + xattrsize = sys_fgetxattr(*fd, GF_CS_OBJECT_REMOTE, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + remote = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "fgetxattr" + " failed"); + state = GF_CS_ERROR; + goto out; + } else { + remote = _gf_true; + } + + xattrsize = sys_fgetxattr(*fd, GF_CS_OBJECT_DOWNLOADING, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + downloading = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "fgetxattr" + " failed"); + state = GF_CS_ERROR; + goto out; + } else { + downloading = _gf_true; + } + } else { + xattrsize = sys_lgetxattr(realpath, GF_CS_OBJECT_REMOTE, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + remote = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "getxattr" + " failed"); + state = GF_CS_ERROR; + goto out; + } else { + remote = _gf_true; + } + + xattrsize = sys_lgetxattr(realpath, GF_CS_OBJECT_DOWNLOADING, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + downloading = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "getxattr" + " failed"); + state = GF_CS_ERROR; + goto out; + } else { + downloading = _gf_true; + } + } + + if (remote && downloading) { + if (fd) { + ret = sys_fremovexattr(*fd, GF_CS_OBJECT_DOWNLOADING); + } else { + ret = sys_lremovexattr(realpath, GF_CS_OBJECT_DOWNLOADING); + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "failed to remove xattr, repair failed"); + state = GF_CS_ERROR; + goto out; + } + + if (buf->ia_size) { + if (fd) { + ret = sys_ftruncate(*fd, 0); + } else { + ret = sys_truncate(realpath, 0); + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "truncate failed. File is in inconsistent" + " state"); + state = GF_CS_ERROR; + goto out; + } + } + + state = GF_CS_REMOTE; + goto out; + + } else if (remote) { + if (buf->ia_size) { + if (fd) { + ret = sys_ftruncate(*fd, 0); + } else { + ret = sys_truncate(realpath, 0); + } + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "truncate failed. File is in inconsistent" + " state"); + state = GF_CS_ERROR; + goto out; + } + } + + state = GF_CS_REMOTE; + goto out; + } else if (downloading) { + if (buf->ia_size) { + if (fd) { + ret = sys_fremovexattr(*fd, GF_CS_OBJECT_DOWNLOADING); + } else { + ret = sys_lremovexattr(realpath, GF_CS_OBJECT_DOWNLOADING); + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + "failed to remove xattr, repair failed"); + state = GF_CS_ERROR; + goto out; + } + + state = GF_CS_LOCAL; + goto out; + } + } + + state = GF_CS_LOCAL; +out: + gf_msg_debug(this->name, 0, "heal state returned %d", state); + return state; +} + +gf_cs_obj_state +posix_cs_check_status(xlator_t *this, const char *realpath, int *fd, + struct iatt *buf) +{ + gf_boolean_t remote = _gf_false; + gf_boolean_t downloading = _gf_false; + int ret = 0; + gf_cs_obj_state state = GF_CS_LOCAL; + size_t xattrsize = 0; + int op_errno = 0; + + if (fd) { + xattrsize = sys_fgetxattr(*fd, GF_CS_OBJECT_REMOTE, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + remote = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "getxattr " + "failed err %d", + errno); + goto out; + } else { + remote = _gf_true; + } + + xattrsize = sys_fgetxattr(*fd, GF_CS_OBJECT_DOWNLOADING, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + downloading = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "getxattr " + "failed err : %d", + errno); + + goto out; + } else { + downloading = _gf_true; + } + } + + if (realpath) { + xattrsize = sys_lgetxattr(realpath, GF_CS_OBJECT_REMOTE, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + remote = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "getxattr " + "failed err : %d", + errno); + goto out; + } else { + remote = _gf_true; + } + + xattrsize = sys_lgetxattr(realpath, GF_CS_OBJECT_DOWNLOADING, NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || (errno == ENODATA))) { + downloading = _gf_false; + } else if (xattrsize == -1) { + ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "getxattr " + "failed err : %d", + errno); + goto out; + } else { + downloading = _gf_true; + } + } + +out: + if (ret) { + gf_msg("POSIX", GF_LOG_ERROR, 0, op_errno, + "getxattr failed " + "with %d", + op_errno); + state = GF_CS_ERROR; + return state; + } + + if ((remote && downloading) || (remote && buf && buf->ia_size)) { + state = GF_CS_REPAIR; + gf_msg_debug(this->name, 0, "status is REPAIR"); + return state; + } + + if (remote) + state = GF_CS_REMOTE; + else if (downloading) + state = GF_CS_DOWNLOADING; + else + state = GF_CS_LOCAL; + + gf_msg_debug(this->name, 0, "state returned is %d", state); + return state; +} + +int +posix_cs_set_state(xlator_t *this, dict_t **rsp, gf_cs_obj_state state, + char const *path, int *fd) +{ + int ret = 0; + char *value = NULL; + size_t xattrsize = 0; + + if (!rsp) { + ret = -1; + goto out; + } + + if (!(*rsp)) { + *rsp = dict_new(); + if (!(*rsp)) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "failed to" + " create dict"); + ret = -1; + goto out; + } + } + + ret = dict_set_uint64(*rsp, GF_CS_OBJECT_STATUS, state); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "failed to set " + "dict"); + ret = -1; + goto out; + } + + if (fd) { + xattrsize = sys_fgetxattr(*fd, GF_CS_OBJECT_REMOTE, NULL, 0); + if (xattrsize != -1) { + value = GF_CALLOC(1, xattrsize + 1, gf_posix_mt_char); + if (!value) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "no memory for value"); + ret = -1; + goto out; + } + /* TODO: Add check for ENODATA */ + xattrsize = sys_fgetxattr(*fd, GF_CS_OBJECT_REMOTE, value, + xattrsize + 1); + if (xattrsize == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + " getxattr failed for key %s", GF_CS_OBJECT_REMOTE); + goto out; + } else { + value[xattrsize] = '\0'; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + " getxattr failed for key %s", GF_CS_OBJECT_REMOTE); + goto out; + } + } else { + xattrsize = sys_lgetxattr(path, GF_CS_OBJECT_REMOTE, NULL, 0); + if (xattrsize != -1) { + value = GF_CALLOC(1, xattrsize + 1, gf_posix_mt_char); + if (!value) { + ret = -1; + goto out; + } + + xattrsize = sys_lgetxattr(path, GF_CS_OBJECT_REMOTE, value, + xattrsize + 1); + if (xattrsize == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + " getxattr failed for key %s", GF_CS_OBJECT_REMOTE); + goto out; + } else { + value[xattrsize] = '\0'; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, errno, + " getxattr failed for key %s", GF_CS_OBJECT_REMOTE); + goto out; + } + } + + if (ret == 0) { + ret = dict_set_str(*rsp, GF_CS_OBJECT_REMOTE, value); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "failed to set" + "value"); + } + } + +out: + return ret; +} + +/* This function checks the status of the file and updates the xattr response. + * Also it repairs the state of the file which could have been resulted from a + * crash or transient failures. + */ +int +posix_cs_maintenance(xlator_t *this, fd_t *fd, loc_t *loc, int *pfd, + struct iatt *buf, const char *realpath, dict_t *xattr_req, + dict_t **xattr_rsp, gf_boolean_t ignore_failure) +{ + gf_cs_obj_state state = GF_CS_ERROR; + int ret = 0; + gf_boolean_t is_cs_obj_status = _gf_false; + gf_boolean_t is_cs_obj_repair = _gf_false; + + if (dict_get_sizen(xattr_req, GF_CS_OBJECT_STATUS)) + is_cs_obj_status = _gf_true; + if (dict_get_sizen(xattr_req, GF_CS_OBJECT_REPAIR)) + is_cs_obj_repair = _gf_true; + + if (!(is_cs_obj_status || is_cs_obj_repair)) + return 0; + + if (fd) { + LOCK(&fd->inode->lock); + if (is_cs_obj_status) { + state = posix_cs_check_status(this, NULL, pfd, buf); + gf_msg_debug(this->name, 0, "state : %d", state); + ret = posix_cs_set_state(this, xattr_rsp, state, NULL, pfd); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "posix_cs_set_state failed"); + } + + if (ignore_failure) { + ret = 0; + goto unlock; + } else { + if (state != GF_CS_LOCAL || ret != 0) { + ret = -1; + goto unlock; + } + } + } + + if (is_cs_obj_repair) { + state = posix_cs_check_status(this, NULL, pfd, buf); + gf_msg_debug(this->name, 0, "state : %d", state); + + if (state == GF_CS_REPAIR) { + state = posix_cs_heal_state(this, NULL, pfd, buf); + + if (state == GF_CS_ERROR) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "repair check failed"); + } + } + + ret = posix_cs_set_state(this, xattr_rsp, state, NULL, pfd); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "posix_cs_set_state failed"); + if (ignore_failure) + ret = 0; + else + ret = -1; + goto unlock; + } + } + } else { + if (!loc->inode) { + ret = 0; + goto out; + } + + LOCK(&loc->inode->lock); + if (is_cs_obj_status) { + state = posix_cs_check_status(this, realpath, NULL, buf); + gf_msg_debug(this->name, 0, "state : %d", state); + ret = posix_cs_set_state(this, xattr_rsp, state, realpath, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "posix_cs_set_state failed"); + } + + if (ignore_failure) { + ret = 0; + goto unlock; + } else { + if (state != GF_CS_LOCAL || ret != 0) { + ret = -1; + goto unlock; + } + } + } + + if (is_cs_obj_repair) { + state = posix_cs_check_status(this, realpath, NULL, buf); + gf_msg_debug(this->name, 0, "state : %d", state); + + if (state == GF_CS_REPAIR) { + state = posix_cs_heal_state(this, realpath, NULL, buf); + + if (state == GF_CS_ERROR) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "repair check failed"); + } + } + + ret = posix_cs_set_state(this, xattr_rsp, state, realpath, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "posix_cs_set_state failed"); + if (ignore_failure) + ret = 0; + else + ret = -1; + goto unlock; + } + } + } + +unlock: + if (fd) + UNLOCK(&fd->inode->lock); + else + UNLOCK(&loc->inode->lock); +out: + return ret; +} + +int +posix_check_dev_file(xlator_t *this, inode_t *inode, char *fop, int *op_errno) +{ + int ret = -1; + + if (inode->ia_type == IA_IFBLK || inode->ia_type == IA_IFCHR) { + *op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_INVALID_ARGUMENT, + "%s received on %s file (%s)", fop, + (inode->ia_type == IA_IFBLK) ? "block" : "char", + uuid_utoa(inode->gfid)); + goto out; + } + + ret = 0; + +out: + return ret; +} + +void +posix_update_iatt_buf(struct iatt *buf, int fd, char *loc, dict_t *xattr_req) +{ + int ret = 0; + char val[4096] = { + 0, + }; + + if (!xattr_req) + return; + + if (!dict_get_sizen(xattr_req, GF_CS_OBJECT_STATUS)) + return; + + if (fd != -1) { + ret = sys_fgetxattr(fd, GF_CS_OBJECT_SIZE, &val, sizeof(val)); + if (ret > 0) { + buf->ia_size = atoll(val); + } else { + /* Safe to assume that the other 2 xattrs are also not set*/ + return; + } + ret = sys_fgetxattr(fd, GF_CS_BLOCK_SIZE, &val, sizeof(val)); + if (ret > 0) { + buf->ia_blksize = atoll(val); + } + ret = sys_fgetxattr(fd, GF_CS_NUM_BLOCKS, &val, sizeof(val)); + if (ret > 0) { + buf->ia_blocks = atoll(val); + } + } else { + ret = sys_lgetxattr(loc, GF_CS_OBJECT_SIZE, &val, sizeof(val)); + if (ret > 0) { + buf->ia_size = atoll(val); + } else { + /* Safe to assume that the other 2 xattrs are also not set*/ + return; + } + ret = sys_lgetxattr(loc, GF_CS_BLOCK_SIZE, &val, sizeof(val)); + if (ret > 0) { + buf->ia_blksize = atoll(val); + } + ret = sys_lgetxattr(loc, GF_CS_NUM_BLOCKS, &val, sizeof(val)); + if (ret > 0) { + buf->ia_blocks = atoll(val); + } + } +} + +gf_boolean_t +posix_is_layout_stale(dict_t *xdata, char *par_path, xlator_t *this) +{ + int op_ret = 0; + ssize_t size = 0; + char value_buf[4096] = { + 0, + }; + gf_boolean_t have_val = _gf_false; + data_t *arg_data = NULL; + char *xattr_name = NULL; + size_t xattr_len = 0; + gf_boolean_t is_stale = _gf_false; + + op_ret = dict_get_str_sizen(xdata, GF_PREOP_PARENT_KEY, &xattr_name); + if (xattr_name == NULL) { + op_ret = 0; + return is_stale; + } + + xattr_len = strlen(xattr_name); + arg_data = dict_getn(xdata, xattr_name, xattr_len); + if (!arg_data) { + op_ret = 0; + dict_del_sizen(xdata, GF_PREOP_PARENT_KEY); + return is_stale; + } + + size = sys_lgetxattr(par_path, xattr_name, value_buf, + sizeof(value_buf) - 1); + + if (size >= 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_PREOP_CHECK_FAILED, + "getxattr on key (%s) path (%s) failed due to" + " buffer overflow", + xattr_name, par_path); + size = sys_lgetxattr(par_path, xattr_name, NULL, 0); + } + if (size < 0) { + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_PREOP_CHECK_FAILED, + "getxattr on key (%s) failed, path : %s", xattr_name, + par_path); + goto out; + } + } + + if (!have_val) { + size = sys_lgetxattr(par_path, xattr_name, value_buf, size); + if (size < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_PREOP_CHECK_FAILED, + "getxattr on key (%s) failed (%s)", xattr_name, + strerror(errno)); + goto out; + } + } + + if ((arg_data->len != size) || (memcmp(arg_data->data, value_buf, size))) { + gf_msg(this->name, GF_LOG_INFO, EIO, P_MSG_PREOP_CHECK_FAILED, + "failing preop as on-disk xattr value differs from argument " + "value for key %s", + xattr_name); + op_ret = -1; + } + +out: + dict_deln(xdata, xattr_name, xattr_len); + dict_del_sizen(xdata, GF_PREOP_PARENT_KEY); + + if (op_ret == -1) { + is_stale = _gf_true; + } + + return is_stale; +} + +/* Delete user xattr from the file at the file-path specified by data and from + * dict */ +int +posix_delete_user_xattr(dict_t *dict, char *k, data_t *v, void *data) +{ + int ret; + char *real_path = data; + + ret = sys_lremovexattr(real_path, k); + if (ret) { + gf_msg("posix-helpers", GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, errno, + "removexattr failed. key %s path %s", k, real_path); + } + + dict_del(dict, k); + + return ret; +} diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c new file mode 100644 index 00000000000..6d54d37e5aa --- /dev/null +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -0,0 +1,6004 @@ +/* + Copyright (c) 2006-2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#define __XOPEN_SOURCE 500 + +/* for SEEK_HOLE and SEEK_DATA */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <openssl/md5.h> +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <libgen.h> +#include <pthread.h> +#include <ftw.h> +#include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> +#include <unistd.h> +#include <regex.h> + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#ifdef HAVE_LINKAT +#include <fcntl.h> +#endif /* HAVE_LINKAT */ + +#include <glusterfs/checksum.h> +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include "posix-handle.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/syscall.h> +#include <glusterfs/statedump.h> +#include <glusterfs/locking.h> +#include <glusterfs/timer.h> +#include "glusterfs3-xdr.h" +#include <glusterfs/glusterfs-acl.h> +#include "posix-messages.h" +#include "posix-metadata.h" +#include <glusterfs/events.h> +#include "posix-gfid-path.h" +#include <glusterfs/compat-uuid.h> +#include <glusterfs/common-utils.h> + +extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 + +#undef HAVE_SET_FSID +#ifdef HAVE_SET_FSID + +#define DECLARE_OLD_FS_ID_VAR \ + uid_t old_fsuid; \ + gid_t old_fsgid; + +#define SET_FS_ID(uid, gid) \ + do { \ + old_fsuid = setfsuid(uid); \ + old_fsgid = setfsgid(gid); \ + } while (0) + +#define SET_TO_OLD_FS_ID() \ + do { \ + setfsuid(old_fsuid); \ + setfsgid(old_fsgid); \ + } while (0) + +#else + +#define DECLARE_OLD_FS_ID_VAR +#define SET_FS_ID(uid, gid) +#define SET_TO_OLD_FS_ID() + +#endif + +/* Setting microseconds or nanoseconds depending on what's supported: + The passed in `tv` can be + struct timespec + if supported (better, because it supports nanosecond resolution) or + struct timeval + otherwise. */ +#if HAVE_UTIMENSAT +#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) tv.tv_nsec = nanosecs +#else +#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \ + tv.tv_usec = nanosecs / 1000 +#endif + +static char *disallow_removexattrs[] = {GF_XATTR_VOL_ID_KEY, GFID_XATTR_KEY, + NULL}; + +void +posix_cs_build_xattr_rsp(xlator_t *this, dict_t **rsp, dict_t *req, int fd, + char *loc) +{ + int ret = 0; + uuid_t uuid; + + if (!dict_get_sizen(req, GF_CS_OBJECT_STATUS)) + return; + + if (!(*rsp)) { + *rsp = dict_new(); + if (!(*rsp)) { + return; + } + } + + if (fd != -1) { + if (dict_get_sizen(req, GF_CS_XATTR_ARCHIVE_UUID)) { + ret = sys_fgetxattr(fd, GF_CS_XATTR_ARCHIVE_UUID, uuid, 16); + if (ret > 0) { + ret = dict_set_gfuuid(*rsp, GF_CS_XATTR_ARCHIVE_UUID, uuid, + true); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for %s for fd %d", + uuid_utoa(uuid), GF_CS_XATTR_ARCHIVE_UUID, fd); + } + } else { + gf_msg_debug(this->name, 0, "getxattr failed on %s for fd %d", + GF_CS_XATTR_ARCHIVE_UUID, fd); + } + } + } else { + if (dict_get_sizen(req, GF_CS_XATTR_ARCHIVE_UUID)) { + ret = sys_lgetxattr(loc, GF_CS_XATTR_ARCHIVE_UUID, uuid, 16); + if (ret > 0) { + ret = dict_set_gfuuid(*rsp, GF_CS_XATTR_ARCHIVE_UUID, uuid, + true); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for %s for loc %s", + uuid_utoa(uuid), GF_CS_XATTR_ARCHIVE_UUID, loc); + } + } else { + gf_msg_debug(this->name, 0, "getxattr failed on %s for %s", + GF_CS_XATTR_ARCHIVE_UUID, loc); + } + } + } + return; +} + +int32_t +posix_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + struct iatt buf = { + 0, + }; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_private *priv = NULL; + char *real_path = NULL; + dict_t *xattr_rsp = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + + MAKE_INODE_HANDLE(real_path, this, loc, &buf); + + if (op_ret == -1) { + op_errno = errno; + if (op_errno == ENOENT) { + gf_msg_debug(this->name, 0, + "lstat on gfid-handle %s (path: %s)" + "failed: %s", + real_path ? real_path : "<null>", loc->path, + strerror(op_errno)); + } else { + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_LSTAT_FAILED, + "lstat on gfid-handle %s (path: %s) failed", + real_path ? real_path : "<null>", loc->path); + } + goto out; + } + if (xdata) { + xattr_rsp = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, + &buf); + + posix_cs_maintenance(this, NULL, loc, NULL, &buf, real_path, xdata, + &xattr_rsp, _gf_true); + + posix_cs_build_xattr_rsp(this, &xattr_rsp, xdata, -1, real_path); + } + + posix_update_iatt_buf(&buf, -1, real_path, xdata); + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, &buf, xattr_rsp); + if (xattr_rsp) + dict_unref(xattr_rsp); + + return 0; +} + +static int +posix_do_chmod(xlator_t *this, const char *path, struct iatt *stbuf) +{ + int32_t ret = -1; + mode_t mode = 0; + mode_t mode_bit = 0; + struct posix_private *priv = NULL; + struct stat stat; + int is_symlink = 0; + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + ret = sys_lstat(path, &stat); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_LSTAT_FAILED, + "lstat failed: %s", path); + goto out; + } + + if (S_ISLNK(stat.st_mode)) + is_symlink = 1; + + if (S_ISDIR(stat.st_mode)) { + mode = st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type); + mode_bit = (mode & priv->create_directory_mask) | + priv->force_directory_mode; + mode = posix_override_umask(mode, mode_bit); + } else { + mode = st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type); + mode_bit = (mode & priv->create_mask) | priv->force_create_mode; + mode = posix_override_umask(mode, mode_bit); + } + ret = lchmod(path, mode); + if ((ret == -1) && (errno == ENOSYS)) { + /* in Linux symlinks are always in mode 0777 and no + such call as lchmod exists. + */ + gf_msg_debug(this->name, 0, "%s (%s)", path, strerror(errno)); + if (is_symlink) { + ret = 0; + goto out; + } + + ret = sys_chmod(path, mode); + } +out: + return ret; +} + +static int +posix_do_chown(xlator_t *this, const char *path, struct iatt *stbuf, + int32_t valid) +{ + int32_t ret = -1; + uid_t uid = -1; + gid_t gid = -1; + + if (valid & GF_SET_ATTR_UID) + uid = stbuf->ia_uid; + + if (valid & GF_SET_ATTR_GID) + gid = stbuf->ia_gid; + + ret = sys_lchown(path, uid, gid); + + return ret; +} + +static int +posix_do_utimes(xlator_t *this, const char *path, struct iatt *stbuf, int valid) +{ + int32_t ret = -1; +#if defined(HAVE_UTIMENSAT) + struct timespec tv[2] = {{ + 0, + }, + { + 0, + }}; +#else + struct timeval tv[2] = {{ + 0, + }, + { + 0, + }}; +#endif + struct stat stat; + int is_symlink = 0; + + ret = sys_lstat(path, &stat); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FILE_OP_FAILED, "%s", + path); + goto out; + } + + if (S_ISLNK(stat.st_mode)) + is_symlink = 1; + + if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) { + tv[0].tv_sec = stbuf->ia_atime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv[0], stbuf->ia_atime_nsec); + } else { + /* atime is not given, use current values */ + tv[0].tv_sec = ST_ATIM_SEC(&stat); + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv[0], ST_ATIM_NSEC(&stat)); + } + + if ((valid & GF_SET_ATTR_MTIME) == GF_SET_ATTR_MTIME) { + tv[1].tv_sec = stbuf->ia_mtime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv[1], stbuf->ia_mtime_nsec); + } else { + /* mtime is not given, use current values */ + tv[1].tv_sec = ST_MTIM_SEC(&stat); + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv[1], ST_MTIM_NSEC(&stat)); + } + + ret = PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv); + if ((ret == -1) && (errno == ENOSYS)) { + gf_msg_debug(this->name, 0, "%s (%s)", path, strerror(errno)); + if (is_symlink) { + ret = 0; + goto out; + } + + ret = PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv); + } + +out: + return ret; +} + +int +posix_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + struct iatt statpre = { + 0, + }; + struct iatt statpost = { + 0, + }; + dict_t *xattr_rsp = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + MAKE_INODE_HANDLE(real_path, this, loc, &statpre); + + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "setattr (lstat) on gfid-handle %s (path: %s) failed", + real_path ? real_path : "<null>", loc->path); + goto out; + } + + if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { + op_ret = posix_do_chown(this, real_path, stbuf, valid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CHOWN_FAILED, + "setattr (chown) on %s " + "failed", + real_path); + goto out; + } + } + + if (valid & GF_SET_ATTR_MODE) { + op_ret = posix_do_chmod(this, real_path, stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_CHMOD_FAILED, + "setattr (chmod) on gfid-handle %s (path: %s) " + "failed", + real_path, loc->path); + goto out; + } + } + + if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { + op_ret = posix_do_utimes(this, real_path, stbuf, valid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_UTIMES_FAILED, + "setattr (utimes) on gfid-handle %s (path: %s) " + "failed", + real_path, loc->path); + goto out; + } + posix_update_utime_in_mdata(this, real_path, -1, loc->inode, + &frame->root->ctime, stbuf, valid); + } + + if ((valid & GF_SET_ATTR_CTIME) && priv->ctime) { + posix_update_ctime_in_mdata(this, real_path, -1, loc->inode, + &frame->root->ctime, stbuf, valid); + } + + if (!valid) { + op_ret = sys_lchown(real_path, -1, -1); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LCHOWN_FAILED, + "lchown (gfid-handle: %s, path: %s, -1, -1) " + "failed", + real_path, loc->path); + + goto out; + } + } + + op_ret = posix_pstat(this, loc->inode, loc->gfid, real_path, &statpost, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "setattr (lstat) on gfid-handle %s (path: %s) failed", real_path, + loc->path); + goto out; + } + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &statpost); + + if (xdata) + xattr_rsp = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, + &statpost); + posix_update_iatt_buf(&statpre, -1, real_path, xdata); + posix_update_iatt_buf(&statpost, -1, real_path, xdata); + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, &statpre, &statpost, + xattr_rsp); + if (xattr_rsp) + dict_unref(xattr_rsp); + + return 0; +} + +int32_t +posix_do_fchown(xlator_t *this, int fd, struct iatt *stbuf, int32_t valid) +{ + int ret = -1; + uid_t uid = -1; + gid_t gid = -1; + + if (valid & GF_SET_ATTR_UID) + uid = stbuf->ia_uid; + + if (valid & GF_SET_ATTR_GID) + gid = stbuf->ia_gid; + + ret = sys_fchown(fd, uid, gid); + + return ret; +} + +int32_t +posix_do_fchmod(xlator_t *this, int fd, struct iatt *stbuf) +{ + int32_t ret = -1; + mode_t mode = 0; + mode_t mode_bit = 0; + struct posix_private *priv = NULL; + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + mode = st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type); + mode_bit = (mode & priv->create_mask) | priv->force_create_mode; + mode = posix_override_umask(mode, mode_bit); + ret = sys_fchmod(fd, mode); +out: + return ret; +} + +static int +posix_do_futimes(xlator_t *this, int fd, struct iatt *stbuf, int valid) +{ + int32_t ret = -1; + struct timeval tv[2] = {{ + 0, + }, + { + 0, + }}; + struct stat stat = { + 0, + }; + gf_boolean_t fstat_executed = _gf_false; + + if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) { + tv[0].tv_sec = stbuf->ia_atime; + tv[0].tv_usec = stbuf->ia_atime_nsec / 1000; + } else { + ret = sys_fstat(fd, &stat); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FILE_OP_FAILED, + "%d", fd); + goto out; + } + fstat_executed = _gf_true; + /* atime is not given, use current values */ + tv[0].tv_sec = ST_ATIM_SEC(&stat); + tv[0].tv_usec = ST_ATIM_NSEC(&stat) / 1000; + } + + if ((valid & GF_SET_ATTR_MTIME) == GF_SET_ATTR_MTIME) { + tv[1].tv_sec = stbuf->ia_mtime; + tv[1].tv_usec = stbuf->ia_mtime_nsec / 1000; + } else { + if (!fstat_executed) { + ret = sys_fstat(fd, &stat); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FILE_OP_FAILED, + "%d", fd); + goto out; + } + } + /* mtime is not given, use current values */ + tv[1].tv_sec = ST_MTIM_SEC(&stat); + tv[1].tv_usec = ST_MTIM_NSEC(&stat) / 1000; + } + + ret = sys_futimes(fd, tv); + if (ret == -1) + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FUTIMES_FAILED, "%d", fd); + +out: + return ret; +} + +int +posix_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct iatt statpre = { + 0, + }; + struct iatt statpost = { + 0, + }; + struct posix_private *priv = NULL; + struct posix_fd *pfd = NULL; + dict_t *xattr_rsp = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd is NULL from fd=%p", fd); + goto out; + } + + op_ret = posix_fdstat(this, fd->inode, pfd->fd, &statpre); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fsetattr (fstat) failed on fd=%p", fd); + goto out; + } + + if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { + op_ret = posix_do_fchown(this, pfd->fd, stbuf, valid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FCHOWN_FAILED, + "fsetattr (fchown) failed" + " on fd=%p", + fd); + goto out; + } + } + + if (valid & GF_SET_ATTR_MODE) { + op_ret = posix_do_fchmod(this, pfd->fd, stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FCHMOD_FAILED, + "fsetattr (fchmod) failed" + " on fd=%p", + fd); + goto out; + } + } + + if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { + op_ret = posix_do_futimes(this, pfd->fd, stbuf, valid); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FUTIMES_FAILED, + "fsetattr (futimes) on " + "failed fd=%p", + fd); + goto out; + } + posix_update_utime_in_mdata(this, NULL, pfd->fd, fd->inode, + &frame->root->ctime, stbuf, valid); + } + + if ((valid & GF_SET_ATTR_CTIME) && priv->ctime) { + posix_update_ctime_in_mdata(this, NULL, pfd->fd, fd->inode, + &frame->root->ctime, stbuf, valid); + } + + if (!valid) { + op_ret = sys_fchown(pfd->fd, -1, -1); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FCHOWN_FAILED, + "fchown (%d, -1, -1) failed", pfd->fd); + + goto out; + } + } + + op_ret = posix_fdstat(this, fd->inode, pfd->fd, &statpost); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fsetattr (fstat) failed on fd=%p", fd); + goto out; + } + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, &statpost); + + if (xdata) + xattr_rsp = posix_xattr_fill(this, NULL, NULL, fd, pfd->fd, xdata, + &statpost); + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, &statpre, &statpost, + xattr_rsp); + if (xattr_rsp) + dict_unref(xattr_rsp); + + return 0; +} + +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata, dict_t **rsp_xdata) +{ + int32_t ret = -1; + int32_t op_errno = 0; + struct posix_fd *pfd = NULL; + gf_boolean_t locked = _gf_false; + posix_inode_ctx_t *ctx = NULL; + struct posix_private *priv = NULL; + gf_boolean_t check_space_error = _gf_false; + struct stat statbuf = { + 0, + }; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + + /* fallocate case is special so call posix_disk_space_check separately + for every fallocate fop instead of calling posix_disk_space with + thread after every 5 sec sleep to working correctly storage.reserve + option behaviour + */ + if (priv->disk_reserve) + posix_disk_space_check(this); + + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, ret, ret, unlock); + +overwrite: + check_space_error = _gf_true; + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_inode_ctx_get_all(fd->inode, this, &ctx); + if (ret < 0) { + ret = -ENOMEM; + goto out; + } + + if (xdata && dict_get(xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) { + locked = _gf_true; + pthread_mutex_lock(&ctx->write_atomic_lock); + } + + ret = posix_fdstat(this, fd->inode, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fallocate (fstat) failed on fd=%p", fd); + goto unlock; + } + + if (xdata) { + ret = posix_cs_maintenance(this, fd, NULL, &pfd->fd, statpre, NULL, + xdata, rsp_xdata, _gf_false); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + ret = -EIO; + goto unlock; + } + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_FALLOCATE_FAILED, + "fallocate failed on %s offset: %jd, " + "len:%zu, flags: %d", + uuid_utoa(fd->inode->gfid), offset, len, flags); + goto unlock; + } + + ret = posix_fdstat(this, fd->inode, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fallocate (fstat) failed on fd=%p", fd); + goto unlock; + } + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, statpost); + +unlock: + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + if (op_errno == ENOSPC && priv->disk_space_full && !check_space_error) { +#ifdef FALLOC_FL_KEEP_SIZE + if (flags & FALLOC_FL_KEEP_SIZE) { + goto overwrite; + } +#endif + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + + if (sys_fstat(pfd->fd, &statbuf) < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_FILE_OP_FAILED, + "%d", pfd->fd); + goto out; + } + + if (offset + len <= statbuf.st_size) { + gf_msg_debug(this->name, 0, + "io vector size will not" + " change disk size so allow overwrite for" + " fd %d", + pfd->fd); + goto overwrite; + } + } + +out: + SET_TO_OLD_FS_ID(); + if (ret == ENOSPC) + ret = -ENOSPC; + + return ret; +} + +char * +_page_aligned_alloc(size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC(1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF(alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, off_t len, int o_direct) +{ + off_t num_vect = 0; + off_t num_loop = 1; + off_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + off_t remain = 0; + off_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC(num_vect, sizeof(struct iovec), gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC(vect_size, sizeof(char), gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + if (sys_lseek(fd, offset, SEEK_SET) < 0) { + op_ret = -1; + goto err; + } + + for (idx = 0; idx < num_loop; idx++) { + op_ret = sys_writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + if (op_ret != (vect_size * num_vect)) { + op_ret = -1; + errno = ENOSPC; + goto err; + } + } + if (extra) { + op_ret = sys_writev(fd, vector, extra); + if (op_ret < 0) + goto err; + if (op_ret != (vect_size * extra)) { + op_ret = -1; + errno = ENOSPC; + goto err; + } + } + if (remain) { + vector[0].iov_len = remain; + op_ret = sys_writev(fd, vector, 1); + if (op_ret < 0) + goto err; + if (op_ret != remain) { + op_ret = -1; + errno = ENOSPC; + goto err; + } + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, struct iatt *statpre, struct iatt *statpost, + dict_t *xdata, dict_t **rsp_xdata) +{ + int32_t ret = -1; + int32_t op_errno = 0; + int32_t flags = 0; + struct posix_fd *pfd = NULL; + gf_boolean_t locked = _gf_false; + posix_inode_ctx_t *ctx = NULL; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_inode_ctx_get_all(fd->inode, this, &ctx); + if (ret < 0) { + ret = -ENOMEM; + goto out; + } + + if (dict_get(xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) { + locked = _gf_true; + pthread_mutex_lock(&ctx->write_atomic_lock); + } + + ret = posix_fdstat(this, fd->inode, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd = %p", fd); + goto out; + } + + if (xdata) { + ret = posix_cs_maintenance(this, fd, NULL, &pfd->fd, statpre, NULL, + xdata, rsp_xdata, _gf_false); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state " + "check failed, fd %p", + fd); + ret = -EIO; + goto out; + } + } + + posix_update_iatt_buf(statpre, pfd->fd, NULL, xdata); + /* See if we can use FALLOC_FL_ZERO_RANGE to perform the zero fill. + * If it fails, fall back to _posix_do_zerofill() and an optional fsync. + */ + flags = FALLOC_FL_ZERO_RANGE; + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == 0) { + goto fsync; + } else { + ret = -errno; + if ((ret != -ENOSYS) && (ret != -EOPNOTSUPP)) { + goto out; + } + } + + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_ZEROFILL_FAILED, + "zerofill failed on fd %d length %" PRId64, pfd->fd, len); + goto out; + } + +fsync: + if (pfd->flags & (O_SYNC | O_DSYNC)) { + ret = sys_fsync(pfd->fd); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_WRITEV_FAILED, + "fsync() in writev on fd" + "%d failed", + pfd->fd); + ret = -errno; + goto out; + } + } + + ret = posix_fdstat(this, fd->inode, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post operation fstat failed on fd=%p", fd); + goto out; + } + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, statpost); + +out: + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + SET_TO_OLD_FS_ID(); + + return ret; +} + +int32_t +posix_glfallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = { + 0, + }; + struct iatt statpost = { + 0, + }; + dict_t *rsp_xdata = NULL; + +#ifdef FALLOC_FL_KEEP_SIZE + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; +#endif /* FALLOC_FL_KEEP_SIZE */ + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, &statpre, + &statpost, xdata, &rsp_xdata); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, rsp_xdata); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, rsp_xdata); + return 0; +} + +int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + dict_t *rsp_xdata = NULL; +#ifndef FALLOC_FL_KEEP_SIZE + ret = EOPNOTSUPP; + +#else /* FALLOC_FL_KEEP_SIZE */ + int32_t flags = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = { + 0, + }; + struct iatt statpost = { + 0, + }; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, &statpre, + &statpost, xdata, &rsp_xdata); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, rsp_xdata); + return 0; + +err: +#endif /* FALLOC_FL_KEEP_SIZE */ + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, rsp_xdata); + return 0; +} + +int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = { + 0, + }; + struct iatt statpost = { + 0, + }; + struct posix_private *priv = NULL; + int op_ret = -1; + int op_errno = EINVAL; + dict_t *rsp_xdata = NULL; + gf_boolean_t check_space_error = _gf_false; + struct posix_fd *pfd = NULL; + struct stat statbuf = { + 0, + }; + + VALIDATE_OR_GOTO(frame, unwind); + VALIDATE_OR_GOTO(this, unwind); + + priv = this->private; + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + +overwrite: + check_space_error = _gf_true; + ret = posix_do_zerofill(frame, this, fd, offset, len, &statpre, &statpost, + xdata, &rsp_xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + goto unwind; + } + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, rsp_xdata); + return 0; + +out: + if (op_errno == ENOSPC && priv->disk_space_full && !check_space_error) { + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + + if (sys_fstat(pfd->fd, &statbuf) < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_FILE_OP_FAILED, + "%d", pfd->fd); + goto out; + } + + if (offset + len <= statbuf.st_size) { + gf_msg_debug(this->name, 0, + "io vector size will not" + " change disk size so allow overwrite for" + " fd %d", + pfd->fd); + goto overwrite; + } + } + +unwind: + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, NULL, NULL, + rsp_xdata); + return 0; +} + +int32_t +posix_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + /* + * IPC is for inter-translator communication. If one gets here, it + * means somebody sent one that nobody else recognized, which is an + * error much like an uncaught exception. + */ + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_IPC_NOT_HANDLE, + "GF_LOG_IPC(%d) not handled", op); + STACK_UNWIND_STRICT(ipc, frame, -1, EOPNOTSUPP, NULL); + return 0; +} + +int32_t +posix_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ +#ifdef HAVE_SEEK_HOLE + struct posix_fd *pfd = NULL; + off_t ret = -1; + int err = 0; + int whence = 0; + struct iatt preop = { + 0, + }; + dict_t *rsp_xdata = NULL; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + switch (what) { + case GF_SEEK_DATA: + whence = SEEK_DATA; + break; + case GF_SEEK_HOLE: + whence = SEEK_HOLE; + break; + default: + err = ENOTSUP; + gf_msg(this->name, GF_LOG_ERROR, ENOTSUP, P_MSG_SEEK_UNKOWN, + "don't know what to seek"); + goto out; + } + + ret = posix_fd_ctx_get(fd, this, &pfd, &err); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd is NULL from fd=%p", fd); + goto out; + } + + if (xdata) { + ret = posix_fdstat(this, fd->inode, pfd->fd, &preop); + if (ret == -1) { + ret = -errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + goto out; + } + + ret = posix_cs_maintenance(this, fd, NULL, &pfd->fd, &preop, NULL, + xdata, &rsp_xdata, _gf_false); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + ret = -EIO; + goto out; + } + } + + ret = sys_lseek(pfd->fd, offset, whence); + if (ret == -1) { + err = errno; + gf_msg(this->name, fop_log_level(GF_FOP_SEEK, err), err, + P_MSG_SEEK_FAILED, "seek failed on fd %d length %" PRId64, + pfd->fd, offset); + goto out; + } + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(seek, frame, (ret == -1 ? -1 : 0), err, + (ret == -1 ? -1 : ret), rsp_xdata); +#else + STACK_UNWIND_STRICT(seek, frame, -1, EINVAL, 0, NULL); +#endif + return 0; +} + +int32_t +posix_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + char *real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + DIR *dir = NULL; + struct posix_fd *pfd = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(fd, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + op_errno = ESTALE; + goto out; + } + + op_ret = -1; + dir = sys_opendir(real_path); + + if (dir == NULL) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_OPENDIR_FAILED, + "opendir failed on gfid-handle: %s (path: %s)", real_path, + loc->path); + goto out; + } + + op_ret = dirfd(dir); + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_DIRFD_FAILED, + "dirfd() failed (path: %s, gfid-handle: %s", loc->path, + real_path); + goto out; + } + + pfd = GF_CALLOC(1, sizeof(*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + op_errno = errno; + goto out; + } + + pfd->dir = dir; + pfd->dir_eof = -1; + pfd->fd = op_ret; + + op_ret = fd_ctx_set(fd, this, (uint64_t)(long)pfd); + if (op_ret) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_FD_PATH_SETTING_FAILED, + "failed to set the fd" + "context path=%s " + "gfid-handle= %s,fd=%p", + loc->path, real_path, fd); + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, NULL); + + op_ret = 0; + +out: + if (op_ret == -1) { + if (dir) { + (void)sys_closedir(dir); + dir = NULL; + } + if (pfd) { + GF_FREE(pfd); + pfd = NULL; + } + } + + SET_TO_OLD_FS_ID(); + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, NULL); + return 0; +} + +static void +posix_add_fd_to_cleanup(xlator_t *this, struct posix_fd *pfd) +{ + glusterfs_ctx_t *ctx = this->ctx; + struct posix_private *priv = this->private; + + pfd->xl = this; + pthread_mutex_lock(&ctx->fd_lock); + { + list_add_tail(&pfd->list, &ctx->janitor_fds); + priv->rel_fdcount++; + pthread_cond_signal(&ctx->fd_cond); + } + pthread_mutex_unlock(&ctx->fd_lock); +} + +int32_t +posix_releasedir(xlator_t *this, fd_t *fd) +{ + struct posix_fd *pfd = NULL; + uint64_t tmp_pfd = 0; + int ret = 0; + + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd); + goto out; + } + + pfd = (struct posix_fd *)(long)tmp_pfd; + if (!pfd->dir) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_PFD_NULL, + "pfd->dir is NULL for fd=%p", fd); + goto out; + } + posix_add_fd_to_cleanup(this, pfd); + +out: + return 0; +} + +int32_t +posix_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + char *dest = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct iatt stbuf = { + 0, + }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(loc, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + + dest = alloca(size + 1); + + MAKE_INODE_HANDLE(real_path, this, loc, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on %s failed", loc->path ? loc->path : "<null>"); + goto out; + } + + op_ret = sys_readlink(real_path, dest, size); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_READYLINK_FAILED, + "readlink on gfid-handle: %s (path: %s) failed", real_path, + loc->path); + goto out; + } + + dest[op_ret] = 0; +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, dest, &stbuf, NULL); + + return 0; +} + +int32_t +posix_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + struct posix_private *priv = NULL; + struct iatt prebuf = { + 0, + }; + struct iatt postbuf = { + 0, + }; + dict_t *rsp_xdata = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + + MAKE_INODE_HANDLE(real_path, this, loc, &prebuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "pre-operation lstat on (path: %s gfid-handle: %s) " + "failed", + loc->path, real_path ? real_path : "<null>"); + goto out; + } + + if (xdata) { + op_ret = posix_cs_maintenance(this, NULL, loc, NULL, &prebuf, real_path, + xdata, &rsp_xdata, _gf_false); + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, path %s", loc->path); + op_errno = EIO; + goto out; + } + } + + posix_update_iatt_buf(&prebuf, -1, real_path, xdata); + op_ret = sys_truncate(real_path, offset); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TRUNCATE_FAILED, + "truncate on gfid-handle: %s (path: %s) failed", real_path, + loc->path); + goto out; + } + + op_ret = posix_pstat(this, loc->inode, loc->gfid, real_path, &postbuf, + _gf_false); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "lstat on gfid-handle %s (path: %s) failed", real_path, + loc->path); + goto out; + } + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &postbuf); + + op_ret = 0; +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + return 0; +} + +int32_t +posix_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + int32_t _fd = -1; + struct posix_fd *pfd = NULL; + struct posix_private *priv = NULL; + struct iatt preop = { + 0, + }; + dict_t *rsp_xdata = NULL; + struct iatt stbuf = { + 0, + }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + if (loc->inode && ((loc->inode->ia_type == IA_IFBLK) || + (loc->inode->ia_type == IA_IFCHR))) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_INVALID_ARGUMENT, + "open received on a block/char file (%s)", + uuid_utoa(loc->inode->gfid)); + op_errno = EINVAL; + goto out; + } + + if (flags & O_CREAT) + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + MAKE_INODE_HANDLE(real_path, this, loc, &stbuf); + if (!real_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + if (IA_ISLNK(stbuf.ia_type)) { + op_ret = -1; + op_errno = ELOOP; + goto out; + } + + op_ret = -1; + SET_FS_ID(frame->root->uid, frame->root->gid); + + if (priv->o_direct) + flags |= O_DIRECT; + + _fd = sys_open(real_path, flags, priv->force_create_mode); + if (_fd == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FILE_OP_FAILED, + "open on gfid-handle %s (path: %s), flags: %d", real_path, + loc->path, flags); + goto out; + } + + posix_set_ctime(frame, this, real_path, -1, loc->inode, &stbuf); + + pfd = GF_CALLOC(1, sizeof(*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + op_errno = errno; + goto out; + } + + pfd->flags = flags; + pfd->fd = _fd; + + if (xdata) { + op_ret = posix_fdstat(this, fd->inode, pfd->fd, &preop); + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + GF_FREE(pfd); + goto out; + } + + posix_cs_maintenance(this, fd, NULL, &pfd->fd, &preop, NULL, xdata, + &rsp_xdata, _gf_true); + } + + op_ret = fd_ctx_set(fd, this, (uint64_t)(long)pfd); + if (op_ret) + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_FD_PATH_SETTING_FAILED, + "failed to set the fd context gfid-handle=%s path=%s fd=%p", + real_path, loc->path, fd); + + op_ret = 0; + +out: + if (op_ret == -1) { + if (_fd != -1) { + sys_close(_fd); + } + } + + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, rsp_xdata); + + return 0; +} + +int +posix_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_private *priv = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec vec = { + 0, + }; + struct posix_fd *pfd = NULL; + struct iatt stbuf = { + 0, + }; + struct iatt preop = { + 0, + }; + int ret = -1; + dict_t *rsp_xdata = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + VALIDATE_OR_GOTO(fd->inode, out); + VALIDATE_OR_GOTO(this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + if ((fd->inode->ia_type == IA_IFBLK) || (fd->inode->ia_type == IA_IFCHR)) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_INVALID_ARGUMENT, + "readv received on a block/char file (%s)", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto out; + } + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + + if (!size) { + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, EINVAL, P_MSG_INVALID_ARGUMENT, + "size=%" GF_PRI_SIZET, size); + goto out; + } + + iobuf = iobuf_get_page_aligned(this->ctx->iobuf_pool, size, ALIGN_SIZE); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + + _fd = pfd->fd; + + if (xdata) { + op_ret = posix_fdstat(this, fd->inode, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + goto out; + } + op_ret = posix_cs_maintenance(this, fd, NULL, &_fd, &preop, NULL, xdata, + &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + op_errno = EIO; + goto out; + } + } + + posix_update_iatt_buf(&preop, _fd, NULL, xdata); + op_ret = sys_pread(_fd, iobuf->ptr, size, offset); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_READ_FAILED, + "read failed on gfid=%s, " + "fd=%p, offset=%" PRIu64 " size=%" GF_PRI_SIZET + ", " + "buf=%p", + uuid_utoa(fd->inode->gfid), fd, offset, size, iobuf->ptr); + goto out; + } + + GF_ATOMIC_ADD(priv->read_value, op_ret); + + vec.iov_base = iobuf->ptr; + vec.iov_len = op_ret; + + iobref = iobref_new(); + + iobref_add(iobref, iobuf); + + /* + * readv successful, and we need to get the stat of the file + * we read from + */ + + op_ret = posix_fdstat(this, fd->inode, _fd, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%p", fd); + goto out; + } + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, &stbuf); + + /* Hack to notify higher layers of EOF. */ + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; + +out: + + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &vec, 1, &stbuf, iobref, + rsp_xdata); + + if (iobref) + iobref_unref(iobref); + if (iobuf) + iobuf_unref(iobuf); + + return 0; +} + +int32_t +__posix_pwritev(int fd, struct iovec *vector, int count, off_t offset) +{ + int32_t op_ret = 0; + int idx = 0; + int retval = 0; + off_t internal_off = 0; + + if (!vector) + return -EFAULT; + + internal_off = offset; + for (idx = 0; idx < count; idx++) { + retval = sys_pwrite(fd, vector[idx].iov_base, vector[idx].iov_len, + internal_off); + if (retval == -1) { + op_ret = -errno; + goto err; + } + op_ret += retval; + internal_off += retval; + } + +err: + return op_ret; +} + +int32_t +__posix_writev(int fd, struct iovec *vector, int count, off_t startoff, + int odirect) +{ + int32_t op_ret = 0; + int idx = 0; + int max_buf_size = 0; + int retval = 0; + char *buf = NULL; + char *alloc_buf = NULL; + off_t internal_off = 0; + + /* Check for the O_DIRECT flag during open() */ + if (!odirect) + return __posix_pwritev(fd, vector, count, startoff); + + for (idx = 0; idx < count; idx++) { + if (max_buf_size < vector[idx].iov_len) + max_buf_size = vector[idx].iov_len; + } + + alloc_buf = _page_aligned_alloc(max_buf_size, &buf); + if (!alloc_buf) { + op_ret = -errno; + goto err; + } + + internal_off = startoff; + for (idx = 0; idx < count; idx++) { + memcpy(buf, vector[idx].iov_base, vector[idx].iov_len); + + /* not sure whether writev works on O_DIRECT'd fd */ + retval = sys_pwrite(fd, buf, vector[idx].iov_len, internal_off); + if (retval == -1) { + op_ret = -errno; + goto err; + } + + op_ret += retval; + internal_off += retval; + } + +err: + GF_FREE(alloc_buf); + + return op_ret; +} + +dict_t * +_fill_writev_xdata(fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || gf_uuid_is_null(fd->inode->gfid)) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, P_MSG_XATTR_FAILED, + "fd: %p inode: %p" + "gfid:%s", + fd, inode ? inode : 0, + inode ? uuid_utoa(inode->gfid) : "N/A"); + goto out; + } + + if (!xdata) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + if (dict_get(xdata, GLUSTERFS_OPEN_FD_COUNT)) { + ret = dict_set_uint32(rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for %s", + uuid_utoa(fd->inode->gfid), GLUSTERFS_OPEN_FD_COUNT); + } + } + + if (dict_get(xdata, GLUSTERFS_ACTIVE_FD_COUNT)) { + ret = dict_set_uint32(rsp_xdata, GLUSTERFS_ACTIVE_FD_COUNT, + fd->inode->active_fd_count); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for %s", + uuid_utoa(fd->inode->gfid), GLUSTERFS_ACTIVE_FD_COUNT); + } + } + + if (dict_get(xdata, GLUSTERFS_WRITE_IS_APPEND)) { + ret = dict_set_uint32(rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, is_append); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for %s", + uuid_utoa(fd->inode->gfid), GLUSTERFS_WRITE_IS_APPEND); + } + } +out: + return rsp_xdata; +} + +int32_t +posix_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_private *priv = NULL; + struct posix_fd *pfd = NULL; + struct iatt preop = { + 0, + }; + struct iatt postop = { + 0, + }; + int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; + gf_boolean_t write_append = _gf_false; + gf_boolean_t update_atomic = _gf_false; + posix_inode_ctx_t *ctx = NULL; + gf_boolean_t check_space_error = _gf_false; + struct stat statbuf = { + 0, + }; + int totlen = 0; + int idx = 0; + + VALIDATE_OR_GOTO(frame, unwind); + VALIDATE_OR_GOTO(this, unwind); + VALIDATE_OR_GOTO(fd, unwind); + VALIDATE_OR_GOTO(fd->inode, unwind); + VALIDATE_OR_GOTO(vector, unwind); + VALIDATE_OR_GOTO(this->private, unwind); + + priv = this->private; + + VALIDATE_OR_GOTO(priv, unwind); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + +overwrite: + + check_space_error = _gf_true; + if ((fd->inode->ia_type == IA_IFBLK) || (fd->inode->ia_type == IA_IFCHR)) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_INVALID_ARGUMENT, + "writev received on a block/char file (%s)", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto out; + } + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + + _fd = pfd->fd; + + ret = posix_check_internal_writes(this, fd, _fd, xdata); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "possible overwrite from internal client, fd=%p", fd); + op_ret = -1; + op_errno = EBUSY; + goto out; + } + + if (xdata) { + if (dict_get(xdata, GLUSTERFS_WRITE_IS_APPEND)) + write_append = _gf_true; + if (dict_get(xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) + update_atomic = _gf_true; + } + + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + + /* + * The update_atomic option is to instruct posix to do prestat, + * write and poststat atomically. This is to prevent any modification to + * ia_size and ia_blocks until poststat and the diff in their values + * between pre and poststat could be of use for some translators (shard + * as of today). + */ + + op_ret = posix_inode_ctx_get_all(fd->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + if (write_append || update_atomic) { + locked = _gf_true; + pthread_mutex_lock(&ctx->write_atomic_lock); + } + + op_ret = posix_fdstat(this, fd->inode, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + goto out; + } + + if (xdata) { + op_ret = posix_cs_maintenance(this, fd, NULL, &_fd, &preop, NULL, xdata, + &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + op_errno = EIO; + goto out; + } + } + + posix_update_iatt_buf(&preop, _fd, NULL, xdata); + if (locked && write_append) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + + op_ret = __posix_writev(_fd, vector, count, offset, + (pfd->flags & O_DIRECT)); + + if (locked && (!update_atomic)) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_WRITE_FAILED, + "write failed: offset %" PRIu64 ",", offset); + goto out; + } + + rsp_xdata = _fill_writev_xdata(fd, xdata, this, is_append); + /* writev successful, we also need to get the stat of + * the file we wrote to + */ + + ret = posix_fdstat(this, fd->inode, _fd, &postop); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd); + goto out; + } + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, &postop); + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + if (flags & (O_SYNC | O_DSYNC)) { + ret = sys_fsync(_fd); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_WRITEV_FAILED, + "fsync() in writev on fd %d failed", _fd); + op_ret = -1; + op_errno = errno; + goto out; + } + } + + GF_ATOMIC_ADD(priv->write_value, op_ret); + +out: + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + if (op_errno == ENOSPC && priv->disk_space_full && !check_space_error) { + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto unwind; + } + + if (sys_fstat(pfd->fd, &statbuf) < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_FILE_OP_FAILED, + "%d", pfd->fd); + goto unwind; + } + + for (idx = 0; idx < count; idx++) { + totlen = vector[idx].iov_len; + } + + if ((offset + totlen <= statbuf.st_size) && + !(statbuf.st_blocks * statbuf.st_blksize < statbuf.st_size)) { + gf_msg_debug(this->name, 0, + "io vector size will not" + " change disk size so allow overwrite for" + " fd %d", + pfd->fd); + goto overwrite; + } + } + +unwind: + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, &preop, &postop, + rsp_xdata); + + if (rsp_xdata) + dict_unref(rsp_xdata); + return 0; +} + +int32_t +posix_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off64_t off_in, fd_t *fd_out, off64_t off_out, size_t len, + uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd_in = -1; + int _fd_out = -1; + struct posix_private *priv = NULL; + struct posix_fd *pfd_in = NULL; + struct posix_fd *pfd_out = NULL; + struct iatt preop_dst = { + 0, + }; + struct iatt postop_dst = { + 0, + }; + struct iatt stbuf = { + 0, + }; + int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; + gf_boolean_t update_atomic = _gf_false; + posix_inode_ctx_t *ctx = NULL; + char in_uuid_str[64] = {0}, out_uuid_str[64] = {0}; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd_in, out); + VALIDATE_OR_GOTO(fd_in->inode, out); + VALIDATE_OR_GOTO(fd_out, out); + VALIDATE_OR_GOTO(fd_out->inode, out); + VALIDATE_OR_GOTO(this->private, out); + + priv = this->private; + + VALIDATE_OR_GOTO(priv, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + if (posix_check_dev_file(this, fd_in->inode, "copy_file_range", &op_errno)) + goto out; + + if (posix_check_dev_file(this, fd_out->inode, "copy_file_range", &op_errno)) + goto out; + + ret = posix_fd_ctx_get(fd_in, this, &pfd_in, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd_in); + goto out; + } + + _fd_in = pfd_in->fd; + + ret = posix_fd_ctx_get(fd_out, this, &pfd_out, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd_out); + goto out; + } + + _fd_out = pfd_out->fd; + + /* + * Currently, the internal write is checked via xdata which + * is set by some xlator above. It could be due to several of + * the reasons such as healing or a snapshot operation happening + * using copy_file_range. As of now (i.e. writing the patch with + * this change) none of the xlators above posix are using the + * internal write with copy_file_range. In future it might + * change. Atleast as of now the hope is that, when that happens + * this functon or fop does not require additional changes for + * handling internal writes. + */ + ret = posix_check_internal_writes(this, fd_out, _fd_out, xdata); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "possible overwrite from internal client, fd=%p", fd_out); + op_ret = -1; + op_errno = EBUSY; + goto out; + } + + if (xdata) { + if (dict_get(xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) + update_atomic = _gf_true; + } + + /* + * The update_atomic option is to instruct posix to do prestat, + * write and poststat atomically. This is to prevent any modification to + * ia_size and ia_blocks until poststat and the diff in their values + * between pre and poststat could be of use for some translators. + * This is similar to the atomic write operation. atmoic write is + * (i.e. prestat + write + poststat) used by shard as of now. In case, + * some xlator needs copy_file_range to be atomic from prestat and postat + * prespective (i.e. prestat + copy_file_range + poststat) then it has + * to send "GLUSTERFS_WRITE_UPDATE_ATOMIC" key in xdata. + */ + + op_ret = posix_inode_ctx_get_all(fd_out->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + if (update_atomic) { + ret = pthread_mutex_lock(&ctx->write_atomic_lock); + if (!ret) + locked = _gf_true; + else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_MUTEX_FAILED, + "failed to hold write atomic lock on %s", + uuid_utoa(fd_out->inode->gfid)); + goto out; + } + } + + op_ret = posix_fdstat(this, fd_out->inode, _fd_out, &preop_dst); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd_out); + goto out; + } + + /* + * Since, only the destination file (fd_out) is undergoing + * modification, the write related tests are done on that. + * i.e. this is treater similar to as if the destination file + * undergoing write fop from maintenance perspective. + */ + if (xdata) { + op_ret = posix_cs_maintenance(this, fd_out, NULL, &_fd_out, &preop_dst, + NULL, xdata, &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd_out); + op_errno = EIO; + goto out; + } + } + + /* + * NOTE: This is just doing a single execution of copy_file_range + * system call. If the returned value of this system call is less + * than len, then should we keep doing it in a for loop until the + * copy_file_range of all the len bytes is done? + * Check the example program provided in the man page of + * copy_file_range. + * If so, then a separate variables for both off_in and off_out + * should be used which are initialized to off_in and off_out + * that this function call receives, but then advanced by the + * value returned by sys_copy_file_range and then use that as + * off_in and off_out for next instance of copy_file_range execution. + */ + op_ret = sys_copy_file_range(_fd_in, &off_in, _fd_out, &off_out, len, + flags); + + if (op_ret < 0) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_COPY_FILE_RANGE_FAILED, + "copy_file_range failed: fd_in: %p (gfid: %s) ," + " fd_out %p (gfid:%s)", + fd_in, uuid_utoa_r(fd_in->inode->gfid, in_uuid_str), fd_out, + uuid_utoa_r(fd_out->inode->gfid, out_uuid_str)); + goto out; + } + + /* + * Let this be as it is for now. This function collects + * infomration such as open fd count etc. So, even though + * is_append does not apply to copy_file_range, for now, + * allowing it to be recorded in the dict as _gf_false. + */ + rsp_xdata = _fill_writev_xdata(fd_out, xdata, this, is_append); + + /* copy_file_range successful, we also need to get the stat of + * the file we wrote to (i.e. destination file or fd_out). + */ + ret = posix_fdstat(this, fd_out->inode, _fd_out, &postop_dst); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd_out); + goto out; + } + + /* + * Also perform the stat on the source fd (i.e. fd_in). For now, + * allowing it to be done within the locked region if the request + * is for atomic operation (and update) of copy_file_range. + */ + ret = posix_fdstat(this, fd_in->inode, _fd_in, &stbuf); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd_in); + goto out; + } + + /* + * The core logic of what time attributes are to be updated + * on a fop is decided at client side xlator utime. + * All the remaining fops call posix_set_ctime function + * to update the {a,m,c}time. But, for all the other fops, + * the operation is happening on only one file (or inode). + * But here, there are 2 fds (source and destination). Hence + * the new function below to update the appropriate times for + * both the source and the destination file. + * For the source file, if at all anything has to be updated, + * it would be atime (as that file is only read, not updated). + * For the destination file, the attributes that require the + * modification would be mtime and ctime. + * What times have to be changed is actually determined by + * utime xlator. But, all of them would be in frame->root->flags. + * So, currently posix assumes that, the atime flag is for + * the source file and the other 2 flags are for the destination + * file. Since, the assumption is rigid (i.e. atime for source + * and {m,c}time for destination), the below function is called + * posix_set_ctime_cfr (cfr standing for copy_file_range). + * FUTURE TODO: + * In future, some other functionality or fop might operate + * simultaneously on 2 files. Then, depending upon what that new + * fop does or what are its requirements, the below function might + * require changes to become generic for consumption in case of + * simultaneous operations on 2 files. + */ + posix_set_ctime_cfr(frame, this, NULL, pfd_in->fd, fd_in->inode, &stbuf, + NULL, pfd_out->fd, fd_out->inode, &postop_dst); + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + /* + * Record copy_file_range in priv->write_value for now. + * If not needed, remove below section of code along with + * this comment (or add comment to explain why it is not + * needed). + */ + GF_ATOMIC_ADD(priv->write_value, op_ret); + +out: + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, &stbuf, + &preop_dst, &postop_dst, rsp_xdata); + + if (rsp_xdata) + dict_unref(rsp_xdata); + return 0; +} + +int32_t +posix_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + char *real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct statvfs buf = { + 0, + }; + struct posix_private *priv = NULL; + int shared_by = 1; + double percent = 0; + uint64_t reserved_blocks = 0; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(this->private, out); + + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + priv = this->private; + + op_ret = sys_statvfs(real_path, &buf); + + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_STATVFS_FAILED, + "statvfs failed on gfid-handle %s (path: %s)", real_path, + loc->path); + goto out; + } + + if (priv->disk_unit == 'p') { + percent = priv->disk_reserve; + reserved_blocks = (((buf.f_blocks * percent) / 100) + 0.5); + } else { + if (buf.f_bsize) { + reserved_blocks = (priv->disk_reserve + buf.f_bsize - 1) / + buf.f_bsize; + } + } + + if (buf.f_bfree > reserved_blocks) { + buf.f_bfree = (buf.f_bfree - reserved_blocks); + if (buf.f_bavail > buf.f_bfree) { + buf.f_bavail = buf.f_bfree; + } + } else { + buf.f_bfree = 0; + buf.f_bavail = 0; + } + + shared_by = priv->shared_brick_count; + if (shared_by > 1) { + buf.f_blocks /= shared_by; + buf.f_bfree /= shared_by; + buf.f_bavail /= shared_by; + buf.f_files /= shared_by; + buf.f_ffree /= shared_by; + buf.f_favail /= shared_by; + } + + if (!priv->export_statfs) { + buf.f_blocks = 0; + buf.f_bfree = 0; + buf.f_bavail = 0; + buf.f_files = 0; + buf.f_ffree = 0; + buf.f_favail = 0; + } + + op_ret = 0; + +out: + STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, &buf, NULL); + return 0; +} + +int32_t +posix_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + struct posix_fd *pfd = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL on fd=%p", fd); + goto out; + } + + op_ret = 0; + +out: + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL); + + return 0; +} + +int32_t +posix_release(xlator_t *this, fd_t *fd) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + + pfd = (struct posix_fd *)(long)tmp_pfd; + if (pfd->dir) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DIR_NOT_NULL, + "pfd->dir is %p (not NULL) for file fd=%p", pfd->dir, fd); + } + + posix_add_fd_to_cleanup(this, pfd); + +out: + return 0; +} + +int +posix_batch_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub(frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock(&priv->fsync_mutex); + { + list_add_tail(&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal(&priv->fsync_cond); + } + pthread_mutex_unlock(&priv->fsync_mutex); + + return 0; +} + +int32_t +posix_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_fd *pfd = NULL; + int ret = -1; + struct iatt preop = { + 0, + }; + struct iatt postop = { + 0, + }; + struct posix_private *priv = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + +#ifdef GF_DARWIN_HOST_OS + /* Always return success in case of fsync in MAC OS X */ + op_ret = 0; + goto out; +#endif + + priv = this->private; + + if (priv->batch_fsync_mode && xdata && dict_get(xdata, "batch-fsync")) { + posix_batch_fsync(frame, this, fd, datasync, xdata); + return 0; + } + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd not found in fd's ctx"); + goto out; + } + + _fd = pfd->fd; + + op_ret = posix_fdstat(this, fd->inode, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + goto out; + } + + if (datasync) { + op_ret = sys_fdatasync(_fd); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSYNC_FAILED, + "fdatasync on fd=%p" + "failed:", + fd); + goto out; + } + } else { + op_ret = sys_fsync(_fd); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSYNC_FAILED, + "fsync on fd=%p " + "failed", + fd); + goto out; + } + } + + op_ret = posix_fdstat(this, fd->inode, _fd, &postop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd); + goto out; + } + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, &preop, &postop, NULL); + + return 0; +} + +static int gf_posix_xattr_enotsup_log; +static int +_handle_setxattr_keyvalue_pair(dict_t *d, char *k, data_t *v, void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_handle_pair(filler->this, filler->loc, filler->real_path, k, v, + filler->flags, filler->stbuf); +} + +#ifdef GF_DARWIN_HOST_OS +static int +map_xattr_flags(int flags) +{ + /* DARWIN has different defines on XATTR_ flags. + There do not seem to be a POSIX standard + Parse any other flags over. + */ + int darwinflags = flags & + ~(GF_XATTR_CREATE | GF_XATTR_REPLACE | XATTR_REPLACE); + if (GF_XATTR_CREATE & flags) + darwinflags |= XATTR_CREATE; + if (GF_XATTR_REPLACE & flags) + darwinflags |= XATTR_REPLACE; + return darwinflags; +} +#endif + +int32_t +posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *acl_xattr = NULL; + struct iatt preop = {0}; + struct iatt postop = {0}; + int32_t ret = 0; + ssize_t acl_size = 0; + dict_t *xattr = NULL; + dict_t *subvol_xattrs = NULL; + posix_xattr_filler_t filler = { + 0, + }; + struct posix_private *priv = NULL; + struct iatt tmp_stbuf = { + 0, + }; + data_t *tdata = NULL; + char *cs_var = NULL; + gf_cs_obj_state state = -1; + int i = 0; + int len; + struct mdata_iatt mdata_iatt = { + 0, + }; + int8_t sync_backend_xattrs = _gf_false; + data_pair_t *custom_xattrs; + data_t *keyval = NULL; + char **xattrs_to_heal = get_xattrs_to_heal(); + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(dict, out); + + priv = this->private; + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + ret = dict_get_mdata(dict, CTIME_MDATA_XDATA_KEY, &mdata_iatt); + if (ret == 0) { + /* This is initiated by lookup when ctime feature is enabled to create + * "trusted.glusterfs.mdata" xattr if not present. These are the files + * which were created when ctime feature is disabled. + */ + ret = posix_set_mdata_xattr_legacy_files(this, loc->inode, real_path, + &mdata_iatt, &op_errno); + if (ret != 0) { + op_ret = -1; + } + goto out; + } + + posix_pstat(this, loc->inode, loc->gfid, real_path, &preop, _gf_false); + + op_ret = -1; + + dict_del(dict, GFID_XATTR_KEY); + dict_del(dict, GF_XATTR_VOL_ID_KEY); + /* the io-stats-dump key should not reach disk */ + dict_del(dict, GF_XATTR_IOSTATS_DUMP_KEY); + + tdata = dict_get(dict, GF_CS_OBJECT_UPLOAD_COMPLETE); + if (tdata) { + /*TODO: move the following to a different function */ + LOCK(&loc->inode->lock); + { + state = posix_cs_check_status(this, real_path, NULL, &preop); + if (state != GF_CS_LOCAL) { + op_errno = EINVAL; + ret = posix_cs_set_state(this, &xattr, state, real_path, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "set state failed"); + } + goto unlock; + } + + ret = posix_pstat(this, loc->inode, loc->gfid, real_path, + &tmp_stbuf, _gf_true); + if (ret) { + op_errno = EINVAL; + goto unlock; + } + + cs_var = alloca(4096); + sprintf(cs_var, "%" PRId64, tmp_stbuf.ia_mtime); + + /*TODO: may be should consider nano-second also */ + if (strncmp(cs_var, tdata->data, tdata->len) > 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "mtime " + "passed is different from seen by file now." + " Will skip truncating the file"); + ret = -1; + op_errno = EINVAL; + goto unlock; + } + + len = sprintf(cs_var, "%" PRIu64, tmp_stbuf.ia_size); + + ret = sys_lsetxattr(real_path, GF_CS_OBJECT_SIZE, cs_var, len, + flags); + if (ret) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "setxattr failed. key %s err %d", GF_CS_OBJECT_SIZE, + ret); + goto unlock; + } + + len = sprintf(cs_var, "%" PRIu64, tmp_stbuf.ia_blocks); + + ret = sys_lsetxattr(real_path, GF_CS_NUM_BLOCKS, cs_var, len, + flags); + if (ret) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "setxattr failed. key %s err %d", GF_CS_NUM_BLOCKS, ret); + goto unlock; + } + + len = sprintf(cs_var, "%" PRIu32, tmp_stbuf.ia_blksize); + + ret = sys_lsetxattr(real_path, GF_CS_BLOCK_SIZE, cs_var, len, + flags); + if (ret) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "setxattr failed. key %s err %d", GF_CS_BLOCK_SIZE, ret); + goto unlock; + } + + memset(cs_var, 0, 4096); + if (loc->path[0] == '/') { + for (i = 1; i < strlen(loc->path); i++) { + cs_var[i - 1] = loc->path[i]; + } + + cs_var[i] = '\0'; + gf_msg_debug(this->name, GF_LOG_ERROR, "remotepath %s", cs_var); + } + + ret = sys_lsetxattr(real_path, GF_CS_OBJECT_REMOTE, cs_var, + strlen(cs_var), flags); + if (ret) { + op_errno = errno; + gf_log("POSIX", GF_LOG_ERROR, + "setxattr failed - %s" + " %d", + GF_CS_OBJECT_SIZE, ret); + goto unlock; + } + + ret = sys_truncate(real_path, 0); + if (ret) { + op_errno = errno; + gf_log("POSIX", GF_LOG_ERROR, + "truncate failed - %s" + " %d", + GF_CS_OBJECT_SIZE, ret); + ret = sys_lremovexattr(real_path, GF_CS_OBJECT_REMOTE); + if (ret) { + op_errno = errno; + gf_log("POSIX", GF_LOG_ERROR, + "removexattr " + "failed post processing- %s" + " %d", + GF_CS_OBJECT_SIZE, ret); + } + goto unlock; + } else { + state = GF_CS_REMOTE; + ret = posix_cs_set_state(this, &xattr, state, real_path, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "set state failed"); + } + } + } + unlock: + UNLOCK(&loc->inode->lock); + op_ret = ret; + goto out; + } + + filler.real_path = real_path; + filler.this = this; + filler.stbuf = &preop; + filler.loc = loc; + +#ifdef GF_DARWIN_HOST_OS + filler.flags = map_xattr_flags(flags); +#else + filler.flags = flags; +#endif + op_ret = dict_foreach(dict, _handle_setxattr_keyvalue_pair, &filler); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + goto out; + } + + ret = dict_get_int8(xdata, "sync_backend_xattrs", &sync_backend_xattrs); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to get sync_backend_xattrs"); + } + + if (sync_backend_xattrs) { + /* List all custom xattrs */ + subvol_xattrs = dict_new(); + if (!subvol_xattrs) + goto out; + + ret = dict_set_int32_sizen(xdata, "list-xattr", 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "Unable to set list-xattr in dict "); + goto out; + } + + subvol_xattrs = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, + NULL); + + /* Remove all user xattrs from the file */ + dict_foreach_fnmatch(subvol_xattrs, "user.*", posix_delete_user_xattr, + real_path); + + /* Remove all custom xattrs from the file */ + for (i = 1; xattrs_to_heal[i]; i++) { + keyval = dict_get(subvol_xattrs, xattrs_to_heal[i]); + if (keyval) { + ret = sys_lremovexattr(real_path, xattrs_to_heal[i]); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, + errno, "removexattr failed. key %s path %s", + xattrs_to_heal[i], loc->path); + goto out; + } + + dict_del(subvol_xattrs, xattrs_to_heal[i]); + keyval = NULL; + } + } + + /* Set custom xattrs based on info provided by DHT */ + custom_xattrs = dict->members_list; + + while (custom_xattrs != NULL) { + ret = sys_lsetxattr(real_path, custom_xattrs->key, + custom_xattrs->value->data, + custom_xattrs->value->len, flags); + if (ret) { + op_errno = errno; + gf_log(this->name, GF_LOG_ERROR, "setxattr failed - %s %d", + custom_xattrs->key, ret); + goto out; + } + + custom_xattrs = custom_xattrs->next; + } + } + + xattr = dict_new(); + if (!xattr) + goto out; + + /* + * FIXFIX: Send the stbuf info in the xdata for now + * This is used by DHT to redirect FOPs if the file is being migrated + * Ignore errors for now + */ + ret = posix_pstat(this, loc->inode, loc->gfid, real_path, &postop, + _gf_false); + if (ret) + goto out; + + ret = posix_set_iatt_in_dict(xattr, &preop, &postop); + + /* + * ACL can be set on a file/folder using GF_POSIX_ACL_*_KEY xattrs which + * won't aware of access-control xlator. To update its context correctly, + * POSIX_ACL_*_XATTR stored in xdata which is send in the call_back path. + */ + if (dict_get(dict, GF_POSIX_ACL_ACCESS)) { + /* + * The size of buffer will be know after calling sys_lgetxattr, + * so first we allocate buffer with large size(~4k), then we + * reduced into required size using GF_REALLO(). + */ + acl_xattr = GF_CALLOC(1, ACL_BUFFER_MAX, gf_posix_mt_char); + if (!acl_xattr) + goto out; + + acl_size = sys_lgetxattr(real_path, POSIX_ACL_ACCESS_XATTR, acl_xattr, + ACL_BUFFER_MAX); + + if (acl_size < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_XATTR_FAILED, + "Posix acl is not set " + "properly at the backend"); + goto out; + } + + /* If acl_size is more than max buffer size, just ignore it */ + if (acl_size >= ACL_BUFFER_MAX) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, P_MSG_BUFFER_OVERFLOW, + "size of acl is more" + "than the buffer"); + goto out; + } + + acl_xattr = GF_REALLOC(acl_xattr, acl_size); + if (!acl_xattr) + goto out; + + ret = dict_set_bin(xattr, POSIX_ACL_ACCESS_XATTR, acl_xattr, acl_size); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_SET_XDATA_FAIL, + "failed to set" + "xdata for acl"); + GF_FREE(acl_xattr); + goto out; + } + } + + if (dict_get(dict, GF_POSIX_ACL_DEFAULT)) { + acl_xattr = GF_CALLOC(1, ACL_BUFFER_MAX, gf_posix_mt_char); + if (!acl_xattr) + goto out; + + acl_size = sys_lgetxattr(real_path, POSIX_ACL_DEFAULT_XATTR, acl_xattr, + ACL_BUFFER_MAX); + + if (acl_size < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_XATTR_FAILED, + "Posix acl is not set " + "properly at the backend"); + goto out; + } + + if (acl_size >= ACL_BUFFER_MAX) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, P_MSG_BUFFER_OVERFLOW, + "size of acl is more" + "than the buffer"); + goto out; + } + + acl_xattr = GF_REALLOC(acl_xattr, acl_size); + if (!acl_xattr) + goto out; + + ret = dict_set_bin(xattr, POSIX_ACL_DEFAULT_XATTR, acl_xattr, acl_size); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_SET_XDATA_FAIL, + "failed to set" + "xdata for acl"); + GF_FREE(acl_xattr); + goto out; + } + } + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xattr); + + if (xattr) + dict_unref(xattr); + + if (subvol_xattrs) + dict_unref(subvol_xattrs); + + return 0; +} + +int +posix_xattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) +{ + int ret = -1; + int op_ret = -1; + const char *fname = NULL; + char *real_path = NULL; + char *found = NULL; + DIR *fd = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + return -ESTALE; + } + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_LSTAT_FAILED, + "posix_xattr_get_real_filename (lstat) on " + "gfid-handle %s (path: %s) failed", + real_path, loc->path); + return -errno; + } + + fd = sys_opendir(real_path); + if (!fd) + return -errno; + + fname = key + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY); + + for (;;) { + errno = 0; + entry = sys_readdir(fd, scratch); + if (!entry || errno != 0) + break; + + if (strcasecmp(entry->d_name, fname) == 0) { + found = gf_strdup(entry->d_name); + if (!found) { + (void)sys_closedir(fd); + return -ENOMEM; + } + break; + } + } + + (void)sys_closedir(fd); + + if (!found) + return -ENOATTR; + + ret = dict_set_dynstr(dict, (char *)key, found); + if (ret) { + GF_FREE(found); + return -ENOMEM; + } + ret = strlen(found) + 1; + + return ret; +} + +int +posix_get_ancestry_directory(xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, + int32_t *op_errno, dict_t *xdata) +{ + ssize_t handle_size = 0; + struct posix_private *priv = NULL; + inode_t *inode = NULL; + int ret = -1; + char dirpath[PATH_MAX] = { + 0, + }; + + priv = this->private; + + handle_size = POSIX_GFID_HANDLE_SIZE(priv->base_path_length); + + ret = posix_make_ancestryfromgfid( + this, dirpath, PATH_MAX + 1, head, type | POSIX_ANCESTRY_PATH, + leaf_inode->gfid, handle_size, priv->base_path, leaf_inode->table, + &inode, xdata, op_errno); + if (ret < 0) + goto out; + + /* there is already a reference in loc->inode */ + inode_unref(inode); + + if ((type & POSIX_ANCESTRY_PATH) && (path != NULL)) { + if (strcmp(dirpath, "/")) + dirpath[strlen(dirpath) - 1] = '\0'; + + *path = gf_strdup(dirpath); + } + +out: + return ret; +} + +int32_t +posix_links_in_same_directory(char *dirpath, int count, inode_t *leaf_inode, + inode_t *parent, struct stat *stbuf, + gf_dirent_t *head, char **path, int type, + dict_t *xdata, int32_t *op_errno) +{ + int op_ret = -1; + gf_dirent_t *gf_entry = NULL; + xlator_t *this = NULL; + struct posix_private *priv = NULL; + DIR *dirp = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + char temppath[PATH_MAX] = { + 0, + }; + char scr[PATH_MAX * 4] = { + 0, + }; + + this = THIS; + + priv = this->private; + + dirp = sys_opendir(dirpath); + if (!dirp) { + *op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_OPEN_FAILED, + "could not opendir %s", dirpath); + goto out; + } + + while (count > 0) { + errno = 0; + entry = sys_readdir(dirp, scratch); + if (!entry || errno != 0) + break; + + if (entry->d_ino != stbuf->st_ino) + continue; + + /* Linking an inode here, can cause a race in posix_acl. + Parent inode gets linked here, but before + it reaches posix_acl_readdirp_cbk, create/lookup can + come on a leaf-inode, as parent-inode-ctx not yet updated + in posix_acl_readdirp_cbk, create and lookup can fail + with EACCESS. So do the inode linking in the quota xlator + + linked_inode = inode_link (leaf_inode, parent, + entry->d_name, NULL); + + GF_ASSERT (linked_inode == leaf_inode); + inode_unref (linked_inode);*/ + + if (type & POSIX_ANCESTRY_DENTRY) { + loc_t loc = { + 0, + }; + + loc.inode = inode_ref(leaf_inode); + gf_uuid_copy(loc.gfid, leaf_inode->gfid); + + (void)snprintf(temppath, sizeof(temppath), "%s/%s", dirpath, + entry->d_name); + + gf_entry = gf_dirent_for_name(entry->d_name); + if (!gf_entry) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, "gf_entry is NULL"); + op_ret = -1; + *op_errno = ENOMEM; + inode_unref(loc.inode); + goto out; + } + gf_entry->inode = inode_ref(leaf_inode); + gf_entry->dict = posix_xattr_fill(this, temppath, &loc, NULL, -1, + xdata, NULL); + iatt_from_stat(&(gf_entry->d_stat), stbuf); + + list_add_tail(&gf_entry->list, &head->list); + loc_wipe(&loc); + } + + if (type & POSIX_ANCESTRY_PATH) { + (void)snprintf(temppath, sizeof(temppath), "%s/%s", + &dirpath[priv->base_path_length], entry->d_name); + if (!*path) { + *path = gf_strdup(temppath); + } else { + /* creating a colon separated */ + /* list of hard links */ + (void)snprintf(scr, sizeof(scr), "%s:%s", *path, temppath); + + GF_FREE(*path); + *path = gf_strdup(scr); + } + if (!*path) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + } + + count--; + } + + op_ret = 0; +out: + if (dirp) { + op_ret = sys_closedir(dirp); + if (op_ret == -1) { + *op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_CLOSE_FAILED, + "closedir failed"); + } + } + + return op_ret; +} + +int +posix_get_ancestry_non_directory(xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, + int32_t *op_errno, dict_t *xdata) +{ + size_t remaining_size = 0; + int op_ret = -1, pathlen = -1; + ssize_t handle_size = 0; + uuid_t pgfid = { + 0, + }; + int nlink_samepgfid = 0; + struct stat stbuf = { + 0, + }; + char *list = NULL; + int32_t list_offset = 0; + struct posix_private *priv = NULL; + ssize_t size = 0; + inode_t *parent = NULL; + loc_t *loc = NULL; + char *leaf_path = NULL; + char key[4096] = { + 0, + }; + char dirpath[PATH_MAX] = { + 0, + }; + char pgfidstr[UUID_CANONICAL_FORM_LEN + 1] = { + 0, + }; + int len; + + priv = this->private; + + loc = GF_CALLOC(1, sizeof(*loc), gf_posix_mt_char); + if (loc == NULL) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + gf_uuid_copy(loc->gfid, leaf_inode->gfid); + + MAKE_INODE_HANDLE(leaf_path, this, loc, NULL); + if (!leaf_path) { + GF_FREE(loc); + *op_errno = ESTALE; + goto out; + } + GF_FREE(loc); + + size = sys_llistxattr(leaf_path, NULL, 0); + if (size == -1) { + *op_errno = errno; + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting brick" + " with 'user_xattr' flag)"); + + } else { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_XATTR_FAILED, + "listxattr failed on" + "%s", + leaf_path); + } + + goto out; + } + + if (size == 0) { + op_ret = 0; + goto out; + } + + list = alloca(size); + if (!list) { + *op_errno = errno; + goto out; + } + + size = sys_llistxattr(leaf_path, list, size); + if (size < 0) { + op_ret = -1; + *op_errno = errno; + goto out; + } + remaining_size = size; + list_offset = 0; + + op_ret = sys_lstat(leaf_path, &stbuf); + if (op_ret == -1) { + *op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_LSTAT_FAILED, + "lstat failed on %s", leaf_path); + goto out; + } + + while (remaining_size > 0) { + len = snprintf(key, sizeof(key), "%s", list + list_offset); + if (strncmp(key, PGFID_XATTR_KEY_PREFIX, + SLEN(PGFID_XATTR_KEY_PREFIX)) != 0) + goto next; + + op_ret = sys_lgetxattr(leaf_path, key, &nlink_samepgfid, + sizeof(nlink_samepgfid)); + if (op_ret == -1) { + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "getxattr failed on " + "%s: key = %s ", + leaf_path, key); + goto out; + } + + nlink_samepgfid = ntoh32(nlink_samepgfid); + + snprintf(pgfidstr, sizeof(pgfidstr), "%s", + key + SLEN(PGFID_XATTR_KEY_PREFIX)); + gf_uuid_parse(pgfidstr, pgfid); + + handle_size = POSIX_GFID_HANDLE_SIZE(priv->base_path_length); + + /* constructing the absolute real path of parent dir */ + snprintf(dirpath, sizeof(dirpath), "%s", priv->base_path); + pathlen = PATH_MAX + 1 - priv->base_path_length; + + op_ret = posix_make_ancestryfromgfid( + this, dirpath + priv->base_path_length, pathlen, head, + type | POSIX_ANCESTRY_PATH, pgfid, handle_size, priv->base_path, + leaf_inode->table, &parent, xdata, op_errno); + if (op_ret < 0) { + goto next; + } + + dirpath[strlen(dirpath) - 1] = '\0'; + + posix_links_in_same_directory(dirpath, nlink_samepgfid, leaf_inode, + parent, &stbuf, head, path, type, xdata, + op_errno); + + if (parent != NULL) { + inode_unref(parent); + parent = NULL; + } + + next: + remaining_size -= (len + 1); + list_offset += (len + 1); + } /* while (remaining_size > 0) */ + + op_ret = 0; + +out: + return op_ret; +} + +int +posix_get_ancestry(xlator_t *this, inode_t *leaf_inode, gf_dirent_t *head, + char **path, int type, int32_t *op_errno, dict_t *xdata) +{ + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + if (IA_ISDIR(leaf_inode->ia_type)) { + ret = posix_get_ancestry_directory(this, leaf_inode, head, path, type, + op_errno, xdata); + } else { + if (!priv->update_pgfid_nlinks) + goto out; + ret = posix_get_ancestry_non_directory(this, leaf_inode, head, path, + type, op_errno, xdata); + } + +out: + if (ret && path && *path) { + GF_FREE(*path); + *path = NULL; + } + + return ret; +} + +/** + * posix_getxattr - this function returns a dictionary with all the + * key:value pair present as xattr. used for + * both 'listxattr' and 'getxattr'. + */ +int32_t +posix_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + struct posix_private *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *value = NULL; + char *real_path = NULL; + dict_t *dict = NULL; + int ret = -1; + char *path = NULL; + char *rpath = NULL; + ssize_t size = 0; + char *list = NULL; + int32_t list_offset = 0; + size_t remaining_size = 0; + char *host_buf = NULL; + char *keybuffer = NULL; + int keybuff_len; + char *value_buf = NULL; + gf_boolean_t have_val = _gf_false; + struct iatt buf = { + 0, + }; + dict_t *xattr_rsp = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + VALIDATE_OR_GOTO(this->private, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + + op_ret = -1; + priv = this->private; + + ret = posix_handle_georep_xattrs(frame, name, &op_errno, _gf_true); + if (ret == -1) { + op_ret = -1; + /* errno should be set from the above function*/ + goto out; + } + + ret = posix_handle_mdata_xattr(frame, name, &op_errno); + if (ret == -1) { + op_ret = -1; + /* errno should be set from the above function*/ + goto out; + } + + if (name && posix_is_gfid2path_xattr(name)) { + op_ret = -1; + op_errno = ENOATTR; + goto out; + } + + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + goto out; + } + + if (loc->inode && name && GF_POSIX_ACL_REQUEST(name)) { + ret = posix_pacl_get(real_path, -1, name, &value); + if (ret || !value) { + op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_ACL_FAILED, + "could not get acl (%s) for" + "gfid-handle %s (path: %s)", + name, real_path, loc->path); + op_ret = -1; + goto out; + } + + ret = dict_set_dynstr(dict, (char *)name, value); + if (ret < 0) { + GF_FREE(value); + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_ACL_FAILED, + "could not set acl (%s) for %s " + "(gfid-handle: %s) in dictionary", + name, loc->path, real_path); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + size = ret; + goto done; + } + + if (loc->inode && name && + (strncmp(name, GF_XATTR_GET_REAL_FILENAME_KEY, + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename(frame, this, loc, name, dict, + xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + if (op_errno == ENOATTR) { + gf_msg_debug(this->name, 0, + "Failed to get " + "real filename (%s, %s)", + loc->path, name); + } else { + gf_msg(this->name, GF_LOG_WARNING, op_errno, + P_MSG_GETTING_FILENAME_FAILED, + "Failed to get real filename (%s, %s):", loc->path, + name); + } + goto out; + } + + size = ret; + goto done; + } + + if (loc->inode && name && !strcmp(name, GLUSTERFS_OPEN_FD_COUNT)) { + if (!fd_list_empty(loc->inode)) { + ret = dict_set_uint32(dict, (char *)name, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "Failed to set " + "dictionary value for %s", + name); + op_errno = ENOMEM; + goto out; + } + } else { + ret = dict_set_uint32(dict, (char *)name, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "Failed to set " + "dictionary value for %s", + name); + op_errno = ENOMEM; + goto out; + } + } + goto done; + } + if (loc->inode && name && (XATTR_IS_PATHINFO(name))) { + VALIDATE_OR_GOTO(this->private, out); + if (LOC_HAS_ABSPATH(loc)) { + MAKE_REAL_PATH(rpath, this, loc->path); + } else { + rpath = real_path; + } + size = gf_asprintf( + &host_buf, "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo && !gf_uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa(priv->glusterd_uuid) + : priv->hostname), + rpath); + if (size < 0) { + op_errno = ENOMEM; + goto out; + } + ret = dict_set_dynstr(dict, (char *)name, host_buf); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "could not set value" + " (%s) in dictionary", + host_buf); + GF_FREE(host_buf); + op_errno = ENOMEM; + goto out; + } + + goto done; + } + + if (loc->inode && name && (strcmp(name, GF_XATTR_NODE_UUID_KEY) == 0) && + !gf_uuid_is_null(priv->glusterd_uuid)) { + size = gf_asprintf(&host_buf, "%s", uuid_utoa(priv->glusterd_uuid)); + if (size == -1) { + op_errno = ENOMEM; + goto out; + } + ret = dict_set_dynstr(dict, GF_XATTR_NODE_UUID_KEY, host_buf); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_DICT_SET_FAILED, + "could not set value" + "(%s) in dictionary", + host_buf); + GF_FREE(host_buf); + op_errno = -ret; + goto out; + } + goto done; + } + + if (loc->inode && name && (strcmp(name, GFID_TO_PATH_KEY) == 0)) { + ret = inode_path(loc->inode, NULL, &path); + if (ret < 0) { + op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, op_errno, + P_MSG_INODE_PATH_GET_FAILED, + "%s: could not get " + "inode path", + uuid_utoa(loc->inode->gfid)); + goto out; + } + + size = ret; + ret = dict_set_dynstr(dict, GFID_TO_PATH_KEY, path); + if (ret < 0) { + op_errno = ENOMEM; + GF_FREE(path); + goto out; + } + goto done; + } + + if (loc->inode && name && (strcmp(name, GFID2PATH_VIRT_XATTR_KEY) == 0)) { + if (!priv->gfid2path) { + op_errno = ENOATTR; + op_ret = -1; + goto out; + } + ret = posix_get_gfid2path(this, loc->inode, real_path, &op_errno, dict); + if (ret < 0) { + op_ret = -1; + goto out; + } + size = ret; + goto done; + } + + if (loc->inode && name && (strcmp(name, GET_ANCESTRY_PATH_KEY) == 0)) { + int type = POSIX_ANCESTRY_PATH; + + op_ret = posix_get_ancestry(this, loc->inode, NULL, &path, type, + &op_errno, xdata); + if (op_ret < 0) { + op_ret = -1; + op_errno = ENODATA; + goto out; + } + size = op_ret; + op_ret = dict_set_dynstr(dict, GET_ANCESTRY_PATH_KEY, path); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -op_ret, + P_MSG_GET_KEY_VALUE_FAILED, + "could not get " + "value for key (%s)", + GET_ANCESTRY_PATH_KEY); + GF_FREE(path); + op_errno = ENOMEM; + goto out; + } + + goto done; + } + + if (loc->inode && name && + (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE, + SLEN(GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { + op_ret = posix_get_objectsignature(real_path, dict); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + goto done; + } + + /* here allocate value_buf of 8192 bytes to avoid one extra getxattr + call,If buffer size is small to hold the xattr result then it will + allocate a new buffer value of required size and call getxattr again + */ + + value_buf = alloca(XATTR_VAL_BUF_SIZE); + if (name) { + char *key = (char *)name; + + keybuffer = key; +#if defined(GF_DARWIN_HOST_OS_DISABLED) + if (priv->xattr_user_namespace == XATTR_STRIP) { + if (strncmp(key, "user.", 5) == 0) { + key += 5; + gf_msg_debug(this->name, 0, + "getxattr for file %s (gfid-handle: %s)" + " stripping user key: %s -> %s", + loc->path, real_path, keybuffer, key); + } + } +#endif + size = sys_lgetxattr(real_path, key, value_buf, XATTR_VAL_BUF_SIZE - 1); + if (size >= 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_XATTR_FAILED, + "getxattr failed due to overflow of buffer" + " on gfid-handle %s (path: %s) : %s ", + real_path, loc->path, key); + size = sys_lgetxattr(real_path, key, NULL, 0); + } + if (size == -1) { + op_errno = errno; + if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } + if ((op_errno == ENOATTR) || (op_errno == ENODATA)) { + gf_msg_debug(this->name, 0, + "No such attribute:%s for file %s (path: %s)", + key, real_path, loc->path); + } else { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + P_MSG_XATTR_FAILED, + "getxattr failed on " + "%s (path: %s): %s ", + real_path, loc->path, key); + } + goto out; + } + } + value = GF_MALLOC(size + 1, gf_posix_mt_char); + if (!value) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + if (have_val) { + memcpy(value, value_buf, size); + } else { + bzero(value, size + 1); + size = sys_lgetxattr(real_path, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "getxattr failed on %s (path: %s): key = %s", real_path, + loc->path, key); + GF_FREE(value); + goto out; + } + } + value[size] = '\0'; + op_ret = dict_set_dynptr(dict, key, value, size); + if (op_ret < 0) { + op_errno = -op_ret; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_DICT_SET_FAILED, + "dict set operation " + "on %s (gfid-handle: %s) for the key %s failed.", + loc->path, real_path, key); + GF_FREE(value); + goto out; + } + + goto done; + } + + have_val = _gf_false; + size = sys_llistxattr(real_path, value_buf, XATTR_VAL_BUF_SIZE - 1); + if (size > 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_XATTR_FAILED, + "listxattr failed due to overflow of buffer" + " on %s (path: %s) ", + real_path, loc->path); + size = sys_llistxattr(real_path, NULL, 0); + } + if (size == -1) { + op_errno = errno; + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "listxattr failed on %s (path: %s)", real_path, + loc->path); + } + goto out; + } + if (size == 0) + goto done; + } + list = alloca(size); + if (!list) { + op_errno = errno; + goto out; + } + if (have_val) { + memcpy(list, value_buf, size); + } else { + size = sys_llistxattr(real_path, list, size); + if (size < 0) { + op_ret = -1; + op_errno = errno; + goto out; + } + } + remaining_size = size; + list_offset = 0; + keybuffer = alloca(XATTR_KEY_BUF_SIZE); + while (remaining_size > 0) { + keybuff_len = snprintf(keybuffer, XATTR_KEY_BUF_SIZE, "%s", + list + list_offset); + + ret = posix_handle_georep_xattrs(frame, keybuffer, NULL, _gf_false); + if (ret == -1) + goto ignore; + + ret = posix_handle_mdata_xattr(frame, keybuffer, &op_errno); + if (ret == -1) { + goto ignore; + } + + if (posix_is_gfid2path_xattr(keybuffer)) { + goto ignore; + } + + have_val = _gf_false; + size = sys_lgetxattr(real_path, keybuffer, value_buf, + XATTR_VAL_BUF_SIZE - 1); + if (size >= 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, op_errno, P_MSG_XATTR_FAILED, + "getxattr failed due to overflow of" + " buffer on %s (path: %s): %s ", + real_path, loc->path, keybuffer); + size = sys_lgetxattr(real_path, keybuffer, NULL, 0); + } + if (size == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "getxattr failed on" + " %s (path: %s): key = %s ", + real_path, loc->path, keybuffer); + goto out; + } + } + value = GF_MALLOC(size + 1, gf_posix_mt_char); + if (!value) { + op_errno = errno; + goto out; + } + if (have_val) { + memcpy(value, value_buf, size); + } else { + bzero(value, size + 1); + size = sys_lgetxattr(real_path, keybuffer, value, size); + if (size == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "getxattr failed on" + " %s (path: %s): key = %s ", + real_path, loc->path, keybuffer); + GF_FREE(value); + goto out; + } + } + value[size] = '\0'; +#ifdef GF_DARWIN_HOST_OS + /* The protocol expect namespace for now */ + char *newkey = NULL; + gf_add_prefix(XATTR_USER_PREFIX, keybuffer, &newkey); + keybuff_len = snprintf(keybuffer, sizeof(keybuffer), "%s", newkey); + GF_FREE(newkey); +#endif + op_ret = dict_set_dynptr(dict, keybuffer, value, size); + if (op_ret < 0) { + op_errno = -op_ret; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_DICT_SET_FAILED, + "dict set operation " + "on %s (gfid-handle: %s) for the key %s failed.", + loc->path, real_path, keybuffer); + GF_FREE(value); + goto out; + } + + ignore: + remaining_size -= keybuff_len + 1; + list_offset += keybuff_len + 1; + + } /* while (remaining_size > 0) */ + +done: + op_ret = size; + + if (xdata && (op_ret >= 0)) { + xattr_rsp = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, + &buf); + } + + if (dict) { + dict_del(dict, GFID_XATTR_KEY); + dict_del(dict, GF_XATTR_VOL_ID_KEY); + } + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xattr_rsp); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (dict) { + dict_unref(dict); + } + + return 0; +} + +int32_t +posix_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct posix_fd *pfd = NULL; + int _fd = -1; + int32_t list_offset = 0; + ssize_t size = 0; + size_t remaining_size = 0; + char *value = NULL; + char *list = NULL; + dict_t *dict = NULL; + int ret = -1; + char key[4096] = { + 0, + }; + int key_len; + char *value_buf = NULL; + gf_boolean_t have_val = _gf_false; + struct iatt buf = { + 0, + }; + dict_t *xattr_rsp = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + SET_FS_ID(frame->root->uid, frame->root->gid); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + op_ret = -1; + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + + _fd = pfd->fd; + + /* Get the total size */ + dict = dict_new(); + if (!dict) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + if (name && !strcmp(name, GLUSTERFS_OPEN_FD_COUNT)) { + ret = dict_set_uint32(dict, (char *)name, 1); + if (ret < 0) { + op_ret = -1; + size = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, + "Failed to set " + "dictionary value for %s", + name); + goto out; + } + goto done; + } + + if (name && strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE, + SLEN(GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0) { + op_ret = posix_fdget_objectsignature(_fd, dict); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "posix_fdget_objectsignature failed"); + op_errno = -op_ret; + op_ret = -1; + size = -1; + goto out; + } + + goto done; + } + + /* here allocate value_buf of 8192 bytes to avoid one extra getxattr + call,If buffer size is small to hold the xattr result then it will + allocate a new buffer value of required size and call getxattr again + */ + value_buf = alloca(XATTR_VAL_BUF_SIZE); + + if (name) { + key_len = snprintf(key, sizeof(key), "%s", name); +#ifdef GF_DARWIN_HOST_OS + struct posix_private *priv = NULL; + priv = this->private; + if (priv->xattr_user_namespace == XATTR_STRIP) { + char *newkey = NULL; + gf_add_prefix(XATTR_USER_PREFIX, key, &newkey); + key_len = snprintf(key, sizeof(key), "%s", newkey); + GF_FREE(newkey); + } +#endif + size = sys_fgetxattr(_fd, key, value_buf, XATTR_VAL_BUF_SIZE - 1); + if (size >= 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_XATTR_FAILED, + "fgetxattr failed due to overflow of" + "buffer on %s ", + key); + size = sys_fgetxattr(_fd, key, NULL, 0); + } + if (size == -1) { + op_errno = errno; + if (errno == ENODATA || errno == ENOATTR) { + gf_msg_debug(this->name, 0, + "fgetxattr" + " failed on key %s (%s)", + key, strerror(op_errno)); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fgetxattr" + " failed on key %s", + key); + } + goto done; + } + } + value = GF_MALLOC(size + 1, gf_posix_mt_char); + if (!value) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + if (have_val) { + memcpy(value, value_buf, size); + } else { + bzero(value, size + 1); + size = sys_fgetxattr(_fd, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fgetxattr" + " failed on fd %p for the key %s ", + fd, key); + GF_FREE(value); + goto out; + } + } + + value[size] = '\0'; + op_ret = dict_set_dynptr(dict, key, value, size); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DICT_SET_FAILED, + "dict set operation " + "on key %s failed", + key); + GF_FREE(value); + goto out; + } + + goto done; + } + size = sys_flistxattr(_fd, value_buf, XATTR_VAL_BUF_SIZE - 1); + if (size > 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_XATTR_FAILED, + "listxattr failed due to overflow of buffer" + " on %p ", + fd); + size = sys_flistxattr(_fd, NULL, 0); + } + if (size == -1) { + op_ret = -1; + op_errno = errno; + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting " + "brick with 'user_xattr' flag)"); + } else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "listxattr failed " + "on %p:", + fd); + } + goto out; + } + if (size == 0) + goto done; + } + list = alloca(size + 1); + if (!list) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + if (have_val) + memcpy(list, value_buf, size); + else + size = sys_flistxattr(_fd, list, size); + + remaining_size = size; + list_offset = 0; + while (remaining_size > 0) { + if (*(list + list_offset) == '\0') + break; + + key_len = snprintf(key, sizeof(key), "%s", list + list_offset); + have_val = _gf_false; + size = sys_fgetxattr(_fd, key, value_buf, XATTR_VAL_BUF_SIZE - 1); + if (size >= 0) { + have_val = _gf_true; + } else { + if (errno == ERANGE) { + gf_msg(this->name, GF_LOG_INFO, errno, P_MSG_XATTR_FAILED, + "fgetxattr failed due to overflow of buffer" + " on fd %p: for the key %s ", + fd, key); + size = sys_fgetxattr(_fd, key, NULL, 0); + } + if (size == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fgetxattr failed " + "on fd %p for the key %s ", + fd, key); + break; + } + } + value = GF_MALLOC(size + 1, gf_posix_mt_char); + if (!value) { + op_ret = -1; + op_errno = errno; + goto out; + } + if (have_val) { + memcpy(value, value_buf, size); + } else { + bzero(value, size + 1); + size = sys_fgetxattr(_fd, key, value, size); + if (size == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "fgetxattr failed o" + "n the fd %p for the key %s ", + fd, key); + GF_FREE(value); + break; + } + } + value[size] = '\0'; + + op_ret = dict_set_dynptr(dict, key, value, size); + if (op_ret) { + op_errno = -op_ret; + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DICT_SET_FAILED, + "dict set operation " + "failed on key %s", + key); + GF_FREE(value); + goto out; + } + remaining_size -= key_len + 1; + list_offset += key_len + 1; + + } /* while (remaining_size > 0) */ + +done: + op_ret = size; + + if (xdata && (op_ret >= 0)) { + xattr_rsp = posix_xattr_fill(this, NULL, NULL, fd, pfd->fd, xdata, + &buf); + } + + if (dict) { + dict_del(dict, GFID_XATTR_KEY); + dict_del(dict, GF_XATTR_VOL_ID_KEY); + } + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xattr_rsp); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (dict) + dict_unref(dict); + + return 0; +} + +static int +_handle_fsetxattr_keyvalue_pair(dict_t *d, char *k, data_t *v, void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_fhandle_pair(filler->frame, filler->this, filler->fdnum, k, v, + filler->flags, filler->stbuf, filler->fd); +} + +int32_t +posix_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_fd *pfd = NULL; + int _fd = -1; + int ret = -1; + struct iatt preop = { + 0, + }; + struct iatt postop = { + 0, + }; + dict_t *xattr = NULL; + posix_xattr_filler_t filler = { + 0, + }; + struct posix_private *priv = NULL; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + VALIDATE_OR_GOTO(dict, out); + + priv = this->private; + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + _fd = pfd->fd; + + ret = posix_fdstat(this, fd->inode, pfd->fd, &preop); + if (ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fsetxattr (fstat)" + "failed on fd=%p", + fd); + goto out; + } + + dict_del(dict, GFID_XATTR_KEY); + dict_del(dict, GF_XATTR_VOL_ID_KEY); + + filler.fdnum = _fd; + filler.this = this; + filler.frame = frame; + filler.stbuf = &preop; + filler.fd = fd; +#ifdef GF_DARWIN_HOST_OS + filler.flags = map_xattr_flags(flags); +#else + filler.flags = flags; +#endif + op_ret = dict_foreach(dict, _handle_fsetxattr_keyvalue_pair, &filler); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + } + + if (!ret && xdata && dict_get(xdata, GLUSTERFS_DURABLE_OP)) { + op_ret = sys_fsync(_fd); + if (op_ret < 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_WARNING, errno, + P_MSG_DURABILITY_REQ_NOT_SATISFIED, + "could not satisfy durability request: " + "reason "); + } + } + + ret = posix_fdstat(this, fd->inode, pfd->fd, &postop); + if (ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_XATTR_FAILED, + "fsetxattr (fstat)" + "failed on fd=%p", + fd); + goto out; + } + xattr = dict_new(); + if (!xattr) + goto out; + + ret = posix_set_iatt_in_dict(xattr, &preop, &postop); + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xattr); + + if (xattr) + dict_unref(xattr); + + return 0; +} + +int +_posix_remove_xattr(dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *)data; + this = filler->this; +#ifdef GF_DARWIN_HOST_OS + struct posix_private *priv = NULL; + priv = (struct posix_private *)this->private; + char *newkey = NULL; + if (priv->xattr_user_namespace == XATTR_STRIP) { + gf_remove_prefix(XATTR_USER_PREFIX, key, &newkey); + gf_msg_debug("remove_xattr", 0, "key %s => %s", key, newkey); + key = newkey; + } +#endif + /* Bulk remove xattr is internal fop in gluster. Some of the xattrs may + * have special behavior. Ex: removexattr("posix.system_acl_access"), + * removes more than one xattr on the file that could be present in the + * bulk-removal request. Removexattr of these deleted xattrs will fail + * with either ENODATA/ENOATTR. Since all this fop cares is removal of the + * xattrs in bulk-remove request and if they are already deleted, it can be + * treated as success. + */ + + if (filler->real_path) + op_ret = sys_lremovexattr(filler->real_path, key); + else + op_ret = sys_fremovexattr(filler->fdnum, key); + + if (op_ret == -1) { + if (errno == ENODATA || errno == ENOATTR) + op_ret = 0; + } + + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != ENODATA && errno != EPERM) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "removexattr failed on " + "file/dir %s with gfid: %s (for %s)", + filler->real_path ? filler->real_path : "", + uuid_utoa(filler->inode->gfid), key); + } + } +#ifdef GF_DARWIN_HOST_OS + GF_FREE(newkey); +#endif + return op_ret; +} + +int +posix_common_removexattr(call_frame_t *frame, loc_t *loc, fd_t *fd, + const char *name, dict_t *xdata, int *op_errno, + dict_t **xdata_rsp) +{ + gf_boolean_t bulk_removexattr = _gf_false; + gf_boolean_t disallow = _gf_false; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + int op_ret = 0; + struct iatt preop = { + 0, + }; + struct iatt postop = { + 0, + }; + int ret = 0; + int _fd = -1; + xlator_t *this = frame->this; + inode_t *inode = NULL; + posix_xattr_filler_t filler = {0}; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID(frame->root->uid, frame->root->gid); + + if (loc) { + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + op_ret = -1; + *op_errno = ESTALE; + goto out; + } + inode = loc->inode; + } else { + op_ret = posix_fd_ctx_get(fd, this, &pfd, op_errno); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, *op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto out; + } + _fd = pfd->fd; + inode = fd->inode; + } + + if (posix_is_gfid2path_xattr(name)) { + op_ret = -1; + *op_errno = ENOATTR; + goto out; + } + + if (loc) { + ret = posix_pstat(this, inode, loc->gfid, real_path, &preop, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PSTAT_FAILED, + "pstat operaton failed on %s", real_path); + } + } else { + ret = posix_fdstat(this, inode, _fd, &preop); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FDSTAT_FAILED, + "fdstat operaton failed on %s", real_path ? real_path : ""); + } + } + + if (gf_get_index_by_elem(disallow_removexattrs, (char *)name) >= 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_NOT_REMOVED, + "Remove xattr called on %s for file/dir %s with gfid: " + "%s", + name, real_path ? real_path : "", uuid_utoa(inode->gfid)); + op_ret = -1; + *op_errno = EPERM; + goto out; + } else if (posix_is_bulk_removexattr((char *)name, xdata)) { + bulk_removexattr = _gf_true; + (void)dict_has_key_from_array(xdata, disallow_removexattrs, &disallow); + if (disallow) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_NOT_REMOVED, + "Bulk removexattr has keys that shouldn't be " + "removed for file/dir %s with gfid: %s", + real_path ? real_path : "", uuid_utoa(inode->gfid)); + op_ret = -1; + *op_errno = EPERM; + goto out; + } + } + + if (bulk_removexattr) { + filler.real_path = real_path; + filler.this = this; + filler.fdnum = _fd; + filler.inode = inode; + op_ret = dict_foreach(xdata, _posix_remove_xattr, &filler); + if (op_ret) { + *op_errno = filler.op_errno; + goto out; + } + } else { + if (loc) + op_ret = sys_lremovexattr(real_path, name); + else + op_ret = sys_fremovexattr(_fd, name); + if (op_ret == -1) { + *op_errno = errno; + if (*op_errno != ENOATTR && *op_errno != ENODATA && + *op_errno != EPERM) { + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_XATTR_FAILED, + "removexattr on %s with gfid %s " + "(for %s)", + real_path, uuid_utoa(inode->gfid), name); + } + goto out; + } + } + + if (loc) { + posix_set_ctime(frame, this, real_path, -1, inode, NULL); + ret = posix_pstat(this, inode, loc->gfid, real_path, &postop, + _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PSTAT_FAILED, + "pstat operaton failed on %s", real_path); + } + } else { + posix_set_ctime(frame, this, NULL, _fd, inode, NULL); + ret = posix_fdstat(this, inode, _fd, &postop); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_FDSTAT_FAILED, + "fdstat operaton failed on %s", real_path); + } + } + if (ret) + goto out; + *xdata_rsp = dict_new(); + if (!*xdata_rsp) + goto out; + + ret = posix_set_iatt_in_dict(*xdata_rsp, &preop, &postop); + + op_ret = 0; +out: + SET_TO_OLD_FS_ID(); + return op_ret; +} + +int32_t +posix_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int op_ret = -1; + int op_errno = EINVAL; + dict_t *xdata_rsp = NULL; + + VALIDATE_OR_GOTO(loc, out); + + op_ret = posix_common_removexattr(frame, loc, NULL, name, xdata, &op_errno, + &xdata_rsp); +out: + STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata_rsp); + + if (xdata_rsp) + dict_unref(xdata_rsp); + + return 0; +} + +int32_t +posix_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + dict_t *xdata_rsp = NULL; + + VALIDATE_OR_GOTO(fd, out); + + op_ret = posix_common_removexattr(frame, NULL, fd, name, xdata, &op_errno, + &xdata_rsp); +out: + STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata_rsp); + + if (xdata_rsp) + dict_unref(xdata_rsp); + + return 0; +} + +int32_t +posix_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + struct posix_fd *pfd = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL, fd=%p", fd); + goto out; + } + + op_ret = 0; + +out: + STACK_UNWIND_STRICT(fsyncdir, frame, op_ret, op_errno, NULL); + + return 0; +} + +void +posix_print_xattr(dict_t *this, char *key, data_t *value, void *data) +{ + gf_msg_debug("posix", 0, "(key/val) = (%s/%d)", key, data_to_int32(value)); +} + +/** + * add_array - add two arrays of 32-bit numbers (stored in network byte order) + * dest = dest + src + * @count: number of 32-bit numbers + * FIXME: handle overflow + */ + +static void +__add_array(int32_t *dest, int32_t *src, int count) +{ + int i = 0; + int32_t destval = 0; + for (i = 0; i < count; i++) { + destval = ntoh32(dest[i]); + dest[i] = hton32(destval + ntoh32(src[i])); + } +} + +static void +__add_long_array(int64_t *dest, int64_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton64(ntoh64(dest[i]) + ntoh64(src[i])); + } +} + +/* functions: + __add_array_with_default + __add_long_array_with_default + + xattrop type: + GF_XATTROP_ADD_ARRAY_WITH_DEFAULT + GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT + + These operations are similar to 'GF_XATTROP_ADD_ARRAY', + except that it adds a default value if xattr is missing + or its value is zero on disk. + + One use-case of this operation is in inode-quota. + When a new directory is created, its default dir_count + should be set to 1. So when a xattrop performed setting + inode-xattrs, it should account initial dir_count + 1 if the xattrs are not present + + Here is the usage of this operation + + value required in xdata for each key + struct array { + int32_t newvalue_1; + int32_t newvalue_2; + ... + int32_t newvalue_n; + int32_t default_1; + int32_t default_2; + ... + int32_t default_n; + }; + + or + + struct array { + int32_t value_1; + int32_t value_2; + ... + int32_t value_n; + } data[2]; + fill data[0] with new value to add + fill data[1] with default value + + xattrop GF_XATTROP_ADD_ARRAY_WITH_DEFAULT + for i from 1 to n + { + if (xattr (dest_i) is zero or not set in the disk) + dest_i = newvalue_i + default_i + else + dest_i = dest_i + newvalue_i + } + + value in xdata after xattrop is successful + struct array { + int32_t dest_1; + int32_t dest_2; + ... + int32_t dest_n; + }; +*/ +static void +__add_array_with_default(int32_t *dest, int32_t *src, int count) +{ + int i = 0; + int32_t destval = 0; + + for (i = 0; i < count; i++) { + destval = ntoh32(dest[i]); + if (destval == 0) + dest[i] = hton32(ntoh32(src[i]) + ntoh32(src[count + i])); + else + dest[i] = hton32(destval + ntoh32(src[i])); + } +} + +static void +__add_long_array_with_default(int64_t *dest, int64_t *src, int count) +{ + int i = 0; + int64_t destval = 0; + + for (i = 0; i < count; i++) { + destval = ntoh64(dest[i]); + if (destval == 0) + dest[i] = hton64(ntoh64(src[i]) + ntoh64(src[i + count])); + else + dest[i] = hton64(destval + ntoh64(src[i])); + } +} + +static int +_posix_handle_xattr_keyvalue_pair(dict_t *d, char *k, data_t *v, void *tmp) +{ + int size = 0; + int count = 0; + int op_ret = 0; + int op_errno = 0; + gf_xattrop_flags_t optype = 0; + char *array = NULL; + char *dst_data = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + posix_inode_ctx_t *ctx = NULL; + + filler = tmp; + + optype = (gf_xattrop_flags_t)(filler->flags); + this = filler->this; + inode = filler->inode; + count = v->len; + if (optype == GF_XATTROP_ADD_ARRAY_WITH_DEFAULT || + optype == GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT) + count = count / 2; + + array = GF_CALLOC(count, sizeof(char), gf_posix_mt_char); + +#ifdef GF_DARWIN_HOST_OS + struct posix_private *priv = NULL; + priv = this->private; + if (priv->xattr_user_namespace == XATTR_STRIP) { + if (strncmp(k, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) { + k += XATTR_USER_PREFIX_LEN; + } + } +#endif + op_ret = posix_inode_ctx_get_all(inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + pthread_mutex_lock(&ctx->xattrop_lock); + { + if (filler->real_path) { + size = sys_lgetxattr(filler->real_path, k, (char *)array, count); + } else { + size = sys_fgetxattr(filler->fdnum, k, (char *)array, count); + } + + op_errno = errno; + if ((size == -1) && (op_errno != ENODATA) && (op_errno != ENOATTR)) { + if (op_errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not " + "supported by filesystem"); + } else if (op_errno != ENOENT || + !posix_special_xattr(marker_xattrs, k)) { + if (filler->real_path) + gf_msg(this->name, fop_log_level(GF_FOP_XATTROP, op_errno), + op_errno, P_MSG_XATTR_FAILED, + "getxattr failed on %s while " + "doing xattrop: Key:%s ", + filler->real_path, k); + else + gf_msg( + this->name, GF_LOG_ERROR, op_errno, P_MSG_XATTR_FAILED, + "fgetxattr failed on gfid=%s " + "while doing xattrop: " + "Key:%s (%s)", + uuid_utoa(filler->inode->gfid), k, strerror(op_errno)); + } + + op_ret = -1; + goto unlock; + } + + if (size == -1 && optype == GF_XATTROP_GET_AND_SET) { + GF_FREE(array); + array = NULL; + } + + /* We only write back the xattr if it has been really modified + * (i.e. v->data is not all 0's). Otherwise we return its value + * but we don't update anything. + * + * If the xattr does not exist, a value of all 0's is returned + * without creating it. */ + size = count; + if (optype != GF_XATTROP_GET_AND_SET && + mem_0filled(v->data, v->len) == 0) + goto unlock; + + dst_data = array; + switch (optype) { + case GF_XATTROP_ADD_ARRAY: + __add_array((int32_t *)array, (int32_t *)v->data, count / 4); + break; + + case GF_XATTROP_ADD_ARRAY64: + __add_long_array((int64_t *)array, (int64_t *)v->data, + count / 8); + break; + + case GF_XATTROP_ADD_ARRAY_WITH_DEFAULT: + __add_array_with_default((int32_t *)array, (int32_t *)v->data, + count / 4); + break; + + case GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT: + __add_long_array_with_default((int64_t *)array, + (int64_t *)v->data, count / 8); + break; + + case GF_XATTROP_GET_AND_SET: + dst_data = v->data; + break; + + default: + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_UNKNOWN_OP, + "Unknown xattrop type (%d)" + " on %s. Please send a bug report to " + "gluster-devel@gluster.org", + optype, filler->real_path); + op_ret = -1; + op_errno = EINVAL; + goto unlock; + } + + if (filler->real_path) { + size = sys_lsetxattr(filler->real_path, k, dst_data, count, 0); + } else { + size = sys_fsetxattr(filler->fdnum, k, (char *)dst_data, count, 0); + } + op_errno = errno; + } +unlock: + pthread_mutex_unlock(&ctx->xattrop_lock); + + if (op_ret == -1) + goto out; + + if (size == -1) { + if (filler->real_path) + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_XATTR_FAILED, + "setxattr failed on %s " + "while doing xattrop: key=%s", + filler->real_path, k); + else + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_XATTR_FAILED, + "fsetxattr failed on gfid=%s while doing " + "xattrop: key=%s (%s)", + uuid_utoa(filler->inode->gfid), k, strerror(op_errno)); + op_ret = -1; + goto out; + } else if (array) { + op_ret = dict_set_bin(filler->xattr, k, array, count); + if (op_ret) { + if (filler->real_path) + gf_msg_debug(this->name, 0, + "dict_set_bin failed (path=%s): " + "key=%s (%s)", + filler->real_path, k, strerror(-size)); + else + gf_msg_debug(this->name, 0, + "dict_set_bin failed (gfid=%s): " + "key=%s (%s)", + uuid_utoa(filler->inode->gfid), k, + strerror(-size)); + + op_ret = -1; + op_errno = EINVAL; + GF_FREE(array); + array = NULL; + goto out; + } + array = NULL; + } + +out: + if (op_ret < 0) + filler->op_errno = op_errno; + + if (array) + GF_FREE(array); + + return op_ret; +} + +/** + * xattrop - xattr operations - for internal use by GlusterFS + * @optype: ADD_ARRAY: + * dict should contain: + * "key" ==> array of 32-bit numbers + */ + +int +do_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + int op_ret = 0; + int op_errno = 0; + int _fd = -1; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + inode_t *inode = NULL; + posix_xattr_filler_t filler = { + 0, + }; + dict_t *xattr_rsp = NULL; + dict_t *xdata_rsp = NULL; + struct iatt stbuf = {0}; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(xattr, out); + VALIDATE_OR_GOTO(this, out); + + if (fd) { + op_ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, + fop_log_level(GF_FOP_FXATTROP, op_errno), + P_MSG_PFD_GET_FAILED, + "failed to get pfd from" + " fd=%p", + fd); + goto out; + } + _fd = pfd->fd; + } + + if (loc && !gf_uuid_is_null(loc->gfid)) { + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + op_ret = -1; + op_errno = ESTALE; + goto out; + } + } + + if (real_path) { + inode = loc->inode; + } else if (fd) { + inode = fd->inode; + } + + xattr_rsp = dict_new(); + if (xattr_rsp == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + filler.this = this; + filler.fdnum = _fd; + filler.real_path = real_path; + filler.flags = (int)optype; + filler.inode = inode; + filler.xattr = xattr_rsp; + + op_ret = dict_foreach(xattr, _posix_handle_xattr_keyvalue_pair, &filler); + op_errno = filler.op_errno; + if (op_ret < 0) + goto out; + + if (!xdata) + goto out; + + if (fd) { + op_ret = posix_fdstat(this, inode, _fd, &stbuf); + } else { + op_ret = posix_pstat(this, inode, inode->gfid, real_path, &stbuf, + _gf_false); + } + if (op_ret < 0) { + op_errno = errno; + goto out; + } + xdata_rsp = posix_xattr_fill(this, real_path, loc, fd, _fd, xdata, &stbuf); + if (!xdata_rsp) { + op_ret = -1; + op_errno = ENOMEM; + } + posix_set_mode_in_dict(xdata, xdata_rsp, &stbuf); +out: + + STACK_UNWIND_STRICT(xattrop, frame, op_ret, op_errno, xattr_rsp, xdata_rsp); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (xdata_rsp) + dict_unref(xdata_rsp); + return 0; +} + +int +posix_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + do_xattrop(frame, this, loc, NULL, optype, xattr, xdata); + return 0; +} + +int +posix_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + do_xattrop(frame, this, NULL, fd, optype, xattr, xdata); + return 0; +} + +int +posix_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(loc, out); + + MAKE_INODE_HANDLE(real_path, this, loc, NULL); + if (!real_path) { + op_ret = -1; + op_errno = errno; + goto out; + } + + op_ret = sys_access(real_path, mask & 07); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_ACCESS_FAILED, + "access failed on %s", real_path); + goto out; + } + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, NULL); + return 0; +} + +int32_t +posix_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct iatt preop = { + 0, + }; + struct iatt postop = { + 0, + }; + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + dict_t *rsp_xdata = NULL; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL, fd=%p", fd); + goto out; + } + + _fd = pfd->fd; + + op_ret = posix_fdstat(this, fd->inode, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + goto out; + } + + if (xdata) { + op_ret = posix_cs_maintenance(this, fd, NULL, &_fd, &preop, NULL, xdata, + &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + op_errno = EIO; + goto out; + } + } + + posix_update_iatt_buf(&preop, _fd, NULL, xdata); + op_ret = sys_ftruncate(_fd, offset); + + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TRUNCATE_FAILED, + "ftruncate failed on fd=%p (%" PRId64 "", fd, offset); + goto out; + } + + op_ret = posix_fdstat(this, fd->inode, _fd, &postop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd); + goto out; + } + + posix_set_ctime(frame, this, NULL, pfd->fd, fd->inode, &postop); + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, &preop, &postop, + NULL); + + return 0; +} + +int32_t +posix_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct iatt buf = { + 0, + }; + struct posix_fd *pfd = NULL; + dict_t *xattr_rsp = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + if (!xdata) + gf_msg_trace(this->name, 0, "null xdata passed, fd %p", fd); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL, fd=%p", fd); + goto out; + } + + _fd = pfd->fd; + + op_ret = posix_fdstat(this, fd->inode, _fd, &buf); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%p", fd); + goto out; + } + + if (xdata) { + xattr_rsp = posix_xattr_fill(this, NULL, NULL, fd, _fd, xdata, &buf); + + op_ret = posix_cs_maintenance(this, fd, NULL, &_fd, &buf, NULL, xdata, + &xattr_rsp, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + } + posix_cs_build_xattr_rsp(this, &xattr_rsp, xdata, _fd, NULL); + } + + posix_update_iatt_buf(&buf, _fd, NULL, xdata); + op_ret = 0; + +out: + SET_TO_OLD_FS_ID(); + + STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, &buf, xattr_rsp); + if (xattr_rsp) + dict_unref(xattr_rsp); + return 0; +} + +int32_t +posix_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) +{ + struct gf_lease nullease = { + 0, + }; + + gf_msg(this->name, GF_LOG_CRITICAL, EINVAL, P_MSG_LEASE_DISABLED, + "\"features/leases\" translator is not loaded. You need" + "to use it for proper functioning of your application"); + + STACK_UNWIND_STRICT(lease, frame, -1, ENOSYS, &nullease, NULL); + return 0; +} + +static int gf_posix_lk_log; + +int32_t +posix_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + struct gf_flock nullock = { + 0, + }; + + GF_LOG_OCCASIONALLY(gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + + STACK_UNWIND_STRICT(lk, frame, -1, ENOSYS, &nullock, NULL); + return 0; +} + +int32_t +posix_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY(gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + + STACK_UNWIND_STRICT(inodelk, frame, -1, ENOSYS, NULL); + return 0; +} + +int32_t +posix_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY(gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + + STACK_UNWIND_STRICT(finodelk, frame, -1, ENOSYS, NULL); + return 0; +} + +int32_t +posix_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY(gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + + STACK_UNWIND_STRICT(entrylk, frame, -1, ENOSYS, NULL); + return 0; +} + +int32_t +posix_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY(gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + + STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOSYS, NULL); + return 0; +} + +int +posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, + gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs) +{ + off_t in_case = -1; + off_t last_off = 0; + size_t filled = 0; + int count = 0; + int32_t this_size = -1; + gf_dirent_t *this_entry = NULL; + struct posix_fd *pfd = NULL; + struct stat stbuf = { + 0, + }; + char *hpath = NULL; + int len = 0; + int ret = 0; + int op_errno = 0; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL, fd=%p", fd); + count = -1; + errno = op_errno; + goto out; + } + + if (skip_dirs) { + hpath = alloca(PATH_MAX); + len = posix_handle_path(this, fd->inode->gfid, NULL, hpath, PATH_MAX); + if (len <= 0) { + errno = ESTALE; + count = -1; + goto out; + } + len = strlen(hpath); + hpath[len] = '/'; + } + + if (!off) { + rewinddir(dir); + } else { + seekdir(dir, off); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != off && off != pfd->dir_eof) { + gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, P_MSG_DIR_OPERATION_FAILED, + "seekdir(0x%llx) failed on dir=%p: " + "Invalid argument (offset reused from " + "another DIR * structure?)", + off, dir); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + } + + while (filled <= size) { + in_case = (u_long)telldir(dir); + + if (in_case == -1) { + gf_msg(THIS->name, GF_LOG_ERROR, errno, P_MSG_DIR_OPERATION_FAILED, + "telldir failed on dir=%p", dir); + goto out; + } + + errno = 0; + + entry = sys_readdir(dir, scratch); + + if (!entry || errno != 0) { + if (errno == EBADF) { + gf_msg(THIS->name, GF_LOG_WARNING, errno, + P_MSG_DIR_OPERATION_FAILED, "readdir failed on dir=%p", + dir); + goto out; + } + break; + } + +#ifdef __NetBSD__ + /* + * NetBSD with UFS1 backend uses backing files for + * extended attributes. They can be found in a + * .attribute file located at the root of the filesystem + * We hide it to glusterfs clients, since chaos will occur + * when the cluster/dht xlator decides to distribute + * exended attribute backing file across storage servers. + */ + if (__is_root_gfid(fd->inode->gfid) == 0 && + (!strcmp(entry->d_name, ".attribute"))) + continue; +#endif /* __NetBSD__ */ + + if (__is_root_gfid(fd->inode->gfid) && + (!strcmp(GF_HIDDEN_PATH, entry->d_name))) { + continue; + } + + if (skip_dirs) { + if (DT_ISDIR(entry->d_type)) { + continue; + } else if (hpath) { + strcpy(&hpath[len + 1], entry->d_name); + ret = sys_lstat(hpath, &stbuf); + if (!ret && S_ISDIR(stbuf.st_mode)) + continue; + } + } + + this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) + + strlen(entry->d_name) + 1; + + if (this_size + filled > size) { + seekdir(dir, in_case); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != in_case && in_case != pfd->dir_eof) { + gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, + P_MSG_DIR_OPERATION_FAILED, + "seekdir(0x%llx) failed on dir=%p: " + "Invalid argument (offset reused from " + "another DIR * structure?)", + in_case, dir); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + break; + } + + this_entry = gf_dirent_for_name(entry->d_name); + + if (!this_entry) { + gf_msg(THIS->name, GF_LOG_ERROR, errno, + P_MSG_GF_DIRENT_CREATE_FAILED, + "could not create " + "gf_dirent for entry %s", + entry->d_name); + goto out; + } + /* + * we store the offset of next entry here, which is + * probably not intended, but code using syncop_readdir() + * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it + * for directory read resumption. + */ + last_off = (u_long)telldir(dir); + this_entry->d_off = last_off; + this_entry->d_ino = entry->d_ino; + this_entry->d_type = entry->d_type; + + list_add_tail(&this_entry->list, &entries->list); + + filled += this_size; + count++; + } + + if ((!sys_readdir(dir, scratch) && (errno == 0))) { + /* Indicate EOF */ + errno = ENOENT; + /* Remember EOF offset for later detection */ + pfd->dir_eof = (u_long)last_off; + } +out: + return count; +} + +dict_t * +posix_entry_xattr_fill(xlator_t *this, inode_t *inode, fd_t *fd, + char *entry_path, dict_t *dict, struct iatt *stbuf) +{ + loc_t tmp_loc = { + 0, + }; + + /* if we don't send the 'loc', open-fd-count be a problem. */ + tmp_loc.inode = inode; + + return posix_xattr_fill(this, entry_path, &tmp_loc, NULL, -1, dict, stbuf); +} + +int +posix_readdirp_fill(xlator_t *this, fd_t *fd, gf_dirent_t *entries, + dict_t *dict) +{ + gf_dirent_t *entry = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + int len = 0; + struct iatt stbuf = { + 0, + }; + uuid_t gfid; + int ret = -1; + + if (list_empty(&entries->list)) + return 0; + + itable = fd->inode->table; + + hpath = alloca(PATH_MAX); + len = posix_handle_path(this, fd->inode->gfid, NULL, hpath, PATH_MAX); + if (len <= 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_HANDLEPATH_FAILED, + "Failed to create handle path, fd=%p, gfid=%s", fd, + uuid_utoa(fd->inode->gfid)); + return -1; + } + len = strlen(hpath); + hpath[len] = '/'; + + list_for_each_entry(entry, &entries->list, list) + { + inode = inode_grep(fd->inode->table, fd->inode, entry->d_name); + if (inode) + gf_uuid_copy(gfid, inode->gfid); + else + bzero(gfid, 16); + + strcpy(&hpath[len + 1], entry->d_name); + + ret = posix_pstat(this, inode, gfid, hpath, &stbuf, _gf_false); + + if (ret == -1) { + if (inode) + inode_unref(inode); + continue; + } + + posix_update_iatt_buf(&stbuf, -1, hpath, dict); + + if (!inode) + inode = inode_find(itable, stbuf.ia_gfid); + + if (!inode) + inode = inode_new(itable); + + entry->inode = inode; + + if (dict) { + entry->dict = posix_entry_xattr_fill(this, entry->inode, fd, hpath, + dict, &stbuf); + } + + entry->d_stat = stbuf; + if (stbuf.ia_ino) + entry->d_ino = stbuf.ia_ino; + + if (entry->d_type == DT_UNKNOWN && !IA_ISINVAL(stbuf.ia_type)) { + /* The platform supports d_type but the underlying + filesystem doesn't. We set d_type to the correct + value from ia_type */ + entry->d_type = gf_d_type_from_ia_type(stbuf.ia_type); + } + + inode = NULL; + } + + return 0; +} + +int32_t +posix_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, int whichop, dict_t *dict) +{ + struct posix_fd *pfd = NULL; + DIR *dir = NULL; + int ret = -1; + int count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + gf_dirent_t entries; + int32_t skip_dirs = 0; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + INIT_LIST_HEAD(&entries.list); + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, + "pfd is NULL, fd=%p", fd); + goto out; + } + + dir = pfd->dir; + + if (!dir) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, P_MSG_PFD_NULL, + "dir is NULL for fd=%p", fd); + op_errno = EINVAL; + goto out; + } + + /* When READDIR_FILTER option is set to on, we can filter out + * directory's entry from the entry->list. + */ + ret = dict_get_int32(dict, GF_READDIR_SKIP_DIRS, &skip_dirs); + + LOCK(&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. + + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir(fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK(&fd->lock); + + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + + if (whichop != GF_FOP_READDIRP) + goto out; + + posix_readdirp_fill(this, fd, &entries, dict); + +out: + if (whichop == GF_FOP_READDIR) + STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, NULL); + else + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); + + return 0; +} + +int32_t +posix_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + posix_do_readdir(frame, this, fd, size, off, GF_FOP_READDIR, xdata); + return 0; +} + +int32_t +posix_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + gf_dirent_t entries; + int32_t op_ret = -1, op_errno = 0; + gf_dirent_t *entry = NULL; + + if ((dict != NULL) && (dict_get(dict, GET_ANCESTRY_DENTRY_KEY))) { + INIT_LIST_HEAD(&entries.list); + + op_ret = posix_get_ancestry(this, fd->inode, &entries, NULL, + POSIX_ANCESTRY_DENTRY, &op_errno, dict); + if (op_ret >= 0) { + op_ret = 0; + + list_for_each_entry(entry, &entries.list, list) { op_ret++; } + } + + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); + return 0; + } + + posix_do_readdir(frame, this, fd, size, off, GF_FOP_READDIRP, dict); + return 0; +} + +int32_t +posix_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + char *alloc_buf = NULL; + char *buf = NULL; + int _fd = -1; + struct posix_fd *pfd = NULL; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + ssize_t bytes_read = 0; + int32_t weak_checksum = 0; + int32_t zerofillcheck = 0; + /* Protocol version 4 uses 32 bytes i.e SHA256_DIGEST_LENGTH, + so this is used. */ + unsigned char md5_checksum[SHA256_DIGEST_LENGTH] = {0}; + unsigned char strong_checksum[SHA256_DIGEST_LENGTH] = {0}; + unsigned char *checksum = NULL; + struct posix_private *priv = NULL; + dict_t *rsp_xdata = NULL; + gf_boolean_t buf_has_zeroes = _gf_false; + struct iatt preop = { + 0, + }; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + + alloc_buf = _page_aligned_alloc(len, &buf); + if (!alloc_buf) { + op_errno = ENOMEM; + goto out; + } + + rsp_xdata = dict_new(); + if (!rsp_xdata) { + op_errno = ENOMEM; + goto out; + } + + ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_PFD_NULL, + "pfd is NULL, fd=%p", fd); + goto out; + } + + _fd = pfd->fd; + + if (xdata) { + op_ret = posix_fdstat(this, fd->inode, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd); + goto out; + } + + op_ret = posix_cs_maintenance(this, fd, NULL, &_fd, &preop, NULL, xdata, + &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd); + op_errno = EIO; + goto out; + } + } + + LOCK(&fd->lock); + { + if (priv->aio_capable && priv->aio_init_done) + __posix_fd_set_odirect(fd, pfd, 0, offset, len); + + bytes_read = sys_pread(_fd, buf, len, offset); + if (bytes_read < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_PREAD_FAILED, + "pread of %d bytes returned %zd", len, bytes_read); + + op_errno = errno; + } + } + UNLOCK(&fd->lock); + + if (bytes_read < 0) + goto out; + + if (xdata && + dict_get_int32(xdata, "check-zero-filled", &zerofillcheck) == 0) { + buf_has_zeroes = (mem_0filled(buf, bytes_read)) ? _gf_false : _gf_true; + ret = dict_set_uint32(rsp_xdata, "buf-has-zeroes", buf_has_zeroes); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for key: %s", + uuid_utoa(fd->inode->gfid), "buf-has-zeroes"); + op_errno = -ret; + goto out; + } + } + weak_checksum = gf_rsync_weak_checksum((unsigned char *)buf, (size_t)ret); + + if (priv->fips_mode_rchecksum) { + ret = dict_set_int32(rsp_xdata, "fips-mode-rchecksum", 1); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_DICT_SET_FAILED, + "%s: Failed to set " + "dictionary value for key: %s", + uuid_utoa(fd->inode->gfid), "fips-mode-rchecksum"); + goto out; + } + checksum = strong_checksum; + gf_rsync_strong_checksum((unsigned char *)buf, (size_t)bytes_read, + (unsigned char *)checksum); + } else { + checksum = md5_checksum; + gf_rsync_md5_checksum((unsigned char *)buf, (size_t)bytes_read, + (unsigned char *)checksum); + } + op_ret = 0; + + posix_set_ctime(frame, this, NULL, _fd, fd->inode, NULL); + +out: + STACK_UNWIND_STRICT(rchecksum, frame, op_ret, op_errno, weak_checksum, + checksum, rsp_xdata); + if (rsp_xdata) + dict_unref(rsp_xdata); + GF_FREE(alloc_buf); + + return 0; +} + +int +posix_forget(xlator_t *this, inode_t *inode) +{ + int ret = 0; + char *unlink_path = NULL; + uint64_t ctx_uint1 = 0; + uint64_t ctx_uint2 = 0; + posix_inode_ctx_t *ctx = NULL; + posix_mdata_t *mdata = NULL; + struct posix_private *priv_posix = NULL; + + priv_posix = (struct posix_private *)this->private; + if (!priv_posix) + return 0; + + ret = inode_ctx_del2(inode, this, &ctx_uint1, &ctx_uint2); + if (!ctx_uint1) + goto check_ctx2; + + ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint1; + + if (ctx->unlink_flag == GF_UNLINK_TRUE) { + POSIX_GET_FILE_UNLINK_PATH(priv_posix->base_path, inode->gfid, + unlink_path); + if (!unlink_path) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, + "Failed to remove gfid :%s", uuid_utoa(inode->gfid)); + ret = -1; + goto ctx_free; + } + ret = sys_unlink(unlink_path); + } +ctx_free: + pthread_mutex_destroy(&ctx->xattrop_lock); + pthread_mutex_destroy(&ctx->write_atomic_lock); + pthread_mutex_destroy(&ctx->pgfid_lock); + GF_FREE(ctx); + +check_ctx2: + if (ctx_uint2) { + mdata = (posix_mdata_t *)(uintptr_t)ctx_uint2; + } + + GF_FREE(mdata); + return ret; +} diff --git a/xlators/storage/posix/src/posix-inode-handle.h b/xlators/storage/posix/src/posix-inode-handle.h new file mode 100644 index 00000000000..36c47f2bebc --- /dev/null +++ b/xlators/storage/posix/src/posix-inode-handle.h @@ -0,0 +1,118 @@ +/* + Copyright (c) 2011-2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_INODE_HANDLE_H +#define _POSIX_INODE_HANDLE_H + +#include <limits.h> +#include <sys/types.h> +#include <glusterfs/gf-dirent.h> +#include "posix.h" + +/* From Open Group Base Specifications Issue 6 */ +#ifndef _XOPEN_PATH_MAX +#define _XOPEN_PATH_MAX 1024 +#endif + +#define TRASH_DIR "landfill" + +#define UUID0_STR "00000000-0000-0000-0000-000000000000" +#define SLEN(str) (sizeof(str) - 1) + +#define LOC_HAS_ABSPATH(loc) (loc && (loc->path) && (loc->path[0] == '/')) +#define LOC_IS_DIR(loc) \ + (loc && (loc->inode) && (loc->inode->ia_type == IA_IFDIR)) +#define MAKE_REAL_PATH(var, this, path) \ + do { \ + size_t path_len = strlen(path); \ + size_t var_len = path_len + POSIX_BASE_PATH_LEN(this) + 1; \ + if (POSIX_PATH_MAX(this) != -1 && var_len >= POSIX_PATH_MAX(this)) { \ + var = alloca(path_len + 1); \ + strcpy(var, (path[0] == '/') ? path + 1 : path); \ + } else { \ + var = alloca(var_len); \ + strcpy(var, POSIX_BASE_PATH(this)); \ + strcpy(&var[POSIX_BASE_PATH_LEN(this)], path); \ + } \ + } while (0) + +#define MAKE_HANDLE_PATH(var, this, gfid, base) \ + do { \ + int __len = 0; \ + int tot = PATH_MAX; \ + var = alloca(tot); \ + __len = posix_handle_path(this, gfid, base, var, tot); \ + if (__len <= 0) { \ + var = NULL; \ + } \ + } while (0) + +/* TODO: it is not a good idea to change a variable which + is not passed to the macro.. Fix it later */ +#define MAKE_INODE_HANDLE(rpath, this, loc, iatt_p) \ + do { \ + if (!this->private) { \ + op_ret = -1; \ + gf_msg("make_inode_handle", GF_LOG_ERROR, 0, \ + P_MSG_INODE_HANDLE_CREATE, \ + "private is NULL, fini is already called"); \ + break; \ + } \ + if (gf_uuid_is_null(loc->gfid)) { \ + op_ret = -1; \ + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INODE_HANDLE_CREATE, \ + "null gfid for path %s", (loc)->path); \ + break; \ + } \ + if (LOC_IS_DIR(loc) && LOC_HAS_ABSPATH(loc)) { \ + MAKE_REAL_PATH(rpath, this, (loc)->path); \ + op_ret = posix_pstat(this, (loc)->inode, (loc)->gfid, rpath, \ + iatt_p, _gf_false); \ + break; \ + } \ + errno = 0; \ + op_ret = posix_istat(this, loc->inode, loc->gfid, NULL, iatt_p); \ + if (errno != ELOOP) { \ + MAKE_HANDLE_PATH(rpath, this, (loc)->gfid, NULL); \ + if (!rpath) { \ + op_ret = -1; \ + gf_msg(this->name, GF_LOG_ERROR, errno, \ + P_MSG_INODE_HANDLE_CREATE, \ + "Failed to create inode handle " \ + "for path %s", \ + (loc)->path); \ + } \ + break; \ + } /* __ret == -1 && errno == ELOOP */ \ + else { \ + op_ret = -1; \ + } \ + } while (0) + +#define POSIX_ANCESTRY_PATH (1 << 0) +#define POSIX_ANCESTRY_DENTRY (1 << 1) + +int +posix_handle_path(xlator_t *this, uuid_t gfid, const char *basename, char *buf, + size_t len); + +int +posix_make_ancestryfromgfid(xlator_t *this, char *path, int pathsize, + gf_dirent_t *head, int type, uuid_t gfid, + const size_t handle_size, + const char *priv_base_path, inode_table_t *table, + inode_t **parent, dict_t *xdata, int32_t *op_errno); + +int +posix_handle_init(xlator_t *this); + +int +posix_handle_trash_init(xlator_t *this); + +#endif /* !_POSIX_INODE_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h new file mode 100644 index 00000000000..2253f381ac5 --- /dev/null +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -0,0 +1,25 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __POSIX_MEM_TYPES_H__ +#define __POSIX_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_posix_mem_types_ { + gf_posix_mt_posix_fd = gf_common_mt_end + 1, + gf_posix_mt_char, + gf_posix_mt_posix_private, + gf_posix_mt_trash_path, + gf_posix_mt_paiocb, + gf_posix_mt_inode_ctx_t, + gf_posix_mt_mdata_attr, + gf_posix_mt_end +}; +#endif diff --git a/xlators/storage/posix/src/posix-messages.h b/xlators/storage/posix/src/posix-messages.h new file mode 100644 index 00000000000..f5bede266da --- /dev/null +++ b/xlators/storage/posix/src/posix-messages.h @@ -0,0 +1,74 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _POSIX_MESSAGES_H_ +#define _POSIX_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(POSIX, P_MSG_XATTR_FAILED, P_MSG_NULL_GFID, P_MSG_FCNTL_FAILED, + P_MSG_READV_FAILED, P_MSG_FSTAT_FAILED, P_MSG_PFD_NULL, + P_MSG_INVALID_ARGUMENT, P_MSG_IO_SUBMIT_FAILED, P_MSG_WRITEV_FAILED, + P_MSG_IO_GETEVENTS_FAILED, P_MSG_UNKNOWN_OP, P_MSG_AIO_UNAVAILABLE, + P_MSG_IO_SETUP_FAILED, P_MSG_ZEROFILL_FAILED, P_MSG_OPENDIR_FAILED, + P_MSG_DIRFD_FAILED, P_MSG_FD_PATH_SETTING_FAILED, P_MSG_LSTAT_FAILED, + P_MSG_READYLINK_FAILED, P_MSG_GFID_FAILED, P_MSG_CREATE_FAILED, + P_MSG_MKNOD_FAILED, P_MSG_LCHOWN_FAILED, P_MSG_ACL_FAILED, + P_MSG_MKDIR_NOT_PERMITTED, P_MSG_DIR_OF_SAME_ID, P_MSG_MKDIR_FAILED, + P_MSG_CHOWN_FAILED, P_MSG_UNLINK_FAILED, P_MSG_KEY_STATUS_INFO, + P_MSG_XATTR_STATUS, P_MSG_RMDIR_NOT_PERMITTED, P_MSG_RMDIR_FAILED, + P_MSG_DIR_OPERATION_FAILED, P_MSG_SYMLINK_FAILED, P_MSG_DIR_FOUND, + P_MSG_LINK_FAILED, P_MSG_TRUNCATE_FAILED, P_MSG_FILE_OP_FAILED, + P_MSG_READ_FAILED, P_MSG_DICT_SET_FAILED, P_MSG_STATVFS_FAILED, + P_MSG_DIR_NOT_NULL, P_MSG_FSYNC_FAILED, P_MSG_CLOSE_FAILED, + P_MSG_GETTING_FILENAME_FAILED, P_MSG_INODE_PATH_GET_FAILED, + P_MSG_GET_KEY_VALUE_FAILED, P_MSG_CHMOD_FAILED, P_MSG_FCHMOD_FAILED, + P_MSG_FCHOWN_FAILED, P_MSG_UTIMES_FAILED, P_MSG_FUTIMES_FAILED, + P_MSG_XATTR_NOT_REMOVED, P_MSG_PFD_GET_FAILED, P_MSG_ACCESS_FAILED, + P_MSG_PREAD_FAILED, P_MSG_UUID_NULL, P_MSG_EXPORT_DIR_MISSING, + P_MSG_SUBVOLUME_ERROR, P_MSG_VOLUME_DANGLING, P_MSG_INVALID_OPTION, + P_MSG_INVALID_VOLUME_ID, P_MSG_VOLUME_ID_ABSENT, + P_MSG_HOSTNAME_MISSING, P_MSG_SET_ULIMIT_FAILED, + P_MSG_SET_FILE_MAX_FAILED, P_MSG_MAX_FILE_OPEN, P_MSG_OPEN_FAILED, + P_MSG_LOOKUP_NOT_PERMITTED, P_MSG_RENAME_FAILED, P_MSG_WRITE_FAILED, + P_MSG_FILE_FAILED, P_MSG_THREAD_FAILED, P_MSG_HEALTHCHECK_FAILED, + P_MSG_GET_FDCTX_FAILED, P_MSG_HANDLEPATH_FAILED, + P_MSG_IPC_NOT_HANDLE, P_MSG_SET_XDATA_FAIL, + P_MSG_DURABILITY_REQ_NOT_SATISFIED, P_MSG_XATTR_NOTSUP, + P_MSG_GFID_SET_FAILED, P_MSG_ACL_NOTSUP, P_MSG_BASEPATH_CHDIR_FAILED, + P_MSG_INVALID_OPTION_VAL, P_MSG_INVALID_NODE_UUID, + P_MSG_FSYNCER_THREAD_CREATE_FAILED, P_MSG_GF_DIRENT_CREATE_FAILED, + P_MSG_VOLUME_ID_FETCH_FAILED, P_MSG_UNKNOWN_ARGUMENT, + P_MSG_INODE_HANDLE_CREATE, P_MSG_ENTRY_HANDLE_CREATE, P_MSG_PGFID_OP, + P_MSG_POSIX_AIO, P_MSG_HANDLE_CREATE_TRASH, P_MSG_HANDLE_CREATE, + P_MSG_HANDLE_PATH_CREATE, P_MSG_SET_FILE_CONTENTS, + P_MSG_XDATA_GETXATTR, P_MSG_STALE_HANDLE_REMOVE_FAILED, + P_MSG_HANDLE_PATH_CREATE_FAILED, P_MSG_HANDLE_TRASH_CREATE, + P_MSG_HANDLE_DELETE, P_MSG_READLINK_FAILED, P_MSG_BUFFER_OVERFLOW, + P_MSG_SEEK_UNKOWN, P_MSG_SEEK_FAILED, P_MSG_INODE_RESOLVE_FAILED, + P_MSG_PREOP_CHECK_FAILED, P_MSG_LEASE_DISABLED, + P_MSG_ANCESTORY_FAILED, P_MSG_DISK_SPACE_CHECK_FAILED, + P_MSG_FALLOCATE_FAILED, P_MSG_STOREMDATA_FAILED, + P_MSG_FETCHMDATA_FAILED, P_MSG_GETMDATA_FAILED, + P_MSG_SETMDATA_FAILED, P_MSG_FRESHFILE, P_MSG_MUTEX_FAILED, + P_MSG_COPY_FILE_RANGE_FAILED, P_MSG_TIMER_DELETE_FAILED, P_MSG_NOMEM, + P_MSG_PSTAT_FAILED, P_MSG_FDSTAT_FAILED); + +#endif /* !_GLUSTERD_MESSAGES_H_ */ diff --git a/xlators/storage/posix/src/posix-metadata-disk.h b/xlators/storage/posix/src/posix-metadata-disk.h new file mode 100644 index 00000000000..8833fbb5428 --- /dev/null +++ b/xlators/storage/posix/src/posix-metadata-disk.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _POSIX_METADATA_DISK_H +#define _POSIX_METADATA_DISK_H + +typedef struct gf_timespec_disk { + uint64_t tv_sec; + uint64_t tv_nsec; +} gf_timespec_disk_t; + +/* posix_mdata_t on disk structure */ + +typedef struct __attribute__((__packed__)) posix_mdata_disk { + /* version of structure, bumped up if any new member is added */ + uint8_t version; + /* flags indicates valid fields in the structure */ + uint64_t flags; + gf_timespec_disk_t ctime; + gf_timespec_disk_t mtime; + gf_timespec_disk_t atime; +} posix_mdata_disk_t; + +#endif /* _POSIX_METADATA_DISK_H */ diff --git a/xlators/storage/posix/src/posix-metadata.c b/xlators/storage/posix/src/posix-metadata.c new file mode 100644 index 00000000000..b1889052f11 --- /dev/null +++ b/xlators/storage/posix/src/posix-metadata.c @@ -0,0 +1,916 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/xlator.h> +#include "posix-metadata.h" +#include "posix-metadata-disk.h" +#include "posix-handle.h" +#include "posix-messages.h" +#include <glusterfs/syscall.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> + +static int gf_posix_xattr_enotsup_log; + +/* posix_mdata_to_disk converts posix_mdata_t into network byte order to + * save it on disk in machine independent format + */ +static inline void +posix_mdata_to_disk(posix_mdata_disk_t *out, posix_mdata_t *in) +{ + out->version = in->version; + out->flags = htobe64(in->flags); + + out->ctime.tv_sec = htobe64(in->ctime.tv_sec); + out->ctime.tv_nsec = htobe64(in->ctime.tv_nsec); + + out->mtime.tv_sec = htobe64(in->mtime.tv_sec); + out->mtime.tv_nsec = htobe64(in->mtime.tv_nsec); + + out->atime.tv_sec = htobe64(in->atime.tv_sec); + out->atime.tv_nsec = htobe64(in->atime.tv_nsec); +} + +/* posix_mdata_from_disk converts posix_mdata_disk_t into host byte order + */ +static inline void +posix_mdata_from_disk(posix_mdata_t *out, posix_mdata_disk_t *in) +{ + out->version = in->version; + out->flags = be64toh(in->flags); + + out->ctime.tv_sec = be64toh(in->ctime.tv_sec); + out->ctime.tv_nsec = be64toh(in->ctime.tv_nsec); + + out->mtime.tv_sec = be64toh(in->mtime.tv_sec); + out->mtime.tv_nsec = be64toh(in->mtime.tv_nsec); + + out->atime.tv_sec = be64toh(in->atime.tv_sec); + out->atime.tv_nsec = be64toh(in->atime.tv_nsec); +} + +void +posix_mdata_iatt_from_disk(struct mdata_iatt *out, posix_mdata_disk_t *in) +{ + out->ia_ctime = be64toh(in->ctime.tv_sec); + out->ia_ctime_nsec = be64toh(in->ctime.tv_nsec); + + out->ia_mtime = be64toh(in->mtime.tv_sec); + out->ia_mtime_nsec = be64toh(in->mtime.tv_nsec); + + out->ia_atime = be64toh(in->atime.tv_sec); + out->ia_atime_nsec = be64toh(in->atime.tv_nsec); +} + +/* posix_fetch_mdata_xattr fetches the posix_mdata_t from disk */ +static int +posix_fetch_mdata_xattr(xlator_t *this, const char *real_path_arg, int _fd, + inode_t *inode, posix_mdata_t *metadata, int *op_errno) +{ + size_t size = 256; + int op_ret = -1; + char *value = NULL; + gf_boolean_t fd_based_fop = _gf_false; + char gfid_str[64] = {0}; + char *real_path = NULL; + + if (!metadata) { + goto out; + } + + if (_fd != -1) { + fd_based_fop = _gf_true; + } + if (!(fd_based_fop || real_path_arg)) { + GF_VALIDATE_OR_GOTO(this->name, inode, out); + MAKE_HANDLE_PATH(real_path, this, inode->gfid, NULL); + if (!real_path) { + *op_errno = errno; + uuid_utoa_r(inode->gfid, gfid_str); + gf_msg(this->name, GF_LOG_WARNING, *op_errno, P_MSG_LSTAT_FAILED, + "lstat on gfid %s failed", gfid_str); + goto out; + } + } + + value = GF_MALLOC(size * sizeof(char), gf_posix_mt_char); + if (!value) { + *op_errno = ENOMEM; + goto out; + } + + if (fd_based_fop) { + size = sys_fgetxattr(_fd, GF_XATTR_MDATA_KEY, value, size); + } else if (real_path_arg) { + size = sys_lgetxattr(real_path_arg, GF_XATTR_MDATA_KEY, value, size); + } else if (real_path) { + size = sys_lgetxattr(real_path, GF_XATTR_MDATA_KEY, value, size); + } + + if (size == -1) { + *op_errno = errno; + if (value) { + GF_FREE(value); + value = NULL; + } + if ((*op_errno == ENOTSUP) || (*op_errno == ENOSYS)) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, this->name, + GF_LOG_WARNING, + "Extended attributes not supported" + " (try remounting brick with 'user xattr' " + "flag)"); + } else if (*op_errno == ENOATTR || *op_errno == ENODATA) { + gf_msg_debug(this->name, 0, + "No such attribute:%s for file %s gfid: %s", + GF_XATTR_MDATA_KEY, + real_path ? real_path + : (real_path_arg ? real_path_arg : "null"), + inode ? uuid_utoa(inode->gfid) : "null"); + goto out; + } + + if (fd_based_fop) { + size = sys_fgetxattr(_fd, GF_XATTR_MDATA_KEY, NULL, 0); + } else if (real_path_arg) { + size = sys_lgetxattr(real_path_arg, GF_XATTR_MDATA_KEY, NULL, 0); + } else if (real_path) { + size = sys_lgetxattr(real_path, GF_XATTR_MDATA_KEY, NULL, 0); + } + + if (size == -1) { /* give up now and exist with an error */ + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_XATTR_FAILED, + "getxattr failed on %s gfid: %s key: %s ", + real_path ? real_path + : (real_path_arg ? real_path_arg : "null"), + inode ? uuid_utoa(inode->gfid) : "null", GF_XATTR_MDATA_KEY); + goto out; + } + + value = GF_MALLOC(size * sizeof(char), gf_posix_mt_char); + if (!value) { + *op_errno = ENOMEM; + goto out; + } + + if (fd_based_fop) { + size = sys_fgetxattr(_fd, GF_XATTR_MDATA_KEY, value, size); + } else if (real_path_arg) { + size = sys_lgetxattr(real_path_arg, GF_XATTR_MDATA_KEY, value, + size); + } else if (real_path) { + size = sys_lgetxattr(real_path, GF_XATTR_MDATA_KEY, value, size); + } + if (size == -1) { + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_XATTR_FAILED, + "getxattr failed on %s gfid: %s key: %s ", + real_path ? real_path + : (real_path_arg ? real_path_arg : "null"), + inode ? uuid_utoa(inode->gfid) : "null", GF_XATTR_MDATA_KEY); + goto out; + } + } + posix_mdata_from_disk(metadata, (posix_mdata_disk_t *)value); + + op_ret = 0; +out: + if (value) + GF_FREE(value); + return op_ret; +} + +/* posix_store_mdata_xattr stores the posix_mdata_t on disk */ +static int +posix_store_mdata_xattr(xlator_t *this, const char *real_path_arg, int fd, + inode_t *inode, posix_mdata_t *metadata) +{ + char *real_path = NULL; + int op_ret = 0; + gf_boolean_t fd_based_fop = _gf_false; + char *key = GF_XATTR_MDATA_KEY; + char gfid_str[64] = {0}; + posix_mdata_disk_t disk_metadata; + + if (!metadata) { + op_ret = -1; + goto out; + } + + if (fd != -1) { + fd_based_fop = _gf_true; + } + if (!(fd_based_fop || real_path_arg)) { + MAKE_HANDLE_PATH(real_path, this, inode->gfid, NULL); + if (!real_path) { + uuid_utoa_r(inode->gfid, gfid_str); + gf_msg(this->name, GF_LOG_DEBUG, errno, P_MSG_LSTAT_FAILED, + "lstat on gfid %s failed", gfid_str); + op_ret = -1; + goto out; + } + } + + /* Set default version as 1 */ + posix_mdata_to_disk(&disk_metadata, metadata); + + if (fd_based_fop) { + op_ret = sys_fsetxattr(fd, key, (void *)&disk_metadata, + sizeof(posix_mdata_disk_t), 0); + } else if (real_path_arg) { + op_ret = sys_lsetxattr(real_path_arg, key, (void *)&disk_metadata, + sizeof(posix_mdata_disk_t), 0); + } else if (real_path) { + op_ret = sys_lsetxattr(real_path, key, (void *)&disk_metadata, + sizeof(posix_mdata_disk_t), 0); + } + +#ifdef GF_DARWIN_HOST_OS + if (real_path_arg) { + posix_dump_buffer(this, real_path_arg, key, value, 0); + } else if (real_path) { + posix_dump_buffer(this, real_path, key, value, 0); + } +#endif +out: + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED, + "file: %s: gfid: %s key:%s ", + real_path ? real_path : (real_path_arg ? real_path_arg : "null"), + uuid_utoa(inode->gfid), key); + } + return op_ret; +} + +/* _posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails + * to get it from inode context, gets it from disk. This is with out inode lock. + */ +int +__posix_get_mdata_xattr(xlator_t *this, const char *real_path, int _fd, + inode_t *inode, struct iatt *stbuf) +{ + uint64_t ctx; + posix_mdata_t *mdata = NULL; + int ret = -1; + int op_errno = 0; + + /* Handle readdirp: inode might be null, time attributes should be served + * from xattr not from backend's file attributes */ + if (inode) { + ret = __inode_ctx_get1(inode, this, &ctx); + if (ret == 0) { + mdata = (posix_mdata_t *)(uintptr_t)ctx; + } + } else { + ret = -1; + } + + if (ret == -1 || !mdata) { + mdata = GF_CALLOC(1, sizeof(posix_mdata_t), gf_posix_mt_mdata_attr); + if (!mdata) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_NOMEM, + "Could not allocate mdata. file: %s: gfid: %s", + real_path ? real_path : "null", + inode ? uuid_utoa(inode->gfid) : "null"); + ret = -1; + goto out; + } + + ret = posix_fetch_mdata_xattr(this, real_path, _fd, inode, mdata, + &op_errno); + + if (ret == 0) { + /* Got mdata from disk, set it in inode ctx. This case + * is hit when in-memory status is lost due to brick + * down scenario + */ + if (inode) { + ctx = (uint64_t)(uintptr_t)mdata; + __inode_ctx_set1(inode, this, &ctx); + } + } else { + /* Failed to get mdata from disk, xattr missing. + * This happens when the file is created before + * ctime is enabled. + */ + if (stbuf && op_errno != ENOENT) { + ret = 0; + GF_FREE(mdata); + goto out; + } else { + /* This case should not be hit. If it hits, + * don't fail, log warning, free mdata and move + * on + */ + gf_msg(this->name, GF_LOG_WARNING, op_errno, + P_MSG_FETCHMDATA_FAILED, "file: %s: gfid: %s key:%s ", + real_path ? real_path : "null", + inode ? uuid_utoa(inode->gfid) : "null", + GF_XATTR_MDATA_KEY); + GF_FREE(mdata); + ret = 0; + goto out; + } + } + } + + ret = 0; + + if (ret == 0 && stbuf) { + stbuf->ia_ctime = mdata->ctime.tv_sec; + stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec; + stbuf->ia_mtime = mdata->mtime.tv_sec; + stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec; + stbuf->ia_atime = mdata->atime.tv_sec; + stbuf->ia_atime_nsec = mdata->atime.tv_nsec; + } + /* Not set in inode context, hence free mdata */ + if (!inode) { + GF_FREE(mdata); + } + +out: + return ret; +} + +/* posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails + * to get it from inode context, gets it from disk. This is with inode lock. + */ +int +posix_get_mdata_xattr(xlator_t *this, const char *real_path, int _fd, + inode_t *inode, struct iatt *stbuf) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __posix_get_mdata_xattr(this, real_path, _fd, inode, stbuf); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +static int +posix_compare_timespec(struct timespec *first, struct timespec *second) +{ + if (first->tv_sec == second->tv_sec) + return first->tv_nsec - second->tv_nsec; + else + return first->tv_sec - second->tv_sec; +} + +int +posix_set_mdata_xattr_legacy_files(xlator_t *this, inode_t *inode, + const char *realpath, + struct mdata_iatt *mdata_iatt, int *op_errno) +{ + uint64_t ctx; + posix_mdata_t *mdata = NULL; + posix_mdata_t imdata = { + 0, + }; + int ret = 0; + gf_boolean_t mdata_already_set = _gf_false; + + GF_VALIDATE_OR_GOTO("posix", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __inode_ctx_get1(inode, this, &ctx); + if (ret == 0 && ctx) { + mdata = (posix_mdata_t *)(uintptr_t)ctx; + mdata_already_set = _gf_true; + } else { + mdata = GF_CALLOC(1, sizeof(posix_mdata_t), gf_posix_mt_mdata_attr); + if (!mdata) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_NOMEM, + "Could not allocate mdata. gfid: %s", + uuid_utoa(inode->gfid)); + ret = -1; + *op_errno = ENOMEM; + goto unlock; + } + + ret = posix_fetch_mdata_xattr(this, realpath, -1, inode, + (void *)mdata, op_errno); + if (ret == 0) { + /* Got mdata from disk. This is a race, another client + * has healed the xattr during lookup. So set it in inode + * ctx */ + ctx = (uint64_t)(uintptr_t)mdata; + __inode_ctx_set1(inode, this, &ctx); + mdata_already_set = _gf_true; + } else { + *op_errno = 0; + mdata->version = 1; + mdata->flags = 0; + mdata->ctime.tv_sec = mdata_iatt->ia_ctime; + mdata->ctime.tv_nsec = mdata_iatt->ia_ctime_nsec; + mdata->atime.tv_sec = mdata_iatt->ia_atime; + mdata->atime.tv_nsec = mdata_iatt->ia_atime_nsec; + mdata->mtime.tv_sec = mdata_iatt->ia_mtime; + mdata->mtime.tv_nsec = mdata_iatt->ia_mtime_nsec; + + ctx = (uint64_t)(uintptr_t)mdata; + __inode_ctx_set1(inode, this, &ctx); + } + } + + if (mdata_already_set) { + /* Compare and update the larger time */ + imdata.ctime.tv_sec = mdata_iatt->ia_ctime; + imdata.ctime.tv_nsec = mdata_iatt->ia_ctime_nsec; + imdata.atime.tv_sec = mdata_iatt->ia_atime; + imdata.atime.tv_nsec = mdata_iatt->ia_atime_nsec; + imdata.mtime.tv_sec = mdata_iatt->ia_mtime; + imdata.mtime.tv_nsec = mdata_iatt->ia_mtime_nsec; + + if (posix_compare_timespec(&imdata.ctime, &mdata->ctime) > 0) { + mdata->ctime = imdata.ctime; + } + if (posix_compare_timespec(&imdata.mtime, &mdata->mtime) > 0) { + mdata->mtime = imdata.mtime; + } + if (posix_compare_timespec(&imdata.atime, &mdata->atime) > 0) { + mdata->atime = imdata.atime; + } + } + + ret = posix_store_mdata_xattr(this, realpath, -1, inode, mdata); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_STOREMDATA_FAILED, + "gfid: %s key:%s ", uuid_utoa(inode->gfid), + GF_XATTR_MDATA_KEY); + *op_errno = errno; + goto unlock; + } + } +unlock: + UNLOCK(&inode->lock); +out: + return ret; +} + +/* posix_set_mdata_xattr updates the posix_mdata_t based on the flag + * in inode context and stores it on disk + */ +static int +posix_set_mdata_xattr(xlator_t *this, const char *real_path, int fd, + inode_t *inode, struct timespec *time, + struct timespec *u_atime, struct timespec *u_mtime, + struct iatt *stbuf, posix_mdata_flag_t *flag, + gf_boolean_t update_utime) +{ + uint64_t ctx; + posix_mdata_t *mdata = NULL; + int ret = -1; + int op_errno = 0; + + GF_VALIDATE_OR_GOTO("posix", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, time, out); + + if (update_utime && (flag->atime && !u_atime) && + (flag->mtime && !u_mtime)) { + goto out; + } + + LOCK(&inode->lock); + { + ret = __inode_ctx_get1(inode, this, &ctx); + if (ret == 0) { + mdata = (posix_mdata_t *)(uintptr_t)ctx; + } + if (ret == -1 || !mdata) { + /* + * Do we need to fetch the data from xattr + * If we does we can compare the value and store + * the largest data in inode ctx. + */ + mdata = GF_CALLOC(1, sizeof(posix_mdata_t), gf_posix_mt_mdata_attr); + if (!mdata) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_NOMEM, + "Could not allocate mdata. file: %s: gfid: %s", + real_path ? real_path : "null", uuid_utoa(inode->gfid)); + ret = -1; + goto unlock; + } + + ret = posix_fetch_mdata_xattr(this, real_path, fd, inode, + (void *)mdata, &op_errno); + if (ret == 0) { + /* Got mdata from disk, set it in inode ctx. This case + * is hit when in-memory status is lost due to brick + * down scenario + */ + ctx = (uint64_t)(uintptr_t)mdata; + __inode_ctx_set1(inode, this, &ctx); + } else { + /* + * This is the first time creating the time attr. This happens + * when you activate this feature. On this code path, only new + * files will create mdata xattr. The legacy files (files + * created before ctime enabled) will not have any xattr set. + * The xattr on legacy file will be set via lookup. + */ + + /* Don't create xattr with utimes/utimensat, only update if + * present. This otherwise causes issues during inservice + * upgrade. It causes inconsistent xattr values with in replica + * set. The scenario happens during upgrade where clients are + * older versions (without the ctime feature) and the server is + * upgraded to the new version (with the ctime feature which + * is enabled by default). + */ + + if (update_utime) { + UNLOCK(&inode->lock); + GF_FREE(mdata); + return 0; + } + + mdata->version = 1; + mdata->flags = 0; + mdata->ctime.tv_sec = time->tv_sec; + mdata->ctime.tv_nsec = time->tv_nsec; + mdata->atime.tv_sec = time->tv_sec; + mdata->atime.tv_nsec = time->tv_nsec; + mdata->mtime.tv_sec = time->tv_sec; + mdata->mtime.tv_nsec = time->tv_nsec; + + ctx = (uint64_t)(uintptr_t)mdata; + __inode_ctx_set1(inode, this, &ctx); + } + } + + /* In distributed systems, there could be races with fops + * updating mtime/atime which could result in different + * mtime/atime for same file. So this makes sure, only the + * highest time is retained. If the mtime/atime update comes + * from the explicit utime syscall, it is allowed to set to + * previous or future time but the ctime is always set to + * current time. + */ + if (update_utime) { + if (flag->ctime && + posix_compare_timespec(time, &mdata->ctime) > 0) { + mdata->ctime = *time; + } + if (flag->mtime) { + mdata->mtime = *u_mtime; + } + if (flag->atime) { + mdata->atime = *u_atime; + } + } else { + if (flag->ctime && + posix_compare_timespec(time, &mdata->ctime) > 0) { + mdata->ctime = *time; + } + if (flag->mtime && + posix_compare_timespec(time, &mdata->mtime) > 0) { + mdata->mtime = *time; + } + if (flag->atime && + posix_compare_timespec(time, &mdata->atime) > 0) { + mdata->atime = *time; + } + } + + if (inode->ia_type == IA_INVAL) { + /* + * TODO: This is non-linked inode. So we have to sync the + * data into backend. Because inode_link may return + * a different inode. + */ + /* ret = posix_store_mdata_xattr (this, loc, fd, + * mdata); */ + } + /* + * With this patch set, we are setting the xattr for each update + * We should evaluate the performance, and based on that we can + * decide on asynchronous updation. + */ + ret = posix_store_mdata_xattr(this, real_path, fd, inode, mdata); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_STOREMDATA_FAILED, + "file: %s: gfid: %s key:%s ", real_path ? real_path : "null", + uuid_utoa(inode->gfid), GF_XATTR_MDATA_KEY); + goto unlock; + } + } +unlock: + UNLOCK(&inode->lock); +out: + if (ret == 0 && stbuf) { + stbuf->ia_ctime = mdata->ctime.tv_sec; + stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec; + stbuf->ia_mtime = mdata->mtime.tv_sec; + stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec; + stbuf->ia_atime = mdata->atime.tv_sec; + stbuf->ia_atime_nsec = mdata->atime.tv_nsec; + } + + return ret; +} + +/* posix_update_utime_in_mdata updates the posix_mdata_t when mtime/atime + * is modified using syscall + */ +void +posix_update_utime_in_mdata(xlator_t *this, const char *real_path, int fd, + inode_t *inode, struct timespec *ctime, + struct iatt *stbuf, int valid) +{ + int32_t ret = 0; +#if defined(HAVE_UTIMENSAT) + struct timespec tv_atime = { + 0, + }; + struct timespec tv_mtime = { + 0, + }; +#else + struct timeval tv_atime = { + 0, + }; + struct timeval tv_mtime = { + 0, + }; +#endif + posix_mdata_flag_t flag = { + 0, + }; + + struct posix_private *priv = NULL; + + priv = this->private; + + /* NOTE: + * This routine (utimes) is intentionally allowed for all internal and + * external clients even if ctime is not set. This is because AFR and + * WORM uses time attributes for it's internal operations + */ + if (inode && priv->ctime) { + if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) { + tv_atime.tv_sec = stbuf->ia_atime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv_atime, stbuf->ia_atime_nsec); + + flag.ctime = 1; + flag.atime = 1; + } + + if ((valid & GF_SET_ATTR_MTIME) == GF_SET_ATTR_MTIME) { + tv_mtime.tv_sec = stbuf->ia_mtime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv_mtime, stbuf->ia_mtime_nsec); + + flag.ctime = 1; + flag.mtime = 1; + } + + if (flag.mtime || flag.atime) { + ret = posix_set_mdata_xattr(this, real_path, -1, inode, ctime, + &tv_atime, &tv_mtime, NULL, &flag, + _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata atime failed on file:" + " %s gfid:%s", + real_path, uuid_utoa(inode->gfid)); + } + } + } + return; +} + +/* posix_update_ctime_in_mdata updates the posix_mdata_t when ctime needs + * to be modified + */ +void +posix_update_ctime_in_mdata(xlator_t *this, const char *real_path, int fd, + inode_t *inode, struct timespec *ctime, + struct iatt *stbuf, int valid) +{ + int32_t ret = 0; +#if defined(HAVE_UTIMENSAT) + struct timespec tv_ctime = { + 0, + }; +#else + struct timeval tv_ctime = { + 0, + }; +#endif + posix_mdata_flag_t flag = { + 0, + }; + + struct posix_private *priv = NULL; + priv = this->private; + + if (inode && priv->ctime) { + tv_ctime.tv_sec = stbuf->ia_ctime; + SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv_ctime, stbuf->ia_ctime_nsec); + flag.ctime = 1; + + ret = posix_set_mdata_xattr(this, real_path, -1, inode, &tv_ctime, NULL, + NULL, NULL, &flag, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata atime failed on file:" + " %s gfid:%s", + real_path, uuid_utoa(inode->gfid)); + } + } + return; +} + +static void +posix_get_mdata_flag(uint64_t flags, posix_mdata_flag_t *flag) +{ + if (!flag) + return; + + flag->ctime = 0; + flag->atime = 0; + flag->mtime = 0; + + if (flags & MDATA_CTIME) + flag->ctime = 1; + if (flags & MDATA_MTIME) + flag->mtime = 1; + if (flags & MDATA_ATIME) + flag->atime = 1; +} + +static void +posix_get_parent_mdata_flag(uint64_t flags, posix_mdata_flag_t *flag) +{ + if (!flag) + return; + + flag->ctime = 0; + flag->atime = 0; + flag->mtime = 0; + + if (flags & MDATA_PAR_CTIME) + flag->ctime = 1; + if (flags & MDATA_PAR_MTIME) + flag->mtime = 1; + if (flags & MDATA_PAR_ATIME) + flag->atime = 1; +} + +void +posix_set_ctime(call_frame_t *frame, xlator_t *this, const char *real_path, + int fd, inode_t *inode, struct iatt *stbuf) +{ + posix_mdata_flag_t flag = { + 0, + }; + int ret = 0; + struct posix_private *priv = NULL; + + priv = this->private; + + if (priv->ctime) { + (void)posix_get_mdata_flag(frame->root->flags, &flag); + if ((flag.ctime == 0) && (flag.mtime == 0) && (flag.atime == 0)) { + goto out; + } + ret = posix_set_mdata_xattr(this, real_path, fd, inode, + &frame->root->ctime, NULL, NULL, stbuf, + &flag, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed on file: %s gfid:%s", real_path, + inode ? uuid_utoa(inode->gfid) : "No inode"); + } + } +out: + return; +} + +void +posix_set_parent_ctime(call_frame_t *frame, xlator_t *this, + const char *real_path, int fd, inode_t *inode, + struct iatt *stbuf) +{ + posix_mdata_flag_t flag = { + 0, + }; + int ret = 0; + struct posix_private *priv = NULL; + + priv = this->private; + + if (inode && priv->ctime) { + (void)posix_get_parent_mdata_flag(frame->root->flags, &flag); + if ((flag.ctime == 0) && (flag.mtime == 0) && (flag.atime == 0)) { + goto out; + } + ret = posix_set_mdata_xattr(this, real_path, fd, inode, + &frame->root->ctime, NULL, NULL, stbuf, + &flag, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed on file: %s gfid:%s", real_path, + uuid_utoa(inode->gfid)); + } + } +out: + return; +} + +void +posix_set_ctime_cfr(call_frame_t *frame, xlator_t *this, + const char *real_path_in, int fd_in, inode_t *inode_in, + struct iatt *stbuf_in, const char *real_path_out, + int fd_out, inode_t *inode_out, struct iatt *stbuf_out) +{ + posix_mdata_flag_t flag = { + 0, + }; + posix_mdata_flag_t flag_dup = { + 0, + }; + int ret = 0; + struct posix_private *priv = NULL; + char in_uuid_str[64] = {0}, out_uuid_str[64] = {0}; + + priv = this->private; + + if (priv->ctime) { + (void)posix_get_mdata_flag(frame->root->flags, &flag); + if ((flag.ctime == 0) && (flag.mtime == 0) && (flag.atime == 0)) { + goto out; + } + + if (frame->root->ctime.tv_sec == 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed, No ctime : in: %s gfid_in:%s " + "out: %s gfid_out:%s", + real_path_in, + (inode_in ? uuid_utoa_r(inode_in->gfid, in_uuid_str) + : "No inode"), + real_path_out, + (inode_out ? uuid_utoa_r(inode_out->gfid, out_uuid_str) + : "No inode")); + goto out; + } + + flag_dup = flag; + + /* + * For the destination file, no need to update atime. + * It got modified. Hence the things that need to be + * changed are mtime and ctime (provided the utime + * xlator from the client has set those flags, which + * are just copied to flag_dup). + */ + if (flag.atime) + flag_dup.atime = 0; + + ret = posix_set_mdata_xattr(this, real_path_out, fd_out, inode_out, + &frame->root->ctime, NULL, NULL, stbuf_out, + &flag_dup, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed on file: %s gfid:%s", real_path_out, + inode_out ? uuid_utoa(inode_out->gfid) : "No inode"); + } + + /* + * For the source file, no need to change the mtime and ctime. + * For source file, it is only read operation. So, if at all + * anything needs to be updated, it is only the atime. + */ + if (flag.atime) + flag_dup.atime = flag.atime; + flag_dup.mtime = 0; + flag_dup.ctime = 0; + + ret = posix_set_mdata_xattr(this, real_path_in, fd_out, inode_out, + &frame->root->ctime, NULL, NULL, stbuf_out, + &flag_dup, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SETMDATA_FAILED, + "posix set mdata failed on file: %s gfid:%s", real_path_in, + inode_in ? uuid_utoa(inode_in->gfid) : "No inode"); + } + } +out: + return; +} diff --git a/xlators/storage/posix/src/posix-metadata.h b/xlators/storage/posix/src/posix-metadata.h new file mode 100644 index 00000000000..d37014af93e --- /dev/null +++ b/xlators/storage/posix/src/posix-metadata.h @@ -0,0 +1,71 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _POSIX_METADATA_H +#define _POSIX_METADATA_H + +#include "posix-metadata-disk.h" + +/* In memory representation posix metadata xattr */ +typedef struct { + /* flags indicates valid fields in the structure */ + uint64_t flags; + struct timespec ctime; + struct timespec mtime; + struct timespec atime; + /* version of structure, bumped up if any new member is added */ + uint8_t version; + + char _pad[7]; /* manual padding */ +} posix_mdata_t; + +typedef struct { + unsigned short ctime : 1; + unsigned short mtime : 1; + unsigned short atime : 1; +} posix_mdata_flag_t; + +/* With inode lock*/ +int +posix_get_mdata_xattr(xlator_t *this, const char *real_path, int _fd, + inode_t *inode, struct iatt *stbuf); +/* With out inode lock*/ +int +__posix_get_mdata_xattr(xlator_t *this, const char *real_path, int _fd, + inode_t *inode, struct iatt *stbuf); +void +posix_update_utime_in_mdata(xlator_t *this, const char *real_path, int fd, + inode_t *inode, struct timespec *ctime, + struct iatt *stbuf, int valid); +void +posix_update_ctime_in_mdata(xlator_t *this, const char *real_path, int fd, + inode_t *inode, struct timespec *ctime, + struct iatt *stbuf, int valid); +void +posix_set_ctime(call_frame_t *frame, xlator_t *this, const char *real_path, + int fd, inode_t *inode, struct iatt *stbuf); +void +posix_set_parent_ctime(call_frame_t *frame, xlator_t *this, + const char *real_path, int fd, inode_t *inode, + struct iatt *stbuf); +void +posix_set_ctime_cfr(call_frame_t *frame, xlator_t *this, + const char *real_path_in, int fd_in, inode_t *inode_in, + struct iatt *stbuf_in, const char *read_path_put, + int fd_out, inode_t *inode_out, struct iatt *stbuf_out); +int +posix_set_mdata_xattr_legacy_files(xlator_t *this, inode_t *inode, + const char *realpath, + struct mdata_iatt *mdata_iatt, + int *op_errno); +void +posix_mdata_iatt_from_disk(struct mdata_iatt *out, posix_mdata_disk_t *in); + +#endif /* _POSIX_METADATA_H */ diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 0b7ab190c33..42b965434b9 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1,5198 +1,101 @@ /* - Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2006-2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #define __XOPEN_SOURCE 500 -#include <stdint.h> -#include <sys/time.h> -#include <sys/resource.h> -#include <errno.h> -#include <libgen.h> -#include <pthread.h> -#include <ftw.h> - -#ifndef GF_BSD_HOST_OS -#include <alloca.h> -#endif /* GF_BSD_HOST_OS */ - -#include "glusterfs.h" -#include "md5.h" -#include "checksum.h" -#include "dict.h" -#include "logging.h" -#include "posix.h" -#include "xlator.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "syscall.h" -#include "statedump.h" -#include "locking.h" -#include "timer.h" - -#undef HAVE_SET_FSID -#ifdef HAVE_SET_FSID - -#define DECLARE_OLD_FS_ID_VAR uid_t old_fsuid; gid_t old_fsgid; - -#define SET_FS_ID(uid, gid) do { \ - old_fsuid = setfsuid (uid); \ - old_fsgid = setfsgid (gid); \ - } while (0) - -#define SET_TO_OLD_FS_ID() do { \ - setfsuid (old_fsuid); \ - setfsgid (old_fsgid); \ - } while (0) - -#else - -#define DECLARE_OLD_FS_ID_VAR -#define SET_FS_ID(uid, gid) -#define SET_TO_OLD_FS_ID() - -#endif - -typedef struct { - xlator_t *this; - const char *real_path; - dict_t *xattr; - struct stat *stbuf; - loc_t *loc; -} posix_xattr_filler_t; - -int -posix_forget (xlator_t *this, inode_t *inode) -{ - uint64_t tmp_cache = 0; - if (!inode_ctx_del (inode, this, &tmp_cache)) - dict_destroy ((dict_t *)(long)tmp_cache); - - return 0; -} - -static void -_posix_xattr_get_set (dict_t *xattr_req, - char *key, - data_t *data, - void *xattrargs) -{ - posix_xattr_filler_t *filler = xattrargs; - char *value = NULL; - ssize_t xattr_size = -1; - int ret = -1; - char *databuf = NULL; - int _fd = -1; - loc_t *loc = NULL; - ssize_t req_size = 0; - - - /* should size be put into the data_t ? */ - if (!strcmp (key, "glusterfs.content") - && S_ISREG (filler->stbuf->st_mode)) { - - /* file content request */ - req_size = data_to_uint64 (data); - if (req_size >= filler->stbuf->st_size) { - _fd = open (filler->real_path, O_RDONLY); - - if (_fd == -1) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Opening file %s failed: %s", - filler->real_path, strerror (errno)); - goto err; - } - - databuf = calloc (1, filler->stbuf->st_size); - - if (!databuf) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Out of memory."); - goto err; - } - - ret = read (_fd, databuf, filler->stbuf->st_size); - if (ret == -1) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Read on file %s failed: %s", - filler->real_path, strerror (errno)); - goto err; - } - - ret = close (_fd); - _fd = -1; - if (ret == -1) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Close on file %s failed: %s", - filler->real_path, strerror (errno)); - goto err; - } - - ret = dict_set_bin (filler->xattr, key, - databuf, filler->stbuf->st_size); - if (ret < 0) { - goto err; - } - - /* To avoid double free in cleanup below */ - databuf = NULL; - err: - if (_fd != -1) - close (_fd); - if (databuf) - FREE (databuf); - } - } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { - loc = filler->loc; - if (!list_empty (&loc->inode->fd_list)) { - ret = dict_set_uint32 (filler->xattr, key, 1); - } else { - ret = dict_set_uint32 (filler->xattr, key, 0); - } - } else { - xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0); - - if (xattr_size > 0) { - value = calloc (1, xattr_size + 1); - - sys_lgetxattr (filler->real_path, key, value, - xattr_size); - - value[xattr_size] = '\0'; - ret = dict_set_bin (filler->xattr, key, - value, xattr_size); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_DEBUG, - "dict set failed. path: %s, key: %s", - filler->real_path, key); - } - } -} - - -static int -posix_scale_st_ino (struct posix_private *priv, struct stat *buf) -{ - int i = 0; - int ret = -1; - ino_t temp_ino = 0; - int r; - struct stat export_buf; - - for (i = 0; i < priv->num_devices_to_span; i++) { - if (buf->st_dev == priv->st_device[i]) { - break; - } - if (priv->st_device[i] == 0) { - priv->st_device[i] = buf->st_dev; - break; - } - } - - if (i == priv->num_devices_to_span) { - r = lstat (priv->base_path, &export_buf); - if ((r != 0) || (buf->st_dev != export_buf.st_dev)) { - goto out; - } - - gf_log (THIS->name, GF_LOG_WARNING, - "device number for exported volume %s has changed " - "since init --- assuming done by automount", - priv->base_path); - - priv->st_device[0] = export_buf.st_dev; - } - - temp_ino = (buf->st_ino * priv->num_devices_to_span) + i; - - buf->st_ino = temp_ino; - - ret = 0; -out: - return ret; -} - - -int -posix_lstat_with_gen (xlator_t *this, const char *path, struct stat *stbuf_p) -{ - struct posix_private *priv = NULL; - int ret = 0; - char gen_key[1024] = {0, }; - uint64_t gen_val_be = 0; - uint64_t gen_val = 0; - struct stat stbuf = {0, }; - - priv = this->private; - - ret = lstat (path, &stbuf); - if (ret == -1) - return -1; - - ret = posix_scale_st_ino (priv, &stbuf); - if ((ret == -1) && !strcmp (path, "..")) { - /* stat on ../ might land us outside the export directory, - so don't panic */ - - gf_log (this->name, GF_LOG_WARNING, - "Access to %s (on dev %lld) is crossing device (%lld)", - path, (unsigned long long) stbuf.st_dev, - (unsigned long long) priv->st_device[0]); - errno = EXDEV; - return -1; - } - -#ifndef GF_LINUX_HOST_OS - if (!S_ISDIR (stbuf.st_mode) && !S_ISREG (stbuf.st_mode)) { - stbuf.st_dev = (typeof(stbuf.st_dev))stbuf.st_mtime; - if (stbuf_p) - *stbuf_p = stbuf; - return 0; - } -#endif /* !GF_LINUX_HOST_OS */ - - ret = snprintf (gen_key, 1024, "trusted.%s.gen", this->name); - - if (ret == 1024) - return -1; - - ret = sys_lgetxattr (path, gen_key, (void *) &gen_val_be, - sizeof (gen_val_be)); - if (ret == -1) { - LOCK (&priv->gen_lock); - { - gen_val = ++priv->gen_seq; - } - UNLOCK (&priv->gen_lock); - - gen_val_be = hton64 (gen_val); - - ret = sys_lsetxattr (path, gen_key, &gen_val_be, - sizeof (gen_val_be), 0); - } else { - gen_val = ntoh64 (gen_val_be); - } - - if (ret >= 0) { - ret = 0; - stbuf.st_dev = (typeof(stbuf.st_dev))gen_val; - if (stbuf_p) - *stbuf_p = stbuf; - } - - return ret; -} - - -int -posix_fstat_with_gen (xlator_t *this, int fd, struct stat *stbuf_p) -{ - struct posix_private *priv = NULL; - int ret = 0; - char gen_key[1024] = {0, }; - uint64_t gen_val_be = 0; - uint64_t gen_val = 0; - struct stat stbuf = {0, }; - - priv = this->private; - - ret = fstat (fd, &stbuf); - if (ret == -1) - return -1; - - ret = posix_scale_st_ino (priv, &stbuf); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "Access to fd %d (on dev %lld) is crossing device (%lld)", - fd, (unsigned long long) stbuf.st_dev, - (unsigned long long) priv->st_device[0]); - errno = EXDEV; - return -1; - } - -#ifndef GF_LINUX_HOST_OS - if (!S_ISDIR (stbuf.st_mode) && !S_ISREG (stbuf.st_mode)) { - stbuf.st_dev = (typeof(stbuf.st_dev))stbuf.st_mtime; - return 0; - } -#endif /* !GF_LINUX_HOST_OS */ - - ret = snprintf (gen_key, 1024, "trusted.%s.gen", this->name); - - if (ret == 1024) - return -1; - - ret = sys_fgetxattr (fd, gen_key, (void *) &gen_val_be, - sizeof (gen_val_be)); - if (ret == -1) { - LOCK (&priv->gen_lock); - { - gen_val = ++priv->gen_seq; - } - UNLOCK (&priv->gen_lock); - - gen_val_be = hton64 (gen_val); - - ret = sys_fsetxattr (fd, gen_key, &gen_val_be, - sizeof (gen_val_be), 0); - } else { - gen_val = ntoh64 (gen_val_be); - } - - if (ret >= 0) { - ret = 0; - stbuf.st_dev = (typeof(stbuf.st_dev))gen_val; - if (stbuf_p) - *stbuf_p = stbuf; - } - - return ret; -} - - -dict_t * -posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, - dict_t *xattr_req, struct stat *buf) -{ - dict_t *xattr = NULL; - posix_xattr_filler_t filler = {0, }; - - xattr = get_new_dict(); - if (!xattr) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - filler.this = this; - filler.real_path = real_path; - filler.xattr = xattr; - filler.stbuf = buf; - filler.loc = loc; - - dict_foreach (xattr_req, _posix_xattr_get_set, &filler); -out: - return xattr; -} - - -/* - * If the parent directory of {real_path} has the setgid bit set, - * then set {gid} to the gid of the parent. Otherwise, - * leave {gid} unchanged. - */ - -int -setgid_override (xlator_t *this, char *real_path, gid_t *gid) -{ - char * tmp_path = NULL; - char * parent_path = NULL; - struct stat parent_stbuf; - - int op_ret = 0; - - tmp_path = strdup (real_path); - if (!tmp_path) { - op_ret = -ENOMEM; - gf_log ("[storage/posix]", GF_LOG_ERROR, - "Out of memory"); - goto out; - } - - parent_path = dirname (tmp_path); - - op_ret = posix_lstat_with_gen (this, parent_path, &parent_stbuf); - - if (op_ret == -1) { - op_ret = -errno; - gf_log ("[storage/posix]", GF_LOG_ERROR, - "lstat on parent directory (%s) failed: %s", - parent_path, strerror (errno)); - goto out; - } - - if (parent_stbuf.st_mode & S_ISGID) { - /* - Entries created inside a setgid directory - should inherit the gid from the parent - */ - - *gid = parent_stbuf.st_gid; - } -out: - - if (tmp_path) - FREE (tmp_path); - - return op_ret; -} - - -int32_t -posix_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - struct stat buf = {0, }; - char * real_path = NULL; - int32_t op_ret = -1; - int32_t entry_ret = 0; - int32_t op_errno = 0; - dict_t * xattr = NULL; - char * pathdup = NULL; - char * parentpath = NULL; - struct stat postparent = {0,}; - struct posix_private *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - priv = this->private; - - op_ret = posix_lstat_with_gen (this, real_path, &buf); - op_errno = errno; - - if (op_ret == -1) { - if (op_errno != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - loc->path, strerror (op_errno)); - } - - entry_ret = -1; - goto parent; - } - - if (xattr_req && (op_ret == 0)) { - xattr = posix_lookup_xattr_fill (this, real_path, loc, - xattr_req, &buf); - } - -parent: - if (loc->parent) { - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - } - - op_ret = entry_ret; -out: - if (pathdup) - FREE (pathdup); - - if (xattr) - dict_ref (xattr); - - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, - loc->inode, &buf, xattr, &postparent); - - if (xattr) - dict_unref (xattr); - - return 0; -} - - -int32_t -posix_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - struct stat buf = {0,}; - char * real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - struct posix_private *priv = NULL; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = posix_lstat_with_gen (this, real_path, &buf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID(); - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf); - - return 0; -} - -static int -posix_do_chmod (xlator_t *this, - const char *path, - struct stat *stbuf) -{ - int32_t ret = -1; - - ret = lchmod (path, stbuf->st_mode); - if ((ret == -1) && (errno == ENOSYS)) { - ret = chmod (path, stbuf->st_mode); - } - - return ret; -} - -static int -posix_do_chown (xlator_t *this, - const char *path, - struct stat *stbuf, - int32_t valid) -{ - int32_t ret = -1; - uid_t uid = -1; - gid_t gid = -1; - - if (valid & GF_SET_ATTR_UID) - uid = stbuf->st_uid; - - if (valid & GF_SET_ATTR_GID) - gid = stbuf->st_gid; - - ret = lchown (path, uid, gid); - - return ret; -} - -static int -posix_do_utimes (xlator_t *this, - const char *path, - struct stat *stbuf) -{ - int32_t ret = -1; - struct timeval tv[2] = {{0,},{0,}}; - - tv[0].tv_sec = stbuf->st_atime; - tv[0].tv_usec = ST_ATIM_NSEC (stbuf) / 1000; - tv[1].tv_sec = stbuf->st_mtime; - tv[1].tv_usec = ST_MTIM_NSEC (stbuf) / 1000; - - ret = lutimes (path, tv); - if ((ret == -1) && (errno == ENOSYS)) { - ret = utimes (path, tv); - } - - return ret; -} - -int -posix_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct stat *stbuf, int32_t valid) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat statpre = {0,}; - struct stat statpost = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = posix_lstat_with_gen (this, real_path, &statpre); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (lstat) on %s failed: %s", real_path, - strerror (op_errno)); - goto out; - } - - if (valid & GF_SET_ATTR_MODE) { - op_ret = posix_do_chmod (this, real_path, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (chmod) on %s failed: %s", real_path, - strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){ - op_ret = posix_do_chown (this, real_path, stbuf, valid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (chown) on %s failed: %s", real_path, - strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - op_ret = posix_do_utimes (this, real_path, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (utimes) on %s failed: %s", real_path, - strerror (op_errno)); - goto out; - } - } - - if (!valid) { - op_ret = lchown (real_path, -1, -1); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lchown (%s, -1, -1) failed => (%s)", - real_path, strerror (op_errno)); - - goto out; - } - } - - op_ret = posix_lstat_with_gen (this, real_path, &statpost); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (lstat) on %s failed: %s", real_path, - strerror (op_errno)); - goto out; - } - - op_ret = 0; - -out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, - &statpre, &statpost); - - return 0; -} - -int32_t -posix_do_fchown (xlator_t *this, - int fd, - struct stat *stbuf, - int32_t valid) -{ - int ret = -1; - uid_t uid = -1; - gid_t gid = -1; - - if (valid & GF_SET_ATTR_UID) - uid = stbuf->st_uid; - - if (valid & GF_SET_ATTR_GID) - gid = stbuf->st_gid; - - ret = fchown (fd, uid, gid); - - return ret; -} - - -int32_t -posix_do_fchmod (xlator_t *this, - int fd, struct stat *stbuf) -{ - return fchmod (fd, stbuf->st_mode); -} - -static int -posix_do_futimes (xlator_t *this, - int fd, - struct stat *stbuf) -{ - errno = ENOSYS; - return -1; -} - -int -posix_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct stat *stbuf, int32_t valid) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct stat statpre = {0,}; - struct stat statpost = {0,}; - struct posix_fd *pfd = NULL; - uint64_t tmp_pfd = 0; - int32_t ret = -1; - - DECLARE_OLD_FS_ID_VAR; - - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - op_ret = posix_fstat_with_gen (this, pfd->fd, &statpre); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsetattr (fstat) failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - if (valid & GF_SET_ATTR_MODE) { - op_ret = posix_do_fchmod (this, pfd->fd, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsetattr (fchmod) failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { - op_ret = posix_do_fchown (this, pfd->fd, stbuf, valid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsetattr (fchown) failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - - } - - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - op_ret = posix_do_futimes (this, pfd->fd, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsetattr (futimes) on failed fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - } - - if (!valid) { - op_ret = fchown (pfd->fd, -1, -1); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fchown (%d, -1, -1) failed => (%s)", - pfd->fd, strerror (op_errno)); - - goto out; - } - } - - op_ret = posix_fstat_with_gen (this, pfd->fd, &statpost); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsetattr (fstat) failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - op_ret = 0; - -out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, - &statpre, &statpost); - - return 0; -} - -int32_t -posix_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) -{ - char * real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - DIR * dir = NULL; - struct posix_fd * pfd = NULL; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - VALIDATE_OR_GOTO (fd, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - dir = opendir (real_path); - - if (dir == NULL) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = dirfd (dir); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "dirfd() failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - - pfd = CALLOC (1, sizeof (*fd)); - if (!pfd) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - pfd->dir = dir; - pfd->fd = dirfd (dir); - pfd->path = strdup (real_path); - if (!pfd->path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - fd_ctx_set (fd, this, (uint64_t)(long)pfd); - - op_ret = 0; - - out: - if (op_ret == -1) { - if (dir) { - closedir (dir); - dir = NULL; - } - if (pfd) { - if (pfd->path) - FREE (pfd->path); - FREE (pfd); - pfd = NULL; - } - } - - SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd); - return 0; -} - - -int32_t -posix_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, int32_t flag) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - dir_entry_t entries = {0, }; - dir_entry_t *tmp = NULL; - DIR *dir = NULL; - struct dirent *dirent = NULL; - int real_path_len = -1; - int entry_path_len = -1; - char *entry_path = NULL; - int count = 0; - struct posix_fd *pfd = NULL; - uint64_t tmp_pfd = 0; - struct stat buf = {0,}; - int ret = -1; - char tmp_real_path[ZR_PATH_MAX]; - char linkpath[ZR_PATH_MAX]; - struct posix_private *priv = NULL; - - DECLARE_OLD_FS_ID_VAR ; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "fd %p does not have context in %s", - fd, this->name); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - if (!pfd->path) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_DEBUG, - "pfd does not have path set (possibly file " - "fd, fd=%p)", fd); - goto out; - } - - real_path = pfd->path; - real_path_len = strlen (real_path); - - entry_path_len = real_path_len + NAME_MAX; - entry_path = CALLOC (1, entry_path_len); - - if (!entry_path) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - strncpy (entry_path, real_path, entry_path_len); - entry_path[real_path_len] = '/'; - - dir = pfd->dir; - - if (!dir) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_DEBUG, - "pfd does not have dir set (possibly file fd, " - "fd=%p, path=`%s'", - fd, real_path); - goto out; - } - - /* TODO: check for all the type of flag, and behave appropriately */ - - while ((dirent = readdir (dir))) { - if (!dirent) - break; - - /* This helps in self-heal, when only directories - needs to be replicated */ - - /* This is to reduce the network traffic, in case only - directory is needed from posix */ - - strncpy (tmp_real_path, real_path, ZR_PATH_MAX); - strncat (tmp_real_path, "/", - ZR_PATH_MAX - strlen (tmp_real_path)); - - strncat (tmp_real_path, dirent->d_name, - ZR_PATH_MAX - (strlen (tmp_real_path) + 1)); - - ret = posix_lstat_with_gen (this, tmp_real_path, &buf); - - if ((flag == GF_GET_DIR_ONLY) - && (ret != -1 && !S_ISDIR(buf.st_mode))) { - continue; - } - - tmp = CALLOC (1, sizeof (*tmp)); - - if (!tmp) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - tmp->name = strdup (dirent->d_name); - if (!tmp->name) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - if (entry_path_len < - (real_path_len + 1 + strlen (tmp->name) + 1)) { - entry_path_len = (real_path_len + - strlen (tmp->name) + 1024); - - entry_path = realloc (entry_path, entry_path_len); - } - - strcpy (&entry_path[real_path_len+1], tmp->name); - - tmp->buf = buf; - - if (S_ISLNK(tmp->buf.st_mode)) { - - ret = readlink (entry_path, linkpath, ZR_PATH_MAX); - if (ret != -1) { - linkpath[ret] = '\0'; - tmp->link = strdup (linkpath); - } - } else { - tmp->link = ""; - } - - count++; - - tmp->next = entries.next; - entries.next = tmp; - - /* if size is 0, count can never be = size, so entire - dir is read */ - if (count == size) - break; - } - - FREE (entry_path); - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - if (op_ret == -1) { - if (entry_path) - FREE (entry_path); - } - - STACK_UNWIND_STRICT (getdents, frame, op_ret, op_errno, - &entries, count); - - if (op_ret == 0) { - while (entries.next) { - tmp = entries.next; - entries.next = entries.next->next; - FREE (tmp->name); - FREE (tmp); - } - } - - return 0; -} - - -int32_t -posix_releasedir (xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; - int ret = 0; - - struct posix_private *priv = NULL; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_del (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd from fd=%p is NULL", fd); - goto out; - } - - pfd = (struct posix_fd *)(long)tmp_pfd; - if (!pfd->dir) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "pfd->dir is NULL for fd=%p path=%s", - fd, pfd->path ? pfd->path : "<NULL>"); - goto out; - } - - priv = this->private; - - if (!pfd->path) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_DEBUG, - "pfd->path was NULL. fd=%p pfd=%p", - fd, pfd); - } - - pthread_mutex_lock (&priv->janitor_lock); - { - INIT_LIST_HEAD (&pfd->list); - list_add_tail (&pfd->list, &priv->janitor_fds); - pthread_cond_signal (&priv->janitor_cond); - } - pthread_mutex_unlock (&priv->janitor_lock); - - op_ret = 0; - - out: - return 0; -} - - -int32_t -posix_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) -{ - char * dest = NULL; - int32_t op_ret = -1; - int32_t lstat_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - struct stat stbuf = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - - dest = alloca (size + 1); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = readlink (real_path, dest, size); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "readlink on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - - dest[op_ret] = 0; - - lstat_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if (lstat_ret == -1) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, dest, &stbuf); - - return 0; -} - -int32_t -posix_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev) -{ - int tmp_fd = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = 0; - struct stat stbuf = { 0, }; - char was_present = 1; - struct posix_private *priv = NULL; - gid_t gid = 0; - char *pathdup = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - char *parentpath = NULL; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - gid = frame->root->gid; - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)){ - was_present = 0; - } - - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) - goto out; - - SET_FS_ID (frame->root->uid, gid); - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = mknod (real_path, mode, dev); - - if (op_ret == -1) { - op_errno = errno; - if ((op_errno == EINVAL) && S_ISREG (mode)) { - /* Over Darwin, mknod with (S_IFREG|mode) - doesn't work */ - tmp_fd = creat (real_path, mode); - if (tmp_fd == -1) - goto out; - close (tmp_fd); - } else { - - gf_log (this->name, GF_LOG_ERROR, - "mknod on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - -#ifndef HAVE_SET_FSID - op_ret = lchown (real_path, frame->root->uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lchown on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } -#endif - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "mknod on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - if (pathdup) - FREE (pathdup); - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, - loc->inode, &stbuf, &preparent, &postparent); - - if ((op_ret == -1) && (!was_present)) { - unlink (real_path); - } - - return 0; -} - - -static int -janitor_walker (const char *fpath, const struct stat *sb, - int typeflag, struct FTW *ftwbuf) -{ - switch (sb->st_mode & S_IFMT) { - case S_IFREG: - case S_IFBLK: - case S_IFLNK: - case S_IFCHR: - case S_IFIFO: - case S_IFSOCK: - gf_log (THIS->name, GF_LOG_TRACE, - "unlinking %s", fpath); - unlink (fpath); - break; - - case S_IFDIR: - if (ftwbuf->level) { /* don't remove top level dir */ - gf_log (THIS->name, GF_LOG_TRACE, - "removing directory %s", fpath); - - rmdir (fpath); - } - break; - } - - return 0; /* 0 = FTW_CONTINUE */ -} - - -static struct posix_fd * -janitor_get_next_fd (xlator_t *this) -{ - struct posix_private *priv = NULL; - struct posix_fd *pfd = NULL; - - struct timespec timeout; - - priv = this->private; - - pthread_mutex_lock (&priv->janitor_lock); - { - if (list_empty (&priv->janitor_fds)) { - time (&timeout.tv_sec); - timeout.tv_sec += priv->janitor_sleep_duration; - timeout.tv_nsec = 0; - - pthread_cond_timedwait (&priv->janitor_cond, - &priv->janitor_lock, - &timeout); - goto unlock; - } - - pfd = list_entry (priv->janitor_fds.next, struct posix_fd, - list); - - list_del (priv->janitor_fds.next); - } -unlock: - pthread_mutex_unlock (&priv->janitor_lock); - - return pfd; -} - - -static void * -posix_janitor_thread_proc (void *data) -{ - xlator_t * this = NULL; - struct posix_private *priv = NULL; - struct posix_fd *pfd; - - time_t now; - - this = data; - priv = this->private; - - THIS = this; - - while (1) { - time (&now); - if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { - gf_log (this->name, GF_LOG_TRACE, - "janitor cleaning out /" GF_REPLICATE_TRASH_DIR); - - nftw (priv->trash_path, - janitor_walker, - 32, - FTW_DEPTH | FTW_PHYS); - - priv->last_landfill_check = now; - } - - pfd = janitor_get_next_fd (this); - if (pfd) { - if (pfd->dir == NULL) { - gf_log (this->name, GF_LOG_TRACE, - "janitor: closing file fd=%d", pfd->fd); - close (pfd->fd); - } else { - gf_log (this->name, GF_LOG_TRACE, - "janitor: closing dir fd=%p", pfd->dir); - closedir (pfd->dir); - } - - if (pfd->path) - FREE (pfd->path); - - FREE (pfd); - } - } - - return NULL; -} - - -static void -posix_spawn_janitor_thread (xlator_t *this) -{ - struct posix_private *priv = NULL; - int ret = 0; - - priv = this->private; - - LOCK (&priv->lock); - { - if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "spawning janitor thread failed: %s", - strerror (errno)); - goto unlock; - } - - priv->janitor_present = _gf_true; - } - } -unlock: - UNLOCK (&priv->lock); -} - - -int32_t -posix_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - struct stat stbuf = {0, }; - char was_present = 1; - struct posix_private *priv = NULL; - gid_t gid = 0; - char *pathdup = NULL; - char *parentpath = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - gid = frame->root->gid; - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)) { - was_present = 0; - } - - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) - goto out; - - SET_FS_ID (frame->root->uid, gid); - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = mkdir (real_path, mode); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "mkdir of %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - -#ifndef HAVE_SET_FSID - op_ret = chown (real_path, frame->root->uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } -#endif - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - if (pathdup) - FREE (pathdup); - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, - loc->inode, &stbuf, &preparent, &postparent); - - if ((op_ret == -1) && (!was_present)) { - unlink (real_path); - } - - return 0; -} - - -int32_t -posix_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - char *pathdup = NULL; - char *parentpath = NULL; - int32_t fd = -1; - struct posix_private *priv = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - priv = this->private; - if (priv->background_unlink) { - if (S_ISREG (loc->inode->st_mode)) { - fd = open (real_path, O_RDONLY); - if (fd == -1) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "open of %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - } - - op_ret = unlink (real_path); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "unlink of %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - if (pathdup) - FREE (pathdup); - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, - &preparent, &postparent); - - if (fd != -1) { - close (fd); - } - - return 0; -} - -int32_t -posix_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - char * pathdup = NULL; - char * parentpath = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = rmdir (real_path); - op_errno = errno; - - if (op_errno == EEXIST) - /* Solaris sets errno = EEXIST instead of ENOTEMPTY */ - op_errno = ENOTEMPTY; - - /* No need to log a common error as ENOTEMPTY */ - if (op_ret == -1 && op_errno != ENOTEMPTY) { - gf_log (this->name, GF_LOG_ERROR, - "rmdir of %s failed: %s", loc->path, - strerror (op_errno)); - } - - if (op_ret == -1) - goto out; - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - out: - if (pathdup) - FREE (pathdup); - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, - &preparent, &postparent); - - return 0; -} - -int32_t -posix_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = { 0, }; - struct posix_private *priv = NULL; - gid_t gid = 0; - char was_present = 1; - char *pathdup = NULL; - char *parentpath = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (linkname, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)){ - was_present = 0; - } - - gid = frame->root->gid; - - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) - goto out; - - SET_FS_ID (frame->root->uid, gid); - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = symlink (linkname, real_path); - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "symlink of %s --> %s failed: %s", - loc->path, linkname, strerror (op_errno)); - goto out; - } - -#ifndef HAVE_SET_FSID - op_ret = lchown (real_path, frame->root->uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lchown failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } -#endif - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - if (pathdup) - FREE (pathdup); - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, - loc->inode, &stbuf, &preparent, &postparent); - - if ((op_ret == -1) && (!was_present)) { - unlink (real_path); - } - - return 0; -} - - -int -posix_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_oldpath = NULL; - char *real_newpath = NULL; - struct stat stbuf = {0, }; - struct posix_private *priv = NULL; - char was_present = 1; - char *oldpathdup = NULL; - char *oldparentpath = NULL; - char *newpathdup = NULL; - char *newparentpath = NULL; - struct stat preoldparent = {0, }; - struct stat postoldparent = {0, }; - struct stat prenewparent = {0, }; - struct stat postnewparent = {0, }; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_oldpath, this, oldloc->path); - MAKE_REAL_PATH (real_newpath, this, newloc->path); - - oldpathdup = strdup (real_oldpath); - GF_VALIDATE_OR_GOTO (this->name, oldpathdup, out); - - oldparentpath = dirname (oldpathdup); - - op_ret = posix_lstat_with_gen (this, oldparentpath, &preoldparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - oldloc->path, strerror (op_errno)); - goto out; - } - - newpathdup = strdup (real_newpath); - GF_VALIDATE_OR_GOTO (this->name, newpathdup, out); - - newparentpath = dirname (newpathdup); - - op_ret = posix_lstat_with_gen (this, newparentpath, &prenewparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - newloc->path, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, real_newpath, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)){ - was_present = 0; - } - - op_ret = rename (real_oldpath, real_newpath); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, - (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), - "rename of %s to %s failed: %s", - oldloc->path, newloc->path, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, real_newpath, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - real_newpath, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, oldparentpath, &postoldparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - oldloc->path, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, newparentpath, &postnewparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - newloc->path, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - if (oldpathdup) - FREE (oldpathdup); - - if (newpathdup) - FREE (newpathdup); - - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf, - &preoldparent, &postoldparent, - &prenewparent, &postnewparent); - - if ((op_ret == -1) && !was_present) { - unlink (real_newpath); - } - - return 0; -} - - -int -posix_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_oldpath = 0; - char *real_newpath = 0; - struct stat stbuf = {0, }; - struct posix_private *priv = NULL; - char was_present = 1; - char *newpathdup = NULL; - char *newparentpath = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_oldpath, this, oldloc->path); - MAKE_REAL_PATH (real_newpath, this, newloc->path); - - op_ret = posix_lstat_with_gen (this, real_newpath, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)) { - was_present = 0; - } - - newpathdup = strdup (real_newpath); - if (!newpathdup) { - gf_log (this->name, GF_LOG_ERROR, "strdup failed"); - op_errno = ENOMEM; - goto out; - } - - newparentpath = dirname (newpathdup); - op_ret = posix_lstat_with_gen (this, newparentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s", - newparentpath, strerror (op_errno)); - goto out; - } - - op_ret = link (real_oldpath, real_newpath); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "link %s to %s failed: %s", - oldloc->path, newloc->path, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, real_newpath, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - real_newpath, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, newparentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s", - newparentpath, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - if (newpathdup) - FREE (newpathdup); - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, - oldloc->inode, &stbuf, &preparent, &postparent); - - if ((op_ret == -1) && (!was_present)) { - unlink (real_newpath); - } - - return 0; -} - - -int32_t -posix_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = 0; - struct posix_private *priv = NULL; - struct stat prebuf = {0,}; - struct stat postbuf = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = posix_lstat_with_gen (this, real_path, &prebuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = truncate (real_path, offset); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "truncate on %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, real_path, &postbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", - real_path, strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, - &prebuf, &postbuf); - - return 0; -} - - -int32_t -posix_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - int _flags = 0; - char * real_path = NULL; - struct stat stbuf = {0, }; - struct posix_fd * pfd = NULL; - struct posix_private * priv = NULL; - char was_present = 1; - - gid_t gid = 0; - char *pathdup = NULL; - char *parentpath = NULL; - struct stat preparent = {0,}; - struct stat postparent = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - gid = frame->root->gid; - - op_ret = setgid_override (this, real_path, &gid); - - if (op_ret < 0) { - goto out; - } - - SET_FS_ID (frame->root->uid, gid); - pathdup = strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gen (this, parentpath, &preparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - if (!flags) { - _flags = O_CREAT | O_RDWR | O_EXCL; - } - else { - _flags = flags | O_CREAT; - } - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)) { - was_present = 0; - } - - if (priv->o_direct) - _flags |= O_DIRECT; - - _fd = open (real_path, _flags, mode); - - if (_fd == -1) { - op_errno = errno; - op_ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "open on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - -#ifndef HAVE_SET_FSID - op_ret = chown (real_path, frame->root->uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", - real_path, strerror (op_errno)); - } +/* for SEEK_HOLE and SEEK_DATA */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE #endif - op_ret = posix_fstat_with_gen (this, _fd, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fstat on %d failed: %s", _fd, strerror (op_errno)); - goto out; - } - - op_ret = posix_lstat_with_gen (this, parentpath, &postparent); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - - op_ret = -1; - pfd = CALLOC (1, sizeof (*pfd)); - - if (!pfd) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - pfd->flags = flags; - pfd->fd = _fd; - - fd_ctx_set (fd, this, (uint64_t)(long)pfd); - - LOCK (&priv->lock); - { - priv->stats.nr_files++; - } - UNLOCK (&priv->lock); - - op_ret = 0; - - out: - if (pathdup) - FREE (pathdup); - SET_TO_OLD_FS_ID (); - - if ((-1 == op_ret) && (_fd != -1)) { - close (_fd); - - if (!was_present) { - unlink (real_path); - } - } - - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, - fd, loc->inode, &stbuf, &preparent, &postparent); - - return 0; -} - -int32_t -posix_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd, int wbflags) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - int32_t _fd = -1; - struct posix_fd *pfd = NULL; - struct posix_private *priv = NULL; - char was_present = 1; - gid_t gid = 0; - struct stat stbuf = {0, }; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) - goto out; - - SET_FS_ID (frame->root->uid, gid); - - if (priv->o_direct) - flags |= O_DIRECT; - - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if ((op_ret == -1) && (errno == ENOENT)) { - was_present = 0; - } - - _fd = open (real_path, flags, 0); - if (_fd == -1) { - op_errno = errno; - op_ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "open on %s: %s", real_path, strerror (op_errno)); - goto out; - } - - pfd = CALLOC (1, sizeof (*pfd)); - - if (!pfd) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - pfd->flags = flags; - pfd->fd = _fd; - if (wbflags == GF_OPEN_FSYNC) - pfd->flushwrites = 1; - - fd_ctx_set (fd, this, (uint64_t)(long)pfd); - -#ifndef HAVE_SET_FSID - if (flags & O_CREAT) { - op_ret = chown (real_path, frame->root->uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", - real_path, strerror (op_errno)); - goto out; - } - } -#endif - - if (flags & O_CREAT) { - op_ret = posix_lstat_with_gen (this, real_path, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "lstat on (%s) " - "failed: %s", real_path, strerror (op_errno)); - goto out; - } - } - - LOCK (&priv->lock); - { - priv->stats.nr_files++; - } - UNLOCK (&priv->lock); - - op_ret = 0; - - out: - if (op_ret == -1) { - if (_fd != -1) { - close (_fd); - _fd = -1; - } - } - - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); - - return 0; -} - -#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \ - (unsigned long)(~(bound - 1)))) - -int -posix_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) -{ - uint64_t tmp_pfd = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct posix_private * priv = NULL; - struct iobuf * iobuf = NULL; - struct iobref * iobref = NULL; - struct iovec vec = {0,}; - struct posix_fd * pfd = NULL; - struct stat stbuf = {0,}; - int align = 1; - int ret = -1; - off_t off_ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - if (!size) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, "size=%"GF_PRI_SIZET, size); - goto out; - } - - if (pfd->flags & O_DIRECT) { - align = 4096; /* align to page boundary */ - } - - iobuf = iobuf_get (this->ctx->iobuf_pool); - if (!iobuf) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - _fd = pfd->fd; - - off_ret = lseek (_fd, offset, SEEK_SET); - if (off_ret == -1) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lseek(%"PRId64") failed: %s", - offset, strerror (op_errno)); - goto out; - } - - op_ret = read (_fd, iobuf->ptr, size); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "read failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - LOCK (&priv->lock); - { - priv->read_value += op_ret; - priv->interval_read += op_ret; - } - UNLOCK (&priv->lock); - - vec.iov_base = iobuf->ptr; - vec.iov_len = op_ret; - - op_ret = -1; - iobref = iobref_new (); - - iobref_add (iobref, iobuf); - - /* - * readv successful, and we need to get the stat of the file - * we read from - */ - - op_ret = posix_fstat_with_gen (this, _fd, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fstat failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - op_ret = vec.iov_len; -out: - - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - &vec, 1, &stbuf, iobref); - - if (iobref) - iobref_unref (iobref); - if (iobuf) - iobuf_unref (iobuf); - - return 0; -} - - -int32_t -posix_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct posix_private * priv = NULL; - struct posix_fd * pfd = NULL; - struct stat preop = {0,}; - struct stat postop = {0,}; - int ret = -1; - off_t off_ret = -1; - - int idx = 0; - int align = 4096; - int max_buf_size = 0; - int retval = 0; - char * buf = NULL; - char * alloc_buf = NULL; - uint64_t tmp_pfd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (vector, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - op_ret = posix_fstat_with_gen (this, _fd, &preop); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation fstat failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - off_ret = lseek (_fd, offset, SEEK_SET); - - if (off_ret == -1) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lseek(%"PRId64") on fd=%p failed: %s", - offset, fd, strerror (op_errno)); - goto out; - } - - /* Check for the O_DIRECT flag during open() */ - if (pfd->flags & O_DIRECT) { - /* This is O_DIRECT'd file */ - op_ret = -1; - for (idx = 0; idx < count; idx++) { - if (max_buf_size < vector[idx].iov_len) - max_buf_size = vector[idx].iov_len; - } - - alloc_buf = MALLOC (1 * (max_buf_size + align)); - if (!alloc_buf) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - for (idx = 0; idx < count; idx++) { - /* page aligned buffer */ - buf = ALIGN_BUF (alloc_buf, align); - - memcpy (buf, vector[idx].iov_base, - vector[idx].iov_len); - - /* not sure whether writev works on O_DIRECT'd fd */ - retval = write (_fd, buf, vector[idx].iov_len); - - if (retval == -1) { - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "O_DIRECT enabled on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - - break; - } - if (op_ret == -1) - op_ret = 0; - op_ret += retval; - } - - } else /* if (O_DIRECT) */ { - - /* This is not O_DIRECT'd fd */ - op_ret = writev (_fd, vector, count); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "writev failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - } - - LOCK (&priv->lock); - { - priv->write_value += op_ret; - priv->interval_write += op_ret; - } - UNLOCK (&priv->lock); - - if (op_ret >= 0) { - /* wiretv successful, we also need to get the stat of - * the file we wrote to - */ - - if (pfd->flushwrites) { - /* NOTE: ignore the error, if one occurs at this - * point */ - fsync (_fd); - } - - ret = posix_fstat_with_gen (this, _fd, &postop); - if (ret == -1) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation fstat failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - } - - out: - if (alloc_buf) { - FREE (alloc_buf); - } - - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop); - - return 0; -} - - -int32_t -posix_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) -{ - char * real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - struct statvfs buf = {0, }; - struct posix_private * priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (this->private, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - priv = this->private; - - op_ret = statvfs (real_path, &buf); - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "statvfs failed on %s: %s", - real_path, strerror (op_errno)); - goto out; - } - - if (!priv->export_statfs) { - buf.f_blocks = 0; - buf.f_bfree = 0; - buf.f_bavail = 0; - buf.f_files = 0; - buf.f_ffree = 0; - buf.f_favail = 0; - } - - op_ret = 0; - - out: - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf); - return 0; -} - - -int32_t -posix_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct posix_fd * pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL on fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - /* do nothing */ - - op_ret = 0; - - out: - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); - - return 0; -} - - -int32_t -posix_release (xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct posix_private * priv = NULL; - struct posix_fd * pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - if (pfd->dir) { - op_ret = -1; - op_errno = EBADF; - gf_log (this->name, GF_LOG_DEBUG, - "pfd->dir is %p (not NULL) for file fd=%p", - pfd->dir, fd); - } - - pthread_mutex_lock (&priv->janitor_lock); - { - INIT_LIST_HEAD (&pfd->list); - list_add_tail (&pfd->list, &priv->janitor_fds); - pthread_cond_signal (&priv->janitor_cond); - } - pthread_mutex_unlock (&priv->janitor_lock); - - LOCK (&priv->lock); - { - priv->stats.nr_files--; - } - UNLOCK (&priv->lock); - - op_ret = 0; - - out: - return 0; -} - - -int32_t -posix_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t datasync) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct posix_fd * pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; - struct stat preop = {0,}; - struct stat postop = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - -#ifdef GF_DARWIN_HOST_OS - /* Always return success in case of fsync in MAC OS X */ - op_ret = 0; - goto out; -#endif - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd not found in fd's ctx"); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - op_ret = posix_fstat_with_gen (this, _fd, &preop); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "pre-operation fstat failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - if (datasync) { - ; -#ifdef HAVE_FDATASYNC - op_ret = fdatasync (_fd); -#endif - } else { - op_ret = fsync (_fd); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsync on fd=%p failed: %s", - fd, strerror (op_errno)); - goto out; - } - } - - op_ret = posix_fstat_with_gen (this, _fd, &postop); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "post-operation fstat failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, &postop); - - return 0; -} - -static int gf_posix_xattr_enotsup_log; - -int -set_file_contents (xlator_t *this, char *real_path, - data_pair_t *trav, int flags) -{ - char * key = NULL; - char real_filepath[ZR_PATH_MAX] = {0,}; - int32_t file_fd = -1; - int op_ret = 0; - int ret = -1; - - key = &(trav->key[15]); - sprintf (real_filepath, "%s/%s", real_path, key); - - if (flags & XATTR_REPLACE) { - /* if file exists, replace it - * else, error out */ - file_fd = open (real_filepath, O_TRUNC|O_WRONLY); - - if (file_fd == -1) { - goto create; - } - - if (trav->value->len) { - ret = write (file_fd, trav->value->data, - trav->value->len); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "write failed while doing setxattr " - "for key %s on path %s: %s", - key, real_filepath, strerror (errno)); - goto out; - } - - ret = close (file_fd); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "close failed on %s: %s", - real_filepath, strerror (errno)); - goto out; - } - } - - create: /* we know file doesn't exist, create it */ - - file_fd = open (real_filepath, O_CREAT|O_WRONLY, 0644); - - if (file_fd == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "failed to open file %s with O_CREAT: %s", - key, strerror (errno)); - goto out; - } - - ret = write (file_fd, trav->value->data, trav->value->len); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "write failed on %s while setxattr with " - "key %s: %s", - real_filepath, key, strerror (errno)); - goto out; - } - - ret = close (file_fd); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "close failed on %s while setxattr with " - "key %s: %s", - real_filepath, key, strerror (errno)); - goto out; - } - } - - out: - return op_ret; -} - -int -handle_pair (xlator_t *this, char *real_path, - data_pair_t *trav, int flags) -{ - int sys_ret = -1; - int ret = 0; - - if (ZR_FILE_CONTENT_REQUEST(trav->key)) { - ret = set_file_contents (this, real_path, trav, flags); - } else { - sys_ret = sys_lsetxattr (real_path, trav->key, - trav->value->data, - trav->value->len, flags); - - if (sys_ret < 0) { - if (errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported"); - } else if (errno == ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "setxattr on %s failed: %s", real_path, - strerror (errno)); - } else { - -#ifdef GF_DARWIN_HOST_OS - gf_log (this->name, - ((errno == EINVAL) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "%s: key:%s error:%s", - real_path, trav->key, - strerror (errno)); -#else /* ! DARWIN */ - gf_log (this->name, GF_LOG_ERROR, - "%s: key:%s error:%s", - real_path, trav->key, - strerror (errno)); -#endif /* DARWIN */ - } - - ret = -errno; - goto out; - } - } - out: - return ret; -} - -int32_t -posix_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int flags) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - data_pair_t * trav = NULL; - int ret = -1; - - DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (dict, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - trav = dict->members_list; - - while (trav) { - ret = handle_pair (this, real_path, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno); - - return 0; -} - -int -get_file_contents (xlator_t *this, char *real_path, - const char *name, char **contents) -{ - char real_filepath[ZR_PATH_MAX] = {0,}; - char * key = NULL; - int32_t file_fd = -1; - struct stat stbuf = {0,}; - int op_ret = 0; - int ret = -1; - - key = (char *) &(name[15]); - sprintf (real_filepath, "%s/%s", real_path, key); - - op_ret = posix_lstat_with_gen (this, real_filepath, &stbuf); - if (op_ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", - real_filepath, strerror (errno)); - goto out; - } - - file_fd = open (real_filepath, O_RDONLY); - - if (file_fd == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s", - real_filepath, strerror (errno)); - goto out; - } - - *contents = CALLOC (stbuf.st_size + 1, sizeof(char)); - - if (! *contents) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - ret = read (file_fd, *contents, stbuf.st_size); - if (ret <= 0) { - op_ret = -1; - gf_log (this->name, GF_LOG_ERROR, "read on %s failed: %s", - real_filepath, strerror (errno)); - goto out; - } - - *contents[stbuf.st_size] = '\0'; - - op_ret = close (file_fd); - file_fd = -1; - if (op_ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s", - real_filepath, strerror (errno)); - goto out; - } - - out: - if (op_ret < 0) { - if (*contents) - FREE (*contents); - if (file_fd != -1) - close (file_fd); - } - - return op_ret; -} - -/** - * posix_getxattr - this function returns a dictionary with all the - * key:value pair present as xattr. used for - * both 'listxattr' and 'getxattr'. - */ -int32_t -posix_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) -{ - struct posix_private *priv = NULL; - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char key[1024] = {0,}; - char gen_key[1024] = {0,}; - char * value = NULL; - char * list = NULL; - char * real_path = NULL; - dict_t * dict = NULL; - char * file_contents = NULL; - int ret = -1; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - priv = this->private; - - if (loc->inode && S_ISDIR(loc->inode->st_mode) && name && - ZR_FILE_CONTENT_REQUEST(name)) { - ret = get_file_contents (this, real_path, name, - &file_contents); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_ERROR, - "getting file contents failed: %s", - strerror (op_errno)); - goto out; - } - } - - /* Get the total size */ - dict = get_new_dict (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - if (loc->inode && S_ISREG (loc->inode->st_mode) && name && - (strcmp (name, "trusted.glusterfs.location") == 0)) { - ret = dict_set_static_ptr (dict, - "trusted.glusterfs.location", - priv->hostname); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not set hostname (%s) in dictionary", - priv->hostname); - } - goto done; - } - - - size = sys_llistxattr (real_path, NULL, 0); - if (size == -1) { - op_errno = errno; - if ((errno == ENOTSUP) || (errno == ENOSYS)) { - GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, - this->name, GF_LOG_WARNING, - "Extended attributes not " - "supported."); - } - else { - gf_log (this->name, GF_LOG_ERROR, - "listxattr failed on %s: %s", - real_path, strerror (op_errno)); - } - goto out; - } - - if (size == 0) - goto done; - - list = alloca (size + 1); - if (!list) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - ret = snprintf (gen_key, 1023, "trusted.%s.gen", this->name); - - size = sys_llistxattr (real_path, list, size); - - remaining_size = size; - list_offset = 0; - while (remaining_size > 0) { - if(*(list + list_offset) == '\0') - break; - - strcpy (key, list + list_offset); - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) - break; - - value = CALLOC (op_ret + 1, sizeof(char)); - if (!value) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - op_ret = sys_lgetxattr (real_path, key, value, op_ret); - if (op_ret == -1) - break; - - value [op_ret] = '\0'; - if (strcmp (key, gen_key) != 0) - dict_set (dict, key, data_from_dynptr (value, op_ret)); - else - FREE (value); - - remaining_size -= strlen (key) + 1; - list_offset += strlen (key) + 1; - - } /* while (remaining_size > 0) */ - - done: - op_ret = size; - - if (dict) { - dict_ref (dict); - } - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict); - - if (dict) - dict_unref (dict); - - return 0; -} - - -int32_t -posix_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name) -{ - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - uint64_t tmp_pfd = 0; - struct posix_fd * pfd = NULL; - int _fd = -1; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char key[1024] = {0,}; - char * value = NULL; - char * list = NULL; - dict_t * dict = NULL; - int ret = -1; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - /* Get the total size */ - dict = get_new_dict (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - size = sys_flistxattr (_fd, NULL, 0); - if (size == -1) { - op_errno = errno; - if ((errno == ENOTSUP) || (errno == ENOSYS)) { - GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, - this->name, GF_LOG_WARNING, - "Extended attributes not " - "supported."); - } - else { - gf_log (this->name, GF_LOG_ERROR, - "listxattr failed on %p: %s", - fd, strerror (op_errno)); - } - goto out; - } - - if (size == 0) - goto done; - - list = alloca (size + 1); - if (!list) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - size = sys_flistxattr (_fd, list, size); - - remaining_size = size; - list_offset = 0; - while (remaining_size > 0) { - if(*(list + list_offset) == '\0') - break; - - strcpy (key, list + list_offset); - op_ret = sys_fgetxattr (_fd, key, NULL, 0); - if (op_ret == -1) - break; - - value = CALLOC (op_ret + 1, sizeof(char)); - if (!value) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) - break; - - value [op_ret] = '\0'; - dict_set (dict, key, data_from_dynptr (value, op_ret)); - remaining_size -= strlen (key) + 1; - list_offset += strlen (key) + 1; - - } /* while (remaining_size > 0) */ - - done: - op_ret = size; - - if (dict) { - dict_ref (dict); - } - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict); - - if (dict) - dict_unref (dict); - - return 0; -} - - -int -fhandle_pair (xlator_t *this, int fd, - data_pair_t *trav, int flags) -{ - int sys_ret = -1; - int ret = 0; - - sys_ret = sys_fsetxattr (fd, trav->key, trav->value->data, - trav->value->len, flags); - - if (sys_ret < 0) { - if (errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported"); - } else if (errno == ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr on fd=%d failed: %s", fd, - strerror (errno)); - } else { - -#ifdef GF_DARWIN_HOST_OS - gf_log (this->name, - ((errno == EINVAL) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); -#else /* ! DARWIN */ - gf_log (this->name, GF_LOG_ERROR, - "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); -#endif /* DARWIN */ - } - - ret = -errno; - goto out; - } - -out: - return ret; -} - - -int32_t -posix_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int flags) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; - int _fd = -1; - data_pair_t * trav = NULL; - int ret = -1; - - DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (dict, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - _fd = pfd->fd; - - trav = dict->members_list; - - while (trav) { - ret = fhandle_pair (this, _fd, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno); - - return 0; -} - - -int32_t -posix_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - - DECLARE_OLD_FS_ID_VAR; - - MAKE_REAL_PATH (real_path, this, loc->path); - - SET_FS_ID (frame->root->uid, frame->root->gid); - - op_ret = sys_lremovexattr (real_path, name); - - if (op_ret == -1) { - op_errno = errno; - if (op_errno != ENOATTR && op_errno != EPERM) - gf_log (this->name, GF_LOG_ERROR, - "removexattr on %s: %s", loc->path, - strerror (op_errno)); - goto out; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno); - return 0; -} - - -int32_t -posix_fsyncdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct posix_fd * pfd = NULL; - int _fd = -1; - int ret = -1; - uint64_t tmp_pfd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL, fd=%p", fd); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - op_ret = 0; - - out: - STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno); - - return 0; -} - - -void -posix_print_xattr (dict_t *this, - char *key, - data_t *value, - void *data) -{ - gf_log ("posix", GF_LOG_DEBUG, - "(key/val) = (%s/%d)", key, data_to_int32 (value)); -} - - -/** - * add_array - add two arrays of 32-bit numbers (stored in network byte order) - * dest = dest + src - * @count: number of 32-bit numbers - * FIXME: handle overflow - */ - -static void -__add_array (int32_t *dest, int32_t *src, int count) -{ - int i = 0; - for (i = 0; i < count; i++) { - dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); - } -} - - -/** - * xattrop - xattr operations - for internal use by GlusterFS - * @optype: ADD_ARRAY: - * dict should contain: - * "key" ==> array of 32-bit numbers - */ - -int -do_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) -{ - char *real_path = NULL; - int32_t *array = NULL; - int size = 0; - int count = 0; - - int op_ret = 0; - int op_errno = 0; - - int ret = 0; - int _fd = -1; - uint64_t tmp_pfd = 0; - struct posix_fd *pfd = NULL; - - data_pair_t *trav = NULL; - - char * path = NULL; - inode_t * inode = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (xattr, out); - VALIDATE_OR_GOTO (this, out); - - trav = xattr->members_list; - - if (fd) { - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get pfd from fd=%p", - fd); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - _fd = pfd->fd; - } - - if (loc && loc->path) - MAKE_REAL_PATH (real_path, this, loc->path); - - if (loc) { - path = strdup (loc->path); - inode = loc->inode; - } else { - inode = fd->inode; - } - - while (trav) { - count = trav->value->len / sizeof (int32_t); - array = CALLOC (count, sizeof (int32_t)); - - LOCK (&inode->lock); - { - if (loc) { - size = sys_lgetxattr (real_path, trav->key, (char *)array, - trav->value->len); - } else { - size = sys_fgetxattr (_fd, trav->key, (char *)array, - trav->value->len); - } - - op_errno = errno; - if ((size == -1) && (op_errno != ENODATA) && - (op_errno != ENOATTR)) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported by filesystem"); - } else { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "getxattr failed on %s while doing " - "xattrop: %s", path, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fgetxattr failed on fd=%d while doing " - "xattrop: %s", _fd, - strerror (op_errno)); - } - - op_ret = -1; - goto unlock; - } - - switch (optype) { - - case GF_XATTROP_ADD_ARRAY: - __add_array (array, (int32_t *) trav->value->data, - trav->value->len / 4); - break; - - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on %s. Please send " - "a bug report to gluster-devel@nongnu.org", - optype, path); - op_ret = -1; - op_errno = EINVAL; - goto unlock; - } - - if (loc) { - size = sys_lsetxattr (real_path, trav->key, array, - trav->value->len, 0); - } else { - size = sys_fsetxattr (_fd, trav->key, (char *)array, - trav->value->len, 0); - } - } - unlock: - UNLOCK (&inode->lock); - - if (op_ret == -1) - goto out; - - op_errno = errno; - if (size == -1) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "setxattr failed on %s while doing xattrop: " - "key=%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr failed on fd=%d while doing xattrop: " - "key=%s (%s)", _fd, - trav->key, strerror (op_errno)); - - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, trav->key, array, - trav->value->len); - - if (size != 0) { - if (loc) - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (path=%s): " - "key=%s (%s)", path, - trav->key, strerror (-size)); - else - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (fd=%d): " - "key=%s (%s)", _fd, - trav->key, strerror (-size)); - - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } - - array = NULL; - trav = trav->next; - } - -out: - if (array) - FREE (array); - - if (path) - FREE (path); - - STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr); - return 0; -} - - -int -posix_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) -{ - do_xattrop (frame, this, loc, NULL, optype, xattr); - return 0; -} - - -int -posix_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) -{ - do_xattrop (frame, this, NULL, fd, optype, xattr); - return 0; -} - - -int -posix_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - - DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = access (real_path, mask & 07); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno); - return 0; -} - - -int32_t -posix_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct stat preop = {0,}; - struct stat postop = {0,}; - struct posix_fd *pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; - struct posix_private *priv = NULL; - - DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - op_ret = posix_fstat_with_gen (this, _fd, &preop); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation fstat failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - op_ret = ftruncate (_fd, offset); - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "ftruncate failed on fd=%p: %s", - fd, strerror (errno)); - goto out; - } - - op_ret = posix_fstat_with_gen (this, _fd, &postop); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "post-operation fstat failed on fd=%p: %s", - fd, strerror (errno)); - goto out; - } - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop, &postop); - - return 0; -} - - -static int -same_file_type (mode_t m1, mode_t m2) -{ - return ((S_IFMT & (m1 ^ m2)) == 0); -} - - -static int -ensure_file_type (xlator_t *this, char *pathname, mode_t mode) -{ - struct stat stbuf = {0,}; - int op_ret = 0; - int ret = -1; - - ret = posix_lstat_with_gen (this, pathname, &stbuf); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "stat failed while trying to make sure entry %s " - "is a directory: %s", pathname, strerror (errno)); - goto out; - } - - if (!same_file_type (mode, stbuf.st_mode)) { - op_ret = -EEXIST; - gf_log (this->name, GF_LOG_ERROR, - "entry %s is a different type of file " - "than expected", pathname); - goto out; - } - out: - return op_ret; -} - -static int -create_entry (xlator_t *this, int32_t flags, - dir_entry_t *entry, char *pathname) -{ - int op_ret = 0; - int ret = -1; - struct timeval tv[2] = {{0,0},{0,0}}; - - if (S_ISDIR (entry->buf.st_mode)) { - /* - * If the entry is directory, create it by - * calling 'mkdir'. If the entry is already - * present, check if it is a directory, - * and issue a warning if otherwise. - */ - - ret = mkdir (pathname, entry->buf.st_mode); - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, pathname, - entry->buf.st_mode); - } - else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "mkdir %s with mode (0%o) failed: %s", - pathname, entry->buf.st_mode, - strerror (errno)); - goto out; - } - } - - } else if ((flags & GF_SET_IF_NOT_PRESENT) - || !(flags & GF_SET_DIR_ONLY)) { - - /* create a 0-byte file here */ - - if (S_ISREG (entry->buf.st_mode)) { - ret = open (pathname, O_CREAT|O_EXCL, - entry->buf.st_mode); - - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, - pathname, - entry->buf.st_mode); - } - else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "Error creating file %s with " - "mode (0%o): %s", - pathname, entry->buf.st_mode, - strerror (errno)); - goto out; - } - } - - close (ret); - - } else if (S_ISLNK (entry->buf.st_mode)) { - ret = symlink (entry->link, pathname); - - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, - pathname, - entry->buf.st_mode); - } - else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "error creating symlink %s: %s" - , pathname, strerror (errno)); - goto out; - } - } - - } else if (S_ISBLK (entry->buf.st_mode) || - S_ISCHR (entry->buf.st_mode) || - S_ISFIFO (entry->buf.st_mode) || - S_ISSOCK (entry->buf.st_mode)) { - - ret = mknod (pathname, entry->buf.st_mode, - entry->buf.st_dev); - - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, - pathname, - entry->buf.st_mode); - } else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "error creating device file " - "%s: %s", - pathname, strerror (errno)); - goto out; - } - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "invalid mode 0%o for %s", entry->buf.st_mode, - pathname); - op_ret = -EINVAL; - goto out; - } - } - - /* - * Preserve atime and mtime - */ - - if (!S_ISLNK (entry->buf.st_mode)) { - tv[0].tv_sec = entry->buf.st_atime; - tv[1].tv_sec = entry->buf.st_mtime; - ret = utimes (pathname, tv); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "utimes %s failed: %s", - pathname, strerror (errno)); - goto out; - } - } - -out: - return op_ret; - -} - - -int -posix_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, - int32_t count) -{ - char * real_path = NULL; - char * entry_path = NULL; - int32_t real_path_len = -1; - int32_t entry_path_len = -1; - int32_t ret = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - struct posix_fd * pfd = {0, }; - struct timeval tv[2] = {{0, }, {0, }}; - uint64_t tmp_pfd = 0; - char pathname[ZR_PATH_MAX] = {0,}; - dir_entry_t * trav = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (entries, out); - - tv[0].tv_sec = tv[0].tv_usec = 0; - tv[1].tv_sec = tv[1].tv_usec = 0; - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "fd's ctx not found on fd=%p for %s", - fd, this->name); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - real_path = pfd->path; - - if (!real_path) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "path is NULL on pfd=%p fd=%p", pfd, fd); - goto out; - } - - real_path_len = strlen (real_path); - entry_path_len = real_path_len + 256; - entry_path = CALLOC (1, entry_path_len); - - if (!entry_path) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - strcpy (entry_path, real_path); - entry_path[real_path_len] = '/'; - - /* fd exists, and everything looks fine */ - /** - * create an entry for each one present in '@entries' - * - if flag is set (ie, if its namespace), create both directories - * and files - * - if not set, create only directories. - * - * after the entry is created, change the mode and ownership of the - * entry according to the stat present in entries->buf. - */ - - trav = entries->next; - while (trav) { - strcpy (pathname, entry_path); - strcat (pathname, trav->name); - - ret = create_entry (this, flags, trav, pathname); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - /* TODO: handle another flag, GF_SET_OVERWRITE */ - - /* Change the mode */ - if (!S_ISLNK (trav->buf.st_mode)) { - ret = chmod (pathname, trav->buf.st_mode); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chmod on %s failed: %s", pathname, - strerror (op_errno)); - goto out; - } - } - - /* change the ownership */ - ret = lchown (pathname, trav->buf.st_uid, trav->buf.st_gid); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chmod on %s failed: %s", pathname, - strerror (op_errno)); - goto out; - } - - if (flags & GF_SET_EPOCH_TIME) { - ret = utimes (pathname, tv); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "utimes on %s failed: %s", pathname, - strerror (op_errno)); - goto out; - } - } - - /* consider the next entry */ - trav = trav->next; - } - - op_ret = 0; - out: - STACK_UNWIND_STRICT (setdents, frame, op_ret, op_errno); - if (entry_path) - FREE (entry_path); - - return 0; -} - -int32_t -posix_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) -{ - int _fd = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - struct stat buf = {0,}; - struct posix_fd *pfd = NULL; - uint64_t tmp_pfd = 0; - int ret = -1; - struct posix_private *priv = NULL; - - DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - op_ret = posix_fstat_with_gen (this, _fd, &buf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - - op_ret = 0; - -out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, &buf); - return 0; -} - -static int gf_posix_lk_log; - -int32_t -posix_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct flock *lock) -{ - struct flock nullock = {0, }; - - gf_posix_lk_log++; - - GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_ERROR, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock); - return 0; -} - -int32_t -posix_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *lock) -{ - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - "You need to use it for proper functioning of GlusterFS"); - - STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS); - return 0; -} - -int32_t -posix_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *lock) -{ - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - "You need to use it for proper functioning of GlusterFS"); - - STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS); - return 0; -} - - -int32_t -posix_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - "You need to use it for proper functioning of GlusterFS"); - - STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS); - return 0; -} - -int32_t -posix_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - " You need to use it for proper functioning of GlusterFS"); - - STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS); - return 0; -} - - -int32_t -posix_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, int whichop) -{ - uint64_t tmp_pfd = 0; - struct posix_fd *pfd = NULL; - DIR *dir = NULL; - int ret = -1; - size_t filled = 0; - int count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - gf_dirent_t *this_entry = NULL; - gf_dirent_t entries; - struct dirent *entry = NULL; - off_t in_case = -1; - int32_t this_size = -1; - char *real_path = NULL; - int real_path_len = -1; - char *entry_path = NULL; - int entry_path_len = -1; - struct posix_private *priv = NULL; - struct stat stbuf = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - INIT_LIST_HEAD (&entries.list); - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; - if (!pfd->path) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_DEBUG, - "pfd does not have path set (possibly file " - "fd, fd=%p)", fd); - goto out; - } - - real_path = pfd->path; - real_path_len = strlen (real_path); - - entry_path_len = real_path_len + NAME_MAX; - entry_path = alloca (entry_path_len); - - if (!entry_path) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - strncpy (entry_path, real_path, entry_path_len); - entry_path[real_path_len] = '/'; - - dir = pfd->dir; - - if (!dir) { - gf_log (this->name, GF_LOG_DEBUG, - "dir is NULL for fd=%p", fd); - op_errno = EINVAL; - goto out; - } - - - if (!off) { - rewinddir (dir); - } else { - seekdir (dir, off); - } - - while (filled <= size) { - in_case = telldir (dir); - - if (in_case == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "telldir failed on dir=%p: %s", - dir, strerror (errno)); - goto out; - } - - errno = 0; - entry = readdir (dir); - - if (!entry) { - if (errno == EBADF) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "readdir failed on dir=%p: %s", - dir, strerror (op_errno)); - goto out; - } - break; - } - - this_size = dirent_size (entry); - - if (this_size + filled > size) { - seekdir (dir, in_case); - break; - } - - /* Device spanning requires that we have a stat buf for the - * file so we need to perform a stat on the two conditions - * below. - */ - if ((whichop == GF_FOP_READDIRP) || (priv->span_devices)) { - strcpy (entry_path + real_path_len + 1, entry->d_name); - op_ret = posix_lstat_with_gen (this, entry_path, &stbuf); - if (-1 == op_ret) - continue; - } else - stbuf.st_ino = entry->d_ino; - - /* So at this point stbuf ino is either: - * a. the original inode number got from entry, in case this - * was a readdir fop or if device spanning was disabled. - * - * b. the scaled inode number, if device spanning was enabled - * or this was a readdirp fop. - */ - entry->d_ino = stbuf.st_ino; - - this_entry = gf_dirent_for_name (entry->d_name); - - if (!this_entry) { - gf_log (this->name, GF_LOG_ERROR, - "could not create gf_dirent for entry %s: (%s)", - entry->d_name, strerror (errno)); - goto out; - } - this_entry->d_off = telldir (dir); - this_entry->d_ino = entry->d_ino; - this_entry->d_stat = stbuf; - - list_add_tail (&this_entry->list, &entries.list); - - filled += this_size; - count ++; - } - - op_ret = count; - - out: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries); - - gf_dirent_free (&entries); - - return 0; -} - - -int32_t -posix_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) -{ - posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR); - return 0; -} - - -int32_t -posix_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) -{ - posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP); - return 0; -} - - -int32_t -posix_stats (call_frame_t *frame, xlator_t *this, - int32_t flags) - -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - - struct xlator_stats xlstats = {0, }; - struct xlator_stats * stats = NULL; - struct statvfs buf = {0,}; - struct timeval tv = {0,}; - struct posix_private * priv = (struct posix_private *)this->private; - - int64_t avg_read = 0; - int64_t avg_write = 0; - int64_t _time_ms = 0; - - DECLARE_OLD_FS_ID_VAR; - - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - - stats = &xlstats; - - op_ret = statvfs (priv->base_path, &buf); - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s", - strerror (op_errno)); - goto out; - } - - /* client info is maintained at FSd */ - stats->nr_clients = priv->stats.nr_clients; - stats->nr_files = priv->stats.nr_files; - - /* number of free block in the filesystem. */ - stats->free_disk = buf.f_bfree * buf.f_bsize; - - stats->total_disk_size = buf.f_blocks * buf.f_bsize; - stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; - - /* Calculate read and write usage */ - op_ret = gettimeofday (&tv, NULL); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "gettimeofday failed: %s", strerror (errno)); - goto out; - } - - LOCK (&priv->lock); - { - /* Read */ - _time_ms = (tv.tv_sec - priv->init_time.tv_sec) * 1000 + - ((tv.tv_usec - priv->init_time.tv_usec) / 1000); - - avg_read = (_time_ms) ? (priv->read_value / _time_ms) : 0; /* KBps */ - avg_write = (_time_ms) ? (priv->write_value / _time_ms) : 0; /* KBps */ - - _time_ms = (tv.tv_sec - priv->prev_fetch_time.tv_sec) * 1000 + - ((tv.tv_usec - priv->prev_fetch_time.tv_usec) / 1000); - - if (_time_ms && ((priv->interval_read / _time_ms) > priv->max_read)) { - priv->max_read = (priv->interval_read / _time_ms); - } - - if (_time_ms && - ((priv->interval_write / _time_ms) > priv->max_write)) { - priv->max_write = priv->interval_write / _time_ms; - } - - stats->read_usage = avg_read / priv->max_read; - stats->write_usage = avg_write / priv->max_write; - } - UNLOCK (&priv->lock); - - op_ret = gettimeofday (&(priv->prev_fetch_time), NULL); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "gettimeofday failed: %s", - strerror (op_errno)); - goto out; - } - - priv->interval_read = 0; - priv->interval_write = 0; - - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND (frame, op_ret, op_errno, stats); - return 0; -} - -int32_t -posix_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flag) -{ - char * real_path = NULL; - DIR * dir = NULL; - struct dirent * dirent = NULL; - uint8_t file_checksum[NAME_MAX] = {0,}; - uint8_t dir_checksum[NAME_MAX] = {0,}; - int32_t op_ret = -1; - int32_t op_errno = 0; - int i = 0; - int length = 0; - - struct stat buf = {0,}; - char tmp_real_path[ZR_PATH_MAX] = {0,}; - int ret = -1; - - MAKE_REAL_PATH (real_path, this, loc->path); - - dir = opendir (real_path); - - if (!dir){ - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "opendir() failed on `%s': %s", - real_path, strerror (op_errno)); - goto out; - } - - while ((dirent = readdir (dir))) { - errno = 0; - if (!dirent) { - if (errno != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "readdir() failed on dir=%p: %s", - dir, strerror (errno)); - goto out; - } - break; - } - - length = strlen (dirent->d_name); - - strcpy (tmp_real_path, real_path); - strcat (tmp_real_path, "/"); - strcat (tmp_real_path, dirent->d_name); - ret = posix_lstat_with_gen (this, tmp_real_path, &buf); - - if (ret == -1) - continue; - - if (S_ISDIR (buf.st_mode)) { - for (i = 0; i < length; i++) - dir_checksum[i] ^= dirent->d_name[i]; - } else { - for (i = 0; i < length; i++) - file_checksum[i] ^= dirent->d_name[i]; - } - } - closedir (dir); - - op_ret = 0; - - out: - STACK_UNWIND_STRICT (checksum, frame, op_ret, op_errno, - file_checksum, dir_checksum); - - return 0; -} - -int32_t -posix_priv (xlator_t *this) -{ - struct posix_private *priv = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - - snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, - this->name); - gf_proc_dump_add_section(key_prefix); - - if (!this) - return 0; - - priv = this->private; - - if (!priv) - return 0; - - gf_proc_dump_build_key(key, key_prefix, "base_path"); - gf_proc_dump_write(key,"%s", priv->base_path); - gf_proc_dump_build_key(key, key_prefix, "base_path_length"); - gf_proc_dump_write(key,"%d", priv->base_path_length); - gf_proc_dump_build_key(key, key_prefix, "max_read"); - gf_proc_dump_write(key,"%d", priv->max_read); - gf_proc_dump_build_key(key, key_prefix, "max_write"); - gf_proc_dump_write(key,"%d", priv->max_write); - gf_proc_dump_build_key(key, key_prefix, "stats.nr_files"); - gf_proc_dump_write(key,"%ld", priv->stats.nr_files); - - return 0; -} - -int32_t -posix_inode (xlator_t *this) -{ - return 0; -} - - -int32_t -posix_rchecksum (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, int32_t len) -{ - char *buf = NULL; - - int _fd = -1; - uint64_t tmp_pfd = 0; - - struct posix_fd *pfd = NULL; - - int op_ret = -1; - int op_errno = 0; - - int ret = 0; - - int32_t weak_checksum = 0; - uint8_t strong_checksum[MD5_DIGEST_LEN]; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - memset (strong_checksum, 0, MD5_DIGEST_LEN); - - buf = CALLOC (1, len); - if (!buf) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - goto out; - } - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (struct posix_fd *)(long) tmp_pfd; - - _fd = pfd->fd; - - ret = pread (_fd, buf, len, offset); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pread of %d bytes returned %d (%s)", - len, ret, strerror (errno)); - - op_errno = errno; - goto out; - } - - weak_checksum = gf_rsync_weak_checksum (buf, len); - gf_rsync_strong_checksum (buf, len, strong_checksum); - - FREE (buf); - - op_ret = 0; -out: - STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, - weak_checksum, strong_checksum); - return 0; -} - +#include <glusterfs/xlator.h> +#include "posix.h" -/** - * notify - when parent sends PARENT_UP, send CHILD_UP event from here - */ int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - switch (event) - { - case GF_EVENT_PARENT_UP: - { - /* Tell the parent that posix xlator is up */ - default_notify (this, GF_EVENT_CHILD_UP, data); - } - break; - default: - /* */ - break; - } - return 0; -} - -/** - * init - - */ -int -init (xlator_t *this) -{ - int ret = 0; - int op_ret = -1; - gf_boolean_t tmp_bool = 0; - struct stat buf = {0,}; - struct posix_private * _private = NULL; - data_t * dir_data = NULL; - data_t * tmp_data = NULL; - uint64_t time64 = 0; - - int dict_ret = 0; - int32_t janitor_sleep; - - dir_data = dict_get (this->options, "directory"); - - if (this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/posix cannot have subvolumes"); - ret = -1; - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling. Please check the volume file."); - } - - if (!dir_data) { - gf_log (this->name, GF_LOG_CRITICAL, - "Export directory not specified in volume file."); - ret = -1; - goto out; - } +mem_acct_init(xlator_t *this); - umask (000); // umask `masking' is done at the client side - - /* Check whether the specified directory exists, if not log it. */ - op_ret = stat (dir_data->data, &buf); - if ((op_ret != 0) || !S_ISDIR (buf.st_mode)) { - gf_log (this->name, GF_LOG_ERROR, - "Directory '%s' doesn't exist, exiting.", - dir_data->data); - ret = -1; - goto out; - } - - - /* Check for Extended attribute support, if not present, log it */ - op_ret = sys_lsetxattr (dir_data->data, - "trusted.glusterfs.test", "working", 8, 0); - if (op_ret < 0) { - tmp_data = dict_get (this->options, - "mandate-attribute"); - if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &tmp_bool) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "wrong option provided for key " - "\"mandate-attribute\""); - ret = -1; - goto out; - } - if (!tmp_bool) { - gf_log (this->name, GF_LOG_WARNING, - "Extended attribute not supported, " - "starting as per option"); - } else { - gf_log (this->name, GF_LOG_CRITICAL, - "Extended attribute not supported, " - "exiting."); - ret = -1; - goto out; - } - } else { - gf_log (this->name, GF_LOG_CRITICAL, - "Extended attribute not supported, exiting."); - ret = -1; - goto out; - } - } - - _private = CALLOC (1, sizeof (*_private)); - if (!_private) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - ret = -1; - goto out; - } - - _private->base_path = strdup (dir_data->data); - _private->base_path_length = strlen (_private->base_path); - - _private->trash_path = CALLOC (1, _private->base_path_length - + strlen ("/") - + strlen (GF_REPLICATE_TRASH_DIR) - + 1); - - if (!_private->trash_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - ret = -1; - goto out; - } - - strncpy (_private->trash_path, _private->base_path, _private->base_path_length); - strcat (_private->trash_path, "/" GF_REPLICATE_TRASH_DIR); - - LOCK_INIT (&_private->lock); - - ret = gethostname (_private->hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", strerror (errno)); - } - - { - /* Stats related variables */ - gettimeofday (&_private->init_time, NULL); - gettimeofday (&_private->prev_fetch_time, NULL); - _private->max_read = 1; - _private->max_write = 1; - } - - _private->export_statfs = 1; - tmp_data = dict_get (this->options, "export-statfs-size"); - if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &_private->export_statfs) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "'export-statfs-size' takes only boolean " - "options"); - goto out; - } - if (!_private->export_statfs) - gf_log (this->name, GF_LOG_DEBUG, - "'statfs()' returns dummy size"); - } - - _private->background_unlink = 0; - tmp_data = dict_get (this->options, "background-unlink"); - if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &_private->background_unlink) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "'background-unlink' takes only boolean " - "options"); - goto out; - } - - if (_private->background_unlink) - gf_log (this->name, GF_LOG_DEBUG, - "unlinks will be performed in background"); - } - - tmp_data = dict_get (this->options, "o-direct"); - if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &_private->o_direct) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "wrong option provided for 'o-direct'"); - goto out; - } - if (_private->o_direct) - gf_log (this->name, GF_LOG_DEBUG, - "o-direct mode is enabled (O_DIRECT " - "for every open)"); - } - - _private->num_devices_to_span = 1; - - tmp_data = dict_get (this->options, "span-devices"); - if (tmp_data) { - if (gf_string2int32 (tmp_data->data, - &_private->num_devices_to_span) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "wrong option provided for 'span-devices'"); - goto out; - } - if (_private->num_devices_to_span > 1) { - gf_log (this->name, GF_LOG_NORMAL, - "spanning enabled accross %d mounts", - _private->num_devices_to_span); - _private->span_devices = 1; - } - if (_private->num_devices_to_span < 1) - _private->num_devices_to_span = 1; - } - _private->st_device = CALLOC (1, (sizeof (dev_t) * - _private->num_devices_to_span)); - - /* Start with the base */ - _private->st_device[0] = buf.st_dev; - - _private->janitor_sleep_duration = 600; - - dict_ret = dict_get_int32 (this->options, "janitor-sleep-duration", - &janitor_sleep); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting janitor sleep duration to %d.", - janitor_sleep); - - _private->janitor_sleep_duration = janitor_sleep; - } - - LOCK_INIT (&_private->gen_lock); - time64 = time (NULL); - _private->gen_seq = (time64 << 32); - -#ifndef GF_DARWIN_HOST_OS - { - struct rlimit lim; - lim.rlim_cur = 1048576; - lim.rlim_max = 1048576; - - if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { - gf_log (this->name, GF_LOG_WARNING, - "Failed to set 'ulimit -n " - " 1048576': %s", strerror(errno)); - lim.rlim_cur = 65536; - lim.rlim_max = 65536; - - if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { - gf_log (this->name, GF_LOG_WARNING, - "Failed to set maximum allowed open " - "file descriptors to 64k: %s", - strerror(errno)); - } - else { - gf_log (this->name, GF_LOG_NORMAL, - "Maximum allowed open file descriptors " - "set to 65536"); - } - } - } -#endif - this->private = (void *)_private; - - pthread_mutex_init (&_private->janitor_lock, NULL); - pthread_cond_init (&_private->janitor_cond, NULL); - INIT_LIST_HEAD (&_private->janitor_fds); - - posix_spawn_janitor_thread (this); - out: - return ret; -} - -void -fini (xlator_t *this) -{ - struct posix_private *priv = this->private; - sys_lremovexattr (priv->base_path, "trusted.glusterfs.test"); - FREE (priv); - return; -} +extern struct volume_options posix_options[]; struct xlator_dumpops dumpops = { - .priv = posix_priv, - .inode = posix_inode, -}; - -struct xlator_mops mops = { - .stats = posix_stats, + .priv = posix_priv, + .inode = posix_inode, }; struct xlator_fops fops = { - .lookup = posix_lookup, - .stat = posix_stat, - .opendir = posix_opendir, - .readdir = posix_readdir, - .readdirp = posix_readdirp, - .readlink = posix_readlink, - .mknod = posix_mknod, - .mkdir = posix_mkdir, - .unlink = posix_unlink, - .rmdir = posix_rmdir, - .symlink = posix_symlink, - .rename = posix_rename, - .link = posix_link, - .truncate = posix_truncate, - .create = posix_create, - .open = posix_open, - .readv = posix_readv, - .writev = posix_writev, - .statfs = posix_statfs, - .flush = posix_flush, - .fsync = posix_fsync, - .setxattr = posix_setxattr, - .fsetxattr = posix_fsetxattr, - .getxattr = posix_getxattr, - .fgetxattr = posix_fgetxattr, - .removexattr = posix_removexattr, - .fsyncdir = posix_fsyncdir, - .access = posix_access, - .ftruncate = posix_ftruncate, - .fstat = posix_fstat, - .lk = posix_lk, - .inodelk = posix_inodelk, - .finodelk = posix_finodelk, - .entrylk = posix_entrylk, - .fentrylk = posix_fentrylk, - .setdents = posix_setdents, - .getdents = posix_getdents, - .checksum = posix_checksum, - .rchecksum = posix_rchecksum, - .xattrop = posix_xattrop, - .fxattrop = posix_fxattrop, - .setattr = posix_setattr, - .fsetattr = posix_fsetattr, + .lookup = posix_lookup, + .stat = posix_stat, + .opendir = posix_opendir, + .readdir = posix_readdir, + .readdirp = posix_readdirp, + .readlink = posix_readlink, + .mknod = posix_mknod, + .mkdir = posix_mkdir, + .unlink = posix_unlink, + .rmdir = posix_rmdir, + .symlink = posix_symlink, + .rename = posix_rename, + .link = posix_link, + .truncate = posix_truncate, + .create = posix_create, + .open = posix_open, + .readv = posix_readv, + .writev = posix_writev, + .statfs = posix_statfs, + .flush = posix_flush, + .fsync = posix_fsync, + .setxattr = posix_setxattr, + .fsetxattr = posix_fsetxattr, + .getxattr = posix_getxattr, + .fgetxattr = posix_fgetxattr, + .removexattr = posix_removexattr, + .fremovexattr = posix_fremovexattr, + .fsyncdir = posix_fsyncdir, + .access = posix_access, + .ftruncate = posix_ftruncate, + .fstat = posix_fstat, + .lk = posix_lk, + .inodelk = posix_inodelk, + .finodelk = posix_finodelk, + .entrylk = posix_entrylk, + .fentrylk = posix_fentrylk, + .rchecksum = posix_rchecksum, + .xattrop = posix_xattrop, + .fxattrop = posix_fxattrop, + .setattr = posix_setattr, + .fsetattr = posix_fsetattr, + .fallocate = posix_glfallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, + .ipc = posix_ipc, + .seek = posix_seek, + .lease = posix_lease, + .put = posix_put, + .copy_file_range = posix_copy_file_range, }; struct xlator_cbks cbks = { - .release = posix_release, - .releasedir = posix_releasedir, - .forget = posix_forget + .release = posix_release, + .releasedir = posix_releasedir, + .forget = posix_forget, }; -struct volume_options options[] = { - { .key = {"o-direct"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"directory"}, - .type = GF_OPTION_TYPE_PATH }, - { .key = {"export-statfs-size"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"mandate-attribute"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"span-devices"}, - .type = GF_OPTION_TYPE_INT }, - { .key = {"background-unlink"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"janitor-sleep-duration"}, - .type = GF_OPTION_TYPE_INT }, - { .key = {NULL} } +xlator_api_t xlator_api = { + .init = posix_init, + .fini = posix_fini, + .notify = posix_notify, + .reconfigure = posix_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = posix_options, + .identifier = "posix", + .category = GF_MAINTAINED, }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index f92e256fbc0..b8db146eef2 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -1,42 +1,23 @@ /* - Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_H #define _POSIX_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include <stdio.h> #include <unistd.h> #include <sys/types.h> #include <dirent.h> #include <time.h> -#ifdef linux -#ifdef __GLIBC__ +#ifdef HAVE_SET_FSID #include <sys/fsuid.h> -#else -#include <unistd.h> -#endif #endif #ifdef HAVE_SYS_XATTR_H @@ -47,96 +28,646 @@ #include <sys/extattr.h> #endif -#include "xlator.h" -#include "inode.h" -#include "compat.h" +#include <glusterfs/compat.h> +#include <glusterfs/timer.h> +#include "posix-mem-types.h" +#include <glusterfs/call-stub.h> + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#include "posix-aio.h" +#endif + +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 + +#define XATTR_KEY_BUF_SIZE 4096 +#define XATTR_VAL_BUF_SIZE 8192 + +#define ACL_BUFFER_MAX 4096 /* size of character buffer */ + +#define DHT_LINKTO "trusted.glusterfs.dht.linkto" + +#define POSIX_GFID_HANDLE_SIZE(base_path_len) \ + (base_path_len + SLEN("/") + SLEN(GF_HIDDEN_PATH) + SLEN("/") + \ + SLEN("00/") + SLEN("00/") + SLEN(UUID0_STR) + 1) /* '\0' */; + +#define POSIX_GFID_HANDLE_RELSIZE \ + SLEN("../") + SLEN("../") + SLEN("00/") + SLEN("00/") + SLEN(UUID0_STR) + 1; + +#define GF_UNLINK_TRUE 0x0000000000000001 +#define GF_UNLINK_FALSE 0x0000000000000000 + +#define DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out) \ + do { \ + if (frame->root->pid >= 0 && priv->disk_space_full && \ + !dict_get_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { \ + op_ret = -1; \ + op_errno = ENOSPC; \ + gf_msg_debug("posix", ENOSPC, \ + "disk space utilization reached limits" \ + " for path %s ", \ + priv->base_path); \ + goto out; \ + } \ + } while (0) + +/* Setting microseconds or nanoseconds depending on what's supported: + The passed in `tv` can be + struct timespec + if supported (better, because it supports nanosecond resolution) or + struct timeval + otherwise. */ +#if HAVE_UTIMENSAT +#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) tv.tv_nsec = nanosecs +#define PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv) \ + (sys_utimensat(AT_FDCWD, path, tv, AT_SYMLINK_NOFOLLOW)) +#else +#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \ + tv.tv_usec = nanosecs / 1000 +#define PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv) (lutimes(path, tv)) +#endif + +#define GFID_NULL_CHECK_AND_GOTO(frame, this, loc, xattr_req, op_ret, \ + op_errno, _uuid_req, out) \ + do { \ + int _ret = 0; \ + /* TODO: Remove pid check once trash implements client side \ + * logic to assign gfid for entry creations inside .trashcan \ + */ \ + if (frame->root->pid == GF_SERVER_PID_TRASH) \ + break; \ + _ret = dict_get_gfuuid(xattr_req, "gfid-req", &_uuid_req); \ + if (_ret) { \ + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_NULL_GFID, \ + "failed to get the gfid from dict for %s", loc->path); \ + op_ret = -1; \ + op_errno = EINVAL; \ + goto out; \ + } \ + if (gf_uuid_is_null(_uuid_req)) { \ + gf_msg(this->name, GF_LOG_ERROR, EINVAL, P_MSG_NULL_GFID, \ + "gfid is null for %s", loc->path); \ + op_ret = -1; \ + op_errno = EINVAL; \ + goto out; \ + } \ + } while (0) /** * posix_fd - internal structure common to file and directory fd's */ struct posix_fd { - int fd; /* fd returned by the kernel */ - int32_t flags; /* flags for open/creat */ - char * path; /* used by setdents/getdents */ - DIR * dir; /* handle returned by the kernel */ - int flushwrites; - struct list_head list; /* to add to the janitor list */ + int fd; /* fd returned by the kernel */ + int32_t flags; /* flags for open/creat */ + DIR *dir; /* handle returned by the kernel */ + off_t dir_eof; /* offset at dir EOF */ + struct list_head list; /* to add to the janitor list */ + int odirect; + xlator_t *xl; + char _pad[4]; /* manual padding */ }; - struct posix_private { - char *base_path; - int32_t base_path_length; - - gf_lock_t lock; - - char hostname[256]; - /* Statistics, provides activity of the server */ - struct xlator_stats stats; - - struct timeval prev_fetch_time; - struct timeval init_time; - - time_t last_landfill_check; - int32_t janitor_sleep_duration; - struct list_head janitor_fds; - pthread_cond_t janitor_cond; - pthread_mutex_t janitor_lock; - - int32_t max_read; /* */ - int32_t max_write; /* */ - int64_t interval_read; /* Used to calculate the max_read value */ - int64_t interval_write; /* Used to calculate the max_write value */ - int64_t read_value; /* Total read, from init */ - int64_t write_value; /* Total write, from init */ + char *base_path; + int32_t base_path_length; + int32_t path_max; -/* - In some cases, two exported volumes may reside on the same - partition on the server. Sending statvfs info for both - the volumes will lead to erroneous df output at the client, - since free space on the partition will be counted twice. + gf_lock_t lock; - In such cases, user can disable exporting statvfs info - on one of the volumes by setting this option. -*/ - gf_boolean_t export_statfs; + char *hostname; - gf_boolean_t o_direct; /* always open files in O_DIRECT mode */ + time_t last_landfill_check; - gf_boolean_t span_devices; + gf_atomic_t read_value; /* Total read, from init */ + gf_atomic_t write_value; /* Total write, from init */ -/* - decide whether posix_unlink does open (file), unlink (file), close (fd) - instead of just unlink (file). with the former approach there is no lockout - of access to parent directory during removal of very large files for the - entire duration of freeing of data blocks. -*/ - gf_boolean_t background_unlink; + /* janitor task which cleans up /.trash (created by replicate) */ + struct gf_tw_timer_list *janitor; - int num_devices_to_span; - dev_t *st_device; + char *trash_path; + /* lock for brick dir */ + int mount_lock; -/* a global generation number sequence is used to assign generation numbers in - sequence. -*/ - uint64_t gen_seq; - gf_lock_t gen_lock; + struct stat handledir; + + /* uuid of glusterd that swapned the brick process */ + uuid_t glusterd_uuid; + +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + pthread_mutex_t janitor_mutex; + pthread_cond_t janitor_cond; + pthread_cond_t fd_cond; + int fsync_queue_count; + int32_t janitor_sleep_duration; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + char gfid2path_sep[8]; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + /* seconds to sleep to wait for aio write finish for health checks */ + uint32_t health_check_timeout; + pthread_t health_check; + + double disk_reserve; + pthread_t disk_space_check; + uint32_t disk_space_full; -/* janitor thread which cleans up /.trash (created by replicate) */ - pthread_t janitor; - gf_boolean_t janitor_present; - char * trash_path; +#ifdef GF_DARWIN_HOST_OS + enum { + XATTR_NONE = 0, + XATTR_STRIP, + XATTR_APPEND, + XATTR_BOTH, + } xattr_user_namespace; +#endif + + /* Option to handle the cases of multiple bricks exported from + same backend. Very much usable in brick-splitting feature. */ + int32_t shared_brick_count; + + /*Option to set mode bit permission that will always be set on + file/directory. */ + mode_t force_create_mode; + mode_t force_directory_mode; + mode_t create_mask; + mode_t create_directory_mask; + uint32_t max_hardlinks; + int32_t arrdfd[256]; + int dirfd; + + /* This option is used for either to call a landfill_purge or not */ + gf_boolean_t disable_landfill_purge; + + gf_boolean_t fips_mode_rchecksum; + gf_boolean_t ctime; + gf_boolean_t janitor_task_stop; + + gf_boolean_t disk_space_check_active; + char disk_unit; + gf_boolean_t health_check_active; + gf_boolean_t update_pgfid_nlinks; + gf_boolean_t gfid2path; + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + /* + In some cases, two exported volumes may reside on the same + partition on the server. Sending statvfs info for both + the volumes will lead to erroneous df output at the client, + since free space on the partition will be counted twice. + + In such cases, user can disable exporting statvfs info + on one of the volumes by setting this option. + */ + gf_boolean_t export_statfs; + + gf_boolean_t o_direct; /* always open files in O_DIRECT mode */ + + /* + decide whether posix_unlink does open (file), unlink (file), close (fd) + instead of just unlink (file). with the former approach there is no + lockout of access to parent directory during removal of very large files + for the entire duration of freeing of data blocks. + */ + gf_boolean_t background_unlink; + gf_boolean_t aio_configured; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; + uint32_t rel_fdcount; }; -#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) +typedef struct { + call_frame_t *frame; + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct iatt *stbuf; + loc_t *loc; + inode_t *inode; /* for all do_xattrop() key handling */ + fd_t *fd; + int fdnum; + int flags; + char *list; + size_t list_size; + int32_t op_errno; + + char _pad[4]; /* manual padding */ +} posix_xattr_filler_t; + +typedef struct { + uint64_t unlink_flag; + pthread_mutex_t xattrop_lock; + pthread_mutex_t write_atomic_lock; + pthread_mutex_t pgfid_lock; +} posix_inode_ctx_t; + +#define POSIX_BASE_PATH(this) \ + (((struct posix_private *)this->private)->base_path) + +#define POSIX_BASE_PATH_LEN(this) \ + (((struct posix_private *)this->private)->base_path_length) + +#define POSIX_PATH_MAX(this) (((struct posix_private *)this->private)->path_max) + +#define POSIX_GET_FILE_UNLINK_PATH(base_path, gfid, unlink_path) \ + do { \ + int path_len = 0; \ + char gfid_str[64] = {0}; \ + uuid_utoa_r(gfid, gfid_str); \ + path_len = strlen(base_path) + 1 + SLEN(GF_UNLINK_PATH) + 1 + \ + UUID_CANONICAL_FORM_LEN + 1; \ + unlink_path = alloca(path_len); \ + if (!unlink_path) { \ + gf_msg("posix", GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, \ + "Failed to get unlink_path"); \ + break; \ + } \ + sprintf(unlink_path, "%s/%s/%s", base_path, GF_UNLINK_PATH, gfid_str); \ + } while (0) + +/* Helper functions */ +int +posix_inode_ctx_set_unlink_flag(inode_t *inode, xlator_t *this, uint64_t ctx); + +int +posix_inode_ctx_get_all(inode_t *inode, xlator_t *this, + posix_inode_ctx_t **ctx); + +int +__posix_inode_ctx_set_unlink_flag(inode_t *inode, xlator_t *this, uint64_t ctx); + +int +__posix_inode_ctx_get_all(inode_t *inode, xlator_t *this, + posix_inode_ctx_t **ctx); + +int +posix_gfid_set(xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req, + pid_t pid, int *op_errno); +int +posix_fdstat(xlator_t *this, inode_t *inode, int fd, struct iatt *stbuf_p); +int +posix_istat(xlator_t *this, inode_t *inode, uuid_t gfid, const char *basename, + struct iatt *iatt); +int +posix_pstat(xlator_t *this, inode_t *inode, uuid_t gfid, const char *real_path, + struct iatt *iatt, gf_boolean_t inode_locked); + +dict_t * +posix_xattr_fill(xlator_t *this, const char *path, loc_t *loc, fd_t *fd, + int fdnum, dict_t *xattr, struct iatt *buf); +int +posix_handle_pair(xlator_t *this, loc_t *loc, const char *real_path, char *key, + data_t *value, int flags, struct iatt *stbuf); +int +posix_fhandle_pair(call_frame_t *frame, xlator_t *this, int fd, char *key, + data_t *value, int flags, struct iatt *stbuf, fd_t *_fd); +void +posix_janitor_timer_start(xlator_t *this); +int +posix_acl_xattr_set(xlator_t *this, const char *path, dict_t *xattr_req); +int +posix_gfid_heal(xlator_t *this, const char *path, loc_t *loc, + dict_t *xattr_req); +int +posix_entry_create_xattr_set(xlator_t *this, loc_t *loc, const char *path, + dict_t *dict); + +int +posix_fd_ctx_get(fd_t *fd, xlator_t *this, struct posix_fd **pfd, + int *op_errno); +void +posix_fill_ino_from_gfid(xlator_t *this, struct iatt *buf); + +gf_boolean_t +posix_special_xattr(char **pattern, char *key); + +void +__posix_fd_set_odirect(fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size); +int +posix_spawn_health_check_thread(xlator_t *this); + +int +posix_spawn_disk_space_check_thread(xlator_t *this); + +void * +posix_fsyncer(void *); +int +posix_get_ancestry(xlator_t *this, inode_t *leaf_inode, gf_dirent_t *head, + char **path, int type, int32_t *op_errno, dict_t *xdata); +int +posix_handle_mdata_xattr(call_frame_t *frame, const char *name, int *op_errno); +int +posix_handle_georep_xattrs(call_frame_t *, const char *, int *, gf_boolean_t); +int32_t +posix_resolve_dirgfid_to_path(const uuid_t dirgfid, const char *brick_path, + const char *bname, char **path); +void +posix_gfid_unset(xlator_t *this, dict_t *xdata); + +int +posix_pacl_get(const char *path, int fdnum, const char *key, char **acl_s); + +int32_t +posix_get_objectsignature(char *, dict_t *); + +int32_t +posix_fdget_objectsignature(int, dict_t *); + +gf_boolean_t +posix_is_bulk_removexattr(char *name, dict_t *dict); + +int32_t +posix_set_iatt_in_dict(dict_t *, struct iatt *, struct iatt *); + +mode_t posix_override_umask(mode_t, mode_t); + +int32_t +posix_priv(xlator_t *this); + +int32_t +posix_inode(xlator_t *this); + +void +posix_fini(xlator_t *this); + +int +posix_init(xlator_t *this); + +int +posix_reconfigure(xlator_t *this, dict_t *options); + +int32_t +posix_notify(xlator_t *this, int32_t event, void *data, ...); + +/* posix-entry-ops.c FOP signatures */ +int32_t +posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +posix_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +posix_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata); + +int +posix_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +posix_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +posix_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata); + +int +posix_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int32_t +posix_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int +posix_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); + +/* posix-inode-fs-ops.c FOP signatures */ +int +posix_forget(xlator_t *this, inode_t *inode); + +int32_t +posix_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int32_t +posix_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +posix_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +posix_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); + +int32_t +posix_glfallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata); + +int32_t +posix_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata); + +int32_t +posix_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata); + +int32_t +posix_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); + +int32_t +posix_releasedir(xlator_t *this, fd_t *fd); + +int32_t +posix_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); + +int32_t +posix_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); + +int32_t +posix_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int +posix_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int32_t +posix_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +int32_t +posix_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int32_t +posix_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +posix_release(xlator_t *this, fd_t *fd); + +int32_t +posix_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); + +int32_t +posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata); + +int +posix_get_ancestry_non_directory(xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, + int32_t *op_errno, dict_t *xdata); + +int +posix_get_ancestry(xlator_t *this, inode_t *leaf_inode, gf_dirent_t *head, + char **path, int type, int32_t *op_errno, dict_t *xdata); + +int32_t +posix_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int32_t +posix_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata); + +int32_t +posix_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata); + +int32_t +posix_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int32_t +posix_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int32_t +posix_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata); + +int +posix_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +int +posix_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +int +posix_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); + +int32_t +posix_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); + +int32_t +posix_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +posix_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata); + +int32_t +posix_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int32_t +posix_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata); + +int32_t +posix_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata); + +int32_t +posix_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata); + +int32_t +posix_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata); + +int32_t +posix_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int32_t +posix_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +int32_t +posix_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata); + +int32_t +posix_put(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, uint32_t flags, struct iovec *vector, int32_t count, + off_t offset, struct iobref *iobref, dict_t *xattr, dict_t *xdata); + +int32_t +posix_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off64_t off_in, fd_t *fd_out, off64_t off_out, size_t len, + uint32_t flags, dict_t *xdata); + +int32_t +posix_set_mode_in_dict(dict_t *in_dict, dict_t *out_dict, + struct iatt *in_stbuf); + +gf_cs_obj_state +posix_cs_check_status(xlator_t *this, const char *realpath, int *fd, + struct iatt *buf); + +int +posix_cs_set_state(xlator_t *this, dict_t **rsp, gf_cs_obj_state state, + char const *path, int *fd); + +gf_cs_obj_state +posix_cs_heal_state(xlator_t *this, const char *path, int *fd, + struct iatt *stbuf); +int +posix_cs_maintenance(xlator_t *this, fd_t *fd, loc_t *loc, int *pfd, + struct iatt *buf, const char *realpath, dict_t *xattr_req, + dict_t **xattr_rsp, gf_boolean_t ignore_failure); +int +posix_check_dev_file(xlator_t *this, inode_t *inode, char *fop, int *op_errno); + +int +posix_spawn_ctx_janitor_thread(xlator_t *this); + +void +posix_update_iatt_buf(struct iatt *buf, int fd, char *loc, dict_t *xdata); -#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) +gf_boolean_t +posix_is_layout_stale(dict_t *xdata, char *par_path, xlator_t *this); -#define MAKE_REAL_PATH(var, this, path) do { \ - var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ - strcpy (var, POSIX_BASE_PATH(this)); \ - strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ - } while (0) +int +posix_delete_user_xattr(dict_t *dict, char *k, data_t *v, void *data); #endif /* _POSIX_H */ |
