summaryrefslogtreecommitdiffstats
path: root/xlators/storage/bdb
diff options
context:
space:
mode:
authorVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
committerVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
commit77adf4cd648dce41f89469dd185deec6b6b53a0b (patch)
tree02e155a5753b398ee572b45793f889b538efab6b /xlators/storage/bdb
parentf3b2e6580e5663292ee113c741343c8a43ee133f (diff)
Added all files
Diffstat (limited to 'xlators/storage/bdb')
-rw-r--r--xlators/storage/bdb/Makefile.am3
-rw-r--r--xlators/storage/bdb/src/Makefile.am18
-rw-r--r--xlators/storage/bdb/src/bctx.c394
-rw-r--r--xlators/storage/bdb/src/bdb-ll.c1455
-rw-r--r--xlators/storage/bdb/src/bdb.c3371
-rw-r--r--xlators/storage/bdb/src/bdb.h439
6 files changed, 5680 insertions, 0 deletions
diff --git a/xlators/storage/bdb/Makefile.am b/xlators/storage/bdb/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/xlators/storage/bdb/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/storage/bdb/src/Makefile.am b/xlators/storage/bdb/src/Makefile.am
new file mode 100644
index 00000000000..c0ab394bc58
--- /dev/null
+++ b/xlators/storage/bdb/src/Makefile.am
@@ -0,0 +1,18 @@
+
+xlator_LTLIBRARIES = bdb.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
+
+bdb_la_LDFLAGS = -module -avoidversion
+
+bdb_la_SOURCES = bctx.c bdb-ll.c bdb.c
+bdb_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = bdb.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+AM_LDFLAGS = -ldb
+
+CLEANFILES =
+
diff --git a/xlators/storage/bdb/src/bctx.c b/xlators/storage/bdb/src/bctx.c
new file mode 100644
index 00000000000..2bfa3ea8762
--- /dev/null
+++ b/xlators/storage/bdb/src/bctx.c
@@ -0,0 +1,394 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <list.h>
+#include <bdb.h>
+#include <libgen.h> /* for dirname */
+
+static void
+__destroy_bctx (bctx_t *bctx)
+{
+ if (bctx->directory)
+ FREE (bctx->directory);
+
+ if (bctx->db_path)
+ FREE (bctx->db_path);
+
+ FREE (bctx);
+}
+
+static void
+__unhash_bctx (bctx_t *bctx)
+{
+ list_del_init (&bctx->b_hash);
+}
+
+static int32_t
+bctx_table_prune (bctx_table_t *table)
+{
+ int32_t ret = 0;
+ struct list_head purge = {0,};
+ struct list_head *next = NULL;
+ bctx_t *entry = NULL;
+ bctx_t *del = NULL, *tmp = NULL;
+
+ if (!table)
+ return 0;
+
+ INIT_LIST_HEAD (&purge);
+
+ LOCK (&table->lock);
+ {
+ if ((table->lru_limit) &&
+ (table->lru_size > table->lru_limit)) {
+ while (table->lru_size > table->lru_limit) {
+ next = table->b_lru.next;
+ entry = list_entry (next, bctx_t, list);
+
+ list_move_tail (next, &table->purge);
+ __unhash_bctx (entry);
+
+ table->lru_size--;
+ ret++;
+ }
+ }
+ list_move_tail (&purge, &table->purge);
+ list_del_init (&table->purge);
+ }
+ UNLOCK (&table->lock);
+
+ {
+ list_for_each_entry_safe (del, tmp, &purge, list) {
+ list_del_init (&del->list);
+ if (del->dbp) {
+ ret = del->dbp->close (del->dbp, 0);
+ if (ret != 0) {
+ gf_log (table->this->name, GF_LOG_ERROR,
+ "failed to close db on path (%s): %s",
+ del->directory, db_strerror (ret));
+ } else {
+ gf_log (table->this->name, GF_LOG_WARNING,
+ "close db for path %s; table->lru_count = %d",
+ del->directory, table->lru_size);
+ }
+ }
+ __destroy_bctx (del);
+ }
+ }
+
+ return ret;
+}
+
+
+/* struct bdb_ctx related */
+static inline uint32_t
+bdb_key_hash (char *key, uint32_t hash_size)
+{
+ uint32_t hash = 0;
+
+ hash = *key;
+
+ if (hash) {
+ for (key += 1; *key != '\0'; key++) {
+ hash = (hash << 5) - hash + *key;
+ }
+ }
+
+ return (hash + *key) % hash_size;
+}
+
+static void
+__hash_bctx (bctx_t *bctx)
+{
+ bctx_table_t *table = NULL;
+ char *key = NULL;
+
+ table = bctx->table;
+
+ MAKE_KEY_FROM_PATH (key, bctx->directory);
+ bctx->key_hash = bdb_key_hash (key, table->hash_size);
+
+ list_del_init (&bctx->b_hash);
+ list_add (&bctx->b_hash, &table->b_hash[bctx->key_hash]);
+}
+
+static inline bctx_t *
+__bctx_passivate (bctx_t *bctx)
+{
+ if (bctx->dbp) {
+ list_move_tail (&bctx->list, &(bctx->table->b_lru));
+ bctx->table->lru_size++;
+ } else {
+ list_move_tail (&bctx->list, &bctx->table->purge);
+ __unhash_bctx (bctx);
+ }
+ return bctx;
+}
+
+static inline bctx_t *
+__bctx_activate (bctx_t *bctx)
+{
+ list_move (&bctx->list, &bctx->table->active);
+ bctx->table->lru_size--;
+
+ return bctx;
+}
+
+static bctx_t *
+__bdb_ctx_unref (bctx_t *bctx)
+{
+ assert (bctx->ref);
+
+ --bctx->ref;
+
+ if (!bctx->ref)
+ bctx = __bctx_passivate (bctx);
+
+ return bctx;
+}
+
+
+bctx_t *
+bctx_unref (bctx_t *bctx)
+{
+ bctx_table_t *table = NULL;
+
+ if (!bctx && !bctx->table)
+ return NULL;
+
+ table = bctx->table;
+
+ LOCK (&table->lock);
+ {
+ bctx = __bdb_ctx_unref (bctx);
+ }
+ UNLOCK (&table->lock);
+
+ bctx_table_prune (table);
+
+ return bctx;
+}
+
+/*
+ * NOTE: __bdb_ctx_ref() is called only after holding table->lock and bctx->lock, in that order
+ */
+static inline bctx_t *
+__bctx_ref (bctx_t *bctx)
+{
+ if (!bctx->ref)
+ __bctx_activate (bctx);
+
+ bctx->ref++;
+
+ return bctx;
+}
+
+bctx_t *
+bctx_ref (bctx_t *bctx)
+{
+ LOCK (&(bctx->table->lock));
+ {
+ __bctx_ref (bctx);
+ }
+ UNLOCK (&(bctx->table->lock));
+
+ return bctx;
+}
+
+
+#define BDB_THIS(table) (table->this)
+
+static inline bctx_t *
+__create_bctx (bctx_table_t *table,
+ const char *path)
+{
+ bctx_t *bctx = NULL;
+ char *db_path = NULL;
+
+ bctx = CALLOC (1, sizeof (*bctx));
+ GF_VALIDATE_OR_GOTO ("bctx", bctx, out);
+
+ bctx->table = table;
+ bctx->directory = strdup (path);
+ GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, BDB_THIS (table), path);
+
+ bctx->db_path = strdup (db_path);
+ GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out);
+
+ INIT_LIST_HEAD (&bctx->c_list);
+ INIT_LIST_HEAD (&bctx->list);
+ INIT_LIST_HEAD (&bctx->b_hash);
+
+ LOCK_INIT (&bctx->lock);
+
+ __hash_bctx (bctx);
+
+ list_add (&bctx->list, &table->b_lru);
+ table->lru_size++;
+
+out:
+ return bctx;
+}
+
+/* bctx_lookup - lookup bctx_t for the directory @directory. (see description of bctx_t in bdb.h)
+ *
+ * @table: bctx_table_t for this instance of bdb.
+ * @directory: directory for which bctx_t is being looked up.
+ */
+bctx_t *
+bctx_lookup (bctx_table_t *table,
+ const char *directory)
+{
+ char *key = NULL;
+ uint32_t key_hash = 0;
+ bctx_t *trav = NULL, *bctx = NULL, *tmp = NULL;
+ int32_t need_break = 0;
+
+ GF_VALIDATE_OR_GOTO ("bctx", table, out);
+ GF_VALIDATE_OR_GOTO ("bctx", directory, out);
+
+ MAKE_KEY_FROM_PATH (key, directory);
+ key_hash = bdb_key_hash (key, table->hash_size);
+
+ LOCK (&table->lock);
+ {
+ if (!list_empty (&table->b_hash[key_hash])) {
+ list_for_each_entry_safe (trav, tmp, &table->b_hash[key_hash], b_hash) {
+ LOCK(&trav->lock);
+ if (!strcmp(trav->directory, directory)) {
+ bctx = __bctx_ref (trav);
+ need_break = 1;
+ }
+ UNLOCK(&trav->lock);
+ if (need_break)
+ break;
+ }
+ }
+
+ if (!bctx) {
+ bctx = __create_bctx (table, directory);
+ bctx = __bctx_ref (bctx);
+ }
+ }
+ UNLOCK (&table->lock);
+out:
+ return bctx;
+}
+
+
+bctx_t *
+bctx_parent (bctx_table_t *table,
+ const char *path)
+{
+ char *pathname = NULL, *directory = NULL;
+ bctx_t *bctx = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bctx", table, out);
+ GF_VALIDATE_OR_GOTO ("bctx", path, out);
+
+ pathname = strdup (path);
+ GF_VALIDATE_OR_GOTO ("bctx", pathname, out);
+ directory = dirname (pathname);
+
+ bctx = bctx_lookup (table, directory);
+ GF_VALIDATE_OR_GOTO ("bctx", bctx, out);
+
+out:
+ if (pathname)
+ free (pathname);
+ return bctx;
+}
+
+inline int32_t
+bdb_db_rename (bctx_table_t *table,
+ const char *oldpath,
+ const char *newpath)
+{
+ DB_ENV *dbenv = NULL;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bctx", table, out);
+ GF_VALIDATE_OR_GOTO ("bctx", oldpath, out);
+ GF_VALIDATE_OR_GOTO ("bctx", newpath, out);
+
+ dbenv = table->dbenv;
+ GF_VALIDATE_OR_GOTO ("bctx", dbenv, out);
+
+ LOCK (&table->lock);
+ {
+ ret = dbenv->dbrename (dbenv, NULL, oldpath, NULL, newpath, 0);
+
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to rename %s to %s: %s",
+ oldpath, newpath, db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "successfully renamed %s to %s: %s",
+ oldpath, newpath, db_strerror (ret));
+ }
+ }
+ UNLOCK (&table->lock);
+
+out:
+ return ret;
+}
+
+bctx_t *
+bctx_rename (bctx_t *bctx,
+ const char *db_newpath)
+{
+ bctx_table_t *table = NULL;
+ int32_t ret = -1;
+
+ table = bctx->table;
+
+ LOCK (&table->lock);
+ {
+ __unhash_bctx (bctx);
+ list_del_init (&bctx->list);
+ if (bctx->dbp) {
+ ret = bctx->dbp->close (bctx->dbp, 0);
+ if (ret != 0) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to close db for directory %s (%s)",
+ bctx->directory, db_strerror (ret));
+ }
+ bctx->dbp = NULL;
+ }
+ }
+ UNLOCK (&table->lock);
+
+ ret = bdb_db_rename (table, bctx->db_path, db_newpath);
+
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "bdb_db_rename failed for directory %s",
+ bctx->directory);
+ bctx = NULL;
+ }
+
+ return bctx;
+}
diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c
new file mode 100644
index 00000000000..40e7d187759
--- /dev/null
+++ b/xlators/storage/bdb/src/bdb-ll.c
@@ -0,0 +1,1455 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include "bdb.h"
+#include <list.h>
+/*
+ * implement the procedures to interact with bdb */
+
+/****************************************************************
+ *
+ * General wrappers and utility procedures for bdb xlator
+ *
+ ****************************************************************/
+#define BDB_LL_PAGE_SIZE_DEFAULT 4096
+#define BDB_LL_PAGE_SIZE_MIN 4096
+#define BDB_LL_PAGE_SIZE_MAX 65536
+
+ino_t
+bdb_inode_transform (ino_t parent,
+ bctx_t *bctx)
+{
+ struct bdb_private *private = NULL;
+ ino_t ino = -1;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+
+ private = bctx->table->this->private;
+
+ LOCK (&private->ino_lock);
+ ino = ++private->next_ino;
+ UNLOCK (&private->ino_lock);
+out:
+ return ino;
+}
+
+
+/***********************************************************
+ *
+ * bdb storage database utilities
+ *
+ **********************************************************/
+
+/*
+ * bdb_db_open - opens a storage db.
+ *
+ * @ctx: context specific to the directory for which we are supposed to open db
+ *
+ * see, if we have empty slots to open a db.
+ * if (no-empty-slots), then prune open dbs and close as many as possible
+ * if (empty-slot-available), tika muchkonDu db open maaDu
+ *
+ * NOTE: illi baro munche lock hiDkobEku
+ */
+static DB *
+bdb_db_open (bctx_t *bctx)
+{
+ DB *storage_dbp = NULL;
+ int32_t op_ret = -1;
+ bctx_table_t *table = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+
+ table = bctx->table;
+ GF_VALIDATE_OR_GOTO ("bdb-ll", table, out);
+
+ /* we have to do the following, we can't deny someone of db_open ;) */
+ op_ret = db_create (&storage_dbp, table->dbenv, 0);
+ if (op_ret != 0) {
+ gf_log ("bdb-ll", GF_LOG_ERROR,
+ "failed to do db_create for directory %s (%s)",
+ bctx->directory, db_strerror (op_ret));
+ storage_dbp = NULL;
+ goto out;
+ }
+
+ if (table->page_size) {
+ op_ret = storage_dbp->set_pagesize (storage_dbp,
+ table->page_size);
+ if (op_ret != 0) {
+ gf_log ("bdb-ll", GF_LOG_ERROR,
+ "failed to set the page_size (%"PRIu64") for directory %s (%s)",
+ table->page_size, bctx->directory, db_strerror (op_ret));
+ } else {
+ gf_log ("bdb-ll", GF_LOG_DEBUG,
+ "page-size (%"PRIu64") set on DB",
+ table->page_size);
+ }
+ }
+
+ op_ret = storage_dbp->open (storage_dbp,
+ NULL,
+ bctx->db_path,
+ NULL,
+ table->access_mode,
+ table->dbflags,
+ 0);
+ if (op_ret != 0 ) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to open storage-db for directory %s (%s)",
+ bctx->db_path, db_strerror (op_ret));
+ storage_dbp = NULL;
+ }
+
+out:
+ return storage_dbp;
+}
+
+
+
+int32_t
+bdb_cursor_close (bctx_t *bctx,
+ DBC *cursorp)
+{
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
+
+ LOCK (&bctx->lock);
+ {
+#ifdef HAVE_BDB_CURSOR_GET
+ ret = cursorp->close (cursorp);
+#else
+ ret = cursorp->c_close (cursorp);
+#endif
+ if ((ret != 0)) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to close db cursor for directory %s (%s)",
+ bctx->directory, db_strerror (ret));
+ }
+ }
+ UNLOCK (&bctx->lock);
+
+out:
+ return ret;
+}
+
+
+int32_t
+bdb_cursor_open (bctx_t *bctx,
+ DBC **cursorpp)
+{
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out);
+
+ LOCK (&bctx->lock);
+ {
+ if (bctx->dbp) {
+ /* do nothing, just continue */
+ ret = 0;
+ } else {
+ bctx->dbp = bdb_db_open (bctx);
+ if (!bctx->dbp) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to open storage db for %s",
+ bctx->directory);
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+ }
+
+ if (ret == 0) {
+ /* all set, lets open cursor */
+ ret = bctx->dbp->cursor (bctx->dbp, NULL, cursorpp, 0);
+ if (ret != 0) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to create a cursor for %s (%s)",
+ bctx->directory, db_strerror (ret));
+ }
+ }
+ }
+ UNLOCK (&bctx->lock);
+
+out:
+ return ret;
+}
+
+
+/* cache related */
+static bdb_cache_t *
+bdb_cache_lookup (bctx_t *bctx,
+ char *path)
+{
+ bdb_cache_t *bcache = NULL;
+ bdb_cache_t *trav = NULL;
+ char *key = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
+
+ MAKE_KEY_FROM_PATH (key, path);
+
+ LOCK (&bctx->lock);
+ {
+ list_for_each_entry (trav, &bctx->c_list, c_list) {
+ if (!strcmp (trav->key, key)){
+ bcache = trav;
+ break;
+ }
+ }
+ }
+ UNLOCK (&bctx->lock);
+
+out:
+ return bcache;
+}
+
+static int32_t
+bdb_cache_insert (bctx_t *bctx,
+ DBT *key,
+ DBT *data)
+{
+ bdb_cache_t *bcache = NULL;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", data, out);
+
+ LOCK (&bctx->lock);
+ {
+ if (bctx->c_count > 5) {
+ /* most of the times, we enter here */
+ /* FIXME: ugly, not supposed to disect any of the
+ * 'struct list_head' directly */
+ if (!list_empty (&bctx->c_list)) {
+ bcache = list_entry (bctx->c_list.prev, bdb_cache_t, c_list);
+ list_del_init (&bcache->c_list);
+ }
+ if (bcache->key) {
+ free (bcache->key);
+ bcache->key = strdup ((char *)key->data);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock);
+ } else {
+ /* should never come here */
+ gf_log ("bdb-ll",
+ GF_LOG_CRITICAL,
+ "bcache->key (null)");
+ } /* if(bcache->key)...else */
+ if (bcache->data) {
+ free (bcache->data);
+ bcache->data = memdup (data->data, data->size);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock);
+ bcache->size = data->size;
+ } else {
+ /* should never come here */
+ gf_log ("bdb-ll",
+ GF_LOG_CRITICAL,
+ "bcache->data (null)");
+ } /* if(bcache->data)...else */
+ list_add (&bcache->c_list, &bctx->c_list);
+ ret = 0;
+ } else {
+ /* we will be entering here very rarely */
+ bcache = CALLOC (1, sizeof (*bcache));
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock);
+ bcache->key = strdup ((char *)(key->data));
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock);
+ bcache->data = memdup (data->data, data->size);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock);
+ bcache->size = data->size;
+ list_add (&bcache->c_list, &bctx->c_list);
+ bctx->c_count++;
+ ret = 0;
+ } /* if(private->c_count < 5)...else */
+ }
+unlock:
+ UNLOCK (&bctx->lock);
+out:
+ return ret;
+}
+
+static int32_t
+bdb_cache_delete (bctx_t *bctx,
+ char *key)
+{
+ bdb_cache_t *bcache = NULL;
+ bdb_cache_t *trav = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
+
+ LOCK (&bctx->lock);
+ {
+ list_for_each_entry (trav, &bctx->c_list, c_list) {
+ if (!strcmp (trav->key, key)){
+ bctx->c_count--;
+ bcache = trav;
+ break;
+ }
+ }
+
+ if (bcache) {
+ list_del_init (&bcache->c_list);
+ free (bcache->key);
+ free (bcache->data);
+ free (bcache);
+ }
+ }
+ UNLOCK (&bctx->lock);
+
+out:
+ return 0;
+}
+
+void *
+bdb_db_stat (bctx_t *bctx,
+ DB_TXN *txnid,
+ uint32_t flags)
+{
+ DB *storage = NULL;
+ void *stat = NULL;
+ int32_t ret = -1;
+
+ LOCK (&bctx->lock);
+ {
+ if (bctx->dbp == NULL) {
+ bctx->dbp = bdb_db_open (bctx);
+ storage = bctx->dbp;
+ } else {
+ /* we are just fine, lets continue */
+ storage = bctx->dbp;
+ } /* if(bctx->dbp==NULL)...else */
+ }
+ UNLOCK (&bctx->lock);
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
+
+ ret = storage->stat (storage, txnid, &stat, flags);
+
+ if (ret != 0) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to do DB->stat() on db file %s: %s",
+ bctx->db_path, db_strerror (ret));
+ } else {
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "successfully called DB->stat() on db file %s",
+ bctx->db_path);
+ }
+out:
+ return stat;
+
+}
+
+/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the corresponding
+ * db file.
+ *
+ * @bctx: bctx_t * corresponding to the parent directory of @path. (should always be a valid
+ * bctx). bdb_storage_get should never be called if @bctx = NULL.
+ * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction or a valid
+ * DB_TXN *, when embedded in an explicit transaction.
+ * @path: path of the file to read from (translated to a database key using MAKE_KEY_FROM_PATH)
+ * @buf: char ** - pointer to a pointer to char. a read buffer is created in this procedure
+ * and pointer to the buffer is passed through @buf to the caller.
+ * @size: size of the file content to be read.
+ * @offset: offset from which the file content to be read.
+ *
+ * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL,
+ * nobody has opened DB till now or DB was closed by bdb_table_prune()).
+ *
+ * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then bdb_storage_get
+ * first looks up the cache for key/value pair. if bdb_lookup_cache fails, then only
+ * DB->get() is called. also, inserts a newly read key/value pair to cache through
+ * bdb_insert_to_cache.
+ *
+ * return: 'number of bytes read' on success or -1 on error.
+ *
+ * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb xlator's internal cache.
+ */
+int32_t
+bdb_db_get (bctx_t *bctx,
+ DB_TXN *txnid,
+ const char *path,
+ char **buf,
+ size_t size,
+ off_t offset)
+{
+ DB *storage = NULL;
+ DBT key = {0,};
+ DBT value = {0,};
+ int32_t ret = -1;
+ char *key_string = NULL;
+ bdb_cache_t *bcache = NULL;
+ int32_t db_flags = 0;
+ uint8_t need_break = 0;
+ int32_t retries = 1;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
+
+ MAKE_KEY_FROM_PATH (key_string, path);
+
+ if (bctx->cache &&
+ ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) {
+ if (buf) {
+ *buf = CALLOC (1, bcache->size);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", buf, out);
+ memcpy (*buf, (bcache->data + offset), bcache->size);
+ }
+ ret = bcache->size;
+ } else {
+ LOCK (&bctx->lock);
+ {
+ if (bctx->dbp == NULL) {
+ bctx->dbp = bdb_db_open (bctx);
+ storage = bctx->dbp;
+ } else {
+ /* we are just fine, lets continue */
+ storage = bctx->dbp;
+ } /* if(bctx->dbp==NULL)...else */
+ }
+ UNLOCK (&bctx->lock);
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
+
+ key.data = (char *)key_string;
+ key.size = strlen (key_string);
+ key.flags = DB_DBT_USERMEM;
+
+ if (bctx->cache){
+ /* we are called to return the size of the file */
+ value.flags = DB_DBT_MALLOC;
+ } else {
+ if (size) {
+ value.flags = DB_DBT_MALLOC | DB_DBT_PARTIAL;
+ } else {
+ value.flags = DB_DBT_MALLOC;
+ }
+ value.dlen = size;
+ value.doff = offset;
+ }
+
+ do {
+ /* TODO: we prefer to give our own buffer to value.data
+ * and ask bdb to fill in it */
+ ret = storage->get (storage, txnid, &key, &value, db_flags);
+
+ if (ret == DB_NOTFOUND) {
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "failed to do DB->get() for key: %s."
+ " key not found in storage DB", key_string);
+ ret = -1;
+ need_break = 1;
+ } else if (ret == DB_LOCK_DEADLOCK) {
+ retries++;
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "deadlock detected in DB->put. retrying DB->put (%d)",
+ retries);
+ }else if (ret == 0) {
+ /* successfully read data, lets set everything in place
+ * and return */
+ if (buf) {
+ *buf = CALLOC (1, value.size);
+ ERR_ABORT (*buf);
+ memcpy (*buf, value.data, value.size);
+ }
+ ret = value.size;
+ if (bctx->cache)
+ bdb_cache_insert (bctx, &key, &value);
+ free (value.data);
+ need_break = 1;
+ } else {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to do DB->get() for key %s: %s",
+ key_string, db_strerror (ret));
+ ret = -1;
+ need_break = 1;
+ }
+ } while (!need_break);
+ }
+out:
+ return ret;
+}/* bdb_db_get */
+
+/* bdb_storage_put - insert a key/value specified to the corresponding DB.
+ *
+ * @bctx: bctx_t * corresponding to the parent directory of @path.
+ * (should always be a valid bctx). bdb_storage_put should never be called if @bctx = NULL.
+ * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction or a valid
+ * DB_TXN *, when embedded in an explicit transaction.
+ * @key_string: key of the database entry.
+ * @buf: pointer to the buffer data to be written as data for @key_string.
+ * @size: size of @buf.
+ * @offset: offset in the key's data to be modified with provided data.
+ * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of @key_string to 0 size).
+ *
+ * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL,
+ * nobody has opened DB till now or DB was closed by bdb_table_prune()).
+ *
+ * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache.
+ *
+ * return: 0 on success or -1 on error.
+ *
+ * also see: bdb_cache_delete for details on how a cached key/value pair is removed.
+ */
+int32_t
+bdb_db_put (bctx_t *bctx,
+ DB_TXN *txnid,
+ const char *key_string,
+ const char *buf,
+ size_t size,
+ off_t offset,
+ int32_t flags)
+{
+ DB *storage = NULL;
+ DBT key = {0,}, value = {0,};
+ int32_t ret = -1;
+ int32_t db_flags = DB_AUTO_COMMIT;
+ uint8_t need_break = 0;
+ int32_t retries = 1;
+
+ LOCK (&bctx->lock);
+ {
+ if (bctx->dbp == NULL) {
+ bctx->dbp = bdb_db_open (bctx);
+ storage = bctx->dbp;
+ } else {
+ /* we are just fine, lets continue */
+ storage = bctx->dbp;
+ }
+ }
+ UNLOCK (&bctx->lock);
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
+
+ if (bctx->cache) {
+ ret = bdb_cache_delete (bctx, (char *)key_string);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
+ }
+
+ key.data = (void *)key_string;
+ key.size = strlen (key_string);
+
+ /* NOTE: bdb lets us expand the file, suppose value.size > value.len, then value.len bytes
+ * from value.doff offset and value.size bytes will be written from value.doff and
+ * data from value.doff + value.dlen will be pushed value.doff + value.size
+ */
+ value.data = (void *)buf;
+
+ if (flags & BDB_TRUNCATE_RECORD) {
+ value.size = size;
+ value.doff = 0;
+ value.dlen = offset;
+ } else {
+ value.size = size;
+ value.dlen = size;
+ value.doff = offset;
+ }
+ value.flags = DB_DBT_PARTIAL;
+ if (buf == NULL && size == 0)
+ /* truncate called us */
+ value.flags = 0;
+
+ do {
+ ret = storage->put (storage, txnid, &key, &value, db_flags);
+ if (ret == DB_LOCK_DEADLOCK) {
+ retries++;
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "deadlock detected in DB->put. retrying DB->put (%d)",
+ retries);
+ } else if (ret) {
+ /* write failed */
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to do DB->put() for key %s: %s",
+ key_string, db_strerror (ret));
+ need_break = 1;
+ } else {
+ /* successfully wrote */
+ ret = 0;
+ need_break = 1;
+ }
+ } while (!need_break);
+out:
+ return ret;
+}/* bdb_db_put */
+
+
+/* bdb_storage_del - delete a key/value pair corresponding to @path from corresponding db file.
+ *
+ * @bctx: bctx_t * corresponding to the parent directory of @path.
+ * (should always be a valid bctx). bdb_storage_del should never be called
+ * if @bctx = NULL.
+ * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction or a
+ * valid DB_TXN *, when embedded in an explicit transaction.
+ * @path: path to the file, whose key/value pair has to be deleted.
+ *
+ * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL,
+ * nobody has opened DB till now or DB was closed by bdb_table_prune()).
+ *
+ * return: 0 on success or -1 on error.
+ */
+int32_t
+bdb_db_del (bctx_t *bctx,
+ DB_TXN *txnid,
+ const char *path)
+{
+ DB *storage = NULL;
+ DBT key = {0,};
+ char *key_string = NULL;
+ int32_t ret = -1;
+ int32_t db_flags = 0;
+ uint8_t need_break = 0;
+ int32_t retries = 1;
+
+ MAKE_KEY_FROM_PATH (key_string, path);
+
+ LOCK (&bctx->lock);
+ {
+ if (bctx->dbp == NULL) {
+ bctx->dbp = bdb_db_open (bctx);
+ storage = bctx->dbp;
+ } else {
+ /* we are just fine, lets continue */
+ storage = bctx->dbp;
+ }
+ }
+ UNLOCK (&bctx->lock);
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
+
+ ret = bdb_cache_delete (bctx, key_string);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
+
+ key.data = key_string;
+ key.size = strlen (key_string);
+ key.flags = DB_DBT_USERMEM;
+
+ do {
+ ret = storage->del (storage, txnid, &key, db_flags);
+
+ if (ret == DB_NOTFOUND) {
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "failed to delete %s from storage db, doesn't exist in storage DB",
+ path);
+ need_break = 1;
+ } else if (ret == DB_LOCK_DEADLOCK) {
+ retries++;
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "deadlock detected in DB->put. retrying DB->put (%d)",
+ retries);
+ }else if (ret == 0) {
+ /* successfully deleted the entry */
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "deleted %s from storage db", path);
+ ret = 0;
+ need_break = 1;
+ } else {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to delete %s from storage db: %s",
+ path, db_strerror (ret));
+ ret = -1;
+ need_break = 1;
+ }
+ } while (!need_break);
+out:
+ return ret;
+}
+
+/* NOTE: bdb version compatibility wrapper */
+int32_t
+bdb_cursor_get (DBC *cursorp,
+ DBT *key,
+ DBT *value,
+ int32_t flags)
+{
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
+
+#ifdef HAVE_BDB_CURSOR_GET
+ ret = cursorp->get (cursorp, key, value, flags);
+#else
+ ret = cursorp->c_get (cursorp, key, value, flags);
+#endif
+ if ((ret != 0) && (ret != DB_NOTFOUND)) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "failed to CURSOR->get() for key %s (%s)",
+ (char *)key->data, db_strerror (ret));
+ }
+
+out:
+ return ret;
+}/* bdb_cursor_get */
+
+
+int32_t
+bdb_dirent_size (DBT *key)
+{
+ return ALIGN (24 /* FIX MEEEE!!! */ + key->size);
+}
+
+
+/* bdb_extract_bfd - translate a fd_t to a bfd (either a 'struct bdb_bfd' or 'struct bdb_dir')
+ *
+ * @fd->ctx is with bdb specific file handle during a successful bdb_open (also bdb_create)
+ * or bdb_opendir.
+ *
+ * return: 'struct bdb_bfd *' or 'struct bdb_dir *' on success, or NULL on failure.
+ */
+inline void *
+bdb_extract_bfd (fd_t *fd,
+ xlator_t *this)
+{
+ uint64_t tmp_bfd = 0;
+ void *bfd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb-ll", fd, out);
+ GF_VALIDATE_OR_GOTO ("bdb-ll", this, out);
+
+ fd_ctx_get (fd, this, &tmp_bfd);
+ bfd = (void *)(long)bfd;
+
+out:
+ return bfd;
+}
+
+/* bdb_dbenv_init - initialize DB_ENV
+ *
+ * initialization includes:
+ * 1. opening DB_ENV (db_env_create(), DB_ENV->open()).
+ * NOTE: see private->envflags for flags used.
+ * 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files
+ * (log files are the files in which transaction logs are written by db).
+ * 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically clear
+ * the unwanted log files (flushed at each checkpoint).
+ * 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed error logs.
+ * used only for debbuging purpose.
+ *
+ * return: returns a valid DB_ENV * on success or NULL on error.
+ *
+ */
+static DB_ENV *
+bdb_dbenv_init (xlator_t *this,
+ char *directory)
+{
+ /* Create a DB environment */
+ DB_ENV *dbenv = NULL;
+ int32_t ret = 0;
+ bdb_private_t *private = NULL;
+ int32_t fatal_flags = 0;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (directory, out);
+
+ private = this->private;
+ VALIDATE_OR_GOTO (private, out);
+
+ ret = db_env_create (&dbenv, 0);
+ VALIDATE_OR_GOTO ((ret == 0), out);
+
+ /* NOTE: set_errpfx returns 'void' */
+ dbenv->set_errpfx(dbenv, this->name);
+
+ ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT);
+ VALIDATE_OR_GOTO ((ret == 0), out);
+
+ ret = dbenv->open(dbenv, directory,
+ private->envflags,
+ S_IRUSR | S_IWUSR);
+ if ((ret != 0) && (ret != DB_RUNRECOVERY)) {
+ gf_log (this->name,
+ GF_LOG_CRITICAL,
+ "failed to open DB environment (%s)",
+ db_strerror (ret));
+ dbenv = NULL;
+ goto out;
+ } else if (ret == DB_RUNRECOVERY) {
+ fatal_flags = ((private->envflags & (~DB_RECOVER)) | DB_RECOVER_FATAL);
+ ret = dbenv->open(dbenv, directory,
+ fatal_flags,
+ S_IRUSR | S_IWUSR);
+ if (ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to open DB environment (%s) with DB_REOVER_FATAL",
+ db_strerror (ret));
+ dbenv = NULL;
+ goto out;
+ } else {
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "opened DB environment after DB_RECOVER_FATAL: %s",
+ db_strerror (ret));
+ }
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "DB environment successfull opened: %s",
+ db_strerror (ret));
+ }
+
+
+
+#if (DB_VERSION_MAJOR == 4 && \
+ DB_VERSION_MINOR == 7)
+ if (private->log_auto_remove) {
+ ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1);
+ } else {
+ ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0);
+ }
+#else
+ if (private->log_auto_remove) {
+ ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1);
+ } else {
+ ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0);
+ }
+#endif
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to set DB_LOG_AUTOREMOVE on dbenv: %s", db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "DB_LOG_AUTOREMOVE set on dbenv");
+ }
+
+ if (private->transaction) {
+ ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1);
+
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to set DB_AUTO_COMMIT on dbenv: %s",
+ db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "DB_AUTO_COMMIT set on dbenv");
+ }
+
+ if (private->txn_timeout) {
+ ret = dbenv->set_timeout (dbenv,
+ private->txn_timeout,
+ DB_SET_TXN_TIMEOUT);
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to set TXN_TIMEOUT to %d milliseconds "
+ "on dbenv: %s",
+ private->txn_timeout, db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "TXN_TIMEOUT set to %d milliseconds",
+ private->txn_timeout);
+ }
+ }
+
+ if (private->lock_timeout) {
+ ret = dbenv->set_timeout(dbenv,
+ private->txn_timeout,
+ DB_SET_LOCK_TIMEOUT);
+
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to set LOCK_TIMEOUT to %d milliseconds "
+ "on dbenv: %s",
+ private->lock_timeout, db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "LOCK_TIMEOUT set to %d milliseconds",
+ private->lock_timeout);
+ }
+ }
+
+ ret = dbenv->set_lg_dir (dbenv, private->logdir);
+
+ if (ret != 0) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to set log directory for dbenv: %s", db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "set dbenv log dir to %s", private->logdir);
+ }
+
+ }
+
+ if (private->errfile) {
+ private->errfp = fopen (private->errfile, "a+");
+ if (private->errfp) {
+ dbenv->set_errfile (dbenv, private->errfp);
+ } else {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to open errfile: %s", strerror (errno));
+ }
+ }
+
+out:
+ return dbenv;
+}
+
+#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv)
+
+/* bdb_checkpoint - during transactional usage, db does not directly write the data to db
+ * files, instead db writes a 'log' (similar to a journal entry) into a
+ * log file. db normally clears the log files during opening of an
+ * environment. since we expect a filesystem server to run for a pretty
+ * long duration and flushing 'log's during dbenv->open would prove very
+ * costly, if we accumulate the log entries for one complete run of
+ * glusterfs server. to flush the logs frequently, db provides a mechanism
+ * called 'checkpointing'. when we do a checkpoint, db flushes the logs to
+ * disk (writes changes to db files) and we can also clear the accumulated
+ * log files after checkpointing. NOTE: removing unwanted log files is not
+ * part of dbenv->txn_checkpoint() call.
+ *
+ * @data: xlator_t of the current instance of bdb xlator.
+ *
+ * bdb_checkpoint is called in a different thread from the main glusterfs thread. bdb
+ * xlator creates the checkpoint thread after successfully opening the db environment.
+ * NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem thread.
+ *
+ * db environment checkpointing frequency is controlled by
+ * 'option checkpoint-timeout <time-in-seconds>' in volfile.
+ *
+ * NOTE: checkpointing thread is started only if 'option transaction on' specified in
+ * volfile. checkpointing is not valid for non-transactional environments.
+ *
+ */
+static void *
+bdb_checkpoint (void *data)
+{
+ xlator_t *this = NULL;
+ struct bdb_private *private = NULL;
+ DB_ENV *dbenv = NULL;
+ int32_t ret = 0;
+ uint32_t active = 0;
+
+ this = (xlator_t *) data;
+ dbenv = BDB_ENV(this);
+ private = this->private;
+
+ for (;;sleep (private->checkpoint_timeout)) {
+ LOCK (&private->active_lock);
+ active = private->active;
+ UNLOCK (&private->active_lock);
+
+ if (active) {
+ ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
+ if (ret) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to checkpoint environment: %s", db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "checkpointing successful");
+ }
+ } else {
+ ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
+ if (ret) {
+ gf_log ("bctx",
+ GF_LOG_ERROR,
+ "failed to do final checkpoint environment: %s",
+ db_strerror (ret));
+ } else {
+ gf_log ("bctx",
+ GF_LOG_DEBUG,
+ "final checkpointing successful");
+ }
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static inline void
+BDB_CACHE_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ /* cache is always on */
+ private->cache = ON;
+}
+
+static inline void
+BDB_LOG_REMOVE_INIT(xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ private->log_auto_remove = 1;
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "DB_ENV will use DB_LOG_AUTO_REMOVE");
+}
+
+static inline void
+BDB_ERRFILE_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *errfile = NULL;
+
+ errfile = dict_get (options, "errfile");
+ if (errfile) {
+ private->errfile = strdup (errfile->data);
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "using errfile: %s", private->errfile);
+ }
+}
+
+static inline void
+BDB_TABLE_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ bctx_table_t *table = NULL;
+ int32_t idx = 0;
+
+ data_t *lru_limit = NULL;
+ data_t *page_size = NULL;
+
+ table = CALLOC (1, sizeof (*table));
+ if (table) {
+ INIT_LIST_HEAD(&(table->b_lru));
+ INIT_LIST_HEAD(&(table->active));
+ INIT_LIST_HEAD(&(table->purge));
+
+ LOCK_INIT (&table->lock);
+ LOCK_INIT (&table->checkpoint_lock);
+
+ table->transaction = private->transaction;
+ table->access_mode = private->access_mode;
+ table->dbflags = private->dbflags;
+ table->this = this;
+
+ {
+ lru_limit = dict_get (options, "lru-limit");
+
+ /* TODO: set max lockers and max txns to accomodate
+ * for more than lru_limit */
+ if (lru_limit) {
+ table->lru_limit = strtol (lru_limit->data, NULL, 0);
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "setting bctx lru limit to %d", table->lru_limit);
+ } else {
+ table->lru_limit = BDB_DEFAULT_LRU_LIMIT;
+ }
+ }
+
+ {
+ page_size = dict_get (options, "page-size");
+
+ if (page_size)
+ {
+ if (gf_string2bytesize (page_size->data,
+ &table->page_size) != 0) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "invalid number format \"%s\""
+ " of \"option page-size\"",
+ page_size->data);
+ }
+
+ if (!(table->page_size >= BDB_LL_PAGE_SIZE_MIN &&
+ table->page_size <= BDB_LL_PAGE_SIZE_MAX)) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "pagesize %s is out of range."
+ "Allowed pagesize is between %d and %d",
+ page_size->data,
+ BDB_LL_PAGE_SIZE_MIN,
+ BDB_LL_PAGE_SIZE_MAX);
+ }
+ }
+ else {
+ table->page_size = BDB_LL_PAGE_SIZE_DEFAULT;
+ }
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG, "using page-size %"PRIu64,
+ table->page_size);
+ }
+
+ table->hash_size = BDB_DEFAULT_HASH_SIZE;
+ table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE, sizeof (struct list_head));
+
+ for (idx = 0; idx < table->hash_size; idx++)
+ INIT_LIST_HEAD(&(table->b_hash[idx]));
+
+ private->b_table = table;
+ } else {
+ gf_log ("bdb-ll",
+ GF_LOG_CRITICAL,
+ "failed to allocate bctx table: out of memory");
+ }
+}
+
+static inline void
+BDB_DIRECTORY_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *directory = NULL;
+ data_t *logdir = NULL;
+ int32_t op_ret = -1;
+ struct stat stbuf = {0};
+
+ directory = dict_get (options, "directory");
+
+ if (directory) {
+ logdir = dict_get (options, "logdir");
+
+ if (logdir == NULL) {
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "using default logdir as database home");
+ private->logdir = strdup (directory->data);
+
+ } else {
+ private->logdir = strdup (logdir->data);
+ gf_log ("bdb-ll",
+ GF_LOG_DEBUG,
+ "using logdir: %s", private->logdir);
+ umask (000);
+ if (mkdir (private->logdir, 0777) == 0) {
+ gf_log ("bdb-ll", GF_LOG_WARNING,
+ "logdir specified (%s) not exists, created",
+ private->logdir);
+ }
+
+ op_ret = stat (private->logdir, &stbuf);
+ if ((op_ret != 0) || !S_ISDIR (stbuf.st_mode)) {
+ gf_log ("bdb-ll",
+ GF_LOG_ERROR,
+ "specified logdir doesn't exist, "
+ "using default (environment home directory: %s)",
+ directory->data);
+ private->logdir = strdup (directory->data);
+ }
+ }
+
+ private->b_table->dbenv = bdb_dbenv_init (this, directory->data);
+
+ if (!private->b_table->dbenv) {
+ gf_log ("bdb-ll", GF_LOG_ERROR,
+ "failed to initialize db environment");
+ FREE (private);
+ op_ret = -1;
+ } else {
+ if (private->transaction) {
+ /* all well, start the checkpointing thread */
+ LOCK_INIT (&private->active_lock);
+
+ LOCK (&private->active_lock);
+ private->active = 1;
+ UNLOCK (&private->active_lock);
+ pthread_create (&private->checkpoint_thread, NULL,
+ bdb_checkpoint, this);
+ }
+ }
+ }
+}
+
+static inline void
+BDB_DIR_MODE_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *dir_mode = NULL;
+ char *endptr = NULL;
+
+ dir_mode = dict_get (options, "dir-mode");
+
+ if (dir_mode) {
+ private->dir_mode = strtol (dir_mode->data, &endptr, 8);
+ if ((*endptr) ||
+ (!IS_VALID_FILE_MODE(private->dir_mode))) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "invalid dir-mode %o. setting to default %o",
+ private->dir_mode,
+ DEFAULT_DIR_MODE);
+ private->dir_mode = DEFAULT_DIR_MODE;
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "setting dir-mode to %o", private->dir_mode);
+ private->dir_mode = private->dir_mode;
+ }
+ } else {
+ private->dir_mode = DEFAULT_DIR_MODE;
+ }
+
+ private->dir_mode = private->dir_mode | S_IFDIR;
+}
+
+static inline void
+BDB_FILE_MODE_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *file_mode = NULL;
+ char *endptr = NULL;
+
+ file_mode = dict_get (options, "file-mode");
+
+ if (file_mode) {
+ private->file_mode = strtol (file_mode->data, &endptr, 8);
+
+ if ((*endptr) ||
+ (!IS_VALID_FILE_MODE(private->file_mode))) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "invalid file-mode %o. setting to default %o",
+ private->file_mode,
+ DEFAULT_FILE_MODE);
+ private->file_mode = DEFAULT_FILE_MODE;
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "setting file-mode to %o", private->file_mode);
+ private->file_mode = private->file_mode;
+ }
+ } else {
+ private->file_mode = DEFAULT_FILE_MODE;
+ }
+
+ private->symlink_mode = private->file_mode | S_IFLNK;
+ private->file_mode = private->file_mode | S_IFREG;
+}
+
+static inline void
+BDB_CHECKPOINT_TIMEOUT_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *checkpoint_timeout = NULL;
+
+ checkpoint_timeout = dict_get (options, "checkpoint-timeout");
+
+ private->checkpoint_timeout = BDB_DEFAULT_CHECKPOINT_TIMEOUT;
+
+ if (checkpoint_timeout) {
+ private->checkpoint_timeout = strtol (checkpoint_timeout->data, NULL, 0);
+
+ if (private->checkpoint_timeout < 5 || private->checkpoint_timeout > 60) {
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "checkpoint-timeout %d seconds too %s",
+ private->checkpoint_timeout,
+ (private->checkpoint_timeout < 5)?"low":"high");
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "setting checkpoint-timeout to %d seconds",
+ private->checkpoint_timeout);
+ }
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "setting checkpoint-timeout to default: %d seconds",
+ private->checkpoint_timeout);
+ }
+}
+
+static inline void
+BDB_LOCK_TIMEOUT_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *lock_timeout = NULL;
+
+ lock_timeout = dict_get (options, "lock-timeout");
+
+ if (lock_timeout) {
+ private->lock_timeout = strtol (lock_timeout->data, NULL, 0);
+
+ if (private->lock_timeout > 4260000) {
+ /* db allows us to DB_SET_LOCK_TIMEOUT to be set to a
+ * maximum of 71 mins (4260000 milliseconds) */
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "lock-timeout %d, out of range",
+ private->lock_timeout);
+ private->lock_timeout = 0;
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "setting lock-timeout to %d milliseconds",
+ private->lock_timeout);
+ }
+ }
+}
+
+static inline void
+BDB_TRANSACTION_TIMEOUT_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *txn_timeout = NULL;
+ txn_timeout = dict_get (options, "transaction-timeout");
+
+ if (txn_timeout) {
+ private->txn_timeout = strtol (txn_timeout->data, NULL, 0);
+
+ if (private->txn_timeout > 4260000) {
+ /* db allows us to DB_SET_TXN_TIMEOUT to be set to a maximum
+ * of 71 mins (4260000 milliseconds) */
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "transaction-timeout %d, out of range",
+ private->txn_timeout);
+ private->txn_timeout = 0;
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "setting transaction-timeout to %d milliseconds",
+ private->txn_timeout);
+ }
+ }
+}
+
+static inline void
+BDB_TRANSACTION_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *mode = NULL;
+
+ mode = dict_get (options, "mode");
+
+ if (mode && !strcmp (mode->data, "off")) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "cache mode selected");
+ private->envflags = DB_CREATE | DB_INIT_LOG |
+ DB_INIT_MPOOL | DB_THREAD;
+ private->dbflags = DB_CREATE | DB_THREAD;
+ private->transaction = OFF;
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "persistant mode selected");
+ private->transaction = ON;
+ private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG |
+ DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD;
+ private->dbflags = DB_CREATE | DB_THREAD;
+ }
+}
+
+static inline void
+BDB_ACCESS_MODE_INIT (xlator_t *this,
+ dict_t *options,
+ struct bdb_private *private)
+{
+ data_t *access_mode = NULL;
+
+ access_mode = dict_get (options, "access-mode");
+
+ if (access_mode && !strcmp (access_mode->data, "btree")) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "using access mode BTREE");
+ private->access_mode = DB_BTREE;
+ } else {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "using access mode HASH");
+ private->access_mode = DB_HASH;
+ }
+}
+
+
+/* bdb_db_init - initialize bdb xlator
+ *
+ * reads the options from @options dictionary and sets appropriate values in @this->private.
+ * also initializes DB_ENV.
+ *
+ * return: 0 on success or -1 on error (with logging the error through gf_log()).
+ */
+int
+bdb_db_init (xlator_t *this,
+ dict_t *options)
+{
+ /* create a db entry for root */
+ int32_t op_ret = 0;
+ bdb_private_t *private = NULL;
+
+ private = this->private;
+
+ BDB_CACHE_INIT (this, options, private);
+
+ BDB_ACCESS_MODE_INIT (this, options, private);
+
+ BDB_TRANSACTION_INIT (this, options, private);
+
+ BDB_TRANSACTION_TIMEOUT_INIT (this, options, private);
+
+ BDB_LOCK_TIMEOUT_INIT (this, options, private);
+
+ {
+ LOCK_INIT (&private->ino_lock);
+ private->next_ino = 2;
+ }
+
+ BDB_CHECKPOINT_TIMEOUT_INIT (this, options, private);
+
+ BDB_FILE_MODE_INIT (this, options, private);
+
+ BDB_DIR_MODE_INIT (this, options, private);
+
+ BDB_TABLE_INIT (this, options, private);
+
+ BDB_ERRFILE_INIT (this, options, private);
+
+ BDB_LOG_REMOVE_INIT (this, options, private);
+
+ BDB_DIRECTORY_INIT (this, options, private);
+
+ return op_ret;
+}
diff --git a/xlators/storage/bdb/src/bdb.c b/xlators/storage/bdb/src/bdb.c
new file mode 100644
index 00000000000..e820e867a94
--- /dev/null
+++ b/xlators/storage/bdb/src/bdb.c
@@ -0,0 +1,3371 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/* bdb based storage translator - named as 'bdb' translator
+ *
+ *
+ * There can be only two modes for files existing on bdb translator:
+ * 1. DIRECTORY - directories are stored by bdb as regular directories on background
+ * file-system. directories also have an entry in the ns_db.db of their parent directory.
+ * 2. REGULAR FILE - regular files are stored as records in the storage_db.db present in
+ * the directory. regular files also have an entry in ns_db.db
+ *
+ * Internally bdb has a maximum of three different types of logical files associated with
+ * each directory:
+ * 1. storage_db.db - storage database, used to store the data corresponding to regular
+ * files in the form of key/value pair. file-name is the 'key' and data
+ * is 'value'.
+ * 2. directory (all subdirectories) - any subdirectory will have a regular directory entry.
+ */
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#define __XOPEN_SOURCE 500
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <ftw.h>
+#include <libgen.h>
+
+#include "glusterfs.h"
+#include "dict.h"
+#include "logging.h"
+#include "bdb.h"
+#include "xlator.h"
+#include "defaults.h"
+#include "common-utils.h"
+
+/* to be used only by fops, nobody else */
+#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv)
+#define B_TABLE(this) (((struct bdb_private *)this->private)->b_table)
+
+
+int32_t
+bdb_mknod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode,
+ dev_t dev)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *key_string = NULL; /* after translating loc->path to DB key */
+ char *db_path = NULL;
+ bctx_t *bctx = NULL;
+ struct stat stbuf = {0,};
+
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ if (!S_ISREG(mode)) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "mknod for non-regular file");
+ op_ret = -1;
+ op_errno = EPERM;
+ goto out;
+ } /* if(!S_ISREG(mode)) */
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+
+ if (bctx == NULL) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to get bctx for path: %s", loc->path);
+ op_ret = -1;
+ op_errno = ENOENT;
+ goto out;
+ } /* if(bctx == NULL) */
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
+
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ MAKE_KEY_FROM_PATH (key_string, loc->path);
+ op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 0, 0);
+ if (op_ret > 0) {
+ /* create successful */
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ stbuf.st_mode = mode;
+ stbuf.st_size = 0;
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ } else {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "bdb_db_get() failed for path: %s", loc->path);
+ op_ret = -1;
+ op_errno = ENOENT;
+ }/* if (!op_ret)...else */
+
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+
+ STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
+ return 0;
+}
+
+static inline int32_t
+is_dir_empty (xlator_t *this,
+ loc_t *loc)
+{
+ int32_t ret = 1;
+ bctx_t *bctx = NULL;
+ DIR *dir = NULL;
+ char *real_path = NULL;
+ void *dbstat = NULL;
+ struct dirent *entry = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ bctx = bctx_lookup (B_TABLE(this), loc->path);
+ if (bctx == NULL) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "failed to get bctx from inode for dir: %s,"
+ "assuming empty directory",
+ loc->path);
+ ret = 1;
+ goto out;
+ }
+
+ dbstat = bdb_db_stat (bctx, NULL, 0);
+ if (dbstat) {
+ switch (bctx->table->access_mode)
+ {
+ case DB_HASH:
+ ret = (((DB_HASH_STAT *)dbstat)->hash_nkeys == 0);
+ break;
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = (((DB_BTREE_STAT *)dbstat)->bt_nkeys == 0);
+ break;
+ case DB_QUEUE:
+ ret = (((DB_QUEUE_STAT *)dbstat)->qs_nkeys == 0);
+ break;
+ case DB_UNKNOWN:
+ gf_log (this->name,
+ GF_LOG_CRITICAL,
+ "unknown access-mode set for db");
+ ret = 0;
+ }
+ } else {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to get db stat for db at path: %s", loc->path);
+ ret = 1;
+ goto out;
+ }
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ dir = opendir (real_path);
+ if (dir == NULL) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "failed to opendir(%s)", loc->path);
+ ret = 0;
+ goto out;
+ }
+
+ while ((entry = readdir (dir))) {
+ if ((!IS_BDB_PRIVATE_FILE(entry->d_name)) &&
+ (!IS_DOT_DOTDOT(entry->d_name))) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "directory (%s) not empty, has a non-db entry",
+ loc->path);
+ ret = 0;
+ break;
+ }/* if(!IS_BDB_PRIVATE_FILE()) */
+ } /* while(true) */
+ closedir (dir);
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ return ret;
+}
+
+int32_t
+bdb_rename (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ struct bdb_private *private = NULL;
+ bctx_table_t *table = NULL;
+ bctx_t *oldbctx = NULL;
+ bctx_t *newbctx = NULL;
+ bctx_t *tmpbctx = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = ENOENT;
+ int32_t read_size = 0;
+ struct stat stbuf = {0,};
+ struct stat old_stbuf = {0,};
+ DB_TXN *txnid = NULL;
+ char *real_newpath = NULL;
+ char *real_oldpath = NULL;
+ char *oldkey = NULL;
+ char *newkey = NULL;
+ char *buf = NULL; /* pointer to temporary buffer, where
+ * the contents of a file are read, if
+ * file being renamed is a regular file */
+ char *real_db_newpath = NULL;
+ char *tmp_db_newpath = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, newloc, out);
+ GF_VALIDATE_OR_GOTO (this->name, oldloc, out);
+
+ private = this->private;
+ table = private->b_table;
+
+ MAKE_REAL_PATH (real_oldpath, this, oldloc->path);
+
+ if (S_ISREG (oldloc->inode->st_mode)) {
+ oldbctx = bctx_parent (B_TABLE(this), oldloc->path);
+ MAKE_REAL_PATH (real_newpath, this, newloc->path);
+
+ op_ret = lstat (real_newpath, &stbuf);
+
+ if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))) {
+ op_ret = -1;
+ op_errno = EISDIR;
+ goto out;
+ }
+ if (op_ret == 0) {
+ /* destination is a symlink */
+ MAKE_KEY_FROM_PATH (oldkey, oldloc->path);
+ MAKE_KEY_FROM_PATH (newkey, newloc->path);
+
+ op_ret = unlink (real_newpath);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to unlink %s (%s)",
+ newloc->path, strerror (op_errno));
+ goto out;
+ }
+ newbctx = bctx_parent (B_TABLE (this), newloc->path);
+ GF_VALIDATE_OR_GOTO (this->name, newbctx, out);
+
+ op_ret = bdb_txn_begin (BDB_ENV(this), &txnid);
+
+ if ((read_size =
+ bdb_db_get (oldbctx, txnid, oldkey, &buf, 0, 0)) < 0) {
+ bdb_txn_abort (txnid);
+ } else if ((op_ret =
+ bdb_db_del (oldbctx, txnid, oldkey)) != 0) {
+ bdb_txn_abort (txnid);
+ } else if ((op_ret = bdb_db_put (newbctx, txnid,
+ newkey, buf,
+ read_size, 0, 0)) != 0) {
+ bdb_txn_abort (txnid);
+ } else {
+ bdb_txn_commit (txnid);
+ }
+
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (newbctx);
+ } else {
+ /* destination doesn't exist or a regular file */
+ MAKE_KEY_FROM_PATH (oldkey, oldloc->path);
+ MAKE_KEY_FROM_PATH (newkey, newloc->path);
+
+ newbctx = bctx_parent (B_TABLE (this), newloc->path);
+ GF_VALIDATE_OR_GOTO (this->name, newbctx, out);
+
+ op_ret = bdb_txn_begin (BDB_ENV(this), &txnid);
+
+ if ((read_size = bdb_db_get (oldbctx, txnid,
+ oldkey, &buf,
+ 0, 0)) < 0) {
+ bdb_txn_abort (txnid);
+ } else if ((op_ret = bdb_db_del (oldbctx,
+ txnid, oldkey)) != 0) {
+ bdb_txn_abort (txnid);
+ } else if ((op_ret = bdb_db_put (newbctx, txnid,
+ newkey, buf,
+ read_size, 0, 0)) != 0) {
+ bdb_txn_abort (txnid);
+ } else {
+ bdb_txn_commit (txnid);
+ }
+
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (newbctx);
+ }
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (oldbctx);
+ } else if (S_ISLNK (oldloc->inode->st_mode)) {
+ MAKE_REAL_PATH (real_newpath, this, newloc->path);
+ op_ret = lstat (real_newpath, &stbuf);
+ if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))) {
+ op_ret = -1;
+ op_errno = EISDIR;
+ goto out;
+ }
+
+ if (op_ret == 0){
+ /* destination exists and is also a symlink */
+ MAKE_REAL_PATH (real_oldpath, this, oldloc->path);
+ op_ret = rename (real_oldpath, real_newpath);
+ op_errno = errno;
+
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to rename symlink %s (%s)",
+ oldloc->path, strerror (op_errno));
+ }
+ goto out;
+ }
+
+ /* destination doesn't exist */
+ MAKE_REAL_PATH (real_oldpath, this, oldloc->path);
+ MAKE_KEY_FROM_PATH (newkey, newloc->path);
+ newbctx = bctx_parent (B_TABLE (this), newloc->path);
+ GF_VALIDATE_OR_GOTO (this->name, newbctx, out);
+
+ op_ret = bdb_db_del (newbctx, txnid, newkey);
+ if (op_ret != 0) {
+ /* no problem */
+ }
+ op_ret = rename (real_oldpath, real_newpath);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to rename %s to %s (%s)",
+ oldloc->path, newloc->path, strerror (op_errno));
+ goto out;
+ }
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (newbctx);
+ } else if (S_ISDIR (oldloc->inode->st_mode) &&
+ (old_stbuf.st_nlink == 2)) {
+
+ tmp_db_newpath = tempnam (private->export_path, "rename_temp");
+ GF_VALIDATE_OR_GOTO (this->name, tmp_db_newpath, out);
+
+ MAKE_REAL_PATH (real_newpath, this, newloc->path);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (real_db_newpath, this, newloc->path);
+
+ oldbctx = bctx_lookup (B_TABLE(this), oldloc->path);
+ op_ret = -1;
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, oldbctx, out);
+
+ op_ret = lstat (real_newpath, &stbuf);
+ if ((op_ret == 0) &&
+ S_ISDIR (stbuf.st_mode) &&
+ is_dir_empty (this, newloc)) {
+
+ tmpbctx = bctx_rename (oldbctx, tmp_db_newpath);
+ op_ret = -1;
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, tmpbctx, out);
+
+ op_ret = rename (real_oldpath, real_newpath);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "rename directory %s to %s failed: %s",
+ oldloc->path, newloc->path,
+ strerror (errno));
+ op_ret = bdb_db_rename (table,
+ tmp_db_newpath,
+ oldbctx->db_path);
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "renaming temp database back to old db failed"
+ " for directory %s", oldloc->path);
+ goto out;
+ } else {
+ /* this is a error case, set op_errno & op_ret */
+ op_ret = -1;
+ op_errno = ENOENT; /* TODO: errno */
+ }
+ }
+ op_ret = bdb_db_rename (table, tmp_db_newpath, real_db_newpath);
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "renaming temp database to new db failed"
+ " for directory %s", oldloc->path);
+ goto out;
+ }
+ } else if ((op_ret != 0) && (errno == ENOENT)) {
+ tmp_db_newpath = tempnam (private->export_path, "rename_temp");
+ GF_VALIDATE_OR_GOTO (this->name, tmp_db_newpath, out);
+
+ tmpbctx = bctx_rename (oldbctx, tmp_db_newpath);
+ op_ret = -1;
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, tmpbctx, out);
+
+ op_ret = rename (real_oldpath, real_newpath);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "rename directory %s to %s failed: %s",
+ oldloc->path, newloc->path,
+ strerror (errno));
+ op_ret = bdb_db_rename (table,
+ tmp_db_newpath,
+ oldbctx->db_path);
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "renaming temp database back to old db failed"
+ " for directory %s", oldloc->path);
+ goto out;
+ } else {
+ /* this is a error case, set op_errno & op_ret */
+ op_ret = -1;
+ op_errno = ENOENT; /* TODO: errno */
+ }
+ } else {
+ op_ret = bdb_db_rename (table,
+ tmp_db_newpath,
+ real_db_newpath);
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "renaming temp database to new db failed"
+ " for directory %s", oldloc->path);
+ goto out;
+ } else {
+ /* this is a error case, set op_errno & op_ret */
+ op_ret = -1;
+ op_errno = ENOENT; /* TODO: errno */
+ }
+ }
+ }
+ } else {
+ gf_log (this->name,
+ GF_LOG_CRITICAL,
+ "rename called on non-existent file type");
+ op_ret = -1;
+ op_errno = EPERM;
+ }
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ return 0;
+}
+
+int32_t
+bdb_link (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, -1, EPERM, NULL, NULL);
+ return 0;
+}
+
+int32_t
+is_space_left (xlator_t *this,
+ size_t size)
+{
+ struct bdb_private *private = this->private;
+ struct statvfs stbuf = {0,};
+ int32_t ret = -1;
+ fsblkcnt_t req_blocks = 0;
+ fsblkcnt_t usable_blocks = 0;
+
+ ret = statvfs (private->export_path, &stbuf);
+ if (ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to do statvfs on %s", private->export_path);
+ return 0;
+ } else {
+ req_blocks = (size / stbuf.f_frsize) + 1;
+
+ usable_blocks = (stbuf.f_bfree - BDB_ENOSPC_THRESHOLD);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "requested size: %"GF_PRI_SIZET"\nfree blocks: %"PRIu64"\nblock size: %lu\nfrag size: %lu",
+ size, stbuf.f_bfree, stbuf.f_bsize, stbuf.f_frsize);
+
+ if (req_blocks < usable_blocks)
+ return 1;
+ else
+ return 0;
+ }
+}
+
+int32_t
+bdb_create (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode,
+ fd_t *fd)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ char *db_path = NULL;
+ struct stat stbuf = {0,};
+ bctx_t *bctx = NULL;
+ struct bdb_private *private = NULL;
+ char *key_string = NULL;
+ struct bdb_fd *bfd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ private = this->private;
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ MAKE_KEY_FROM_PATH (key_string, loc->path);
+ op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 0, 0);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out);
+
+ /* create successful */
+ bfd = CALLOC (1, sizeof (*bfd));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ /* NOTE: bdb_get_bctx_from () returns bctx with a ref */
+ bfd->ctx = bctx;
+ bfd->key = strdup (key_string);
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bfd->key, out);
+
+ BDB_SET_BFD (this, fd, bfd);
+
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ stbuf.st_mode = private->file_mode;
+ stbuf.st_size = 0;
+ stbuf.st_nlink = 1;
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ op_ret = 0;
+ op_errno = 0;
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf);
+
+ return 0;
+}
+
+
+/* bdb_open
+ *
+ * as input parameters bdb_open gets the file name, i.e key. bdb_open should effectively
+ * do: store key, open storage db, store storage-db pointer.
+ *
+ */
+int32_t
+bdb_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ fd_t *fd)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ bctx_t *bctx = NULL;
+ char *key_string = NULL;
+ struct bdb_fd *bfd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ bfd = CALLOC (1, sizeof (*bfd));
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ /* NOTE: bctx_parent () returns bctx with a ref */
+ bfd->ctx = bctx;
+
+ MAKE_KEY_FROM_PATH (key_string, loc->path);
+ bfd->key = strdup (key_string);
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bfd->key, out);
+
+ BDB_SET_BFD (this, fd, bfd);
+ op_ret = 0;
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, fd);
+
+ return 0;
+}
+
+int32_t
+bdb_readv (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ struct iovec vec = {0,};
+ struct stat stbuf = {0,};
+ struct bdb_fd *bfd = NULL;
+ dict_t *reply_dict = NULL;
+ char *buf = NULL;
+ data_t *buf_data = NULL;
+ char *db_path = NULL;
+ int32_t read_size = 0;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ /* we are ready to go */
+ op_ret = bdb_db_get (bfd->ctx, NULL,
+ bfd->key, &buf,
+ size, offset);
+ read_size = op_ret;
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to do db_storage_get()");
+ op_ret = -1;
+ op_errno = ENOENT;
+ goto out;
+ } else if (op_ret == 0) {
+ goto out;
+ }
+
+ buf_data = get_new_data ();
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, buf_data, out);
+
+ reply_dict = get_new_dict ();
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, reply_dict, out);
+
+ buf_data->data = buf;
+
+ if (size < read_size) {
+ op_ret = size;
+ read_size = size;
+ }
+
+ buf_data->len = op_ret;
+
+ dict_set (reply_dict, NULL, buf_data);
+
+ frame->root->rsp_refs = dict_ref (reply_dict);
+
+ vec.iov_base = buf;
+ vec.iov_len = read_size;
+
+ stbuf.st_ino = fd->inode->ino;
+ stbuf.st_size = op_ret ;
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ op_ret = size;
+out:
+ STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf);
+
+ if (reply_dict)
+ dict_unref (reply_dict);
+
+ return 0;
+}
+
+
+int32_t
+bdb_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t offset)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ struct stat stbuf = {0,};
+ struct bdb_fd *bfd = NULL;
+ int32_t idx = 0;
+ off_t c_off = offset;
+ int32_t c_ret = -1;
+ char *db_path = NULL;
+ size_t total_size = 0;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, vector, out);
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+
+ for (idx = 0; idx < count; idx++)
+ total_size += vector[idx].iov_len;
+
+ if (!is_space_left (this, total_size)) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "requested storage for %"GF_PRI_SIZET", ENOSPC", total_size);
+ op_ret = -1;
+ op_errno = ENOSPC;
+ goto out;
+ }
+
+
+ /* we are ready to go */
+ for (idx = 0; idx < count; idx++) {
+ c_ret = bdb_db_put (bfd->ctx, NULL,
+ bfd->key, vector[idx].iov_base,
+ vector[idx].iov_len, c_off, 0);
+ if (c_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to do bdb_db_put at offset: %"PRIu64" for file: %s",
+ c_off, bfd->key);
+ break;
+ } else {
+ c_off += vector[idx].iov_len;
+ }
+ op_ret += vector[idx].iov_len;
+ } /* for(idx=0;...)... */
+
+ if (c_ret) {
+ /* write failed */
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to do bdb_db_put(): %s",
+ db_strerror (op_ret));
+ op_ret = -1;
+ op_errno = EBADFD; /* TODO: search for a more meaningful errno */
+ goto out;
+ }
+ /* NOTE: we want to increment stbuf->st_size, as stored in db */
+ stbuf.st_size = op_ret;
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ op_errno = 0;
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ return 0;
+}
+
+int32_t
+bdb_flush (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ struct bdb_fd *bfd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ /* do nothing */
+ op_ret = 0;
+ op_errno = 0;
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+int32_t
+bdb_release (xlator_t *this,
+ fd_t *fd)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EBADFD;
+ struct bdb_fd *bfd = NULL;
+
+ if ((bfd = bdb_extract_bfd (fd, this)) == NULL){
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to extract %s specific information from fd:%p", this->name, fd);
+ op_ret = -1;
+ op_errno = EBADFD;
+ } else {
+ bctx_unref (bfd->ctx);
+ bfd->ctx = NULL;
+
+ if (bfd->key)
+ free (bfd->key); /* we did strdup() in bdb_open() */
+ free (bfd);
+ op_ret = 0;
+ op_errno = 0;
+ } /* if((fd->ctx == NULL)...)...else */
+
+ return 0;
+}/* bdb_release */
+
+
+int32_t
+bdb_fsync (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t datasync)
+{
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, 0, 0);
+ return 0;
+}/* bdb_fsync */
+
+static int gf_bdb_lk_log;
+
+int32_t
+bdb_lk (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t cmd,
+ struct flock *lock)
+{
+ struct flock nullock = {0, };
+
+ gf_bdb_lk_log++;
+ if (!(gf_bdb_lk_log % GF_UNIVERSAL_ANSWER)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"features/posix-locks\" translator is not loaded, you need to use it");
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, -1, ENOSYS, &nullock);
+ return 0;
+}/* bdb_lk */
+
+/* bdb_lookup
+ *
+ * there are four possibilities for a file being looked up:
+ * 1. file exists and is a directory.
+ * 2. file exists and is a symlink.
+ * 3. file exists and is a regular file.
+ * 4. file does not exist.
+ * case 1 and 2 are handled by doing lstat() on the @loc. if the file is a directory or symlink,
+ * lstat() succeeds. lookup continues to check if the @loc belongs to case-3 only if lstat() fails.
+ * to check for case 3, bdb_lookup does a bdb_db_get() for the given @loc. (see description of
+ * bdb_db_get() for more details on how @loc is transformed into db handle and key). if check
+ * for case 1, 2 and 3 fail, we proceed to conclude that file doesn't exist (case 4).
+ *
+ * @frame: call frame.
+ * @this: xlator_t of this instance of bdb xlator.
+ * @loc: loc_t specifying the file to operate upon.
+ * @need_xattr: if need_xattr != 0, we are asked to return all the extended attributed of @loc,
+ * if any exist, in a dictionary. if @loc is a regular file and need_xattr is set, then
+ * we look for value of need_xattr. if need_xattr > sizo-of-the-file @loc, then the file
+ * content of @loc is returned in dictionary of xattr with 'glusterfs.content' as
+ * dictionary key.
+ *
+ * NOTE: bdb currently supports only directories, symlinks and regular files.
+ *
+ * NOTE: bdb_lookup returns the 'struct stat' of underlying file itself, in case of directory and
+ * symlink (st_ino is modified as bdb allocates its own set of inodes of all files). for
+ * regular files, bdb uses 'struct stat' of the database file in which the @loc is stored
+ * as templete and modifies st_ino (see bdb_inode_transform for more details), st_mode (can
+ * be set in volfile 'option file-mode <mode>'), st_size (exact size of the @loc
+ * contents), st_blocks (block count on the underlying filesystem to accomodate st_size,
+ * see BDB_COUNT_BLOCKS in bdb.h for more details).
+ */
+int32_t
+bdb_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ struct stat stbuf = {0, };
+ int32_t op_ret = -1;
+ int32_t op_errno = ENOENT;
+ dict_t *xattr = NULL;
+ char *pathname = NULL;
+ char *directory = NULL;
+ char *real_path = NULL;
+ bctx_t *bctx = NULL;
+ char *db_path = NULL;
+ struct bdb_private *private = NULL;
+ char *key_string = NULL;
+ int32_t entry_size = 0;
+ char *file_content = NULL;
+ data_t *file_content_data = NULL;
+ uint64_t need_xattr = 0;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ private = this->private;
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ pathname = strdup (loc->path);
+ GF_VALIDATE_OR_GOTO (this->name, pathname, out);
+
+ directory = dirname (pathname);
+ GF_VALIDATE_OR_GOTO (this->name, directory, out);
+
+ if (!strcmp (directory, loc->path)) {
+ /* SPECIAL CASE: looking up root */
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ /* bctx_lookup() returns NULL only when its time to wind up,
+ * we should shutdown functioning */
+ bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
+ op_ret = -1;
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ stbuf.st_ino = 1;
+ stbuf.st_mode = private->dir_mode;
+ } else {
+ MAKE_KEY_FROM_PATH (key_string, loc->path);
+ op_ret = lstat (real_path, &stbuf);
+ if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))){
+ bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ if (loc->ino) {
+ /* revalidating directory inode */
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "revalidating directory %s", (char *)loc->path);
+ stbuf.st_ino = loc->ino;
+ } else {
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ }
+ stbuf.st_mode = private->dir_mode;
+ op_ret = 0;
+ op_errno = 0;
+ goto out;
+ } else if (op_ret == 0) {
+ /* a symlink */
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "lookup called for symlink: %s", loc->path);
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_ret = -1;
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ if (loc->ino) {
+ stbuf.st_ino = loc->ino;
+ } else {
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ }
+ stbuf.st_mode = private->symlink_mode;
+ op_ret = 0;
+ op_errno = 0;
+ goto out;
+ }
+
+ /* for regular files */
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_ret = -1;
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ if (GF_FILE_CONTENT_REQUESTED(xattr_req, &need_xattr)) {
+ entry_size = bdb_db_get (bctx,
+ NULL,
+ loc->path,
+ &file_content,
+ 0, 0);
+ } else {
+ entry_size = bdb_db_get (bctx,
+ NULL,
+ loc->path,
+ NULL,
+ 0, 0);
+ }
+
+ op_ret = entry_size;
+ op_errno = ENOENT;
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "returning ENOENT for %s", loc->path);
+ goto out;
+ }
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ if ((need_xattr >= entry_size)
+ && (entry_size) && (file_content)) {
+ file_content_data = data_from_dynptr (file_content,
+ entry_size);
+ xattr = get_new_dict ();
+ dict_set (xattr, "glusterfs.content",
+ file_content_data);
+ } else {
+ if (file_content)
+ free (file_content);
+ }
+
+ if (loc->ino) {
+ /* revalidate */
+ stbuf.st_ino = loc->ino;
+ stbuf.st_size = entry_size;
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ } else {
+ /* fresh lookup, create an inode number */
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ stbuf.st_size = entry_size;
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ }/* if(inode->ino)...else */
+ stbuf.st_nlink = 1;
+ stbuf.st_mode = private->file_mode;
+ }
+ op_ret = 0;
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ if (pathname)
+ free (pathname);
+
+ if (xattr)
+ dict_ref (xattr);
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf, xattr);
+
+ if (xattr)
+ dict_unref (xattr);
+
+ return 0;
+
+}/* bdb_lookup */
+
+int32_t
+bdb_stat (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+
+ struct stat stbuf = {0,};
+ char *real_path = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ struct bdb_private *private = NULL;
+ char *db_path = NULL;
+ bctx_t *bctx = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ private = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, private, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret == 0) {
+ /* directory or symlink */
+ stbuf.st_ino = loc->inode->ino;
+ if (S_ISDIR(stbuf.st_mode))
+ stbuf.st_mode = private->dir_mode;
+ else
+ stbuf.st_mode = private->symlink_mode;
+ /* we are done, lets unwind the stack */
+ goto out;
+ }
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_ret = -1;
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ stbuf.st_size = bdb_db_get (bctx, NULL, loc->path, NULL, 0, 0);
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+ stbuf.st_ino = loc->inode->ino;
+
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+
+ return 0;
+}/* bdb_stat */
+
+
+
+/* bdb_opendir - in the world of bdb, open/opendir is all about opening correspondind databases.
+ * opendir in particular, opens the database for the directory which is
+ * to be opened. after opening the database, a cursor to the database is also created.
+ * cursor helps us get the dentries one after the other, and cursor maintains the state
+ * about current positions in directory. pack 'pointer to db', 'pointer to the
+ * cursor' into struct bdb_dir and store it in fd->ctx, we get from our parent xlator.
+ *
+ * @frame: call frame
+ * @this: our information, as we filled during init()
+ * @loc: location information
+ * @fd: file descriptor structure (glusterfs internal)
+ *
+ * return value - immaterial, async call.
+ *
+ */
+int32_t
+bdb_opendir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ fd_t *fd)
+{
+ char *real_path = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ bctx_t *bctx = NULL;
+ struct bdb_dir *bfd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ bfd = CALLOC (1, sizeof (*bfd));
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ bfd->dir = opendir (real_path);
+ op_errno = errno;
+ GF_VALIDATE_OR_GOTO (this->name, bfd->dir, out);
+
+ /* NOTE: bctx_lookup() return bctx with ref */
+ bfd->ctx = bctx;
+
+ bfd->path = strdup (real_path);
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bfd->path, out);
+
+ BDB_SET_BFD (this, fd, bfd);
+ op_ret = 0;
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, fd);
+
+ return 0;
+}/* bdb_opendir */
+
+
+int32_t
+bdb_getdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t off,
+ int32_t flag)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ int32_t ret = -1;
+ int32_t real_path_len = 0;
+ int32_t entry_path_len = 0;
+ int32_t count = 0;
+ char *real_path = NULL;
+ char *entry_path = NULL;
+ char *db_path = NULL;
+ dir_entry_t entries = {0, };
+ dir_entry_t *tmp = NULL;
+ DIR *dir = NULL;
+ struct dirent *dirent = NULL;
+ struct bdb_dir *bfd = NULL;
+ struct stat db_stbuf = {0,};
+ struct stat buf = {0,};
+ DBC *cursorp = NULL;
+ size_t tmp_name_len = 0;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ MAKE_REAL_PATH (real_path, this, bfd->path);
+ dir = bfd->dir;
+
+ while ((dirent = readdir (dir))) {
+ if (!dirent)
+ break;
+
+ if (IS_BDB_PRIVATE_FILE(dirent->d_name)) {
+ continue;
+ }
+
+ tmp_name_len = strlen (dirent->d_name);
+ if (entry_path_len < (real_path_len + 1 + (tmp_name_len) + 1)) {
+ entry_path_len = real_path_len + tmp_name_len + 1024;
+ entry_path = realloc (entry_path, entry_path_len);
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, entry_path, out);
+ }
+
+ strncpy (&entry_path[real_path_len+1], dirent->d_name, tmp_name_len);
+ op_ret = stat (entry_path, &buf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ entry_path, strerror (op_errno));
+ goto out;
+ }
+
+ if ((flag == GF_GET_DIR_ONLY) &&
+ (ret != -1 && !S_ISDIR(buf.st_mode))) {
+ continue;
+ }
+
+ tmp = CALLOC (1, sizeof (*tmp));
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, tmp, out);
+
+ tmp->name = strdup (dirent->d_name);
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, dirent->d_name, out);
+
+ memcpy (&tmp->buf, &buf, sizeof (buf));
+
+ tmp->buf.st_ino = -1;
+ if (S_ISLNK(tmp->buf.st_mode)) {
+ char linkpath[ZR_PATH_MAX] = {0,};
+ ret = readlink (entry_path, linkpath, ZR_PATH_MAX);
+ if (ret != -1) {
+ linkpath[ret] = '\0';
+ tmp->link = strdup (linkpath);
+ }
+ } else {
+ tmp->link = "";
+ }
+
+ count++;
+
+ tmp->next = entries.next;
+ entries.next = tmp;
+ /* if size is 0, count can never be = size, so entire dir is read */
+
+ if (count == size)
+ break;
+ }
+
+ if ((flag != GF_GET_DIR_ONLY) && (count < size)) {
+ /* read from db */
+ op_ret = bdb_cursor_open (bfd->ctx, &cursorp);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out);
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory);
+ op_ret = lstat (db_path, &db_stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ /* read all the entries in database, one after the other and put into dictionary */
+ while (1) {
+ DBT key = {0,}, value = {0,};
+
+ key.flags = DB_DBT_MALLOC;
+ value.flags = DB_DBT_MALLOC;
+ op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT);
+
+ if (op_ret == DB_NOTFOUND) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "end of list of key/value pair in db for directory: %s",
+ bfd->ctx->directory);
+ op_ret = 0;
+ op_errno = 0;
+ break;
+ } else if (op_ret != 0){
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to do cursor get for directory %s: %s",
+ bfd->ctx->directory, db_strerror (op_ret));
+ op_ret = -1;
+ op_errno = ENOENT;
+ break;
+ }
+ /* successfully read */
+ tmp = CALLOC (1, sizeof (*tmp));
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, tmp, out);
+
+ tmp->name = CALLOC (1, key.size + 1);
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, tmp->name, out);
+
+ memcpy (tmp->name, key.data, key.size);
+ tmp->buf = db_stbuf;
+ tmp->buf.st_size = bdb_db_get (bfd->ctx, NULL,
+ tmp->name, NULL,
+ 0, 0);
+ tmp->buf.st_blocks = BDB_COUNT_BLOCKS (tmp->buf.st_size, \
+ tmp->buf.st_blksize);
+ /* FIXME: wat will be the effect of this? */
+ tmp->buf.st_ino = -1;
+ count++;
+
+ tmp->next = entries.next;
+ tmp->link = "";
+ entries.next = tmp;
+ /* if size is 0, count can never be = size, so entire dir is read */
+ if (count == size)
+ break;
+
+ free (key.data);
+ } /* while(1){ } */
+ bdb_cursor_close (bfd->ctx, cursorp);
+ } else {
+ /* do nothing */
+ }
+ FREE (entry_path);
+ op_ret = 0;
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &entries, count);
+
+ while (entries.next) {
+ tmp = entries.next;
+ entries.next = entries.next->next;
+ FREE (tmp->name);
+ FREE (tmp);
+ }
+ return 0;
+}/* bdb_getdents */
+
+
+int32_t
+bdb_releasedir (xlator_t *this,
+ fd_t *fd)
+{
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ struct bdb_dir *bfd = NULL;
+
+ if ((bfd = bdb_extract_bfd (fd, this)) == NULL) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to extract fd data from fd=%p", fd);
+ op_ret = -1;
+ op_errno = EBADF;
+ } else {
+ if (bfd->path) {
+ free (bfd->path);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "bfd->path was NULL. fd=%p bfd=%p",
+ fd, bfd);
+ }
+
+ if (bfd->dir) {
+ closedir (bfd->dir);
+ } else {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "bfd->dir is NULL.");
+ }
+ if (bfd->ctx) {
+ bctx_unref (bfd->ctx);
+ } else {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "bfd->ctx is NULL");
+ }
+ free (bfd);
+ }
+
+ return 0;
+}/* bdb_releasedir */
+
+
+int32_t
+bdb_readlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ size_t size)
+{
+ char *dest = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ char *real_path = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ dest = alloca (size + 1);
+ GF_VALIDATE_OR_GOTO (this->name, dest, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ op_ret = readlink (real_path, dest, size);
+
+ if (op_ret > 0)
+ dest[op_ret] = 0;
+
+ op_errno = errno;
+
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "readlink failed on %s: %s",
+ loc->path, strerror (op_errno));
+ }
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, dest);
+
+ return 0;
+}/* bdb_readlink */
+
+
+int32_t
+bdb_mkdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ int32_t op_ret = -1;
+ int32_t ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+ struct stat stbuf = {0, };
+ bctx_t *bctx = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ op_ret = mkdir (real_path, mode);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to mkdir %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ op_ret = chown (real_path, frame->root->uid, frame->root->gid);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to chmod on %s (%s)",
+ real_path, strerror (op_errno));
+ goto err;
+ }
+
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto err;
+ }
+
+ bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, err);
+
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+
+ goto out;
+
+err:
+ ret = rmdir (real_path);
+ if (ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to rmdir the directory created (%s)",
+ strerror (errno));
+ }
+
+
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
+
+ return 0;
+}/* bdb_mkdir */
+
+
+int32_t
+bdb_unlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ bctx_t *bctx = NULL;
+ char *real_path = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ op_ret = bdb_db_del (bctx, NULL, loc->path);
+ if (op_ret == DB_NOTFOUND) {
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ op_ret = unlink (real_path);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to unlink on %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ } else if (op_ret == 0) {
+ op_errno = 0;
+ }
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno);
+
+ return 0;
+}/* bdb_unlink */
+
+
+
+int32_t
+bdb_do_rmdir (xlator_t *this,
+ loc_t *loc)
+{
+ char *real_path = NULL;
+ int32_t ret = -1;
+ bctx_t *bctx = NULL;
+ DB_ENV *dbenv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ dbenv = BDB_ENV(this);
+ GF_VALIDATE_OR_GOTO (this->name, dbenv, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ bctx = bctx_lookup (B_TABLE(this), loc->path);
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ LOCK(&bctx->lock);
+ {
+ if (bctx->dbp == NULL) {
+ goto unlock;
+ }
+
+ ret = bctx->dbp->close (bctx->dbp, 0);
+ GF_VALIDATE_OR_GOTO (this->name, (ret == 0), unlock);
+
+ bctx->dbp = NULL;
+
+ ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, NULL, 0);
+ if (ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to DB_ENV->dbremove() on path %s: %s",
+ loc->path, db_strerror (ret));
+ }
+ }
+unlock:
+ UNLOCK(&bctx->lock);
+
+ if (ret) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to remove db %s: %s", bctx->db_path, db_strerror (ret));
+ ret = -1;
+ goto out;
+ }
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "removed db %s", bctx->db_path);
+ ret = rmdir (real_path);
+
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ return ret;
+}
+
+int32_t
+bdb_rmdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = ENOTEMPTY;
+
+ if (!is_dir_empty (this, loc)) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "rmdir: directory %s not empty", loc->path);
+ op_errno = ENOTEMPTY;
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = bdb_do_rmdir (this, loc);
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to bdb_do_rmdir on %s",
+ loc->path);
+ goto out;
+ }
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno);
+
+ return 0;
+} /* bdb_rmdir */
+
+int32_t
+bdb_symlink (call_frame_t *frame,
+ xlator_t *this,
+ const char *linkname,
+ loc_t *loc)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+ struct stat stbuf = {0,};
+ struct bdb_private *private = NULL;
+ bctx_t *bctx = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, linkname, out);
+
+ private = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, private, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ op_ret = symlink (linkname, real_path);
+ op_errno = errno;
+ if (op_ret == 0) {
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto err;
+ }
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ GF_VALIDATE_OR_GOTO (this->name, bctx, err);
+
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ stbuf.st_mode = private->symlink_mode;
+
+ goto out;
+ }
+err:
+ op_ret = unlink (real_path);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to unlink the previously created symlink (%s)",
+ strerror (op_errno));
+ }
+ op_ret = -1;
+ op_errno = ENOENT;
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
+
+ return 0;
+} /* bdb_symlink */
+
+int32_t
+bdb_chmod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+ struct stat stbuf = {0,};
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ /* directory or symlink */
+ op_ret = chmod (real_path, mode);
+ op_errno = errno;
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+
+ return 0;
+}/* bdb_chmod */
+
+
+int32_t
+bdb_chown (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ uid_t uid,
+ gid_t gid)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+ struct stat stbuf = {0,};
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ /* directory or symlink */
+ op_ret = lchown (real_path, uid, gid);
+ op_errno = errno;
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+
+ return 0;
+}/* bdb_chown */
+
+
+int32_t
+bdb_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+ struct stat stbuf = {0,};
+ char *db_path = NULL;
+ bctx_t *bctx = NULL;
+ char *key_string = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ bctx = bctx_parent (B_TABLE(this), loc->path);
+ op_errno = ENOENT;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_KEY_FROM_PATH (key_string, loc->path);
+
+ /* now truncate */
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ if (loc->inode->ino) {
+ stbuf.st_ino = loc->inode->ino;
+ }else {
+ stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx);
+ }
+
+ op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 1, 0);
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "failed to do bdb_db_put: %s",
+ db_strerror (op_ret));
+ op_ret = -1;
+ op_errno = EINVAL; /* TODO: better errno */
+ }
+
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+
+ return 0;
+}/* bdb_truncate */
+
+
+int32_t
+bdb_utimens (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct timespec ts[2])
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ char *real_path = NULL;
+ struct stat stbuf = {0,};
+ struct timeval tv[2] = {{0,},};
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ op_errno = EPERM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ /* directory or symlink */
+ tv[0].tv_sec = ts[0].tv_sec;
+ tv[0].tv_usec = ts[0].tv_nsec / 1000;
+ tv[1].tv_sec = ts[1].tv_sec;
+ tv[1].tv_usec = ts[1].tv_nsec / 1000;
+
+ op_ret = lutimes (real_path, tv);
+ if (op_ret == -1 && errno == ENOSYS) {
+ op_ret = utimes (real_path, tv);
+ }
+ op_errno = errno;
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "utimes on %s failed: %s",
+ loc->path, strerror (op_errno));
+ goto out;
+ }
+
+ op_ret = lstat (real_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ stbuf.st_ino = loc->inode->ino;
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+
+ return 0;
+}/* bdb_utimens */
+
+int32_t
+bdb_statfs (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+ struct statvfs buf = {0, };
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ op_ret = statvfs (real_path, &buf);
+ op_errno = errno;
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &buf);
+ return 0;
+}/* bdb_statfs */
+
+static int gf_bdb_xattr_log;
+
+/* bdb_setxattr - set extended attributes.
+ *
+ * bdb allows setxattr operation only on directories.
+ * bdb reservers 'glusterfs.file.<attribute-name>' to operate on the content of the files
+ * under the specified directory. 'glusterfs.file.<attribute-name>' transforms to contents of
+ * file of name '<attribute-name>' under specified directory.
+ *
+ * @frame: call frame.
+ * @this: xlator_t of this instance of bdb xlator.
+ * @loc: loc_t specifying the file to operate upon.
+ * @dict: list of extended attributes to set on @loc.
+ * @flags: can be XATTR_REPLACE (replace an existing extended attribute only if it exists) or
+ * XATTR_CREATE (create an extended attribute only if it doesn't already exist).
+ *
+ *
+ */
+int32_t
+bdb_setxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int flags)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ data_pair_t *trav = dict->members_list;
+ bctx_t *bctx = NULL;
+ char *real_path = NULL;
+ char *key = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ if (!S_ISDIR (loc->inode->st_mode)) {
+ op_ret = -1;
+ op_errno = EPERM;
+ goto out;
+ }
+
+ while (trav) {
+ if (ZR_FILE_CONTENT_REQUEST(trav->key) ) {
+ bctx = bctx_lookup (B_TABLE(this), loc->path);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ key = &(trav->key[15]);
+
+ if (flags & XATTR_REPLACE) {
+ /* replace only if previously exists, otherwise error out */
+ op_ret = bdb_db_get (bctx, NULL, key,
+ NULL, 0, 0);
+ if (op_ret == -1) {
+ /* key doesn't exist in database */
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "cannot XATTR_REPLACE, xattr %s doesn't exist "
+ "on path %s", key, loc->path);
+ op_ret = -1;
+ op_errno = ENOENT;
+ break;
+ }
+ op_ret = bdb_db_put (bctx, NULL,
+ key, trav->value->data,
+ trav->value->len,
+ op_ret, BDB_TRUNCATE_RECORD);
+ if (op_ret != 0) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ break;
+ }
+ } else {
+ /* fresh create */
+ op_ret = bdb_db_put (bctx, NULL, key,
+ trav->value->data,
+ trav->value->len,
+ 0, 0);
+ if (op_ret != 0) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ break;
+ } else {
+ op_ret = 0;
+ op_errno = 0;
+ } /* if(op_ret!=0)...else */
+ } /* if(flags&XATTR_REPLACE)...else */
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+ } else {
+ /* do plain setxattr */
+ op_ret = lsetxattr (real_path,
+ trav->key,
+ trav->value->data,
+ trav->value->len,
+ flags);
+ op_errno = errno;
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if (op_errno == ENOTSUP) {
+ gf_bdb_xattr_log++;
+ if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Extended Attributes support not present."\
+ "Please check");
+ }
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr failed on %s (%s)",
+ loc->path, strerror (op_errno));
+ }
+ break;
+ }
+ } /* if(ZR_FILE_CONTENT_REQUEST())...else */
+ trav = trav->next;
+ }/* while(trav) */
+out:
+ frame->root->rsp_refs = NULL;
+
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}/* bdb_setxattr */
+
+
+/* bdb_gettxattr - get extended attributes.
+ *
+ * bdb allows getxattr operation only on directories.
+ * bdb_getxattr retrieves the whole content of the file, when glusterfs.file.<attribute-name>
+ * is specified.
+ *
+ * @frame: call frame.
+ * @this: xlator_t of this instance of bdb xlator.
+ * @loc: loc_t specifying the file to operate upon.
+ * @name: name of extended attributes to get for @loc.
+ *
+ * NOTE: see description of bdb_setxattr for details on how
+ * 'glusterfs.file.<attribute-name>' is handles by bdb.
+ */
+int32_t
+bdb_getxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ dict_t *dict = NULL;
+ bctx_t *bctx = NULL;
+ char *buf = NULL;
+ char *key_string = NULL;
+ int32_t list_offset = 0;
+ size_t size = 0;
+ size_t remaining_size = 0;
+ char *real_path = NULL;
+ char key[1024] = {0,};
+ char *value = NULL;
+ char *list = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, name, out);
+
+ dict = get_new_dict ();
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ if (!S_ISDIR (loc->inode->st_mode)) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "operation not permitted on a non-directory file: %s", loc->path);
+ op_ret = -1;
+ op_errno = ENODATA;
+ goto out;
+ }
+
+ if (name && ZR_FILE_CONTENT_REQUEST(name)) {
+ bctx = bctx_lookup (B_TABLE(this), loc->path);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ key_string = (char *)&(name[15]);
+
+ op_ret = bdb_db_get (bctx, NULL, key_string, &buf, 0, 0);
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "failed to db get on directory: %s for key: %s",
+ bctx->directory, name);
+ op_ret = -1;
+ op_errno = ENODATA;
+ goto out;
+ }
+
+ dict_set (dict, (char *)name, data_from_dynptr (buf, op_ret));
+ } else {
+ MAKE_REAL_PATH (real_path, this, loc->path);
+ size = llistxattr (real_path, NULL, 0);
+ op_errno = errno;
+ if (size <= 0) {
+ /* There are no extended attributes, send an empty dictionary */
+ if (size == -1 && op_errno != ENODATA) {
+ if (op_errno == ENOTSUP) {
+ gf_bdb_xattr_log++;
+ if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER))
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "Extended Attributes support not present."\
+ "Please check");
+ } else {
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "llistxattr failed on %s (%s)",
+ loc->path, strerror (op_errno));
+ }
+ }
+ op_ret = -1;
+ op_errno = ENODATA;
+ } else {
+ list = alloca (size + 1);
+ op_errno = ENOMEM;
+ GF_VALIDATE_OR_GOTO (this->name, list, out);
+
+ size = llistxattr (real_path, list, size);
+ op_ret = size;
+ op_errno = errno;
+ if (size == -1) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "llistxattr failed on %s (%s)",
+ loc->path, strerror (errno));
+ goto out;
+ }
+ remaining_size = size;
+ list_offset = 0;
+ while (remaining_size > 0) {
+ if(*(list+list_offset) == '\0')
+ break;
+ strcpy (key, list + list_offset);
+ op_ret = lgetxattr (real_path, key, NULL, 0);
+ if (op_ret == -1)
+ break;
+ value = CALLOC (op_ret + 1, sizeof(char));
+ GF_VALIDATE_OR_GOTO (this->name, value, out);
+
+ op_ret = lgetxattr (real_path, key, value, op_ret);
+ if (op_ret == -1)
+ break;
+ value [op_ret] = '\0';
+ dict_set (dict, key, data_from_dynptr (value, op_ret));
+ remaining_size -= strlen (key) + 1;
+ list_offset += strlen (key) + 1;
+ } /* while(remaining_size>0) */
+ } /* if(size <= 0)...else */
+ } /* if(name...)...else */
+
+out:
+ if(bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ if (dict)
+ dict_ref (dict);
+
+ STACK_UNWIND (frame, op_ret, op_errno, dict);
+
+ if (dict)
+ dict_unref (dict);
+
+ return 0;
+}/* bdb_getxattr */
+
+
+int32_t
+bdb_removexattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ bctx_t *bctx = NULL;
+ char *real_path = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO (this->name, name, out);
+
+ if (!S_ISDIR(loc->inode->st_mode)) {
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "operation not permitted on non-directory files");
+ op_ret = -1;
+ op_errno = EPERM;
+ goto out;
+ }
+
+ if (ZR_FILE_CONTENT_REQUEST(name)) {
+ bctx = bctx_lookup (B_TABLE(this), loc->path);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ op_ret = bdb_db_del (bctx, NULL, name);
+ if (op_ret == -1) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to delete %s from db of %s directory",
+ name, loc->path);
+ op_errno = EINVAL; /* TODO: errno */
+ goto out;
+ }
+ } else {
+ MAKE_REAL_PATH(real_path, this, loc->path);
+ op_ret = lremovexattr (real_path, name);
+ op_errno = errno;
+ if (op_ret == -1) {
+ if (op_errno == ENOTSUP) {
+ gf_bdb_xattr_log++;
+ if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER))
+ gf_log (this->name, GF_LOG_WARNING,
+ "Extended Attributes support not present."
+ "Please check");
+ } else {
+ gf_log (this->name,
+ GF_LOG_WARNING,
+ "%s: %s",
+ loc->path, strerror (op_errno));
+ }
+ } /* if(op_ret == -1) */
+ } /* if (ZR_FILE_CONTENT_REQUEST(name))...else */
+
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}/* bdb_removexattr */
+
+
+int32_t
+bdb_fsyncdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int datasync)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ struct bdb_fd *bfd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ frame->root->rsp_refs = NULL;
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+out:
+ STACK_UNWIND (frame, op_ret, op_errno);
+
+ return 0;
+}/* bdb_fsycndir */
+
+
+int32_t
+bdb_access (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ char *real_path = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ op_ret = access (real_path, mask);
+ op_errno = errno;
+ /* TODO: implement for db entries */
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}/* bdb_access */
+
+
+int32_t
+bdb_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ struct stat buf = {0,};
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ /* TODO: impelement */
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &buf);
+
+ return 0;
+}
+
+int32_t
+bdb_fchown (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ uid_t uid,
+ gid_t gid)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ struct stat buf = {0,};
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ /* TODO: implement */
+out:
+ STACK_UNWIND (frame, op_ret, op_errno, &buf);
+
+ return 0;
+}
+
+
+int32_t
+bdb_fchmod (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ mode_t mode)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EPERM;
+ struct stat buf = {0,};
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ /* TODO: impelement */
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, &buf);
+
+ return 0;
+}
+
+int32_t
+bdb_setdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags,
+ dir_entry_t *entries,
+ int32_t count)
+{
+ int32_t op_ret = -1, op_errno = EINVAL;
+ char *entry_path = NULL;
+ int32_t real_path_len = 0;
+ int32_t entry_path_len = 0;
+ int32_t ret = 0;
+ struct bdb_dir *bfd = NULL;
+ dir_entry_t *trav = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, entries, out);
+
+ frame->root->rsp_refs = NULL;
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ real_path_len = strlen (bfd->path);
+ entry_path_len = real_path_len + 256;
+ entry_path = CALLOC (1, entry_path_len);
+ GF_VALIDATE_OR_GOTO (this->name, entry_path, out);
+
+ strcpy (entry_path, bfd->path);
+ entry_path[real_path_len] = '/';
+
+ trav = entries->next;
+ while (trav) {
+ char pathname[ZR_PATH_MAX] = {0,};
+ strcpy (pathname, entry_path);
+ strcat (pathname, trav->name);
+
+ if (S_ISDIR(trav->buf.st_mode)) {
+ /* If the entry is directory, create it by calling 'mkdir'. If
+ * directory is not present, it will be created, if its present,
+ * no worries even if it fails.
+ */
+ ret = mkdir (pathname, trav->buf.st_mode);
+ if ((ret == -1) && (errno != EEXIST)) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to created directory %s: %s",
+ pathname, strerror(errno));
+ goto loop;
+ }
+
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "Creating directory %s with mode (0%o)",
+ pathname,
+ trav->buf.st_mode);
+ /* Change the mode
+ * NOTE: setdents tries its best to restore the state
+ * of storage. if chmod and chown fail, they can be
+ * ignored now */
+ ret = chmod (pathname, trav->buf.st_mode);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "chmod failed on %s (%s)",
+ pathname, strerror (errno));
+ goto loop;
+ }
+ /* change the ownership */
+ ret = chown (pathname, trav->buf.st_uid, trav->buf.st_gid);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "chown failed on %s (%s)",
+ pathname, strerror (errno));
+ goto loop;
+ }
+ } else if ((flags == GF_SET_IF_NOT_PRESENT) ||
+ (flags != GF_SET_DIR_ONLY)) {
+ /* Create a 0 byte file here */
+ if (S_ISREG (trav->buf.st_mode)) {
+ op_ret = bdb_db_put (bfd->ctx, NULL,
+ trav->name, NULL, 0, 0, 0);
+ if (op_ret != 0) {
+ /* create successful */
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to create file %s",
+ pathname);
+ } /* if (!op_ret)...else */
+ } else if (S_ISLNK (trav->buf.st_mode)) {
+ /* TODO: impelement */;
+ } else {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "storage/bdb allows to create regular files only"
+ "file %s (mode = %d) cannot be created",
+ pathname, trav->buf.st_mode);
+ } /* if(S_ISREG())...else */
+ } /* if(S_ISDIR())...else if */
+ loop:
+ /* consider the next entry */
+ trav = trav->next;
+ } /* while(trav) */
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno);
+
+ FREE (entry_path);
+ return 0;
+}
+
+int32_t
+bdb_fstat (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ struct stat stbuf = {0,};
+ struct bdb_fd *bfd = NULL;
+ bctx_t *bctx = NULL;
+ char *db_path = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ bctx = bfd->ctx;
+
+ MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
+ op_ret = lstat (db_path, &stbuf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to lstat on %s (%s)",
+ db_path, strerror (op_errno));
+ goto out;
+ }
+
+ stbuf.st_ino = fd->inode->ino;
+ stbuf.st_size = bdb_db_get (bctx, NULL, bfd->key, NULL, 0, 0);
+ stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
+
+out:
+ frame->root->rsp_refs = NULL;
+
+ STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ return 0;
+}
+
+
+int32_t
+bdb_readdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t off)
+{
+ struct bdb_dir *bfd = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ size_t filled = 0;
+ gf_dirent_t *this_entry = NULL;
+ gf_dirent_t entries;
+ struct dirent *entry = NULL;
+ off_t in_case = 0;
+ int32_t this_size = 0;
+ DBC *cursorp = NULL;
+ int32_t count = 0;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ INIT_LIST_HEAD (&entries.list);
+
+ bfd = bdb_extract_bfd (fd, this);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, bfd, out);
+
+ op_errno = ENOMEM;
+
+ while (filled <= size) {
+ this_entry = NULL;
+ entry = NULL;
+ in_case = 0;
+ this_size = 0;
+
+ in_case = telldir (bfd->dir);
+ entry = readdir (bfd->dir);
+ if (!entry)
+ break;
+
+ if (IS_BDB_PRIVATE_FILE(entry->d_name))
+ continue;
+
+ this_size = dirent_size (entry);
+
+ if (this_size + filled > size) {
+ seekdir (bfd->dir, in_case);
+ break;
+ }
+
+ count++;
+
+ this_entry = gf_dirent_for_name (entry->d_name);
+ this_entry->d_ino = entry->d_ino;
+
+ this_entry->d_off = -1;
+
+ this_entry->d_type = entry->d_type;
+ this_entry->d_len = entry->d_reclen;
+
+
+ list_add (&this_entry->list, &entries.list);
+
+ filled += this_size;
+ }
+ op_ret = filled;
+ op_errno = 0;
+ if (filled >= size) {
+ goto out;
+ }
+
+ /* hungry kyaa? */
+ op_ret = bdb_cursor_open (bfd->ctx, &cursorp);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out);
+
+ /* TODO: fix d_off, don't use bfd->offset. wrong method */
+ if (strlen (bfd->offset)) {
+ DBT key = {0,}, value = {0,};
+ key.data = bfd->offset;
+ key.size = strlen (bfd->offset);
+ key.flags = DB_DBT_USERMEM;
+ value.dlen = 0;
+ value.doff = 0;
+ value.flags = DB_DBT_PARTIAL;
+
+ op_ret = bdb_cursor_get (cursorp, &key, &value, DB_SET);
+ op_errno = EBADFD;
+ GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out);
+
+ } else {
+ /* first time or last time, do nothing */
+ }
+
+ while (filled <= size) {
+ DBT key = {0,}, value = {0,};
+ this_entry = NULL;
+
+ key.flags = DB_DBT_MALLOC;
+ value.dlen = 0;
+ value.doff = 0;
+ value.flags = DB_DBT_PARTIAL;
+ op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT);
+
+ if (op_ret == DB_NOTFOUND) {
+ /* we reached end of the directory */
+ op_ret = 0;
+ op_errno = 0;
+ break;
+ } else if (op_ret != 0) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "database error during readdir");
+ op_ret = -1;
+ op_errno = ENOENT;
+ break;
+ } /* if (op_ret == DB_NOTFOUND)...else if...else */
+
+ if (key.data == NULL) {
+ /* NOTE: currently ignore when we get key.data == NULL.
+ * TODO: we should not get key.data = NULL */
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "null key read from db");
+ continue;
+ }/* if(key.data)...else */
+ count++;
+ this_size = bdb_dirent_size (&key);
+ if (this_size + filled > size)
+ break;
+ /* TODO - consider endianness here */
+ this_entry = gf_dirent_for_name ((const char *)key.data);
+ /* FIXME: bug, if someone is going to use ->d_ino */
+ this_entry->d_ino = -1;
+ this_entry->d_off = 0;
+ this_entry->d_type = 0;
+ this_entry->d_len = key.size;
+
+ if (key.data) {
+ strncpy (bfd->offset, key.data, key.size);
+ bfd->offset [key.size] = '\0';
+ free (key.data);
+ }
+
+ list_add (&this_entry->list, &entries.list);
+
+ filled += this_size;
+ }/* while */
+ bdb_cursor_close (bfd->ctx, cursorp);
+ op_ret = filled;
+ op_errno = 0;
+out:
+ frame->root->rsp_refs = NULL;
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "read %"GF_PRI_SIZET" bytes for %d entries", filled, count);
+ STACK_UNWIND (frame, count, op_errno, &entries);
+
+ gf_dirent_free (&entries);
+
+ return 0;
+}
+
+
+int32_t
+bdb_stats (call_frame_t *frame,
+ xlator_t *this,
+ int32_t flags)
+
+{
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+
+ struct xlator_stats xlstats = {0, }, *stats = NULL;
+ struct statvfs buf;
+ struct timeval tv;
+ struct bdb_private *private = NULL;
+ int64_t avg_read = 0;
+ int64_t avg_write = 0;
+ int64_t _time_ms = 0;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+
+ private = (struct bdb_private *)(this->private);
+ stats = &xlstats;
+
+ op_ret = statvfs (private->export_path, &buf);
+ op_errno = errno;
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to statvfs on %s (%s)",
+ private->export_path, strerror (op_errno));
+ goto out;
+ }
+
+ stats->nr_files = private->stats.nr_files;
+ stats->nr_clients = private->stats.nr_clients; /* client info is maintained at FSd */
+ stats->free_disk = buf.f_bfree * buf.f_bsize; /* Number of Free block in the filesystem. */
+ stats->total_disk_size = buf.f_blocks * buf.f_bsize; /* */
+ stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize;
+
+ /* Calculate read and write usage */
+ gettimeofday (&tv, NULL);
+
+ /* Read */
+ _time_ms = (tv.tv_sec - private->init_time.tv_sec) * 1000 +
+ ((tv.tv_usec - private->init_time.tv_usec) / 1000);
+
+ avg_read = (_time_ms) ? (private->read_value / _time_ms) : 0; /* KBps */
+ avg_write = (_time_ms) ? (private->write_value / _time_ms) : 0; /* KBps */
+
+ _time_ms = (tv.tv_sec - private->prev_fetch_time.tv_sec) * 1000 +
+ ((tv.tv_usec - private->prev_fetch_time.tv_usec) / 1000);
+ if (_time_ms && ((private->interval_read / _time_ms) > private->max_read)) {
+ private->max_read = (private->interval_read / _time_ms);
+ }
+ if (_time_ms && ((private->interval_write / _time_ms) > private->max_write)) {
+ private->max_write = private->interval_write / _time_ms;
+ }
+
+ stats->read_usage = avg_read / private->max_read;
+ stats->write_usage = avg_write / private->max_write;
+
+ gettimeofday (&(private->prev_fetch_time), NULL);
+ private->interval_read = 0;
+ private->interval_write = 0;
+
+out:
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, stats);
+ return 0;
+}
+
+
+int32_t
+bdb_inodelk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t cmd, struct flock *lock)
+{
+ frame->root->rsp_refs = NULL;
+
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS");
+
+ STACK_UNWIND (frame, -1, ENOSYS);
+ return 0;
+}
+
+
+int32_t
+bdb_finodelk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd, struct flock *lock)
+{
+ frame->root->rsp_refs = NULL;
+
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS");
+
+ STACK_UNWIND (frame, -1, ENOSYS);
+ return 0;
+}
+
+
+int32_t
+bdb_entrylk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type)
+{
+ frame->root->rsp_refs = NULL;
+
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS");
+
+ STACK_UNWIND (frame, -1, ENOSYS);
+ return 0;
+}
+
+
+int32_t
+bdb_fentrylk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type)
+{
+ frame->root->rsp_refs = NULL;
+
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS");
+
+ STACK_UNWIND (frame, -1, ENOSYS);
+ return 0;
+}
+
+
+int32_t
+bdb_checksum (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flag)
+{
+ char *real_path = NULL;
+ DIR *dir = NULL;
+ struct dirent *dirent = NULL;
+ uint8_t file_checksum[ZR_FILENAME_MAX] = {0,};
+ uint8_t dir_checksum[ZR_FILENAME_MAX] = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ int32_t i = 0, length = 0;
+ bctx_t *bctx = NULL;
+ DBC *cursorp = NULL;
+ char *data = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", frame, out);
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, loc, out);
+
+ MAKE_REAL_PATH (real_path, this, loc->path);
+
+ {
+ dir = opendir (real_path);
+ op_errno = errno;
+ GF_VALIDATE_OR_GOTO (this->name, dir, out);
+ while ((dirent = readdir (dir))) {
+ if (!dirent)
+ break;
+
+ if (IS_BDB_PRIVATE_FILE(dirent->d_name))
+ continue;
+
+ length = strlen (dirent->d_name);
+ for (i = 0; i < length; i++)
+ dir_checksum[i] ^= dirent->d_name[i];
+ } /* while((dirent...)) */
+ closedir (dir);
+ }
+
+ {
+ bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, bctx, out);
+
+ op_ret = bdb_cursor_open (bctx, &cursorp);
+ op_errno = EINVAL;
+ GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out);
+
+ while (1) {
+ DBT key = {0,}, value = {0,};
+
+ key.flags = DB_DBT_MALLOC;
+ value.doff = 0;
+ value.dlen = 0;
+ op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT);
+
+ if (op_ret == DB_NOTFOUND) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "end of list of key/value pair in db for "
+ "directory: %s", bctx->directory);
+ op_ret = 0;
+ op_errno = 0;
+ break;
+ } else if (op_ret == 0){
+ /* successfully read */
+ data = key.data;
+ length = key.size;
+ for (i = 0; i < length; i++)
+ file_checksum[i] ^= data[i];
+
+ free (key.data);
+ } else {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to do cursor get for directory %s: %s",
+ bctx->directory, db_strerror (op_ret));
+ op_ret = -1;
+ op_errno = ENOENT;
+ break;
+ }/* if(op_ret == DB_NOTFOUND)...else if...else */
+ } /* while(1) */
+ bdb_cursor_close (bctx, cursorp);
+ }
+out:
+ if (bctx) {
+ /* NOTE: bctx_unref always returns success,
+ * see description of bctx_unref for more details */
+ bctx_unref (bctx);
+ }
+
+ frame->root->rsp_refs = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum);
+
+ return 0;
+}
+
+/**
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ switch (event)
+ {
+ case GF_EVENT_PARENT_UP:
+ {
+ /* Tell the parent that bdb xlator is up */
+ assert ((this->private != NULL) &&
+ (BDB_ENV(this) != NULL));
+ default_notify (this, GF_EVENT_CHILD_UP, data);
+ }
+ break;
+ default:
+ /* */
+ break;
+ }
+ return 0;
+}
+
+
+
+/**
+ * init -
+ */
+int32_t
+init (xlator_t *this)
+{
+ int32_t ret = -1;
+ struct stat buf = {0,};
+ struct bdb_private *_private = NULL;
+ data_t *directory = NULL;
+ bctx_t *bctx = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bdb", this, out);
+
+ _private = CALLOC (1, sizeof (*_private));
+ GF_VALIDATE_OR_GOTO (this->name, _private, out);
+
+ if (this->children) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "FATAL: storage/bdb cannot have subvolumes");
+ FREE (_private);
+ goto out;;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ directory = dict_get (this->options, "directory");
+ if (!directory) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "export directory not specified in volfile");
+ FREE (_private);
+ goto out;
+ }
+ umask (000); // umask `masking' is done at the client side
+ /* // * No need to create directory, sys admin should do it himself
+ if (mkdir (directory->data, 0777) == 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "directory specified not exists, created");
+ }
+ */
+
+ /* Check whether the specified directory exists, if not create it. */
+ ret = stat (directory->data, &buf);
+ if ((ret != 0) || !S_ISDIR (buf.st_mode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "specified directory '%s' doesn't exists, Exiting", directory->data);
+ FREE (_private);
+ goto out;
+ } else {
+ ret = 0;
+ }
+
+
+ _private->export_path = strdup (directory->data);
+ _private->export_path_length = strlen (_private->export_path);
+
+ {
+ /* Stats related variables */
+ gettimeofday (&_private->init_time, NULL);
+ gettimeofday (&_private->prev_fetch_time, NULL);
+ _private->max_read = 1;
+ _private->max_write = 1;
+ }
+
+ this->private = (void *)_private;
+ {
+ ret = bdb_db_init (this, this->options);
+
+ if (ret == -1){
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "failed to initialize database");
+ goto out;
+ } else {
+ bctx = bctx_lookup (_private->b_table, "/");
+ /* NOTE: we are not doing bctx_unref() for root bctx,
+ * let it remain in active list forever */
+ if (!bctx) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "failed to allocate memory for root (/) bctx: out of memory");
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+ }
+out:
+ return ret;
+}
+
+void
+bctx_cleanup (struct list_head *head)
+{
+ bctx_t *trav = NULL;
+ bctx_t *tmp = NULL;
+ DB *storage = NULL;
+
+ list_for_each_entry_safe (trav, tmp, head, list) {
+ LOCK (&trav->lock);
+ storage = trav->dbp;
+ trav->dbp = NULL;
+ list_del_init (&trav->list);
+ UNLOCK (&trav->lock);
+
+ if (storage) {
+ storage->close (storage, 0);
+ storage = NULL;
+ }
+ }
+ return;
+}
+
+void
+fini (xlator_t *this)
+{
+ struct bdb_private *private = NULL;
+ int32_t idx = 0;
+ int32_t ret = 0;
+ private = this->private;
+
+ if (B_TABLE(this)) {
+ /* close all the dbs from lru list */
+ bctx_cleanup (&(B_TABLE(this)->b_lru));
+ for (idx = 0; idx < B_TABLE(this)->hash_size; idx++)
+ bctx_cleanup (&(B_TABLE(this)->b_hash[idx]));
+
+ if (BDB_ENV(this)) {
+ LOCK (&private->active_lock);
+ private->active = 0;
+ UNLOCK (&private->active_lock);
+
+ ret = pthread_join (private->checkpoint_thread, NULL);
+ if (ret != 0) {
+ gf_log (this->name,
+ GF_LOG_CRITICAL,
+ "failed to join checkpoint thread");
+ }
+
+ /* TODO: pick each of the 'struct bctx' from private->b_hash
+ * and close all the databases that are open */
+ BDB_ENV(this)->close (BDB_ENV(this), 0);
+ } else {
+ /* impossible to reach here */
+ }
+
+ FREE (B_TABLE(this));
+ }
+ FREE (private);
+ return;
+}
+
+struct xlator_mops mops = {
+ .stats = bdb_stats,
+};
+
+struct xlator_fops fops = {
+ .lookup = bdb_lookup,
+ .stat = bdb_stat,
+ .opendir = bdb_opendir,
+ .readdir = bdb_readdir,
+ .readlink = bdb_readlink,
+ .mknod = bdb_mknod,
+ .mkdir = bdb_mkdir,
+ .unlink = bdb_unlink,
+ .rmdir = bdb_rmdir,
+ .symlink = bdb_symlink,
+ .rename = bdb_rename,
+ .link = bdb_link,
+ .chmod = bdb_chmod,
+ .chown = bdb_chown,
+ .truncate = bdb_truncate,
+ .utimens = bdb_utimens,
+ .create = bdb_create,
+ .open = bdb_open,
+ .readv = bdb_readv,
+ .writev = bdb_writev,
+ .statfs = bdb_statfs,
+ .flush = bdb_flush,
+ .fsync = bdb_fsync,
+ .setxattr = bdb_setxattr,
+ .getxattr = bdb_getxattr,
+ .removexattr = bdb_removexattr,
+ .fsyncdir = bdb_fsyncdir,
+ .access = bdb_access,
+ .ftruncate = bdb_ftruncate,
+ .fstat = bdb_fstat,
+ .lk = bdb_lk,
+ .inodelk = bdb_inodelk,
+ .finodelk = bdb_finodelk,
+ .entrylk = bdb_entrylk,
+ .fentrylk = bdb_fentrylk,
+ .fchown = bdb_fchown,
+ .fchmod = bdb_fchmod,
+ .setdents = bdb_setdents,
+ .getdents = bdb_getdents,
+ .checksum = bdb_checksum,
+};
+
+struct xlator_cbks cbks = {
+ .release = bdb_release,
+ .releasedir = bdb_releasedir
+};
+
+#if 0
+struct volume_options options[] = {
+ { "directory", GF_OPTION_TYPE_PATH, 0, },
+ { "logdir", GF_OPTION_TYPE_PATH, 0, },
+ { "errfile", GF_OPTION_TYPE_PATH, 0, },
+ { "dir-mode", GF_OPTION_TYPE_ANY, 0, }, // base 8 number
+ { "file-mode", GF_OPTION_TYPE_ANY, 0, }, // base 8 number
+ { "page-size", GF_OPTION_TYPE_SIZET, -1, },
+ { "lru-limit", GF_OPTION_TYPE_INT, -1, },
+ { "lock-timeout", GF_OPTION_TYPE_TIME, 0, },
+ { "checkpoint-timeout", GF_OPTION_TYPE_TIME, 0, },
+ { "transaction-timeout", GF_OPTION_TYPE_TIME, 0, },
+ { "mode", GF_OPTION_TYPE_BOOL, 0, }, // Should be 'cache' ??
+ { "access-mode", GF_OPTION_TYPE_STR, 0, 0, 0, "btree"},
+ { NULL, 0, }
+};
+
+#endif /* #if 0 */
diff --git a/xlators/storage/bdb/src/bdb.h b/xlators/storage/bdb/src/bdb.h
new file mode 100644
index 00000000000..f2d962680dd
--- /dev/null
+++ b/xlators/storage/bdb/src/bdb.h
@@ -0,0 +1,439 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _BDB_H
+#define _BDB_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <dirent.h>
+
+#include <db.h>
+
+#ifdef linux
+#ifdef __GLIBC__
+#include <sys/fsuid.h>
+#else
+#include <unistd.h>
+#endif
+#endif
+
+#ifdef HAVE_SYS_XATTR_H
+#include <sys/xattr.h>
+#endif
+
+#ifdef HAVE_SYS_EXTATTR_H
+#include <sys/extattr.h>
+#endif
+
+#include <pthread.h>
+#include "xlator.h"
+#include "inode.h"
+#include "compat.h"
+#include "compat-errno.h"
+
+#define GLFS_BDB_STORAGE "/glusterfs_storage.db"
+
+/* numbers are not so reader-friendly, so lets have ON and OFF macros */
+#define ON 1
+#define OFF 0
+
+#define BDB_DEFAULT_LRU_LIMIT 100
+#define BDB_DEFAULT_HASH_SIZE 100
+
+#define BDB_ENOSPC_THRESHOLD 25600
+
+#define BDB_DEFAULT_CHECKPOINT_TIMEOUT 30
+
+#define BCTX_ENV(bctx) (bctx->table->dbenv)
+/* MAKE_REAL_PATH(var,this,path)
+ * make the real path on the underlying file-system
+ *
+ * @var: destination to hold the real path
+ * @this: pointer to xlator_t corresponding to bdb xlator
+ * @path: path, as seen from mount-point
+ */
+#define MAKE_REAL_PATH(var, this, path) do { \
+ int base_len = ((struct bdb_private *)this->private)->export_path_length; \
+ var = alloca (strlen (path) + base_len + 2); \
+ strcpy (var, ((struct bdb_private *)this->private)->export_path); \
+ strcpy (&var[base_len], path); \
+ } while (0)
+
+/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path)
+ * make the real path to the storage-database file on file-system
+ *
+ * @var: destination to hold the real path
+ * @this: pointer to xlator_t corresponding to bdb xlator
+ * @path: path of the directory, as seen from mount-point
+ */
+#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do { \
+ int base_len = ((struct bdb_private *)this->private)->export_path_length; \
+ var = alloca (strlen (path) + base_len + strlen (GLFS_BDB_STORAGE)); \
+ strcpy (var, ((struct bdb_private *)this->private)->export_path); \
+ strcpy (&var[base_len], path); \
+ strcat (var, GLFS_BDB_STORAGE); \
+ } while (0)
+
+/* MAKE_KEY_FROM_PATH(key,path)
+ * make a 'key', which we use as key in the underlying database by using the path
+ *
+ * @key: destination to hold the key
+ * @path: path to file as seen from mount-point
+ */
+#define MAKE_KEY_FROM_PATH(key, path) do { \
+ char *tmp = alloca (strlen (path)); \
+ strcpy (tmp, path); \
+ key = basename (tmp); \
+ }while (0);
+
+/* BDB_DO_LSTAT(path,stbuf,dirent)
+ * construct real-path to a dirent and do lstat on the real-path
+ *
+ * @path: path to the directory whose readdir is currently in progress
+ * @stbuf: a 'struct stat *'
+ * @dirent: a 'struct dirent *'
+ */
+#define BDB_DO_LSTAT(path, stbuf, dirent) do { \
+ char tmp_real_path[GF_PATH_MAX]; \
+ strcpy(tmp_real_path, path); \
+ strcat (tmp_real_path, "/"); \
+ strcat(tmp_real_path, dirent->d_name); \
+ ret = lstat (tmp_real_path, stbuf); \
+ } while(0);
+
+/* IS_BDB_PRIVATE_FILE(name)
+ * check if a given 'name' is bdb xlator's internal file name
+ *
+ * @name: basename of a file.
+ *
+ * bdb xlator reserves file names 'glusterfs_storage.db',
+ * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*' (used by libdb)
+ */
+#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) || \
+ (!strcmp(name, "glusterfs_storage.db")) || \
+ (!strcmp(name, "glusterfs_ns.db")) || \
+ (!strncmp(name, "log.0000", 8)))
+
+/* check if 'name' is '.' or '..' entry */
+#define IS_DOT_DOTDOT(name) ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2)))
+
+/* BDB_SET_BCTX(this,inode,bctx)
+ * put a stamp on inode. d00d, you are using bdb.. huhaha.
+ * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories.
+ * this will happen either in lookup() or mkdir().
+ *
+ * @this: pointer xlator_t of bdb xlator.
+ * @inode: inode where 'struct bdb_ctx *' has to be stored.
+ * @bctx: a 'struct bdb_ctx *'
+ */
+#define BDB_SET_BCTX(this,inode,bctx) do{ \
+ inode_ctx_put(inode, this, (uint64_t)(long)bctx); \
+ }while (0);
+
+/* MAKE_BCTX_FROM_INODE(this,bctx,inode)
+ * extract bdb xlator's 'struct bdb_ctx *' from an inode's ctx.
+ * valid only if done for directory inodes, otherwise bctx = NULL.
+ *
+ * @this: pointer xlator_t of bdb xlator.
+ * @bctx: a 'struct bdb_ctx *'
+ * @inode: inode from where 'struct bdb_ctx *' has to be extracted.
+ */
+#define MAKE_BCTX_FROM_INODE(this,bctx,inode) do{ \
+ uint64_t tmp_bctx = 0; \
+ inode_ctx_get (inode, this, &tmp_bctx); \
+ if (ret == 0) \
+ bctx = (void *)(long)tmp_bctx; \
+ }while (0);
+
+#define BDB_SET_BFD(this,fd,bfd) do{ \
+ fd_ctx_set (fd, this, (uint64_t)(long)bfd); \
+ }while (0);
+
+/* maximum number of open dbs that bdb xlator will ever have */
+#define BDB_MAX_OPEN_DBS 100
+
+/* convert file size to block-count */
+#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1)
+
+/* file permissions, again macros are more readable */
+#define RWXRWXRWX 0777
+#define DEFAULT_FILE_MODE 0644
+#define DEFAULT_DIR_MODE 0755
+
+/* see, if have a valid file permissions specification in @mode */
+#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX)))
+#define IS_VALID_DIR_MODE(mode) (!(mode & (~(RWXRWXRWX)))
+
+/* maximum retries for a failed transactional operation */
+#define BDB_MAX_RETRIES 10
+
+typedef struct bctx_table bctx_table_t;
+typedef struct bdb_ctx bctx_t;
+typedef struct bdb_cache bdb_cache_t;
+typedef struct bdb_private bdb_private_t;
+
+struct bctx_table {
+ uint64_t dbflags; /* flags to be used for opening each database */
+ uint64_t cache; /* cache: can be either ON or OFF */
+ gf_lock_t lock; /* used to lock the 'struct bctx_table *' */
+ gf_lock_t checkpoint_lock; /* lock for checkpointing */
+ struct list_head *b_hash; /* hash table of 'struct bdb_ctx' */
+ struct list_head active; /* list of active 'struct bdb_ctx' */
+ struct list_head b_lru; /* lru list of inactive 'struct bdb_ctx' */
+ struct list_head purge;
+ uint32_t lru_limit;
+ uint32_t lru_size;
+ uint32_t hash_size;
+ DBTYPE access_mode; /* access mode for accessing the databases,
+ * can be DB_HASH, DB_BTREE */
+ DB_ENV *dbenv; /* DB_ENV under which every db operation
+ * is carried over */
+ int32_t transaction;
+ xlator_t *this;
+
+ uint64_t page_size; /* page-size of DB,
+ * DB->set_pagesize(), should be set before DB->open */
+};
+
+struct bdb_ctx {
+ /* controller members */
+ struct list_head list; /* lru list of 'struct bdb_ctx's,
+ * a bdb_ctx can exist in one of b_hash or lru lists */
+ struct list_head b_hash; /* directory 'name' hashed list of 'struct bdb_ctx's */
+
+ struct bctx_table *table;
+ int32_t ref; /* reference count */
+ gf_lock_t lock; /* used to lock this 'struct bdb_ctx' */
+
+ char *directory; /* directory path */
+ DB *dbp; /* pointer to open database, that resides inside this directory */
+ uint32_t cache; /* cache ON or OFF */
+
+ /* per directory cache, bdb xlator's internal cache */
+ struct list_head c_list; /* linked list of cached records */
+ int32_t c_count; /* number of cached records */
+
+ int32_t key_hash; /* index to hash table list, to which this ctx belongs */
+ char *db_path; /* absolute path to db file */
+};
+
+struct bdb_fd {
+ struct bdb_ctx *ctx; /* pointer to bdb_ctx of the parent directory */
+ char *key; /* name of the file. NOTE: basename, not the complete path */
+ int32_t flags; /* open flags */
+};
+
+struct bdb_dir {
+ struct bdb_ctx *ctx; /* pointer to bdb_ctx of this directory */
+ DIR *dir; /* open directory pointer, as returned by opendir() */
+ char offset[NAME_MAX]; /* FIXME: readdir offset, too crude. must go */
+ char *path; /* path to this directory */
+};
+
+/* cache */
+struct bdb_cache {
+ struct list_head c_list; /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */
+ char *key; /* name of the file this cache holds. NOTE: basename of file */
+ char *data; /* file content */
+ size_t size; /* size of the file content that this cache holds */
+};
+
+
+struct bdb_private {
+ inode_table_t *itable; /* pointer to inode table that we use */
+ int32_t temp; /**/
+ char is_stateless; /**/
+ char *export_path; /* path to the export directory
+ * (option directory <export-path>) */
+ int32_t export_path_length; /* length of 'export_path' string */
+
+ /* statistics */
+ struct xlator_stats stats; /* Statistics, provides activity of the server */
+
+ struct timeval prev_fetch_time;
+ struct timeval init_time;
+ int32_t max_read; /* */
+ int32_t max_write; /* */
+ int64_t interval_read; /* Used to calculate the max_read value */
+ int64_t interval_write; /* Used to calculate the max_write value */
+ int64_t read_value; /* Total read, from init */
+ int64_t write_value; /* Total write, from init */
+
+ /* bdb xlator specific private data */
+ uint64_t envflags; /* flags used for opening DB_ENV for this xlator */
+ uint64_t dbflags; /* flags to be used for opening each database */
+ uint64_t cache; /* cache: can be either ON or OFF */
+ uint32_t transaction; /* transaction: can be either ON or OFF */
+ uint32_t active;
+ gf_lock_t active_lock;
+ struct bctx_table *b_table;
+ DBTYPE access_mode; /* access mode for accessing the databases,
+ * can be DB_HASH, DB_BTREE
+ * (option access-mode <mode>) */
+ mode_t file_mode; /* mode for each and every file stored on bdb
+ * (option file-mode <mode>) */
+ mode_t dir_mode; /* mode for each and every directory stored on bdb
+ * (option dir-mode <mode>) */
+ mode_t symlink_mode; /* mode for each and every symlink stored on bdb */
+ pthread_t checkpoint_thread; /* pthread_t object used for creating checkpoint
+ * thread */
+ int32_t checkpoint_timeout; /* time duration between two consecutive checkpoint
+ * operations.
+ * (option checkpoint-timeout <time-in-seconds>) */
+ ino_t next_ino; /* inode number allocation counter */
+ gf_lock_t ino_lock; /* lock to protect 'next_ino' */
+ char *logdir; /* environment log directory
+ * (option logdir <directory>) */
+ char *errfile; /* errfile path, used by environment to
+ * print detailed error log.
+ * (option errfile <errfile-path>) */
+ FILE *errfp; /* DB_ENV->set_errfile() expects us to fopen
+ * the errfile before doing DB_ENV->set_errfile() */
+ uint32_t txn_timeout; /* used by DB_ENV->set_timeout to set the timeout for
+ * a transactionally encapsulated DB->operation() to
+ * timeout before waiting for locks to be released.
+ * (option transaction-timeout <time-in-milliseconds>)
+ */
+ uint32_t lock_timeout;
+ uint32_t log_auto_remove; /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/
+ uint32_t log_region_max;
+};
+
+
+static inline int32_t
+bdb_txn_begin (DB_ENV *dbenv,
+ DB_TXN **ptxnid)
+{
+ return dbenv->txn_begin (dbenv, NULL, ptxnid, 0);
+}
+
+static inline int32_t
+bdb_txn_abort (DB_TXN *txnid)
+{
+ return txnid->abort (txnid);
+}
+
+static inline int32_t
+bdb_txn_commit (DB_TXN *txnid)
+{
+ return txnid->commit (txnid, 0);
+}
+
+inline void *
+bdb_extract_bfd (fd_t *fd, xlator_t *this);
+
+
+void *
+bdb_db_stat (bctx_t *bctx,
+ DB_TXN *txnid,
+ uint32_t flags);
+
+int32_t
+bdb_db_get(struct bdb_ctx *bctx,
+ DB_TXN *txnid,
+ const char *key_string,
+ char **buf,
+ size_t size,
+ off_t offset);
+
+#define BDB_TRUNCATE_RECORD 0xcafebabe
+
+int32_t
+bdb_db_put (struct bdb_ctx *bctx,
+ DB_TXN *txnid,
+ const char *key_string,
+ const char *buf,
+ size_t size,
+ off_t offset,
+ int32_t flags);
+
+int32_t
+bdb_db_del (struct bdb_ctx *bctx,
+ DB_TXN *txnid,
+ const char *path);
+
+ino_t
+bdb_inode_transform (ino_t parent,
+ struct bdb_ctx *bctx);
+
+
+int32_t
+bdb_cursor_open (struct bdb_ctx *bctx,
+ DBC **cursorp);
+
+int32_t
+bdb_cursor_get (DBC *cursorp,
+ DBT *key,
+ DBT *value,
+ int32_t flags);
+
+
+int32_t
+bdb_cursor_close (struct bdb_ctx *ctx,
+ DBC *cursorp);
+
+
+int32_t
+bdb_dirent_size (DBT *key);
+
+int32_t
+dirent_size (struct dirent *entry);
+
+int
+bdb_db_init (xlator_t *this,
+ dict_t *options);
+
+void
+bdb_dbs_from_dict_close (dict_t *this,
+ char *key,
+ data_t *value,
+ void *data);
+
+bctx_t *
+bctx_lookup (struct bctx_table *table,
+ const char *path);
+
+bctx_t *
+bctx_parent
+(struct bctx_table *table,
+ const char *path);
+
+bctx_t *
+bctx_unref (bctx_t *ctx);
+
+bctx_t *
+bctx_ref (bctx_t *ctx);
+
+bctx_t *
+bctx_rename (bctx_t *bctx,
+ const char *db_newpath);
+
+int32_t
+bdb_db_rename (bctx_table_t *table,
+ const char *tmp_db_newpath,
+ const char *real_db_newpath);
+#endif /* _BDB_H */