summaryrefslogtreecommitdiffstats
path: root/xlators/storage/bd/src/bd.h
diff options
context:
space:
mode:
authorM. Mohan Kumar <mohan@in.ibm.com>2013-11-13 22:44:42 +0530
committerAnand Avati <avati@redhat.com>2013-11-13 11:38:42 -0800
commit48c40e1a42efe1b59126406084821947d139dd0e (patch)
tree74959ecda9b9bd56c85e0e32991c11c06b022296 /xlators/storage/bd/src/bd.h
parent15a8ecd9b3eedf80881bd3dba81f16b7d2cb7c97 (diff)
bd: posix/multi-brick support to BD xlator
Current BD xlator (block backend) has a few limitations such as * Creation of directories not supported * Supports only single brick * Does not use extended attributes (and client gfid) like posix xlator * Creation of special files (symbolic links, device nodes etc) not supported Basic limitation of not allowing directory creation is blocking oVirt/VDSM to consume BD xlator as part of Gluster domain since VDSM creates multi-level directories when GlusterFS is used as storage backend for storing VM images. To overcome these limitations a new BD xlator with following improvements is suggested. * New hybrid BD xlator that handles both regular files and block device files * The volume will have both POSIX and BD bricks. Regular files are created on POSIX bricks, block devices are created on the BD brick (VG) * BD xlator leverages exiting POSIX xlator for most POSIX calls and hence sits above the POSIX xlator * Block device file is differentiated from regular file by an extended attribute * The xattr 'user.glusterfs.bd' (BD_XATTR) plays a role in mapping a posix file to Logical Volume (LV). * When a client sends a request to set BD_XATTR on a posix file, a new LV is created and mapped to posix file. So every block device will have a representative file in POSIX brick with 'user.glusterfs.bd' (BD_XATTR) set. * Here after all operations on this file results in LV related operations. For example opening a file that has BD_XATTR set results in opening the LV block device, reading results in reading the corresponding LV block device. When BD xlator gets request to set BD_XATTR via setxattr call, it creates a LV and information about this LV is placed in the xattr of the posix file. xattr "user.glusterfs.bd" used to identify that posix file is mapped to BD. Usage: Server side: [root@host1 ~]# gluster volume create bdvol host1:/storage/vg1_info?vg1 host2:/storage/vg2_info?vg2 It creates a distributed gluster volume 'bdvol' with Volume Group vg1 using posix brick /storage/vg1_info in host1 and Volume Group vg2 using /storage/vg2_info in host2. [root@host1 ~]# gluster volume start bdvol Client side: [root@node ~]# mount -t glusterfs host1:/bdvol /media [root@node ~]# touch /media/posix It creates regular posix file 'posix' in either host1:/vg1 or host2:/vg2 brick [root@node ~]# mkdir /media/image [root@node ~]# touch /media/image/lv1 It also creates regular posix file 'lv1' in either host1:/vg1 or host2:/vg2 brick [root@node ~]# setfattr -n "user.glusterfs.bd" -v "lv" /media/image/lv1 [root@node ~]# Above setxattr results in creating a new LV in corresponding brick's VG and it sets 'user.glusterfs.bd' with value 'lv:<default-extent-size' [root@node ~]# truncate -s5G /media/image/lv1 It results in resizig LV 'lv1'to 5G New BD xlator code is placed in xlators/storage/bd directory. Also add volume-uuid to the VG so that same VG can't be used for other bricks/volumes. After deleting a gluster volume, one has to manually remove the associated tag using vgchange <vg-name> --deltag <trusted.glusterfs.volume-id:<volume-id>> Changes from previous version V5: * Removed support for delayed deleting of LVs Changes from previous version V4: * Consolidated the patches * Removed usage of BD_XATTR_SIZE and consolidated it in BD_XATTR. Changes from previous version V3: * Added support in FUSE to support full/linked clone * Added support to merge snapshots and provide information about origin * bd_map xlator removed * iatt structure used in inode_ctx. iatt is cached and updated during fsync/flush * aio support * Type and capabilities of volume are exported through getxattr Changes from version 2: * Used inode_context for caching BD size and to check if loc/fd is BD or not. * Added GlusterFS server offloaded copy and snapshot through setfattr FOP. As part of this libgfapi is modified. * BD xlator supports stripe * During unlinking if a LV file is already opened, its added to delete list and bd_del_thread tries to delete from this list when a last reference to that file is closed. Changes from previous version: * gfid is used as name of LV * ? is used to specify VG name for creating BD volume in volume create, add-brick. gluster volume create volname host:/path?vg * open-behind issue is fixed * A replicate brick can be added dynamically and LVs from source brick are replicated to destination brick * A distribute brick can be added dynamically and rebalance operation distributes existing LVs/files to the new brick * Thin provisioning support added. * bd_map xlator support retained * setfattr -n user.glusterfs.bd -v "lv" creates a regular LV and setfattr -n user.glusterfs.bd -v "thin" creates thin LV * Capability and backend information added to gluster volume info (and --xml) so that management tools can exploit BD xlator. * tracing support for bd xlator added TODO: * Add support to display snapshots for a given LV * Display posix filename for list-origin instead of gfid Change-Id: I00d32dfbab3b7c806e0841515c86c3aa519332f2 BUG: 1028672 Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com> Reviewed-on: http://review.gluster.org/4809 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/storage/bd/src/bd.h')
-rw-r--r--xlators/storage/bd/src/bd.h140
1 files changed, 140 insertions, 0 deletions
diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h
new file mode 100644
index 00000000000..4d8b8954524
--- /dev/null
+++ b/xlators/storage/bd/src/bd.h
@@ -0,0 +1,140 @@
+/*
+ BD translator - Exports Block devices on server side as regular
+ files to client
+
+ Copyright IBM, Corp. 2012
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _BD_H
+#define _BD_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "mem-types.h"
+
+#define BD_XLATOR "block device mapper xlator"
+#define BACKEND_VG "vg"
+#define GF_XATTR "user.glusterfs"
+#define BD_XATTR GF_XATTR ".bd"
+
+#define BD_LV "lv"
+#define BD_THIN "thin"
+
+#define LVM_RESIZE "/sbin/lvresize"
+#define LVM_CREATE "/sbin/lvcreate"
+
+#define VOL_TYPE "volume.type"
+#define VOL_CAPS "volume.caps"
+
+#define ALIGN_SIZE 4096
+
+#define BD_CAPS_BD 0x01
+#define BD_CAPS_THIN 0x02
+
+#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \
+ if (!buff) { \
+ op_errno = ENOMEM; \
+ gf_log (this->name, GF_LOG_ERROR, "out of memory"); \
+ goto label; \
+ }
+
+#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \
+ if (!local) { \
+ op_errno = EINVAL; \
+ goto label; \
+ }
+
+#define BD_STACK_UNWIND(typ, frame, args ...) do { \
+ bd_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ bd_local_free (__this, __local); \
+ } while (0)
+
+typedef char bd_gfid_t[GF_UUID_BUF_SIZE];
+
+enum gf_bd_mem_types_ {
+ gf_bd_private = gf_common_mt_end + 1,
+ gf_bd_attr,
+ gf_bd_fd,
+ gf_bd_mt_end
+};
+
+/**
+ * bd_fd - internal structure
+ */
+typedef struct bd_fd {
+ int fd;
+ int32_t flag;
+} bd_fd_t;
+
+typedef struct bd_priv {
+ lvm_t handle;
+ char *vg;
+ char *pool;
+ int caps;
+} bd_priv_t;
+
+
+typedef enum bd_type {
+ BD_TYPE_NONE,
+ BD_TYPE_LV,
+} bd_type_t;
+
+typedef struct {
+ struct iatt iatt;
+ char *type;
+} bd_attr_t;
+
+typedef struct {
+ dict_t *dict;
+ bd_attr_t *bdatt;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
+ data_t *data; /* for setxattr */
+} bd_local_t;
+
+typedef struct {
+ char *lv;
+ struct list_head list;
+} bd_del_entry;
+
+/* Prototypes */
+int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx);
+int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx);
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv);
+bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this);
+void bd_local_free (xlator_t *this, bd_local_t *local);
+int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd);
+char *page_aligned_alloc (size_t size, char **aligned_buf);
+int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid);
+uint64_t bd_get_default_extent (bd_priv_t *priv);
+uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size);
+int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv);
+int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size);
+int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+#endif