From ef171ff2bfd114e46442441fbdeb692a416cc951 Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Wed, 11 Dec 2013 16:26:25 -0500 Subject: Roll-up patch for NSR so far. Previous history: https://forge.gluster.org/~jdarcy/glusterfs-core/glusterfs-nsr Change-Id: I2b56328788753c6a74d9589815f2dd705ac9ce6a Signed-off-by: Jeff Darcy --- api/src/glfs-fops.c | 365 +- api/src/glfs-handleops.c | 193 +- api/src/glfs-handles.h | 37 +- api/src/glfs.h | 86 +- configure.ac | 6 + glusterfs.spec.in | 4 + libglusterfs/src/call-stub.c | 2 +- libglusterfs/src/call-stub.h | 6 + libglusterfs/src/glusterfs.h | 2 +- libglusterfs/src/list.h | 14 + libglusterfs/src/syncop.c | 233 +- libglusterfs/src/syncop.h | 32 + tests/basic/nsr.t | 47 + xlators/cluster/Makefile.am | 2 +- xlators/cluster/nsr-client/Makefile.am | 3 + xlators/cluster/nsr-client/src/Makefile.am | 33 + xlators/cluster/nsr-client/src/fop-template.c | 113 + xlators/cluster/nsr-client/src/gen-fops.py | 57 + xlators/cluster/nsr-client/src/nsrc.c | 194 + xlators/cluster/nsr-recon/Makefile.am | 3 + xlators/cluster/nsr-recon/src/Makefile.am | 22 + xlators/cluster/nsr-recon/src/recon_driver.c | 2624 ++++++++++++ xlators/cluster/nsr-recon/src/recon_driver.h | 308 ++ xlators/cluster/nsr-recon/src/recon_xlator.c | 837 ++++ xlators/cluster/nsr-recon/src/recon_xlator.h | 78 + xlators/cluster/nsr-server/Makefile.am | 3 + xlators/cluster/nsr-server/src/Makefile.am | 36 + xlators/cluster/nsr-server/src/all-templates.c | 299 ++ xlators/cluster/nsr-server/src/codegen.py | 174 + xlators/cluster/nsr-server/src/codegen.pyc | Bin 0 -> 4915 bytes xlators/cluster/nsr-server/src/etcd-api.c | 586 +++ xlators/cluster/nsr-server/src/etcd-api.h | 176 + xlators/cluster/nsr-server/src/gen-fops.py | 123 + xlators/cluster/nsr-server/src/leader.c | 420 ++ xlators/cluster/nsr-server/src/nsr-cg.c | 4444 ++++++++++++++++++++ xlators/cluster/nsr-server/src/nsr-internal.h | 81 + xlators/cluster/nsr-server/src/nsr.c | 682 +++ xlators/cluster/nsr-server/src/stub_etcd.c | 129 + xlators/cluster/nsr-server/src/yajl.c | 175 + xlators/cluster/nsr-server/src/yajl/yajl_common.h | 75 + xlators/cluster/nsr-server/src/yajl/yajl_gen.h | 157 + xlators/cluster/nsr-server/src/yajl/yajl_parse.h | 226 + xlators/cluster/nsr-server/src/yajl/yajl_tree.h | 177 + xlators/cluster/nsr-server/src/yajl/yajl_version.h | 23 + xlators/cluster/nsr-server/src/yajl_alloc.c | 49 + xlators/cluster/nsr-server/src/yajl_alloc.h | 34 + xlators/cluster/nsr-server/src/yajl_buf.c | 103 + xlators/cluster/nsr-server/src/yajl_buf.h | 57 + xlators/cluster/nsr-server/src/yajl_bytestack.h | 69 + xlators/cluster/nsr-server/src/yajl_encode.c | 220 + xlators/cluster/nsr-server/src/yajl_encode.h | 34 + xlators/cluster/nsr-server/src/yajl_gen.c | 350 ++ xlators/cluster/nsr-server/src/yajl_lex.c | 763 ++++ xlators/cluster/nsr-server/src/yajl_lex.h | 117 + xlators/cluster/nsr-server/src/yajl_parser.c | 492 +++ xlators/cluster/nsr-server/src/yajl_parser.h | 78 + xlators/cluster/nsr-server/src/yajl_tree.c | 501 +++ xlators/cluster/nsr-server/src/yajl_version.c | 7 + .../changelog/lib/src/gf-changelog-helpers.h | 1 + xlators/features/changelog/src/Makefile.am | 8 +- .../changelog/src/changelog-default-fops.c | 561 +++ .../features/changelog/src/changelog-encoders.c | 99 +- .../features/changelog/src/changelog-encoders.h | 10 +- xlators/features/changelog/src/changelog-fops.h | 157 + xlators/features/changelog/src/changelog-helpers.c | 208 +- xlators/features/changelog/src/changelog-helpers.h | 246 +- .../features/changelog/src/changelog-mem-types.h | 9 +- xlators/features/changelog/src/changelog-misc.h | 8 +- xlators/features/changelog/src/changelog-rt.c | 9 +- xlators/features/changelog/src/changelog-rt.h | 5 +- xlators/features/changelog/src/changelog.c | 428 +- .../src/policy/changelog-policy-default.c | 44 + .../src/policy/changelog-policy-replication.c | 1184 ++++++ .../changelog/src/policy/changelog-policy.h | 41 + xlators/mgmt/glusterd/src/Makefile.am | 5 +- xlators/mgmt/glusterd/src/glusterd-etcd.c | 86 + xlators/mgmt/glusterd/src/glusterd-etcd.h | 23 + xlators/mgmt/glusterd/src/glusterd-handler.c | 12 + xlators/mgmt/glusterd/src/glusterd-sm.c | 16 + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 + xlators/mgmt/glusterd/src/glusterd-volgen.c | 338 +- xlators/mgmt/glusterd/src/glusterd-volgen.h | 4 + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 13 + xlators/mgmt/glusterd/src/glusterd.c | 16 + xlators/mgmt/glusterd/src/glusterd.h | 20 + 85 files changed, 19036 insertions(+), 679 deletions(-) create mode 100644 tests/basic/nsr.t create mode 100644 xlators/cluster/nsr-client/Makefile.am create mode 100644 xlators/cluster/nsr-client/src/Makefile.am create mode 100644 xlators/cluster/nsr-client/src/fop-template.c create mode 100644 xlators/cluster/nsr-client/src/gen-fops.py create mode 100644 xlators/cluster/nsr-client/src/nsrc.c create mode 100644 xlators/cluster/nsr-recon/Makefile.am create mode 100644 xlators/cluster/nsr-recon/src/Makefile.am create mode 100644 xlators/cluster/nsr-recon/src/recon_driver.c create mode 100644 xlators/cluster/nsr-recon/src/recon_driver.h create mode 100644 xlators/cluster/nsr-recon/src/recon_xlator.c create mode 100644 xlators/cluster/nsr-recon/src/recon_xlator.h create mode 100644 xlators/cluster/nsr-server/Makefile.am create mode 100644 xlators/cluster/nsr-server/src/Makefile.am create mode 100644 xlators/cluster/nsr-server/src/all-templates.c create mode 100644 xlators/cluster/nsr-server/src/codegen.py create mode 100644 xlators/cluster/nsr-server/src/codegen.pyc create mode 100644 xlators/cluster/nsr-server/src/etcd-api.c create mode 100644 xlators/cluster/nsr-server/src/etcd-api.h create mode 100644 xlators/cluster/nsr-server/src/gen-fops.py create mode 100644 xlators/cluster/nsr-server/src/leader.c create mode 100644 xlators/cluster/nsr-server/src/nsr-cg.c create mode 100644 xlators/cluster/nsr-server/src/nsr-internal.h create mode 100644 xlators/cluster/nsr-server/src/nsr.c create mode 100644 xlators/cluster/nsr-server/src/stub_etcd.c create mode 100644 xlators/cluster/nsr-server/src/yajl.c create mode 100644 xlators/cluster/nsr-server/src/yajl/yajl_common.h create mode 100644 xlators/cluster/nsr-server/src/yajl/yajl_gen.h create mode 100644 xlators/cluster/nsr-server/src/yajl/yajl_parse.h create mode 100644 xlators/cluster/nsr-server/src/yajl/yajl_tree.h create mode 100644 xlators/cluster/nsr-server/src/yajl/yajl_version.h create mode 100644 xlators/cluster/nsr-server/src/yajl_alloc.c create mode 100644 xlators/cluster/nsr-server/src/yajl_alloc.h create mode 100644 xlators/cluster/nsr-server/src/yajl_buf.c create mode 100644 xlators/cluster/nsr-server/src/yajl_buf.h create mode 100644 xlators/cluster/nsr-server/src/yajl_bytestack.h create mode 100644 xlators/cluster/nsr-server/src/yajl_encode.c create mode 100644 xlators/cluster/nsr-server/src/yajl_encode.h create mode 100644 xlators/cluster/nsr-server/src/yajl_gen.c create mode 100644 xlators/cluster/nsr-server/src/yajl_lex.c create mode 100644 xlators/cluster/nsr-server/src/yajl_lex.h create mode 100644 xlators/cluster/nsr-server/src/yajl_parser.c create mode 100644 xlators/cluster/nsr-server/src/yajl_parser.h create mode 100644 xlators/cluster/nsr-server/src/yajl_tree.c create mode 100644 xlators/cluster/nsr-server/src/yajl_version.c create mode 100644 xlators/features/changelog/src/changelog-default-fops.c create mode 100644 xlators/features/changelog/src/changelog-fops.h create mode 100644 xlators/features/changelog/src/policy/changelog-policy-default.c create mode 100644 xlators/features/changelog/src/policy/changelog-policy-replication.c create mode 100644 xlators/features/changelog/src/policy/changelog-policy.h create mode 100644 xlators/mgmt/glusterd/src/glusterd-etcd.c create mode 100644 xlators/mgmt/glusterd/src/glusterd-etcd.h diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c index f3ac335fb..8d905193a 100644 --- a/api/src/glfs-fops.c +++ b/api/src/glfs-fops.c @@ -145,7 +145,7 @@ out: int -glfs_close (struct glfs_fd *glfd) +glfs_close_with_xdata (struct glfs_fd *glfd, dict_t *dict) { xlator_t *subvol = NULL; int ret = -1; @@ -168,7 +168,7 @@ glfs_close (struct glfs_fd *glfd) goto out; } - ret = syncop_flush (subvol, fd); + ret = syncop_flush_with_xdata (subvol, fd, dict); out: fs = glfd->fs; glfs_fd_destroy (glfd); @@ -181,6 +181,11 @@ out: return ret; } +int +glfs_close (struct glfs_fd *glfd) +{ + return(glfs_close_with_xdata(glfd, NULL)); +} int glfs_lstat (struct glfs *fs, const char *path, struct stat *stat) @@ -249,7 +254,7 @@ out: int -glfs_fstat (struct glfs_fd *glfd, struct stat *stat) +glfs_fstat_with_xdata (struct glfs_fd *glfd, struct stat *stat, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -272,7 +277,7 @@ glfs_fstat (struct glfs_fd *glfd, struct stat *stat) goto out; } - ret = syncop_fstat (subvol, fd, &iatt); + ret = syncop_fstat_with_xdata (subvol, fd, &iatt, dict); if (ret == 0 && stat) glfs_iatt_to_stat (glfd->fs, &iatt, stat); @@ -285,17 +290,21 @@ out: return ret; } +int +glfs_fstat (struct glfs_fd *glfd, struct stat *stat) +{ + return(glfs_fstat_with_xdata(glfd, stat, NULL)); +} + struct glfs_fd * -glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode) +glfs_creat_with_xdata (struct glfs *fs, const char *path, int flags, mode_t mode, uuid_t gfid, dict_t *xattr_req) { int ret = -1; struct glfs_fd *glfd = NULL; xlator_t *subvol = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; int reval = 0; __glfs_entry_fs (fs); @@ -307,14 +316,6 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode) goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -404,8 +405,6 @@ retry: out: loc_wipe (&loc); - if (xattr_req) - dict_unref (xattr_req); if (ret && glfd) { glfs_fd_destroy (glfd); @@ -421,9 +420,28 @@ out: return glfd; } +struct glfs_fd * +glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode) +{ + dict_t *xattr_req = NULL; + uuid_t gfid; + struct glfs_fd *fd = NULL; + + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return NULL; + } + uuid_generate (gfid); + fd = glfs_creat_with_xdata (fs, path, flags, mode, gfid, xattr_req); + if (xattr_req) + dict_unref (xattr_req); + return (fd); +} off_t -glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence) +glfs_lseek_with_xdata (struct glfs_fd *glfd, off_t offset, int whence, dict_t *dict) { struct stat sb = {0, }; int ret = -1; @@ -438,7 +456,7 @@ glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence) glfd->offset += offset; break; case SEEK_END: - ret = glfs_fstat (glfd, &sb); + ret = glfs_fstat_with_xdata (glfd, &sb, dict); if (ret) { /* seek cannot fail :O */ break; @@ -450,12 +468,17 @@ glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence) return glfd->offset; } +off_t +glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence) +{ + return(glfs_lseek_with_xdata(glfd, offset, whence, NULL)); +} ////////////// ssize_t -glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, - off_t offset, int flags) +glfs_preadv_with_xdata (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, + off_t offset, int flags, dict_t *dict) { xlator_t *subvol = NULL; ssize_t ret = -1; @@ -483,7 +506,7 @@ glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, size = iov_length (iovec, iovcnt); - ret = syncop_readv (subvol, fd, size, offset, 0, &iov, &cnt, &iobref); + ret = syncop_readv_with_xdata (subvol, fd, size, offset, 0, &iov, &cnt, &iobref, dict); if (ret <= 0) goto out; @@ -506,6 +529,12 @@ out: return ret; } +ssize_t +glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, + off_t offset, int flags) +{ + return(glfs_preadv_with_xdata(glfd, iovec, iovcnt, offset, flags, NULL)); +} ssize_t glfs_read (struct glfs_fd *glfd, void *buf, size_t count, int flags) @@ -521,6 +550,19 @@ glfs_read (struct glfs_fd *glfd, void *buf, size_t count, int flags) return ret; } +ssize_t +glfs_read_with_xdata (struct glfs_fd *glfd, void *buf, size_t count, int flags, dict_t *dict) +{ + struct iovec iov = {0, }; + ssize_t ret = 0; + + iov.iov_base = buf; + iov.iov_len = count; + + ret = glfs_preadv_with_xdata (glfd, &iov, 1, glfd->offset, flags, dict); + + return ret; +} ssize_t glfs_pread (struct glfs_fd *glfd, void *buf, size_t count, off_t offset, @@ -772,6 +814,12 @@ glfs_readv_async (struct glfs_fd *glfd, const struct iovec *iov, int count, ssize_t glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, off_t offset, int flags) +{ + return(glfs_pwritev_with_xdata(glfd, iovec, iovcnt, offset, flags, NULL)); +} +ssize_t +glfs_pwritev_with_xdata (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, + off_t offset, int flags, dict_t *dict) { xlator_t *subvol = NULL; int ret = -1; @@ -828,7 +876,7 @@ glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, iov.iov_base = iobuf_ptr (iobuf); iov.iov_len = size; - ret = syncop_writev (subvol, fd, &iov, 1, offset, iobref, flags); + ret = syncop_writev_with_xdata (subvol, fd, &iov, 1, offset, iobref, flags, dict); iobuf_unref (iobuf); iobref_unref (iobref); @@ -862,6 +910,20 @@ glfs_write (struct glfs_fd *glfd, const void *buf, size_t count, int flags) return ret; } +ssize_t +glfs_write_with_xdata (struct glfs_fd *glfd, const void *buf, size_t count, int flags, dict_t *dict) +{ + struct iovec iov = {0, }; + ssize_t ret = 0; + + iov.iov_base = (void *) buf; + iov.iov_len = count; + + ret = glfs_pwritev_with_xdata (glfd, &iov, 1, glfd->offset, flags, dict); + + return ret; +} + ssize_t @@ -875,6 +937,16 @@ glfs_writev (struct glfs_fd *glfd, const struct iovec *iov, int count, return ret; } +ssize_t +glfs_writev_with_xdata (struct glfs_fd *glfd, const struct iovec *iov, int count, + int flags, dict_t *dict) +{ + ssize_t ret = 0; + + ret = glfs_pwritev_with_xdata (glfd, iov, count, glfd->offset, flags, dict); + + return ret; +} ssize_t glfs_pwrite (struct glfs_fd *glfd, const void *buf, size_t count, off_t offset, @@ -978,7 +1050,7 @@ glfs_writev_async (struct glfs_fd *glfd, const struct iovec *iov, int count, int -glfs_fsync (struct glfs_fd *glfd) +glfs_fsync_with_xdata (struct glfs_fd *glfd, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1000,7 +1072,7 @@ glfs_fsync (struct glfs_fd *glfd) goto out; } - ret = syncop_fsync (subvol, fd, 0); + ret = syncop_fsync_with_xdata (subvol, fd, 0, dict); out: if (fd) fd_unref (fd); @@ -1010,6 +1082,11 @@ out: return ret; } +int +glfs_fsync (struct glfs_fd *glfd) +{ + return(glfs_fsync_with_xdata(glfd, NULL)); +} static int glfs_fsync_async_common (struct glfs_fd *glfd, glfs_io_cbk fn, void *data, @@ -1093,7 +1170,7 @@ glfs_fdatasync_async (struct glfs_fd *glfd, glfs_io_cbk fn, void *data) int -glfs_ftruncate (struct glfs_fd *glfd, off_t offset) +glfs_ftruncate_with_xdata (struct glfs_fd *glfd, off_t offset, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1115,7 +1192,7 @@ glfs_ftruncate (struct glfs_fd *glfd, off_t offset) goto out; } - ret = syncop_ftruncate (subvol, fd, offset); + ret = syncop_ftruncate_with_xdata (subvol, fd, offset, dict); out: if (fd) fd_unref (fd); @@ -1125,6 +1202,11 @@ out: return ret; } +int +glfs_ftruncate (struct glfs_fd *glfd, off_t offset) +{ + return(glfs_ftruncate_with_xdata(glfd, offset, NULL)); +} int glfs_ftruncate_async (struct glfs_fd *glfd, off_t offset, @@ -1196,14 +1278,12 @@ out: int -glfs_symlink (struct glfs *fs, const char *data, const char *path) +glfs_symlink_with_xdata (struct glfs *fs, const char *data, const char *path, uuid_t gfid, dict_t *xattr_req) { int ret = -1; xlator_t *subvol = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; int reval = 0; __glfs_entry_fs (fs); @@ -1215,14 +1295,6 @@ glfs_symlink (struct glfs *fs, const char *data, const char *path) goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -1267,14 +1339,30 @@ retry: out: loc_wipe (&loc); - if (xattr_req) - dict_unref (xattr_req); - glfs_subvol_done (fs, subvol); return ret; } +int +glfs_symlink (struct glfs *fs, const char *data, const char *path) +{ + uuid_t gfid; + dict_t *xattr_req = NULL; + int ret = -1; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return -1 ; + } + + uuid_generate (gfid); + ret = glfs_symlink_with_xdata(fs, data, path, gfid, xattr_req); + + dict_unref (xattr_req); + return ret; +} int glfs_readlink (struct glfs *fs, const char *path, char *buf, size_t bufsiz) @@ -1325,14 +1413,12 @@ out: int -glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev) +glfs_mknod_with_xdata (struct glfs *fs, const char *path, mode_t mode, dev_t dev, uuid_t gfid, dict_t *xattr_req) { int ret = -1; xlator_t *subvol = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; int reval = 0; __glfs_entry_fs (fs); @@ -1344,14 +1430,7 @@ glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev) goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -1396,24 +1475,38 @@ retry: out: loc_wipe (&loc); - if (xattr_req) - dict_unref (xattr_req); - glfs_subvol_done (fs, subvol); return ret; } +int +glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev) +{ + dict_t *xattr_req = NULL; + uuid_t gfid; + int ret; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return -1; + } + + uuid_generate (gfid); + ret = glfs_mknod_with_xdata(fs, path, mode, dev, gfid, xattr_req); + + dict_unref (xattr_req); + return (ret); +} int -glfs_mkdir (struct glfs *fs, const char *path, mode_t mode) +glfs_mkdir_with_xdata (struct glfs *fs, const char *path, mode_t mode, uuid_t gfid, dict_t *xattr_req) { int ret = -1; xlator_t *subvol = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; int reval = 0; __glfs_entry_fs (fs); @@ -1425,14 +1518,6 @@ glfs_mkdir (struct glfs *fs, const char *path, mode_t mode) goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -1477,17 +1562,33 @@ retry: out: loc_wipe (&loc); - if (xattr_req) - dict_unref (xattr_req); glfs_subvol_done (fs, subvol); return ret; } +int +glfs_mkdir (struct glfs *fs, const char *path, mode_t mode) +{ + uuid_t gfid; + dict_t *xattr_req = NULL; + int ret; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return -1; + } + + uuid_generate (gfid); + ret = glfs_mkdir_with_xdata(fs, path, mode, gfid, xattr_req); + dict_unref (xattr_req); + return ret; +} int -glfs_unlink (struct glfs *fs, const char *path) +glfs_unlink_with_xdata (struct glfs *fs, const char *path, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1517,7 +1618,7 @@ retry: goto out; } - ret = syncop_unlink (subvol, &loc); + ret = syncop_unlink_with_xdata (subvol, &loc, dict); ESTALE_RETRY (ret, errno, reval, &loc, retry); @@ -1531,9 +1632,14 @@ out: return ret; } +int +glfs_unlink (struct glfs *fs, const char *path) +{ + return(glfs_unlink_with_xdata(fs, path, NULL)); +} int -glfs_rmdir (struct glfs *fs, const char *path) +glfs_rmdir_with_xdata (struct glfs *fs, const char *path, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1563,7 +1669,7 @@ retry: goto out; } - ret = syncop_rmdir (subvol, &loc, 0); + ret = syncop_rmdir_with_xdata (subvol, &loc, 0, dict); ESTALE_RETRY (ret, errno, reval, &loc, retry); @@ -1577,9 +1683,14 @@ out: return ret; } +int +glfs_rmdir (struct glfs *fs, const char *path) +{ + return (glfs_rmdir_with_xdata(fs, path, NULL)); +} int -glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath) +glfs_rename_with_xdata (struct glfs *fs, const char *oldpath, const char *newpath, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1626,7 +1737,7 @@ retrynew: /* TODO: check if new or old is a prefix of the other, and fail EINVAL */ - ret = syncop_rename (subvol, &oldloc, &newloc); + ret = syncop_rename_with_xdata (subvol, &oldloc, &newloc, dict); if (ret == -1 && errno == ESTALE) { if (reval < DEFAULT_REVAL_COUNT) { @@ -1652,7 +1763,13 @@ out: int -glfs_link (struct glfs *fs, const char *oldpath, const char *newpath) +glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath) +{ + return(glfs_rename_with_xdata(fs, oldpath, newpath, NULL)); +} + +int +glfs_link_with_xdata (struct glfs *fs, const char *oldpath, const char *newpath, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1703,7 +1820,7 @@ retrynew: } newloc.inode = inode_ref (oldloc.inode); - ret = syncop_link (subvol, &oldloc, &newloc); + ret = syncop_link_with_xdata (subvol, &oldloc, &newloc, dict); if (ret == -1 && errno == ESTALE) { loc_wipe (&oldloc); @@ -1723,6 +1840,11 @@ out: return ret; } +int +glfs_link (struct glfs *fs, const char *oldpath, const char *newpath) +{ + return(glfs_link_with_xdata(fs, oldpath, newpath, NULL)); +} struct glfs_fd * glfs_opendir (struct glfs *fs, const char *path) @@ -2158,8 +2280,8 @@ out: int -glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt, - int valid, int follow) +glfs_setattr_with_xdata (struct glfs *fs, const char *path, struct iatt *iatt, + int valid, int follow, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2186,7 +2308,7 @@ retry: if (ret) goto out; - ret = syncop_setattr (subvol, &loc, iatt, valid, 0, 0); + ret = syncop_setattr_with_xdata (subvol, &loc, iatt, valid, 0, 0, dict); ESTALE_RETRY (ret, errno, reval, &loc, retry); out: @@ -2197,9 +2319,15 @@ out: return ret; } +int +glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt, + int valid, int follow) +{ + return(glfs_setattr_with_xdata(fs, path, iatt, valid, follow, NULL)); +} int -glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid) +glfs_fsetattr_with_xdata (struct glfs_fd *glfd, struct iatt *iatt, int valid, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2221,7 +2349,7 @@ glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid) goto out; } - ret = syncop_fsetattr (subvol, fd, iatt, valid, 0, 0); + ret = syncop_fsetattr_with_xdata (subvol, fd, iatt, valid, 0, 0, dict); out: if (fd) fd_unref (fd); @@ -2231,6 +2359,11 @@ out: return ret; } +int +glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid) +{ + return(glfs_fsetattr_with_xdata(glfd, iatt, valid, NULL)); +} int glfs_chmod (struct glfs *fs, const char *path, mode_t mode) @@ -2471,8 +2604,8 @@ glfs_lgetxattr (struct glfs *fs, const char *path, const char *name, ssize_t -glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value, - size_t size) +glfs_fgetxattr_with_xdata (struct glfs_fd *glfd, const char *name, void *value, + size_t size, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2495,7 +2628,7 @@ glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value, goto out; } - ret = syncop_fgetxattr (subvol, fd, &xattr, name); + ret = syncop_fgetxattr_with_xdata (subvol, fd, &xattr, name, dict); if (ret) goto out; @@ -2509,6 +2642,12 @@ out: return ret; } +ssize_t +glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value, + size_t size) +{ + return(glfs_fgetxattr_with_xdata(glfd, name, value, size, NULL)); +} int glfs_listxattr_process (void *value, size_t size, dict_t *xattr) @@ -2597,7 +2736,7 @@ glfs_llistxattr (struct glfs *fs, const char *path, void *value, size_t size) ssize_t -glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size) +glfs_flistxattr_with_xdata (struct glfs_fd *glfd, void *value, size_t size,dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2620,7 +2759,7 @@ glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size) goto out; } - ret = syncop_fgetxattr (subvol, fd, &xattr, NULL); + ret = syncop_fgetxattr_with_xdata (subvol, fd, &xattr, NULL, dict); if (ret) goto out; @@ -2635,6 +2774,12 @@ out: } +ssize_t +glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size) +{ + return(glfs_flistxattr_with_xdata(glfd, value, size, NULL)); +} + dict_t * dict_for_key_value (const char *name, const char *value, size_t size) { @@ -2657,7 +2802,7 @@ dict_for_key_value (const char *name, const char *value, size_t size) int glfs_setxattr_common (struct glfs *fs, const char *path, const char *name, - const void *value, size_t size, int flags, int follow) + const void *value, size_t size, int flags, int follow, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2692,7 +2837,7 @@ retry: goto out; } - ret = syncop_setxattr (subvol, &loc, xattr, flags); + ret = syncop_setxattr_with_xdata (subvol, &loc, xattr, flags, dict); ESTALE_RETRY (ret, errno, reval, &loc, retry); @@ -2711,21 +2856,27 @@ int glfs_setxattr (struct glfs *fs, const char *path, const char *name, const void *value, size_t size, int flags) { - return glfs_setxattr_common (fs, path, name, value, size, flags, 1); + return glfs_setxattr_common (fs, path, name, value, size, flags, 1, NULL); } +int +glfs_setxattr_with_xdata (struct glfs *fs, const char *path, const char *name, + const void *value, size_t size, int flags, dict_t * dict) +{ + return glfs_setxattr_common (fs, path, name, value, size, flags, 1, dict); +} int glfs_lsetxattr (struct glfs *fs, const char *path, const char *name, const void *value, size_t size, int flags) { - return glfs_setxattr_common (fs, path, name, value, size, flags, 0); + return glfs_setxattr_common (fs, path, name, value, size, flags, 0, NULL); } int -glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value, - size_t size, int flags) +glfs_fsetxattr_with_xdata (struct glfs_fd *glfd, const char *name, const void *value, + size_t size, int flags, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2755,7 +2906,7 @@ glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value, goto out; } - ret = syncop_fsetxattr (subvol, fd, xattr, flags); + ret = syncop_fsetxattr_with_xdata (subvol, fd, xattr, flags, dict); out: if (xattr) dict_unref (xattr); @@ -2768,10 +2919,16 @@ out: return ret; } +int +glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value, + size_t size, int flags) +{ + return(glfs_fsetxattr_with_xdata(glfd, name, value, size, flags, NULL)); +} int glfs_removexattr_common (struct glfs *fs, const char *path, const char *name, - int follow) + int follow, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2798,7 +2955,7 @@ retry: if (ret) goto out; - ret = syncop_removexattr (subvol, &loc, name); + ret = syncop_removexattr_with_xdata (subvol, &loc, name, dict); ESTALE_RETRY (ret, errno, reval, &loc, retry); @@ -2814,19 +2971,25 @@ out: int glfs_removexattr (struct glfs *fs, const char *path, const char *name) { - return glfs_removexattr_common (fs, path, name, 1); + return glfs_removexattr_common (fs, path, name, 1, NULL); } int glfs_lremovexattr (struct glfs *fs, const char *path, const char *name) { - return glfs_removexattr_common (fs, path, name, 0); + return glfs_removexattr_common (fs, path, name, 0, NULL); +} + +int +glfs_removexattr_with_xdata (struct glfs *fs, const char *path, const char *name, dict_t *dict) +{ + return glfs_removexattr_common (fs, path, name, 1, dict); } int -glfs_fremovexattr (struct glfs_fd *glfd, const char *name) +glfs_fremovexattr_with_xdata (struct glfs_fd *glfd, const char *name, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -2848,7 +3011,7 @@ glfs_fremovexattr (struct glfs_fd *glfd, const char *name) goto out; } - ret = syncop_fremovexattr (subvol, fd, name); + ret = syncop_fremovexattr_with_xdata (subvol, fd, name, dict); out: if (fd) fd_unref (fd); @@ -2858,6 +3021,11 @@ out: return ret; } +int +glfs_fremovexattr (struct glfs_fd *glfd, const char *name) +{ + return(glfs_fremovexattr_with_xdata(glfd, name, NULL)); +} int glfs_fallocate (struct glfs_fd *glfd, int keep_size, off_t offset, size_t len) @@ -3100,7 +3268,6 @@ out: return retpath; } - char * glfs_getcwd (struct glfs *fs, char *buf, size_t n) { diff --git a/api/src/glfs-handleops.c b/api/src/glfs-handleops.c index 0f996d3a2..6a60557ff 100644 --- a/api/src/glfs-handleops.c +++ b/api/src/glfs-handleops.c @@ -270,7 +270,7 @@ out: } struct glfs_fd * -glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags) +glfs_h_open_with_xdata (struct glfs *fs, struct glfs_object *object, int flags, dict_t * dict) { int ret = -1; struct glfs_fd *glfd = NULL; @@ -279,7 +279,7 @@ glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags) loc_t loc = {0, }; /* validate in args */ - if ((fs == NULL) || (object == NULL)) { + if ((fs == NULL) || (object == NULL) || (dict == NULL)) { errno = EINVAL; return NULL; } @@ -330,7 +330,7 @@ glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags) GLFS_LOC_FILL_INODE (inode, loc, out); /* fop/op */ - ret = syncop_open (subvol, &loc, flags, glfd->fd); + ret = syncop_open_with_xdata (subvol, &loc, flags, glfd->fd, dict); out: loc_wipe (&loc); @@ -352,9 +352,16 @@ out: return glfd; } +struct glfs_fd * +glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags) +{ + return(glfs_h_open_with_xdata(fs, object, flags, NULL)); +} + struct glfs_object * -glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path, - int flags, mode_t mode, struct stat *stat) +glfs_h_creat_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path, + int flags, mode_t mode, struct stat *stat, + uuid_t gfid, dict_t * xattr_req) { int ret = -1; struct glfs_fd *glfd = NULL; @@ -362,12 +369,10 @@ glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path, inode_t *inode = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; struct glfs_object *object = NULL; /* validate in args */ - if ((fs == NULL) || (parent == NULL) || (path == NULL)) { + if ((fs == NULL) || (parent == NULL) || (path == NULL) || (xattr_req == NULL)) { errno = EINVAL; return NULL; } @@ -389,14 +394,6 @@ glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path, goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -464,20 +461,34 @@ out: } struct glfs_object * -glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path, - mode_t mode, struct stat *stat) +glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path, + int flags, mode_t mode, struct stat *stat) +{ + uuid_t gfid; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return NULL; + } + uuid_generate (gfid); + return(glfs_h_creat_with_xdata(fs, parent, path, flags, mode, stat, gfid, xattr_req)); +} + +struct glfs_object * +glfs_h_mkdir_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path, + mode_t mode, struct stat *stat, uuid_t gfid, dict_t *xattr_req) { int ret = -1; xlator_t *subvol = NULL; inode_t *inode = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; struct glfs_object *object = NULL; /* validate in args */ - if ((fs == NULL) || (parent == NULL) || (path == NULL)) { + if ((fs == NULL) || (parent == NULL) || (path == NULL) || (xattr_req == NULL)) { errno = EINVAL; return NULL; } @@ -499,14 +510,6 @@ glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path, goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -552,20 +555,36 @@ out: } struct glfs_object * -glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path, - mode_t mode, dev_t dev, struct stat *stat) +glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path, + mode_t mode, struct stat *stat) +{ + uuid_t gfid; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return NULL; + } + + uuid_generate (gfid); + return(glfs_h_mkdir_with_xdata(fs, parent, path, mode, stat, gfid, xattr_req)); +} + +struct glfs_object * +glfs_h_mknod_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path, + mode_t mode, dev_t dev, struct stat *stat, + uuid_t gfid, dict_t * xattr_req) { int ret = -1; xlator_t *subvol = NULL; inode_t *inode = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; struct glfs_object *object = NULL; /* validate in args */ - if ((fs == NULL) || (parent == NULL) || (path == NULL)) { + if ((fs == NULL) || (parent == NULL) || (path == NULL) || (xattr_req == NULL)) { errno = EINVAL; return NULL; } @@ -587,14 +606,6 @@ glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path, goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -638,8 +649,26 @@ out: return object; } +struct glfs_object * +glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path, + mode_t mode, dev_t dev, struct stat *stat) +{ + uuid_t gfid; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return NULL; + } + + uuid_generate (gfid); + + return(glfs_h_mknod_with_xdata(fs, parent, path, mode, dev, stat, gfid, xattr_req)); +} + int -glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path) +glfs_h_unlink_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -675,12 +704,12 @@ glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path) } if (!IA_ISDIR(loc.inode->ia_type)) { - ret = syncop_unlink (subvol, &loc); + ret = syncop_unlink_with_xdata (subvol, &loc, dict); if (ret != 0) { goto out; } } else { - ret = syncop_rmdir (subvol, &loc, 0); + ret = syncop_rmdir_with_xdata (subvol, &loc, 0, dict); if (ret != 0) { goto out; } @@ -700,8 +729,14 @@ out: return ret; } +int +glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path) +{ + return(glfs_h_unlink_with_xdata(fs, parent, path, NULL)); +} + struct glfs_fd * -glfs_h_opendir (struct glfs *fs, struct glfs_object *object) +glfs_h_opendir_with_xdata (struct glfs *fs, struct glfs_object *object, dict_t *dict) { int ret = -1; struct glfs_fd *glfd = NULL; @@ -754,7 +789,7 @@ glfs_h_opendir (struct glfs *fs, struct glfs_object *object) GLFS_LOC_FILL_INODE (inode, loc, out); /* fop/op */ - ret = syncop_opendir (subvol, &loc, glfd->fd); + ret = syncop_opendir_with_xdata (subvol, &loc, glfd->fd, dict); out: loc_wipe (&loc); @@ -775,6 +810,12 @@ out: return glfd; } +struct glfs_fd * +glfs_h_opendir (struct glfs *fs, struct glfs_object *object) +{ + return(glfs_h_opendir_with_xdata(fs, object, NULL)); +} + ssize_t glfs_h_extract_handle (struct glfs_object *object, unsigned char *handle, int len) @@ -951,21 +992,19 @@ out: } struct glfs_object * -glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name, - const char *data, struct stat *stat) +glfs_h_symlink_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *name, + const char *data, struct stat *stat, uuid_t gfid, dict_t * xattr_req) { int ret = -1; xlator_t *subvol = NULL; inode_t *inode = NULL; loc_t loc = {0, }; struct iatt iatt = {0, }; - uuid_t gfid; - dict_t *xattr_req = NULL; struct glfs_object *object = NULL; /* validate in args */ if ((fs == NULL) || (parent == NULL) || (name == NULL) || - (data == NULL)) { + (data == NULL) || (xattr_req == NULL)) { errno = EINVAL; return NULL; } @@ -987,14 +1026,6 @@ glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name, goto out; } - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - errno = ENOMEM; - goto out; - } - - uuid_generate (gfid); ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16); if (ret) { ret = -1; @@ -1044,6 +1075,23 @@ out: return object; } +struct glfs_object * +glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name, + const char *data, struct stat *stat) +{ + uuid_t gfid; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + return NULL; + } + + uuid_generate (gfid); + return(glfs_h_symlink_with_xdata(fs, parent, name, data, stat, gfid, xattr_req)); +} + int glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf, size_t bufsiz) @@ -1101,8 +1149,8 @@ out: } int -glfs_h_link (struct glfs *fs, struct glfs_object *linksrc, - struct glfs_object *parent, const char *name) +glfs_h_link_with_xdata (struct glfs *fs, struct glfs_object *linksrc, + struct glfs_object *parent, const char *name, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1165,7 +1213,7 @@ glfs_h_link (struct glfs *fs, struct glfs_object *linksrc, newloc.inode = inode_ref (inode); /* fop/op */ - ret = syncop_link (subvol, &oldloc, &newloc); + ret = syncop_link_with_xdata (subvol, &oldloc, &newloc, dict); if (ret == 0) /* TODO: No iatt to pass as there has been no lookup */ @@ -1186,8 +1234,14 @@ out: } int -glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname, - struct glfs_object *newdir, const char *newname) +glfs_h_link (struct glfs *fs, struct glfs_object *linksrc, + struct glfs_object *parent, const char *name) +{ + return(glfs_h_link_with_xdata(fs, linksrc, parent, name, NULL)); +} +int +glfs_h_rename_with_xdata (struct glfs *fs, struct glfs_object *olddir, const char *oldname, + struct glfs_object *newdir, const char *newname, dict_t *dict) { int ret = -1; xlator_t *subvol = NULL; @@ -1255,7 +1309,7 @@ glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname, /* TODO: check if new or old is a prefix of the other, and fail EINVAL */ - ret = syncop_rename (subvol, &oldloc, &newloc); + ret = syncop_rename_with_xdata (subvol, &oldloc, &newloc, dict); if (ret == 0) inode_rename (oldloc.parent->table, oldloc.parent, oldloc.name, @@ -1276,3 +1330,10 @@ out: return ret; } + +int +glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname, + struct glfs_object *newdir, const char *newname) +{ + return(glfs_h_rename_with_xdata(fs, olddir, oldname, newdir, newname, NULL)); +} diff --git a/api/src/glfs-handles.h b/api/src/glfs-handles.h index bc26618c4..548268fd6 100644 --- a/api/src/glfs-handles.h +++ b/api/src/glfs-handles.h @@ -84,21 +84,42 @@ struct glfs_object *glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path, int flags, mode_t mode, struct stat *sb) __THROW; +struct glfs_object *glfs_h_creat_with_xdata (struct glfs *fs, struct glfs_object *parent, + const char *path, int flags, mode_t mode, + struct stat *sb, uuid_t gfid, dict_t * xattr_req); + struct glfs_object *glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path, mode_t flags, struct stat *sb) __THROW; +struct glfs_object *glfs_h_mkdir_with_xdata (struct glfs *fs, struct glfs_object *parent, + const char *path, mode_t flags, + struct stat *sb, uuid_t gfid, dict_t * xattr_req); + struct glfs_object *glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path, mode_t mode, dev_t dev, struct stat *sb) __THROW; +struct glfs_object *glfs_h_mknod_with_xdata (struct glfs *fs, struct glfs_object *parent, + const char *path, mode_t mode, dev_t dev, + struct stat *sb, uuid_t gfid, dict_t * xattr_req); + struct glfs_object *glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name, const char *data, struct stat *stat) __THROW; +struct glfs_object *glfs_h_symlink_with_xdata (struct glfs *fs, + struct glfs_object *parent, + const char *name, + const char *data, + struct stat *stat, + uuid_t gfid, + dict_t * xattr_req) __THROW; /* Operations on the actual objects */ int glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path) __THROW; +int glfs_h_unlink_with_xdata (struct glfs *fs, struct glfs_object *parent, + const char *path, dict_t *dict) __THROW; int glfs_h_close (struct glfs_object *object) __THROW; @@ -122,10 +143,16 @@ int glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf, int glfs_h_link (struct glfs *fs, struct glfs_object *linktgt, struct glfs_object *parent, const char *name) __THROW; +int glfs_h_link_with_xdata (struct glfs *fs, struct glfs_object *linktgt, + struct glfs_object *parent, const char *name, + dict_t *dict) __THROW; int glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname, struct glfs_object *newdir, const char *newname) __THROW; +int glfs_h_rename_with_xdata (struct glfs *fs, struct glfs_object *olddir, + const char *oldname, struct glfs_object *newdir, + const char *newname, dict_t *dict) __THROW; /* Operations enabling opaque invariant handle to object transitions */ ssize_t glfs_h_extract_handle (struct glfs_object *object, @@ -136,11 +163,17 @@ struct glfs_object *glfs_h_create_from_handle (struct glfs *fs, struct stat *stat) __THROW; /* Operations enabling object handles to fd transitions */ -struct glfs_fd *glfs_h_opendir (struct glfs *fs, - struct glfs_object *object) __THROW; +struct glfs_fd *glfs_h_opendir (struct glfs *fs, struct glfs_object *object) + __THROW; +struct glfs_fd *glfs_h_opendir_with_xdata (struct glfs *fs, + struct glfs_object *object, + dict_t *dict) __THROW; struct glfs_fd *glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags) __THROW; +struct glfs_fd *glfs_h_open_with_xdata (struct glfs *fs, + struct glfs_object *object, int flags, + dict_t *dict) __THROW; __END_DECLS diff --git a/api/src/glfs.h b/api/src/glfs.h index 20fb18c9e..d79385792 100644 --- a/api/src/glfs.h +++ b/api/src/glfs.h @@ -354,8 +354,11 @@ glfs_fd_t *glfs_open (glfs_t *fs, const char *path, int flags) __THROW; glfs_fd_t *glfs_creat (glfs_t *fs, const char *path, int flags, mode_t mode) __THROW; +glfs_fd_t *glfs_creat_with_xdata (glfs_t *fs, const char *path, int flags, + mode_t mode, uuid_t gfid, dict_t *dict) __THROW; int glfs_close (glfs_fd_t *fd) __THROW; +int glfs_close_with_xdata (glfs_fd_t *fd, dict_t *dict) __THROW; glfs_t *glfs_from_glfd (glfs_fd_t *fd) __THROW; @@ -389,10 +392,13 @@ typedef void (*glfs_io_cbk) (glfs_fd_t *fd, ssize_t ret, void *data); // glfs_{read,write}[_async] -ssize_t glfs_read (glfs_fd_t *fd, void *buf, - size_t count, int flags) __THROW; -ssize_t glfs_write (glfs_fd_t *fd, const void *buf, - size_t count, int flags) __THROW; +ssize_t glfs_read (glfs_fd_t *fd, void *buf, size_t count, int flags) __THROW; +ssize_t glfs_read_with_xdata (struct glfs_fd *glfd, void *buf, size_t count, + int flags, dict_t *dict) __THROW; +ssize_t glfs_write (glfs_fd_t *fd, const void *buf, size_t count, int flags) + __THROW; +ssize_t glfs_write_with_xdata (glfs_fd_t *fd, const void *buf, size_t count, + int flags, dict_t *dict) __THROW; int glfs_read_async (glfs_fd_t *fd, void *buf, size_t count, int flags, glfs_io_cbk fn, void *data) __THROW; int glfs_write_async (glfs_fd_t *fd, const void *buf, size_t count, int flags, @@ -404,6 +410,8 @@ ssize_t glfs_readv (glfs_fd_t *fd, const struct iovec *iov, int iovcnt, int flags) __THROW; ssize_t glfs_writev (glfs_fd_t *fd, const struct iovec *iov, int iovcnt, int flags) __THROW; +ssize_t glfs_writev_with_xdata (glfs_fd_t *fd, const struct iovec *iov, + int iovcnt, int flags, dict_t *dict) __THROW; int glfs_readv_async (glfs_fd_t *fd, const struct iovec *iov, int count, int flags, glfs_io_cbk fn, void *data) __THROW; int glfs_writev_async (glfs_fd_t *fd, const struct iovec *iov, int count, @@ -424,29 +432,42 @@ int glfs_pwrite_async (glfs_fd_t *fd, const void *buf, int count, off_t offset, ssize_t glfs_preadv (glfs_fd_t *fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) __THROW; +ssize_t glfs_preadv_with_xdata (glfs_fd_t *fd, const struct iovec *iov, + int iovcnt, off_t offset, int flags, + dict_t *dict) __THROW; ssize_t glfs_pwritev (glfs_fd_t *fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) __THROW; -int glfs_preadv_async (glfs_fd_t *fd, const struct iovec *iov, - int count, off_t offset, int flags, - glfs_io_cbk fn, void *data) __THROW; -int glfs_pwritev_async (glfs_fd_t *fd, const struct iovec *iov, - int count, off_t offset, int flags, - glfs_io_cbk fn, void *data) __THROW; +ssize_t glfs_pwritev_with_xdata (glfs_fd_t *fd, const struct iovec *iov, + int iovcnt, off_t offset, int flags, + dict_t *dict) __THROW; +int glfs_preadv_async (glfs_fd_t *fd, const struct iovec *iov, int count, + off_t offset, int flags, glfs_io_cbk fn, void *data) + __THROW; +int glfs_pwritev_async (glfs_fd_t *fd, const struct iovec *iov, int count, + off_t offset, int flags, glfs_io_cbk fn, void *data) + __THROW; off_t glfs_lseek (glfs_fd_t *fd, off_t offset, int whence) __THROW; +off_t glfs_lseek_with_xdata (glfs_fd_t *fd, off_t offset, int whence, + dict_t *dict) __THROW; int glfs_truncate (glfs_t *fs, const char *path, off_t length) __THROW; int glfs_ftruncate (glfs_fd_t *fd, off_t length) __THROW; +int glfs_ftruncate_with_xdata (glfs_fd_t *fd, off_t length, dict_t *dict) + __THROW; int glfs_ftruncate_async (glfs_fd_t *fd, off_t length, glfs_io_cbk fn, void *data) __THROW; int glfs_lstat (glfs_t *fs, const char *path, struct stat *buf) __THROW; int glfs_stat (glfs_t *fs, const char *path, struct stat *buf) __THROW; int glfs_fstat (glfs_fd_t *fd, struct stat *buf) __THROW; +int glfs_fstat_with_xdata (glfs_fd_t *fd, struct stat *buf, dict_t *dict) + __THROW; int glfs_fsync (glfs_fd_t *fd) __THROW; +int glfs_fsync_with_xdata (glfs_fd_t *fd, dict_t *dict) __THROW; int glfs_fsync_async (glfs_fd_t *fd, glfs_io_cbk fn, void *data) __THROW; int glfs_fdatasync (glfs_fd_t *fd) __THROW; @@ -454,22 +475,35 @@ int glfs_fdatasync_async (glfs_fd_t *fd, glfs_io_cbk fn, void *data) __THROW; int glfs_access (glfs_t *fs, const char *path, int mode) __THROW; -int glfs_symlink (glfs_t *fs, const char *oldpath, const char *newpath) __THROW; +int glfs_symlink (glfs_t *fs, const char *oldpath, const char *newpath) + __THROW; +int glfs_symlink_with_xdata (glfs_t *fs, const char *oldpath, + const char *newpath, uuid_t gfid, dict_t *dict) + __THROW; int glfs_readlink (glfs_t *fs, const char *path, char *buf, size_t bufsiz) __THROW; int glfs_mknod (glfs_t *fs, const char *path, mode_t mode, dev_t dev) __THROW; +int glfs_mknod_with_xdata (glfs_t *fs, const char *path, mode_t mode, + dev_t dev, uuid_t gfid, dict_t *dict) __THROW; int glfs_mkdir (glfs_t *fs, const char *path, mode_t mode) __THROW; +int glfs_mkdir_with_xdata (glfs_t *fs, const char *path, mode_t mode, + uuid_t gfid, dict_t *dict) __THROW; int glfs_unlink (glfs_t *fs, const char *path) __THROW; int glfs_rmdir (glfs_t *fs, const char *path) __THROW; +int glfs_rmdir_with_xdata (glfs_t *fs, const char *path, dict_t *dict) __THROW; int glfs_rename (glfs_t *fs, const char *oldpath, const char *newpath) __THROW; +int glfs_rename_with_xdata (glfs_t *fs, const char *oldpath, + const char *newpath, dict_t *dict) __THROW; int glfs_link (glfs_t *fs, const char *oldpath, const char *newpath) __THROW; +int glfs_link_with_xdata (glfs_t *fs, const char *oldpath, const char *newpath, + dict_t *dict) __THROW; glfs_fd_t *glfs_opendir (glfs_t *fs, const char *path) __THROW; @@ -532,6 +566,9 @@ ssize_t glfs_lgetxattr (glfs_t *fs, const char *path, const char *name, ssize_t glfs_fgetxattr (glfs_fd_t *fd, const char *name, void *value, size_t size) __THROW; +ssize_t glfs_fgetxattr_with_xdata (glfs_fd_t *fd, const char *name, + void *value, size_t size, dict_t *dict) + __THROW; ssize_t glfs_listxattr (glfs_t *fs, const char *path, void *value, size_t size) __THROW; @@ -540,21 +577,34 @@ ssize_t glfs_llistxattr (glfs_t *fs, const char *path, void *value, size_t size) __THROW; ssize_t glfs_flistxattr (glfs_fd_t *fd, void *value, size_t size) __THROW; +ssize_t glfs_flistxattr_with_xdata (glfs_fd_t *fd, void *value, size_t size, + dict_t *dict) __THROW; int glfs_setxattr (glfs_t *fs, const char *path, const char *name, const void *value, size_t size, int flags) __THROW; +int glfs_setxattr_with_xdata (glfs_t *fs, const char *path, const char *name, + const void *value, size_t size, int flags, dict_t *dict); + int glfs_lsetxattr (glfs_t *fs, const char *path, const char *name, const void *value, size_t size, int flags) __THROW; int glfs_fsetxattr (glfs_fd_t *fd, const char *name, const void *value, size_t size, int flags) __THROW; +int glfs_fsetxattr_with_xdata (glfs_fd_t *fd, const char *name, + const void *value, size_t size, int flags, + dict_t *dict) __THROW; + int glfs_removexattr (glfs_t *fs, const char *path, const char *name) __THROW; +int glfs_removexattr_with_xdata (glfs_t *fs, const char *path, + const char *name, dict_t *dict) __THROW; int glfs_lremovexattr (glfs_t *fs, const char *path, const char *name) __THROW; int glfs_fremovexattr (glfs_fd_t *fd, const char *name) __THROW; +int glfs_fremovexattr_with_xdata (glfs_fd_t *fd, const char *name, + dict_t *dict) __THROW; int glfs_fallocate(glfs_fd_t *fd, int keep_size, off_t offset, size_t len) __THROW; @@ -578,6 +628,20 @@ int glfs_fchdir (glfs_fd_t *fd) __THROW; char *glfs_realpath (glfs_t *fs, const char *path, char *resolved_path) __THROW; +int +glfs_setattr_with_xdata (struct glfs *fs, const char *path, struct iatt *iatt, + int valid, int follow, dict_t *dict); +int +glfs_fsetattr_with_xdata (struct glfs_fd *glfd, struct iatt *iatt, int valid, dict_t *dict); +int +glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt, + int valid, int follow); +int +glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid); + + + + /* * @cmd and @flock are as specified in man fcntl(2). */ diff --git a/configure.ac b/configure.ac index 7bfee047a..581b976a0 100644 --- a/configure.ac +++ b/configure.ac @@ -58,6 +58,12 @@ AC_CONFIG_FILES([Makefile xlators/cluster/Makefile xlators/cluster/afr/Makefile xlators/cluster/afr/src/Makefile + xlators/cluster/nsr-server/Makefile + xlators/cluster/nsr-server/src/Makefile + xlators/cluster/nsr-recon/Makefile + xlators/cluster/nsr-recon/src/Makefile + xlators/cluster/nsr-client/Makefile + xlators/cluster/nsr-client/src/Makefile xlators/cluster/stripe/Makefile xlators/cluster/stripe/src/Makefile xlators/cluster/dht/Makefile diff --git a/glusterfs.spec.in b/glusterfs.spec.in index f7c2fc5b8..e6dbd5f35 100644 --- a/glusterfs.spec.in +++ b/glusterfs.spec.in @@ -661,6 +661,8 @@ find ./tests ./run-tests.sh -type f | cpio -pd %{buildroot}%{_prefix}/share/glus %exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/protocol/server* %exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/mgmt* %exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/nfs* +%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr.so +%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr_recon.so # sample xlators not generally used or usable %exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/encryption/rot-13* %exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/mac-compat* @@ -745,6 +747,8 @@ fi %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/protocol/server* %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/mgmt* %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/nfs* +%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr.so +%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr_recon.so %ghost %attr(0644,-,-) %config(noreplace) %{_sharedstatedir}/glusterd/glusterd.info %ghost %attr(0600,-,-) %{_sharedstatedir}/glusterd/options # This is really ugly, but I have no idea how to mark these directories in an diff --git a/libglusterfs/src/call-stub.c b/libglusterfs/src/call-stub.c index 7e94ee3c0..86c2463ef 100644 --- a/libglusterfs/src/call-stub.c +++ b/libglusterfs/src/call-stub.c @@ -2297,7 +2297,7 @@ out: } -static void +void call_resume_wind (call_stub_t *stub) { GF_VALIDATE_OR_GOTO ("call-stub", stub, out); diff --git a/libglusterfs/src/call-stub.h b/libglusterfs/src/call-stub.h index 0f6c108ee..ccf92cf53 100644 --- a/libglusterfs/src/call-stub.h +++ b/libglusterfs/src/call-stub.h @@ -764,4 +764,10 @@ fop_zerofill_cbk_stub(call_frame_t *frame, void call_resume (call_stub_t *stub); void call_stub_destroy (call_stub_t *stub); void call_unwind_error (call_stub_t *stub, int op_ret, int op_errno); + +/* + * Sometimes we might want to call just this, perhaps repeatedly, without + * having (or being able) to destroy and recreate it. + */ +void call_resume_wind (call_stub_t *stub); #endif diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index dfe443016..33d2087fc 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -200,7 +200,7 @@ typedef enum { GF_FOP_WRITE, GF_FOP_STATFS, GF_FOP_FLUSH, - GF_FOP_FSYNC, /* 15 */ + GF_FOP_FSYNC, /* 16 */ GF_FOP_SETXATTR, GF_FOP_GETXATTR, GF_FOP_REMOVEXATTR, diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h index 7f3712b51..6fcf17f35 100644 --- a/libglusterfs/src/list.h +++ b/libglusterfs/src/list.h @@ -187,4 +187,18 @@ list_append_init (struct list_head *list, struct list_head *head) &pos->member != (head); \ pos = n, n = list_entry(n->member.prev, typeof(*n), member)) +/* + * This list implementation has some advantages, but one disadvantage: you + * can't use NULL to check whether you're at the head or tail. Thus, the + * address of the head has to be an argument for these macros. + */ + +#define list_next(ptr,head,type,member) \ + (((ptr)->member.next == head) ? NULL \ + : list_entry((ptr)->member.next,type,member)) + +#define list_prev(ptr,head,type,member) \ + (((ptr)->member.prev == head) ? NULL \ + : list_entry((ptr)->member.prev,type,member)) + #endif /* _LLIST_H */ diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c index 1f36e5776..a4a5596c3 100644 --- a/libglusterfs/src/syncop.c +++ b/libglusterfs/src/syncop.c @@ -1158,6 +1158,22 @@ syncop_opendir (xlator_t *subvol, } +int +syncop_opendir_with_xdata (xlator_t *subvol, + loc_t *loc, + fd_t *fd, + dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_opendir_cbk, subvol->fops->opendir, + loc, fd, dict); + + errno = args.op_errno; + return args.op_ret; + +} + int syncop_fsyncdir_cbk (call_frame_t *frame, void* cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) @@ -1204,11 +1220,17 @@ syncop_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int syncop_removexattr (xlator_t *subvol, loc_t *loc, const char *name) +{ + return(syncop_removexattr_with_xdata(subvol, loc, name, NULL)); +} + +int +syncop_removexattr_with_xdata (xlator_t *subvol, loc_t *loc, const char *name, dict_t *dict) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_removexattr_cbk, subvol->fops->removexattr, - loc, name, NULL); + loc, name, dict); errno = args.op_errno; return args.op_ret; @@ -1242,6 +1264,17 @@ syncop_fremovexattr (xlator_t *subvol, fd_t *fd, const char *name) return args.op_ret; } +int +syncop_fremovexattr_with_xdata (xlator_t *subvol, fd_t *fd, const char *name, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_fremovexattr_cbk, + subvol->fops->fremovexattr, fd, name, dict); + + errno = args.op_errno; + return args.op_ret; +} int syncop_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) @@ -1258,14 +1291,19 @@ syncop_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } - int syncop_setxattr (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags) +{ + return (syncop_setxattr_with_xdata(subvol, loc, dict, flags, NULL)); +} + +int +syncop_setxattr_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags, dict_t *extra) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_setxattr_cbk, subvol->fops->setxattr, - loc, dict, flags, NULL); + loc, dict, flags, extra); errno = args.op_errno; return args.op_ret; @@ -1300,6 +1338,18 @@ syncop_fsetxattr (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags) return args.op_ret; } +int +syncop_fsetxattr_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags, dict_t *extra) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_fsetxattr_cbk, subvol->fops->fsetxattr, + fd, dict, flags, extra); + + errno = args.op_errno; + return args.op_ret; +} + int syncop_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *dict, dict_t *xdata) @@ -1353,12 +1403,12 @@ syncop_getxattr (xlator_t *subvol, loc_t *loc, dict_t **dict, const char *key) } int -syncop_fgetxattr (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key) +syncop_fgetxattr_with_xdata (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key, dict_t *extra) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_getxattr_cbk, subvol->fops->fgetxattr, - fd, key, NULL); + fd, key, extra); if (dict) *dict = args.xattr; @@ -1369,6 +1419,12 @@ syncop_fgetxattr (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key) return args.op_ret; } +int +syncop_fgetxattr (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key) +{ + return(syncop_fgetxattr_with_xdata(subvol, fd, dict, key, NULL)); +} + int syncop_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -1432,13 +1488,13 @@ syncop_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid, - struct iatt *preop, struct iatt *postop) +syncop_setattr_with_xdata (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid, + struct iatt *preop, struct iatt *postop, dict_t *dict) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_setattr_cbk, subvol->fops->setattr, - loc, iatt, valid, NULL); + loc, iatt, valid, dict); if (preop) *preop = args.iatt1; @@ -1449,15 +1505,21 @@ syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid, return args.op_ret; } +int +syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid, + struct iatt *preop, struct iatt *postop) +{ + return(syncop_setattr_with_xdata(subvol, loc, iatt, valid, preop, postop, NULL)); +} int -syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid, - struct iatt *preop, struct iatt *postop) +syncop_fsetattr_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid, + struct iatt *preop, struct iatt *postop, dict_t *dict) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_setattr_cbk, subvol->fops->fsetattr, - fd, iatt, valid, NULL); + fd, iatt, valid, dict); if (preop) *preop = args.iatt1; @@ -1468,6 +1530,12 @@ syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid, return args.op_ret; } +int +syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid, + struct iatt *preop, struct iatt *postop) +{ + return(syncop_fsetattr_with_xdata(subvol, fd, iatt, valid, preop, postop, NULL)); +} int32_t syncop_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1498,6 +1566,19 @@ syncop_open (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd) } +int +syncop_open_with_xdata (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_open_cbk, subvol->fops->open, + loc, flags, fd, dict); + + errno = args.op_errno; + return args.op_ret; + +} + int32_t syncop_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1528,14 +1609,14 @@ syncop_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } int -syncop_readv (xlator_t *subvol, fd_t *fd, size_t size, off_t off, +syncop_readv_with_xdata (xlator_t *subvol, fd_t *fd, size_t size, off_t off, uint32_t flags, struct iovec **vector, int *count, - struct iobref **iobref) + struct iobref **iobref, dict_t *dict) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_readv_cbk, subvol->fops->readv, - fd, size, off, flags, NULL); + fd, size, off, flags, dict); if (args.op_ret < 0) goto out; @@ -1560,6 +1641,14 @@ out: } +int +syncop_readv (xlator_t *subvol, fd_t *fd, size_t size, off_t off, + uint32_t flags, struct iovec **vector, int *count, + struct iobref **iobref) +{ + return(syncop_readv_with_xdata(subvol, fd, size, off, flags, vector, count, iobref, NULL)); +} + int syncop_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, @@ -1578,20 +1667,28 @@ syncop_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } int -syncop_writev (xlator_t *subvol, fd_t *fd, const struct iovec *vector, +syncop_writev_with_xdata (xlator_t *subvol, fd_t *fd, const struct iovec *vector, int32_t count, off_t offset, struct iobref *iobref, - uint32_t flags) + uint32_t flags, dict_t *dict) { struct syncargs args = {0, }; SYNCOP (subvol, (&args), syncop_writev_cbk, subvol->fops->writev, fd, (struct iovec *) vector, count, offset, flags, iobref, - NULL); + dict); errno = args.op_errno; return args.op_ret; } +int +syncop_writev (xlator_t *subvol, fd_t *fd, const struct iovec *vector, + int32_t count, off_t offset, struct iobref *iobref, + uint32_t flags) +{ + return(syncop_writev_with_xdata(subvol, fd, vector, count, offset, iobref, flags, NULL)); +} + int syncop_write (xlator_t *subvol, fd_t *fd, const char *buf, int size, off_t offset, struct iobref *iobref, uint32_t flags) { @@ -1684,6 +1781,18 @@ syncop_unlink (xlator_t *subvol, loc_t *loc) return args.op_ret; } +int +syncop_unlink_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_unlink_cbk, subvol->fops->unlink, loc, + 0, dict); + + errno = args.op_errno; + return args.op_ret; +} + int syncop_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, @@ -1714,6 +1823,18 @@ syncop_rmdir (xlator_t *subvol, loc_t *loc, int flags) } +int +syncop_rmdir_with_xdata (xlator_t *subvol, loc_t *loc, int flags, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_rmdir_cbk, subvol->fops->rmdir, loc, + flags, dict); + + errno = args.op_errno; + return args.op_ret; +} + int syncop_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, @@ -1746,6 +1867,18 @@ syncop_link (xlator_t *subvol, loc_t *oldloc, loc_t *newloc) return args.op_ret; } +int +syncop_link_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_link_cbk, subvol->fops->link, + oldloc, newloc, dict); + + errno = args.op_errno; + + return args.op_ret; +} int syncop_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1780,6 +1913,19 @@ syncop_rename (xlator_t *subvol, loc_t *oldloc, loc_t *newloc) return args.op_ret; } +int +syncop_rename_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_rename_cbk, subvol->fops->rename, + oldloc, newloc, dict); + + errno = args.op_errno; + + return args.op_ret; +} + int syncop_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1810,6 +1956,18 @@ syncop_ftruncate (xlator_t *subvol, fd_t *fd, off_t offset) return args.op_ret; } +int +syncop_ftruncate_with_xdata (xlator_t *subvol, fd_t *fd, off_t offset, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_ftruncate_cbk, subvol->fops->ftruncate, + fd, offset, dict); + + errno = args.op_errno; + return args.op_ret; +} + int syncop_truncate (xlator_t *subvol, loc_t *loc, off_t offset) { @@ -1853,6 +2011,19 @@ syncop_fsync (xlator_t *subvol, fd_t *fd, int dataonly) } +int +syncop_fsync_with_xdata (xlator_t *subvol, fd_t *fd, int dataonly, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_fsync_cbk, subvol->fops->fsync, + fd, dataonly, dict); + + errno = args.op_errno; + return args.op_ret; + +} + int syncop_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1884,6 +2055,19 @@ syncop_flush (xlator_t *subvol, fd_t *fd) } +int +syncop_flush_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict) +{ + struct syncargs args = {0}; + + SYNCOP (subvol, (&args), syncop_flush_cbk, subvol->fops->flush, + fd, dict); + + errno = args.op_errno; + return args.op_ret; + +} + int syncop_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, dict_t *xdata) @@ -1917,6 +2101,21 @@ syncop_fstat (xlator_t *subvol, fd_t *fd, struct iatt *stbuf) errno = args.op_errno; return args.op_ret; +} +int +syncop_fstat_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *stbuf, dict_t *dict) +{ + struct syncargs args = {0, }; + + SYNCOP (subvol, (&args), syncop_fstat_cbk, subvol->fops->fstat, + fd, dict); + + if (stbuf) + *stbuf = args.iatt1; + + errno = args.op_errno; + return args.op_ret; + } int diff --git a/libglusterfs/src/syncop.h b/libglusterfs/src/syncop.h index 68218bb17..87985588f 100644 --- a/libglusterfs/src/syncop.h +++ b/libglusterfs/src/syncop.h @@ -344,49 +344,79 @@ int syncop_readdir (xlator_t *subvol, fd_t *fd, size_t size, off_t off, gf_dirent_t *entries); int syncop_opendir (xlator_t *subvol, loc_t *loc, fd_t *fd); +int syncop_opendir_with_xdata (xlator_t *subvol, loc_t *loc, fd_t *fd, dict_t *dict); int syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid, /* out */ struct iatt *preop, struct iatt *postop); +int syncop_setattr_with_xdata (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid, + /* out */ + struct iatt *preop, struct iatt *postop, dict_t *dict); + int syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid, /* out */ struct iatt *preop, struct iatt *postop); +int syncop_fsetattr_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid, + /* out */ + struct iatt *preop, struct iatt *postop, dict_t *dict); + int syncop_statfs (xlator_t *subvol, loc_t *loc, struct statvfs *buf); int syncop_setxattr (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags); +int syncop_setxattr_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags, dict_t *extra); int syncop_fsetxattr (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags); +int syncop_fsetxattr_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags, dict_t *extra); int syncop_listxattr (xlator_t *subvol, loc_t *loc, dict_t **dict); int syncop_getxattr (xlator_t *xl, loc_t *loc, dict_t **dict, const char *key); int syncop_fgetxattr (xlator_t *xl, fd_t *fd, dict_t **dict, const char *key); +int syncop_fgetxattr_with_xdata (xlator_t *xl, fd_t *fd, dict_t **dict, const char *key, dict_t *extra); int syncop_removexattr (xlator_t *subvol, loc_t *loc, const char *name); +int syncop_removexattr_with_xdata (xlator_t *subvol, loc_t *loc, const char *name, dict_t *dict); int syncop_fremovexattr (xlator_t *subvol, fd_t *fd, const char *name); +int syncop_fremovexattr_with_xdata (xlator_t *subvol, fd_t *fd, const char *name, dict_t *dict); int syncop_create (xlator_t *subvol, loc_t *loc, int32_t flags, mode_t mode, fd_t *fd, dict_t *dict, struct iatt *iatt); int syncop_open (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd); +int syncop_open_with_xdata (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd, dict_t *dict); int syncop_close (fd_t *fd); +int syncop_close_with_xdata (fd_t *fd, dict_t *dict); int syncop_write (xlator_t *subvol, fd_t *fd, const char *buf, int size, off_t offset, struct iobref *iobref, uint32_t flags); int syncop_writev (xlator_t *subvol, fd_t *fd, const struct iovec *vector, int32_t count, off_t offset, struct iobref *iobref, uint32_t flags); +int syncop_writev_with_xdata (xlator_t *subvol, fd_t *fd, const struct iovec *vector, + int32_t count, off_t offset, struct iobref *iobref, + uint32_t flags, dict_t *dict); int syncop_readv (xlator_t *subvol, fd_t *fd, size_t size, off_t off, uint32_t flags, /* out */ struct iovec **vector, int *count, struct iobref **iobref); +int syncop_readv_with_xdata (xlator_t *subvol, fd_t *fd, size_t size, off_t off, + uint32_t flags, + /* out */ + struct iovec **vector, int *count, struct iobref **iobref, dict_t *dict); int syncop_ftruncate (xlator_t *subvol, fd_t *fd, off_t offset); +int syncop_ftruncate_with_xdata (xlator_t *subvol, fd_t *fd, off_t offset, dict_t *dict); int syncop_truncate (xlator_t *subvol, loc_t *loc, off_t offset); int syncop_unlink (xlator_t *subvol, loc_t *loc); +int syncop_unlink_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict); + int syncop_rmdir (xlator_t *subvol, loc_t *loc, int flags); +int syncop_rmdir_with_xdata (xlator_t *subvol, loc_t *loc, int flags, dict_t *dict); int syncop_fsync (xlator_t *subvol, fd_t *fd, int dataonly); +int syncop_fsync_with_xdata (xlator_t *subvol, fd_t *fd, int dataonly, dict_t *dict); int syncop_flush (xlator_t *subvol, fd_t *fd); +int syncop_flush_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict); int syncop_fstat (xlator_t *subvol, fd_t *fd, struct iatt *stbuf); +int syncop_fstat_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *stbuf, dict_t *dict); int syncop_stat (xlator_t *subvol, loc_t *loc, struct iatt *stbuf); int syncop_symlink (xlator_t *subvol, loc_t *loc, const char *newpath, @@ -397,6 +427,7 @@ int syncop_mknod (xlator_t *subvol, loc_t *loc, mode_t mode, dev_t rdev, int syncop_mkdir (xlator_t *subvol, loc_t *loc, mode_t mode, dict_t *dict, struct iatt *iatt); int syncop_link (xlator_t *subvol, loc_t *oldloc, loc_t *newloc); +int syncop_link_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict); int syncop_fsyncdir (xlator_t *subvol, fd_t *fd, int datasync); int syncop_access (xlator_t *subvol, loc_t *loc, int32_t mask); int syncop_fallocate(xlator_t *subvol, fd_t *fd, int32_t keep_size, off_t offset, @@ -406,6 +437,7 @@ int syncop_discard(xlator_t *subvol, fd_t *fd, off_t offset, size_t len); int syncop_zerofill(xlator_t *subvol, fd_t *fd, off_t offset, off_t len); int syncop_rename (xlator_t *subvol, loc_t *oldloc, loc_t *newloc); +int syncop_rename_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict); int syncop_lk (xlator_t *subvol, fd_t *fd, int cmd, struct gf_flock *flock); diff --git a/tests/basic/nsr.t b/tests/basic/nsr.t new file mode 100644 index 000000000..5d6faf78e --- /dev/null +++ b/tests/basic/nsr.t @@ -0,0 +1,47 @@ +#!/bin/bash + +# Test *very basic* NSR functionality - startup, mount, simplest possible file +# write. + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function get_rep_count { + v=$(getfattr --only-values -e text -n trusted.nsr.rep-count $1 2> /dev/null) + #echo $v > /dev/tty + echo $v +} + +function ping_file { + dd if=/dev/urandom of=$1 bs=4k count=1 2> /dev/null +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2} + +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' +EXPECT '2' brick_count $V0 + +TEST $CLI volume set $V0 cluster.nsr on + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +## Mount FUSE with caching disabled (read-only) +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Give the bricks a chance to connect to each other. +EXPECT_WITHIN 10 "2" get_rep_count $M0 + +TEST ping_file $M0/probe +TEST cmp ${M0}/probe ${B0}/${V0}1/probe +TEST cmp ${M0}/probe ${B0}/${V0}2/probe + +cleanup +killall -9 etcd diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 0990822a7..6e883e565 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht +SUBDIRS = stripe afr dht nsr-server nsr-recon nsr-client CLEANFILES = diff --git a/xlators/cluster/nsr-client/Makefile.am b/xlators/cluster/nsr-client/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/nsr-client/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/nsr-client/src/Makefile.am b/xlators/cluster/nsr-client/src/Makefile.am new file mode 100644 index 000000000..bacd1a906 --- /dev/null +++ b/xlators/cluster/nsr-client/src/Makefile.am @@ -0,0 +1,33 @@ +python_PYTHON = gen-fops.py + +xlator_LTLIBRARIES = nsrc.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +nsrc_la_LDFLAGS = -module -avoid-version +nsrc_la_SOURCES = nsrc.c + +nsrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = fop-template.c \ + $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/glusterfsd/src/glusterfsd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h + +CLEANFILES = nsrc-cg.c + +CODEGEN_DIR = ../../nsr-server/src/codegen.py + +nsrc-cg.c: gen-fops.py $(CODEGEN) $(XLATOR_HEADER) fop-template.c + $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) fop-template.c > $@ + +nsrc.lo: nsrc-cg.c + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/cluster/nsr-client/src/fop-template.c b/xlators/cluster/nsr-client/src/fop-template.c new file mode 100644 index 000000000..699b07d40 --- /dev/null +++ b/xlators/cluster/nsr-client/src/fop-template.c @@ -0,0 +1,113 @@ +// template-name fop +$TYPE$ +nsrc_$NAME$ (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsrc_local_t *local = NULL; + xlator_t *target_xl = ACTIVE_CHILD(this); + + local = mem_get(this->local_pool); + if (!local) { + goto err; + } + + local->stub = fop_$NAME$_stub (frame, nsrc_$NAME$_continue, + $ARGS_SHORT$); + if (!local->stub) { + goto err; + } + local->curr_xl = target_xl; + local->scars = 0; + + frame->local = local; + STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, target_xl, + target_xl, target_xl->fops->$NAME$, + $ARGS_SHORT$); + return 0; + +err: + if (local) { + mem_put(local); + } + STACK_UNWIND_STRICT ($NAME$, frame, -1, ENOMEM, + $DEFAULTS$); + return 0; +} + +// template-name cbk +$TYPE$ +nsrc_$NAME$_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + $ARGS_LONG$) +{ + nsrc_local_t *local = frame->local; + xlator_t *last_xl = cookie; + xlator_t *next_xl; + nsrc_private_t *priv = this->private; + struct timespec spec; + + if (op_ret != (-1)) { + if (local->scars) { + gf_log (this->name, GF_LOG_INFO, + HILITE("retried %p OK"), frame->local); + } + priv->active = last_xl; + goto unwind; + } + if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) { + goto unwind; + } + + /* TBD: get leader ID from xdata? */ + next_xl = next_xlator(this,last_xl); + /* + * We can't just give up after we've tried all bricks, because it's + * quite likely that a new leader election just hasn't finished yet. + * We also shouldn't retry endlessly, and especially not at a high + * rate, but that's good enough while we work on other things. + * + * TBD: implement slow/finite retry via a worker thread + */ + if (!next_xl || (local->scars >= SCAR_LIMIT)) { + gf_log (this->name, GF_LOG_DEBUG, + HILITE("ran out of retries for %p"), frame->local); + goto unwind; + } + + local->curr_xl = next_xl; + local->scars += 1; + spec.tv_sec = 1; + spec.tv_nsec = 0; + /* + * WARNING + * + * Just calling gf_timer_call_after like this leaves open the + * possibility that writes will get reordered, if a first write is + * rescheduled and then a second comes along to find an updated + * priv->active before the first actually executes. We might need to + * implement a stricter (and more complicated) queuing mechanism to + * ensure absolute consistency in this case. + */ + if (gf_timer_call_after(this->ctx,spec,nsrc_retry_cb,local)) { + return 0; + } + +unwind: + call_stub_destroy(local->stub); + STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno, + $ARGS_SHORT$); + return 0; +} + +// template-name cont-func +$TYPE$ +nsrc_$NAME$_continue (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsrc_local_t *local = frame->local; + + STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, local->curr_xl, + local->curr_xl, local->curr_xl->fops->$NAME$, + $ARGS_SHORT$); + return 0; +} diff --git a/xlators/cluster/nsr-client/src/gen-fops.py b/xlators/cluster/nsr-client/src/gen-fops.py new file mode 100644 index 000000000..b07b3c5b1 --- /dev/null +++ b/xlators/cluster/nsr-client/src/gen-fops.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +# This script generates the boilerplate versions of most fops in the client, +# mostly so that we can use STACK_WIND instead of STACK_WIND_TAIL (see +# fop-template.c for the details). The problem we're solving is that we sit +# under DHT, which makes assumptions about getting callbacks only from its +# direct children. If we didn't define our own versions of these fops, the +# default versions would use STACK_WIND_TAIL and the callbacks would come from +# DHT's grandchildren. The code-generation approach allows us to handle this +# with a minimum of code, and also keep up with any changes to the fop table. + +import sys +sys.path.append("../../nsr-server/src") # Blech. +import codegen + +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_t\)" +full_re = type_re + " *" + name_re +fop_cg = codegen.CodeGenerator() +fop_cg.skip = 2 +fop_cg.parse_decls(sys.argv[1],full_re) +fop_cg.load_templates(sys.argv[2]) + +# Use the multi-template feature to generate multiple callbacks from the same +# parsed declarations. +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)" +full_re = type_re + " *" + name_re +cbk_cg = codegen.CodeGenerator() +cbk_cg.skip = 5 +cbk_cg.parse_decls(sys.argv[1],full_re) +cbk_cg.load_templates(sys.argv[2]) + +# This is a nasty little trick to handle the case where a generated fop needs +# a set of default arguments for the corresponding callback. +# +# Yes, it's ironic that I'm copying and pasting the generator code. +fop_cg.make_defaults = cbk_cg.make_defaults + +# Sorry, getspec, you're not a real fop until someone writes a stub function +# for you. +del fop_cg.decls["getspec"] +del cbk_cg.decls["getspec"] + +# cbk is used by both fop and continue, so emit first +for f_name in cbk_cg.decls.keys(): + cbk_cg.emit(f_name,"cbk") + print("") + +# continue is used by fop, so emit next +for f_name in fop_cg.decls.keys(): + fop_cg.emit(f_name,"cont-func") + print("") + +for f_name in fop_cg.decls.keys(): + fop_cg.emit(f_name,"fop") + print("") diff --git a/xlators/cluster/nsr-client/src/nsrc.c b/xlators/cluster/nsr-client/src/nsrc.c new file mode 100644 index 000000000..6a80b1d86 --- /dev/null +++ b/xlators/cluster/nsr-client/src/nsrc.c @@ -0,0 +1,194 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "timer.h" +#include "xlator.h" + +#define SCAR_LIMIT 20 +#define HILITE(x) (""x"") + +/* + * The fops are actually generated by gen-fops.py; the rest was mostly copied + * from defaults.c (commit cd253754 on 27 August 2013). + */ + +enum gf_dht_mem_types_ { + gf_mt_nsrc_private_t = gf_common_mt_end + 1, +}; + +typedef struct { + xlator_t *active; +} nsrc_private_t; + +typedef struct { + call_stub_t *stub; + xlator_t *curr_xl; + uint16_t scars; +} nsrc_local_t; + +char *NSRC_XATTR = "user.nsr.active"; + +static inline +xlator_t * +ACTIVE_CHILD (xlator_t *parent) +{ + nsrc_private_t *priv = parent->private; + + return priv ? priv->active : FIRST_CHILD(parent); +} + +xlator_t * +next_xlator (xlator_t *this, xlator_t *prev) +{ + xlator_list_t *trav; + + for (trav = this->children; trav; trav = trav->next) { + if (trav->xlator == prev) { + return trav->next ? trav->next->xlator + : this->children->xlator; + } + } + + return NULL; +} + +void +nsrc_retry_cb (void *cb_arg) +{ + nsrc_local_t *local = cb_arg; + + gf_log (__func__, GF_LOG_INFO, HILITE("retrying %p"), local); + call_resume_wind(local->stub); +} + +#include "nsrc-cg.c" + +int32_t +nsrc_forget (xlator_t *this, inode_t *inode) +{ + gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not " + "implement forget_cbk"); + return 0; +} + + +int32_t +nsrc_releasedir (xlator_t *this, fd_t *fd) +{ + gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not " + "implement releasedir_cbk"); + return 0; +} + +int32_t +nsrc_release (xlator_t *this, fd_t *fd) +{ + gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not " + "implement release_cbk"); + return 0; +} + +struct xlator_fops fops = { + .lookup = nsrc_lookup, + .stat = nsrc_stat, + .fstat = nsrc_fstat, + .truncate = nsrc_truncate, + .ftruncate = nsrc_ftruncate, + .access = nsrc_access, + .readlink = nsrc_readlink, + .mknod = nsrc_mknod, + .mkdir = nsrc_mkdir, + .unlink = nsrc_unlink, + .rmdir = nsrc_rmdir, + .symlink = nsrc_symlink, + .rename = nsrc_rename, + .link = nsrc_link, + .create = nsrc_create, + .open = nsrc_open, + .readv = nsrc_readv, + .writev = nsrc_writev, + .flush = nsrc_flush, + .fsync = nsrc_fsync, + .opendir = nsrc_opendir, + .readdir = nsrc_readdir, + .readdirp = nsrc_readdirp, + .fsyncdir = nsrc_fsyncdir, + .statfs = nsrc_statfs, + .setxattr = nsrc_setxattr, + .getxattr = nsrc_getxattr, + .fsetxattr = nsrc_fsetxattr, + .fgetxattr = nsrc_fgetxattr, + .removexattr = nsrc_removexattr, + .fremovexattr = nsrc_fremovexattr, + .lk = nsrc_lk, + .inodelk = nsrc_inodelk, + .finodelk = nsrc_finodelk, + .entrylk = nsrc_entrylk, + .fentrylk = nsrc_fentrylk, + .rchecksum = nsrc_rchecksum, + .xattrop = nsrc_xattrop, + .fxattrop = nsrc_fxattrop, + .setattr = nsrc_setattr, + .fsetattr = nsrc_fsetattr, + .fallocate = nsrc_fallocate, + .discard = nsrc_discard, +}; + +struct xlator_cbks cbks = { +}; + +int32_t +nsrc_init (xlator_t *this) +{ + nsrc_private_t *priv = NULL; + + this->local_pool = mem_pool_new (nsrc_local_t, 128); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create nsrc_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_mt_nsrc_private_t); + if (!priv) { + goto err; + } + + priv->active = FIRST_CHILD(this); + this->private = priv; + return 0; + +err: + if (priv) { + GF_FREE(priv); + } + return -1; +} + +void +nsrc_fini (xlator_t *this) +{ + GF_FREE(this->private); +} + +class_methods_t class_methods = { + .init = nsrc_init, + .fini = nsrc_fini, +}; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/cluster/nsr-recon/Makefile.am b/xlators/cluster/nsr-recon/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/nsr-recon/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/nsr-recon/src/Makefile.am b/xlators/cluster/nsr-recon/src/Makefile.am new file mode 100644 index 000000000..8fa344864 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/Makefile.am @@ -0,0 +1,22 @@ +xlator_LTLIBRARIES = nsr_recon.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +nsr_recon_la_LDFLAGS = -module -avoid-version -lgfapi +nsr_recon_la_SOURCES = recon_driver.c recon_xlator.c + +nsr_recon_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = recon_driver.h recon_xlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/cluster/nsr-recon/src/recon_driver.c b/xlators/cluster/nsr-recon/src/recon_driver.c new file mode 100644 index 000000000..1328d52dc --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_driver.c @@ -0,0 +1,2624 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include + + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" + + +#include "recon_driver.h" +#include "recon_xlator.h" +#include "api/src/glfs-internal.h" +#include "api/src/glfs-handles.h" + +/* TBD: move declarations here and nsr.c into a common place */ +#define NSR_TERM_XATTR "trusted.nsr.term" +#define RECON_TERM_XATTR "trusted.nsr.recon-term" +#define RECON_INDEX_XATTR "trusted.nsr.recon-index" + +/* + * Execution architecture for the NSR reconciliation driver. The driver runs + * as a seperate process in each node where the brick is. The main function of + * the driver is nsr_reconciliation_driver() (last function below) The driver + * just sits in a tight loop waiting for state changes. When a brick becomes a + * replica leader, it fences IO, contacts this process and waits for + * reconciliation to finish. + * + * The replica leader talks to other bricks in replica group which are alive + * and gets the last term info using which it decides which has the latest + * data. That brick is referred to as the "reconciliator"; leader sends a + * message to reconciliator to freeze its data (by reading any incomplete data + * from other nodes from that term if required) + * + * Once that is done leader sends a message to all nodes except the + * reconciliator to sync themselves with the reconciliator. This process is + * referred to as "resolution". + * + * Hence the reconciliation processes need to talk to each other to get a given + * term info. This is implemented using the recon translator IOs which + * implements a bare bone RPC by exposing a file interface to which + * reads/writes are done to pass control messages. This is referred to as the + * "control plane". This implementation allows the control plane to be + * implemented as a bunch of threads for each of the nodes. + * + * The reconciliation process also needs to talk to the brick process on that + * node to actually write the data as part of reconciliation/resolution. This + * is referred to as the "data plane". Again there are a bunch of threads that + * do this work. + * + * The way the worker threads are organised is that main driver context has a + * pointer to contexts for each of these thread contexts. The thread context at + * index 0 always refers to talking with local recon process/brick. So the + * control worker at index 0 will get the local changelog info and data worker + * at index 0 will talk to local brick. + * + * All the ops from the control/data planes are implemented using the glfs + * APIs. + */ + +/* + * This function gets the size of all the extended attributes for a file. + * This is used so that caller knows how much to allocate for key-value storage. + * + * Input Arguments: + * fd - the file opened using glfs API. + * dict - passed so that NSR translator can get this from the required brick + * + * Output Arguments: + * b - pointer to the buffer where the attributes are filled up. + * key_size - the size of all keys + * val_size - the size of all values + * num - number of key/values + */ +static int32_t +get_xattr_total_size( struct glfs_fd *fd, + char **b, + uint32_t *key_size, + uint32_t *val_size, + uint32_t* num, + dict_t *dict) +{ + int32_t s = -1, ret = -1; + char *c = NULL; + + *key_size = 0; + *val_size = 0; + *num = 0; + + // First get the size of the keys + s = glfs_flistxattr_with_xdata(fd, NULL,0, dict); + if (s == -1) + goto out; + *key_size = s; + + // TBD - use the regular calloc + (*b) = c = calloc(s+1,1); + + // get the keys themselves + if (glfs_flistxattr_with_xdata(fd, c, s+1, dict) == -1) + goto out; + do { + int32_t r; + uint32_t len = 0; + // for each key get the size of the value + r = glfs_fgetxattr_with_xdata(fd, c, NULL, 0, dict); + if (r == -1) + goto out; + (*val_size) += r; + len = strlen(c) + 1; + c += len; + s -= len; + (*num)++; + } while(s); + ret = 0; +out: + return ret; +} + +/* + * This function gets bunch of xattr values given set of keys. + * + * Input Arguments: + * fd - the file opened using glfs API. + * keys - the bunch of keys + * size - size of values + * num - number of keys + * dict - passed so that NSR translator can get this from the required brick + * + * Output Arguments: + * buf - where the values are written one after the other (NULL seperated) + */ +static void +get_xattr(struct glfs_fd *fd, + char *keys, + char *buf, + uint32_t size, + uint32_t num, + dict_t *dict) +{ + while(num--) { + int32_t r; + uint32_t len = 0; + + // copy the key + strcpy(buf, keys); + len = strlen(keys); + len++; + buf += len; + + // get the value and copy the value after incrementing buf after the key + r = glfs_fgetxattr_with_xdata(fd, keys, buf, size, dict); + + // TBD - handle error + if (r == -1) + return; + + // increment the key to next value + keys += len; + + // increment buf to hold the next key + buf += strlen(buf) + 1; + } + return; +} + +/* + * Function deletes a bunch of key values in extended attributes of a file. + * Input Arguments: + * fd - the file opened using glfs API. + * dict - passed so that NSR translator can do this from the required brick + * keys - bunch of NULL seperated key names + * num - number of keys + */ +static void delete_xattr(struct glfs_fd *fd, + dict_t *dict_t, + char *keys, + uint32_t num) +{ + while(num--) { + // get the value and copy the value + // TBD - handle failure cases when calling glfs_fremovexattr_with_xdata() + glfs_fremovexattr_with_xdata(fd, keys, dict_t); + keys += strlen(keys) +1; + } + return; +} + +/* + * Given a bunch of key value pairs, fill them as xattrs for a file + * + * Input Arguments: + * fd - the file opened using glfs API. + * dict - passed so that NSR translator can do this from the required brick + * buf - buffer containing the keys-values pairs. The key value are NULL seperated. + * Each of the key-value is seperated by NULL in turn. + * num - Number of such key value pairs. + */ +static void +fill_xattr(struct glfs_fd *fd, + dict_t *dict, + char *buf, + uint32_t num) +{ + char *k = buf, *val = NULL; + + while(num--) { + int32_t r; + + val = k + strlen(k) + 1; + + // TBD - handle failure cases when calling glfs_fsetxattr_with_xdata() + r = glfs_fsetxattr_with_xdata(fd, k, val, strlen(val), 0, dict); + if (r == -1) + return; + k = val + strlen(val) + 1; + } + return; +} + +/* + * This function gets a file that can be used for doing glfs_init later. + * The control file is used by control thread(function) to talk to peer reconciliation process. + * The data file is used by the data thread(function) to talk to the bricks. + * The control file is of name such as con:gfs1:-mnt-a1 where "gfs1" is name of host + * and the brick path is "/mnt/a1". + * The data file is of name such as data:gfs1:-mnt-a1. + * + * Input Arguments: + * vol - name of the volume. This is used to build the full path of the control and data file + * such as /var/lib/glusterd/vols/test/bricks/gfs2:-mnt-test1-nsr-recon.vol. + * In above example the volume name is test and brick on gfs2 is on path /mnt/test1 + * + * worker - The worker for a given node. This worker has 2 threads - one on the data plane + * and one on the control plane. The worker->name is already filled with hostname:brickname + * in the function nsr_reconciliation_driver(). Use that to build the volume file. + * So if worker->name has gfs1:/mnt/a1, control file is con:gfs1:-mnt-a1 + * and data file is data:gfs1:-mnt-a1. + * All these files are under the bricks directory. TBD - move this to a NSR recon directory later. + */ +static void +nsr_recon_get_file(char *vol, nsr_replica_worker_t *worker) +{ + char *ptr; + char tr[256]; + + // Replace the "/" to - + strcpy(tr, worker->name); + ptr = strchr (tr, '/'); + while (ptr) { + *ptr = '-'; + ptr = strchr (tr, '/'); + } + + // Build the base directory such as "/var/lib/glusterd/vols/test/bricks/" + sprintf(worker->control_worker->vol_file, + "/%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + vol, + GLUSTERD_BRICK_INFO_DIR); + + strcat(worker->control_worker->vol_file, "con:"); + strcat(worker->control_worker->vol_file, tr); + + sprintf(worker->data_worker->vol_file, + "/%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + vol, + GLUSTERD_BRICK_INFO_DIR); + strcat(worker->data_worker->vol_file, "data:"); + strcat(worker->data_worker->vol_file, tr); +} + +/* + * This function does all the glfs initialisation + * so that reconciliation process can talk to other recon processes/bricks + * for the control/data messages. + * This will be done everytime a worker needs to be kicked off to talk + * across any plane. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static int +nsr_recon_start_work(nsr_per_node_worker_t *ctx, + gf_boolean_t control) +{ + glfs_t *fs = NULL; + xlator_t *this = ctx->driver_ctx->this; + int32_t ret = 0; + glfs_fd_t *aux_fd = NULL; // fd of auxilary log + char lf[256]; + + nsr_worker_log(this->name, GF_LOG_INFO, + "starting work with volfile %s\n", + ctx->vol_file); + + fs = glfs_new(ctx->id); + if (!fs) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, + "cannot create gfls context for thread %s\n",ctx->id); + return -1; + } + + // For some vague reason, glfs init APIs seem to be clobbering "this". hence resetting it. + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_INFO, + "init done. setting volfile %s\n", + ctx->vol_file); + + ret = glfs_set_volfile(fs, ctx->vol_file); + if (ret != 0) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, + "cannot set volfile %s for thread %s\n",ctx->vol_file, ctx->id); + return -1; + } + + // TBD - convert this to right /usr/local/var/log based log files. + sprintf(lf,"/tmp/logs/%s-%s",(control == _gf_true)?"con":"data",ctx->id); + glfs_set_logging (fs, lf, 7); + glusterfs_this_set(this); + + ret = glfs_init (fs); + if (ret != 0) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do init for thread %s with volfile %s\n",ctx->id, ctx->vol_file); + return -1; + } + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_INFO, + "setting volfile %s done\n", + ctx->vol_file); + + // If it is control thread, open the "/" as the aux_fd. + // All IOs happening via the fd will do the RPCs across the reconciliation + // processes. For some vague reason, the root seems to be open'able like a file. + // TBD - try to clean this up. (implement a virtual file???) + if (control == _gf_true) { + nsr_worker_log(this->name, GF_LOG_INFO, + "doing open for / \n"); + aux_fd = glfs_open (fs, "/", O_RDWR); + // TBD - proper error handling. Stall reconciliation if such a thing happens? + if (aux_fd == NULL) { + nsr_worker_log(this->name, GF_LOG_ERROR, + "cannot open aux log file for thread %s\n",ctx->id); + } else { + nsr_worker_log(this->name, GF_LOG_ERROR, + "---opened aux log file for thread %s\n",ctx->id); + } + ctx->aux_fd = aux_fd; + } + glusterfs_this_set(this); + ctx->fs = fs; + return 0; +} + +/* + * + * This function does the cleanup after reconciliation is done + * or before we start a new reconciliation. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static int +nsr_recon_end_work(nsr_per_node_worker_t *ctx, + gf_boolean_t control) +{ + int32_t ret = 0; + xlator_t *this = ctx->driver_ctx->this; + + nsr_worker_log(this->name, GF_LOG_INFO, + "doing fini for recon worker\n"); + + ret = glfs_fini(ctx->fs); + if (ret != 0) { + glusterfs_this_set(this); + nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do fini for thread %s with volfile %s\n",ctx->id, ctx->vol_file); + return -1; + } + glusterfs_this_set(this); + ctx->fs = NULL; + if (control == _gf_true) { + glfs_close (ctx->aux_fd); + ctx->aux_fd = NULL; + } + return 0; +} + +//called in case all worker functions run as sepeerate threads +static void +init_worker(nsr_per_node_worker_t *ctx, gf_boolean_t control) +{ + pthread_mutex_init(&(ctx->mutex), NULL); + pthread_cond_init(&(ctx->cv), NULL); + INIT_LIST_HEAD(&(ctx->head.list)); +} + + +/* + * Control worker funct for getting changelog info on this node. + * calls directly functions to parse the changelog. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static void +control_worker_func_0(nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]); + xlator_t *this = ctx->driver_ctx->this; + nsr_recon_private_t *priv = this->private; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + ctx->is_control = _gf_true; + + switch (work->req_id){ + case NSR_WORK_ID_INI: + { + break; + } + case NSR_WORK_ID_FINI: + { + break; + } + case NSR_WORK_ID_GET_LAST_TERM_INFO: + { + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + // term is stuffed inside work->index. overloading. + int32_t term = work->index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get last term info for node %d with current term %d\n",index, term); + + // TBD - handle errors + // This is called by the leader after it gets the current term. + // Makes searching easier. + nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, term, <); + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get last term info with current term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->first_index, recon_info->last_index); + break; + } + case NSR_WORK_ID_GET_GIVEN_TERM_INFO: + { + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + // term is stuffed inside work->index. overloading. + int32_t term = work->index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get term info for node %d for term %d\n",index, term); + + // TBD - handle errors + nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, term, <); + + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get term info for term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->last_index, recon_info->first_index); + + break; + } + case NSR_WORK_ID_RECONCILIATOR_DO_WORK: + { + // For local resolution, the main driver thread does it. + // SO there is no way we can have this message for this node. + GF_ASSERT(0); + + nsr_worker_log(this->name, GF_LOG_INFO, + "this message should not be sent \n"); + break; + } + case NSR_WORK_ID_RESOLUTION_DO_WORK: + { + GF_ASSERT(0); + + nsr_worker_log(this->name, GF_LOG_INFO, + "this message should not be sent \n"); + break; + } + case NSR_WORK_ID_END_RECONCILIATION: + { + nsr_worker_log(this->name, GF_LOG_INFO, + "sending reconciliation end message to node %d\n", index); + nsr_recon_return_back(priv, dr->txn_id); + break; + } + case NSR_WORK_ID_GET_RECONCILATION_WINDOW: + { + nsr_reconciliator_info_t *recon_info = rw->recon_info; + // first_index and last_index at 0 indicates empty log. + // For non empty log, the first_index always starts at 1. + uint32_t num = (dr->workers[index].recon_info->last_index - + dr->workers[index].recon_info->first_index + 1); + nsr_recon_record_details_t *rd; + uint32_t i=0; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get reconciliation window records for node %d for term %d with first %d last %d\n", + index, recon_info->last_term, recon_info->first_index, recon_info->last_index); + + GF_ASSERT(num <= MAX_RECONCILIATION_WINDOW_SIZE); + + // TBD - handle buffer allocation errors + rd = GF_CALLOC(num, + sizeof(nsr_recon_record_details_t), + gf_mt_recon_private_t); + + // TBD - handle errors + nsr_recon_libchangelog_get_records(this, priv->changelog_base_path, + recon_info->last_term, + recon_info->first_index, + recon_info->last_index, + rd); + // The above function writes into rd from 0 to (num -1) + // We need to take care of this whenever we deal with records + for (i=0; i < num; i++) { + ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl + memcpy(&(recon_info->records[i].rec), + &(rd[i]), + sizeof(nsr_recon_record_details_t)); + } + + GF_FREE(rd); + + nsr_worker_log(this->name, GF_LOG_INFO, + "got reconciliation window records for node %d for term %d \n", + index, recon_info->last_term); + break; + } + } + + return; +} + +// Control worker thread +static void* +control_worker_main_0(nsr_per_node_worker_t *ctx) +{ + + ctx->is_control = _gf_true; + nsr_worker_log(this->name, GF_LOG_INFO, + "starting control worker func 0\n"); + + init_worker(ctx, 1); + + while(1) + { + nsr_recon_work_t *work = NULL; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + nsr_worker_log(this->name, GF_LOG_INFO, + "waiting for work\n"); + + pthread_mutex_lock(&ctx->mutex); + while (list_empty(&(ctx->head.list))) { + pthread_cond_wait(&ctx->cv, &ctx->mutex); + } + pthread_mutex_unlock(&ctx->mutex); + + + list_for_each_entry(work, &(ctx->head.list), list) { + nsr_worker_log(this->name, GF_LOG_INFO, + "got work with id %d\n", work->req_id); + work->in_use = _gf_false; + + // Call the main function. + control_worker_func_0(ctx, work); + + atomic_dec(&(dr->outstanding)); + break; + } + + nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n"); + list_del_init (&work->list); + nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n"); + } + + return NULL; +} + +/* + * Control worker funct for getting changelog info on some other node. + * calls glfs functions to seek/read/write on aux_fd. + * + * Input arguments: + * ctx - The per worker based context + * control - set to true if this worker is for the control plane + */ +static void +control_worker_func(nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + unsigned int index = ctx->index; + nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]); + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + ctx->is_control = _gf_true; + + switch (work->req_id){ + case NSR_WORK_ID_INI: + { + nsr_worker_log(this->name, GF_LOG_INFO, + "calling nsr_recon_start_work\n"); + + // TBD - handle error in case nsr_recon_start_work gives error + nsr_recon_start_work(ctx, _gf_true); + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished nsr_recon_start_work\n"); + break; + } + case NSR_WORK_ID_FINI: + { + nsr_worker_log(this->name, GF_LOG_INFO, + "calling nsr_recon_end_work\n"); + + // TBD - handle error in case nsr_recon_end_work gives error + nsr_recon_end_work(ctx, _gf_true); + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished nsr_recon_end_work\n"); + break; + } + case NSR_WORK_ID_GET_LAST_TERM_INFO: + { + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + int32_t term = htonl(work->index); // overloading it + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get last term info for node %d with current term %d\n",index, work->index); + + // first write the current term term number + // TBD - error handling for all the glfs APIs + glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_4, SEEK_SET); + glfs_write(ctx->aux_fd, &term, sizeof(term), 0); + glfs_read(ctx->aux_fd, <, sizeof(lt), 0); + ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get last term info with current term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->last_index, recon_info->first_index); + + break; + } + case NSR_WORK_ID_GET_GIVEN_TERM_INFO: + { + nsr_recon_last_term_info_t lt; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + int32_t term = htonl(work->index); // overloading it + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get term info for node %d for term %d\n",index, work->index); + + // first write the term number + // TBD - error handling for all the glfs APIs + glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_3, SEEK_SET); + glfs_write(ctx->aux_fd, &term, sizeof(term), 0); + glfs_read(ctx->aux_fd, <, sizeof(lt), 0); + ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl + recon_info->last_term = lt.last_term; + recon_info->commited_ops = lt.commited_ops; + recon_info->last_index = lt.last_index; + recon_info->first_index = lt.first_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "out of get term info for term %d. got ops %d with first %d and last %d \n", + recon_info->last_term, recon_info->commited_ops, + recon_info->last_index, recon_info->first_index); + + break; + } + case NSR_WORK_ID_RECONCILIATOR_DO_WORK: + { + nsr_recon_role_t rr; + uint32_t i=0; + uint32_t num=0; + uint32_t idx = dr->reconciliator_index; + uint32_t term = dr->workers[idx].recon_info->last_term; + GF_ASSERT(idx == index); + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to make this index %d as reconciliator for term %d\n", index, term); + + // TBD - error handling for all the glfs APIs + glfs_lseek(ctx->aux_fd, + nsr_recon_xlator_sector_1, + SEEK_SET); + + // We have all the info for all other nodes. + // Fill all that info when sending data to that process. + for (i=0; i < dr->replica_group_size; i++) { + if ( dr->workers[i].in_use && + (dr->workers[i].recon_info->last_term == term)) { + rr.info[num].last_term = + dr->workers[i].recon_info->last_term; + rr.info[num].commited_ops = + dr->workers[i].recon_info->commited_ops; + rr.info[num].last_index = + dr->workers[i].recon_info->last_index; + rr.info[num].first_index = + dr->workers[i].recon_info->first_index; + strcpy(rr.info[num].name, + dr->workers[i].name); + } + num++; + } + rr.num = num; + rr.role = reconciliator; + ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl + glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0); + + nsr_worker_log(this->name, GF_LOG_INFO, + "sent reconciliator info for term %d with node count as %d\n", term, num); + + break; + } + case NSR_WORK_ID_RESOLUTION_DO_WORK: + { + nsr_recon_role_t rr; + unsigned int i=0, j=0; + unsigned int rec = dr->reconciliator_index; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to make this index %d as resolutor with reconciliator as %d\n",index, rec); + + // TBD - error handling for all the glfs APIs + glfs_lseek(ctx->aux_fd, + nsr_recon_xlator_sector_1, + SEEK_SET); + rr.num = 2; + + // Fill in info[0] as info for the node for which we are seeking resolution. + // Fill in info[1] as info of the reconciliator node. + // The function nsr_recon_driver_set_role() that will be called when + // this message reaches the node will look at index 1 for term information + // related to the reconciliator. + for (i=0; i < 2; i++) { + (i == 0) ? (j = index) : (j = rec); + rr.info[i].last_term = + dr->workers[j].recon_info->last_term; + rr.info[i].commited_ops = + dr->workers[j].recon_info->commited_ops; + rr.info[i].last_index = + dr->workers[j].recon_info->last_index; + rr.info[i].first_index = + dr->workers[j].recon_info->first_index; + // The name is used as the key to convert indices since + // the reconciliator index could be different across the nodes. + strcpy(rr.info[i].name, + dr->workers[j].name); + if (i == 0) { + nsr_worker_log(this->name, GF_LOG_INFO, + "this node info term=%d, ops=%d, first=%d, last=%d\n", + rr.info[i].last_term, rr.info[i].commited_ops, + rr.info[i].first_index,rr.info[i].last_index); + } else { + nsr_worker_log(this->name, GF_LOG_INFO, + "reconciliator node info term=%d, ops=%d, first=%d, last=%d\n", + rr.info[i].last_term, rr.info[i].commited_ops, + rr.info[i].first_index,rr.info[i].last_index); + } + } + rr.role = resolutor; + ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl + glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0); + + nsr_worker_log(this->name, GF_LOG_INFO, + "sent message to this node %d resolutor with reconciliator as %d\n", index, rec); + + break; + } + case NSR_WORK_ID_END_RECONCILIATION: + { + char c[4]; + uint32_t old = htonl(dr->txn_id); + + nsr_worker_log(this->name, GF_LOG_INFO, + "sending reconciliation end message to node %d\n", index); + + memcpy(c, &old, sizeof(uint32_t)); + // TBD - error handling for all the glfs APIs + glfs_lseek(ctx->aux_fd, + nsr_recon_xlator_sector_0, + SEEK_SET); + glfs_write(ctx->aux_fd, c, sizeof(c), 0); + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished sending reconciliation end message to node %d\n", index); + + break; + } + case NSR_WORK_ID_GET_RECONCILATION_WINDOW: + { + nsr_recon_log_info_t li; + nsr_reconciliator_info_t *recon_info = rw->recon_info; + uint32_t i = 0; + uint32_t num = (dr->workers[index].recon_info->last_index - + dr->workers[index].recon_info->first_index +1); + nsr_recon_record_details_t *rd; + + nsr_worker_log(this->name, GF_LOG_INFO, + "trying to get reconciliation window records for node %d for term %d with first %d last %d\n", + index, recon_info->last_term, recon_info->first_index, recon_info->last_index); + + GF_ASSERT(num <= MAX_RECONCILIATION_WINDOW_SIZE); + + // TBD - error handling for all the glfs APIs + glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_2, SEEK_SET); + + // write to node what term & indices we are interested + li.term = recon_info->last_term; + li.first_index = recon_info->first_index; + li.last_index = recon_info->last_index; + ENDIAN_CONVERSION_LI(li, _gf_false); //htonl + glfs_write(ctx->aux_fd, &li, sizeof(li), 0); + + // then read + rd = GF_CALLOC(num, + sizeof(nsr_recon_record_details_t), + gf_mt_recon_private_t); + glfs_read(ctx->aux_fd, rd, num * sizeof(nsr_recon_record_details_t), 0); + for (i=0; i < num; i++) { + ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl + memcpy(&(recon_info->records[i].rec), + &(rd[i]), + sizeof(nsr_recon_record_details_t)); + nsr_worker_log(this->name, GF_LOG_INFO, + "get_reconcilaition_window:Got %d at index %d\n", + recon_info->records[i].rec.type, + i + recon_info->first_index); + } + free(rd); + + nsr_worker_log(this->name, GF_LOG_INFO, + "got reconciliation window records for node %d for term %d \n", + index, recon_info->last_term); + break; + } + } + + return; +} + +// Control worker thread +static void* +control_worker_main(nsr_per_node_worker_t *ctx) +{ + unsigned int index = ctx->index; + + ctx->is_control = _gf_true; + nsr_worker_log(this->name, GF_LOG_INFO, + "starting control worker func\n"); + + // if this is for local processing, call the changelog parsing calls directly + if (index == 0) { + control_worker_main_0(ctx); + return NULL; + } + + init_worker(ctx, 1); + + + while(1) + { + nsr_recon_work_t *work = NULL; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + nsr_worker_log(this->name, GF_LOG_INFO, + "waiting for work\n"); + + pthread_mutex_lock(&ctx->mutex); + while (list_empty(&(ctx->head.list))) { + pthread_cond_wait(&ctx->cv, &ctx->mutex); + } + pthread_mutex_unlock(&ctx->mutex); + + + list_for_each_entry(work, &(ctx->head.list), list) { + nsr_worker_log(this->name, GF_LOG_INFO, + "got work with id %d\n", work->req_id); + work->in_use = _gf_false; + control_worker_func(ctx,work); + atomic_dec(&(dr->outstanding)); + break; + } + nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n"); + list_del_init (&work->list); + nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n"); + } + + return NULL; +} + +/* + * This function gets called if this process is chosen as the reconciliator + * for this replica group. It would have already got the records for the last term + * for the indices that are required (from the first HOLE to last index) from + * all other nodes that also witnessed that term. COmpare all the records and + * compute the work required. + * + * Input arguments + * ctx - driver context. All recon work is stored in workers[0].recon_info + */ +static void +compute_reconciliation_work(nsr_recon_driver_ctx_t *ctx) +{ + uint32_t i=0, j=0; + nsr_reconciliator_info_t *my_recon = ctx->workers[0].recon_info; + uint32_t num = (my_recon->last_index - my_recon->first_index + 1); + + for (i=0; i < num; i++) { + nsr_log_type_t orig, new; + unsigned int src = 0; + orig = new = my_recon->records[i].rec.type; + nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE; + // index 0 means this node. Look at all other nodes. + for (j=1; j < ctx->replica_group_size; j++) { + if (ctx->workers[j].in_use) { + nsr_log_type_t pr = ctx->workers[j].recon_info->records[i].work.type; + if ((new != pr) && (pr > new)) { + src = j; + new = (new | pr); + } + } + } + // TBD - compare data if new and orig are all FILLs. (can detect changelog corruption) + // Right now we compare if both orig and new are psuedo holes since + // only that is of interest to us. + if (orig != new) { + if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_PSEUDO_HOLE)) + tw = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE; + else if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_FILL)) + tw = NSR_RECON_WORK_HOLE_TO_FILL; + else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_PSEUDO_HOLE)) + tw = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE; + else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_FILL)) + tw = NSR_RECON_WORK_HOLE_TO_FILL; + } + if (tw != NSR_RECON_WORK_NONE) { + my_recon->records[i].work.type = tw; + my_recon->records[i].work.source = src; + // Overwrite the record + memcpy(&(my_recon->records[i].rec), + &(ctx->workers[src].recon_info->records[i].rec), + sizeof(nsr_recon_record_details_t)); + } + } + return; +} + +static void +nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx, + uint32_t i, + gf_boolean_t in_use); + +/* + * Write the role and associated information to the node. + * This gets called from recon xlator indicating node is either + * leader, reconciliator or should do resolution. + * First we undo the last role to make sure we clean up. + * + * Input arguments + * ctx - driver context. + * rr - Role information. + * If leader, the thread now sends the list of all nodes that are part of + * the current replica group. Use that to find out the activate the + * required worker threads. + * If reconciliator, the leader node would have sent information about + * all nodes which saw last term as the reconciliator. + * If resolution to be done, then rr.info[0] will have this node's info + * which the leader would have got earlier. rr[1].info will have the + * info regarding the reconciliator. + * txn_id - All role changes(except when leader becomes reconciliator or resolutor) + * would be initiated as write to the recon xlator which would have got a frame from + * either the brick process(leader change) or other reconciliation process. + * The write function would return immediately after storing the frame which + * needs to be returned back after the actual reconciliation is done. + * For that we store the frame against this id which acts as a key. + */ +gf_boolean_t +nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, + nsr_recon_role_t *rr, + uint32_t txn_id) +{ + uint8_t i=0, j=0; + pthread_mutex_lock(&(ctx->mutex)); + ctx->state = rr->role; + // First make all the threads uninitialise + for (i = 0; i < ctx->replica_group_size; i++) { + nsr_recon_in_use(ctx, i, _gf_false); + } + if (rr->role == leader) { + + // First set info this node + nsr_recon_in_use(ctx, 0, _gf_true); + ctx->workers[0].recon_info = GF_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_private_t); + if (!ctx->workers[0].recon_info) { + return _gf_false; + } + ctx->current_term = rr->current_term; + + // Find rest of the nodes + for (i=1; i < ctx->replica_group_size; i++) { + for (j=0 ; j < rr->num; j++) { + // TBD - make this strcmp later when etcd servers set properly + //if (!strcmp(ctx->workers[i].name, rr->info[j].name)) { + if (strstr(ctx->workers[i].name, rr->info[j].name)) { + nsr_driver_log(this->name, GF_LOG_INFO, + "nsr_recon_driver_set_role: this as leader. found other server %s\n", + ctx->workers[i].name); + + nsr_recon_in_use(ctx, i, _gf_true); + // Allocate this here. This will get later filled when + // the leader tries to get last term information from all + // the nodes + ctx->workers[i].recon_info = GF_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_private_t); + if (!ctx->workers[i].recon_info) { + return _gf_false; + } + break; + } + } + } + ctx->reconciliator_index = -1; + } else if (rr->role == reconciliator) { + ctx->reconciliator_index = 0; + // Copy information about all the other members which had the same term + for (i=0; i < rr->num; i++) { + for (j=0; j < ctx->replica_group_size; j++) { + //if (!strcmp(rr->info[i].name, ctx->workers[j].name)) { + if (strstr(ctx->workers[j].name, rr->info[i].name)) { + nsr_driver_log(this->name, GF_LOG_INFO, + "nsr_recon_driver_set_role: this as reconciliator. found other server %s\n", + ctx->workers[j].name); + ctx->workers[j].recon_info = GF_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_private_t); + if (!ctx->workers[j].recon_info) { + return _gf_false; + } + ctx->workers[j].recon_info->last_term = + rr->info[i].last_term; + ctx->workers[j].recon_info->commited_ops = + rr->info[i].commited_ops; + ctx->workers[j].recon_info->last_index = + rr->info[i].last_index; + ctx->workers[j].recon_info->first_index = + rr->info[i].first_index; + nsr_recon_in_use(ctx, j, _gf_true); + break; + } + } + } + } else if (rr->role == resolutor) { + for (j=0; j < ctx->replica_group_size; j++) { + // info[1] has the information regarding the reconciliator + if (strstr(ctx->workers[j].name, rr->info[1].name)) { + //if (!strcmp(rr->info[1].name, ctx->workers[j].name)) { + nsr_driver_log(this->name, GF_LOG_INFO, + "nsr_recon_driver_set_role: this as resolutor. found other server %s as reconciliator\n", + ctx->workers[1].name); + ctx->workers[j].recon_info = GF_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_private_t); + if (!ctx->workers[j].recon_info) { + return _gf_false; + } + ctx->workers[j].recon_info->last_term = + rr->info[1].last_term; + ctx->workers[j].recon_info->commited_ops = + rr->info[1].commited_ops; + ctx->workers[j].recon_info->last_index = + rr->info[1].last_index; + ctx->workers[j].recon_info->first_index = + rr->info[1].first_index; + ctx->reconciliator_index = j; + nsr_recon_in_use(ctx, j, _gf_true); + GF_ASSERT(ctx->reconciliator_index != 0); + break; + } + } + ctx->workers[0].recon_info = GF_CALLOC (1, + sizeof (nsr_reconciliator_info_t), + gf_mt_recon_private_t); + if (!ctx->workers[0].recon_info) { + return _gf_false; + } + // info[0] has all info for this node + ctx->workers[0].recon_info->last_term = rr->info[0].last_term; + ctx->workers[0].recon_info->commited_ops = rr->info[0].commited_ops; + ctx->workers[0].recon_info->last_index = rr->info[0].last_index; + ctx->workers[0].recon_info->first_index = rr->info[0].first_index; + nsr_recon_in_use(ctx, 0, _gf_true); + } + + ctx->txn_id = txn_id; + // Signal the main driver thread + pthread_cond_signal(&(ctx->cv)); + pthread_mutex_unlock(&(ctx->mutex)); + return _gf_true; +} + + +/* + * This function gets called if this process is chosen to sync itself with + * the reconciliator. + * + * Input arguments + * ctx - driver context. + * my_info - local changelog info that has all the local records for indices that require work + * his_info - reconciliator's info that has all the golden copies + * invalidate - if set to true, then do not consult local records + */ + +static void +compute_resolution_work(nsr_recon_driver_ctx_t *ctx, + nsr_reconciliator_info_t *my_info, + nsr_reconciliator_info_t *his_info, + gf_boolean_t invalidate) +{ + uint32_t i=0; + uint32_t num = (my_info->last_index - my_info->first_index + 1); + + for (i=0; i < num; i++) { + nsr_log_type_t orig, new; + nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE; + orig = my_info->records[i].rec.type; + if (invalidate) + orig = NSR_LOG_HOLE; + new = his_info->records[i].rec.type; + // TBD - we can never have PSUEDO_HOLE in reconciliator's info + // We should have taken care of that during reconciliation. + // Put an assert to validate that. + if (new != orig) { + if ((orig != NSR_LOG_FILL) && (new == NSR_LOG_FILL)) + tw = NSR_RECON_WORK_HOLE_TO_FILL; + else if ((orig != NSR_LOG_HOLE) && (new == NSR_LOG_HOLE)) + tw = NSR_RECON_WORK_UNDO_FILL; + } + // copy the records anyway + my_info->records[i].work.type = tw; + my_info->records[i].work.source = ctx->reconciliator_index; + memcpy(&(my_info->records[i].rec), + &(his_info->records[i].rec), + sizeof(nsr_recon_record_details_t)); + } + return; +} + + +// Create an glfs object +static struct glfs_object * +create_obj(nsr_per_node_worker_t *ctx, char *gfid_str) +{ + struct glfs_object *obj = NULL; + uuid_t gfid; + + uuid_parse(gfid_str, gfid); + + obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL); + if (obj == NULL) { + GF_ASSERT(obj != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "creating of handle failed\n"); + return NULL; + } + return obj; +} + +/* + * Function to apply the actual record onto the local brick. + * prior to this we should have read all the data from the + * brick that has the data. + * + * Input parameters: + * ctx - per node worker context that has the fs for communicating to brick + * ri - Reconciliation record that needs fixup + * dict - So that NSR server translator on brick applis fixup only on this brick + * and the changelog translator consumes term and index. + */ + +static void +apply_record(nsr_per_node_worker_t *ctx, + nsr_reconciliation_record_t *ri, + dict_t * dict) +{ + struct glfs_fd *fd = NULL; + struct glfs_object *obj = NULL; + + + if (ri->rec.op == GF_FOP_WRITE) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "DOing write for file %s @offset %d for len %d\n", + ri->rec.gfid, ri->rec.offset, ri->rec.len); + + // The file has got deleted on the source. Hence just ignore this. + // TBD - get a way to just stuff the log entry without writing the data so that + // changelogs remain identical. + if (ri->work.data == NULL) { + return; + } + + if ((obj = create_obj(ctx,ri->rec.gfid)) == NULL) return; + + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + return; + } + if (glfs_lseek_with_xdata(fd, ri->rec.offset, SEEK_SET, dict) != ri->rec.offset) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "lseek for file %s failed at offset %d\n", + ri->rec.gfid, ri->rec.offset); + return; + } + if (glfs_write_with_xdata(fd, ri->work.data, ri->rec.len, 0, dict) != ri->rec.len) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "write for file %s failed for bytes %d\n", + ri->rec.gfid, ri->rec.len); + return; + } + glfs_close_with_xdata(fd, dict); + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished DOing write for gfid %s @offset %d for len %d\n", + ri->rec.gfid, ri->rec.offset, ri->rec.len); + + } else if (ri->rec.op == GF_FOP_FTRUNCATE) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "DOing truncate for file %s @offset %d \n", + ri->rec.gfid, ri->rec.offset); + + if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) return; + + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + return; + } + if (glfs_ftruncate_with_xdata(fd, ri->rec.offset, dict) == -1) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR + "trunctae for file %s failed @offset %d\n", + ri->rec.gfid,ri->rec.offset ); + return; + } + glfs_close_with_xdata(fd, dict); + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished DOing truncate for gfid %s @offset %d \n", + ri->rec.gfid, ri->rec.offset); + + } else if ((ri->rec.op == GF_FOP_FREMOVEXATTR) || + (ri->rec.op == GF_FOP_REMOVEXATTR) || + (ri->rec.op == GF_FOP_SETXATTR) || + (ri->rec.op == GF_FOP_FSETXATTR)) { + + uint32_t k_s = 0, v_s = 0; + char *t_b= NULL; + uint32_t num = 0; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing set extended attr for file %s \n", + ri->rec.gfid); + + // The file has got deleted on the source. Hence just ignore this. + // TBD - get a way to just stuff the log entry without writing the data so that + // changelogs remain identical. + if (ri->work.data == NULL) { + return; + } + + if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) return; + + if (obj->inode->ia_type == IA_IFDIR) + fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict); + else + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + return; + } + + if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) { + if (t_b) free(t_b); + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "list of xattr of %s failed\n", ri->rec.gfid); + return; + } + + delete_xattr(fd, dict, t_b, num); + + // Set one special dict flag to indicate the opcode so that + // the opcode gets set to this + if (dict_set_int32(dict,"recon-xattr-opcode",ri->rec.op)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "setting opcode to %d failed\n",ri->rec.op); + return; + } + + fill_xattr(fd, dict, ri->work.data, ri->work.num); + + glfs_close_with_xdata(fd, dict); + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finsihed Doing set extended attr for %s \n", + ri->rec.gfid); + + } else if (ri->rec.op == GF_FOP_CREATE) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing create for file %s \n", + ri->rec.gfid); + + // TBD - add mode and flags later + uuid_parse(ri->rec.gfid, gfid); + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + + if (glfs_h_creat_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing create for file %s\n", + ri->rec.entry); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing create for file %s \n", + ri->rec.entry); + + } else if (ri->rec.op == GF_FOP_MKNOD) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing mknod for file %s \n", + ri->rec.entry); + + // TBD - add mode and flags later + uuid_parse(ri->rec.gfid, gfid); + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + + if (glfs_h_mknod_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing mknod for file %s\n", + ri->rec.entry); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing mknod for file %s \n", + ri->rec.entry); + + } else if (ri->rec.op == GF_FOP_MKDIR) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing mkdir for dir %s \n", + ri->rec.gfid); + + // TBD - add mode and flags later + uuid_parse(ri->rec.gfid, gfid); + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + + if (glfs_h_mkdir_with_xdata(ctx->fs, obj, ri->rec.entry, 0777, NULL, gfid, dict) != 0) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing mkdir for file %s\n", + ri->rec.entry); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing mkdir for file %s \n", + ri->rec.entry); + + } else if ((ri->rec.op == GF_FOP_RMDIR) || (ri->rec.op == GF_FOP_UNLINK)) { + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing rmdir/ublink for dir %s \n", + ri->rec.entry); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + if (glfs_h_unlink_with_xdata(ctx->fs, obj, ri->rec.entry, dict) != 0) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failure for Doing rmdir/unlink for file %s\n", + ri->rec.entry); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing rmdir/unlink for file %s \n", + ri->rec.entry); + + } else if (ri->rec.op == GF_FOP_SYMLINK) { + + uuid_t gfid; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing symlink for file %s to file %s \n", + ri->rec.entry, ri->rec.link_path); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + uuid_parse(ri->rec.gfid, gfid); + + if (glfs_h_symlink_with_xdata(ctx->fs, obj, ri->rec.entry, ri->rec.link_path, NULL, gfid, dict) == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failed to Doing symlink for file %s to file %s \n", + ri->rec.entry, ri->rec.link_path); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finished Doing symlink for file %s to file %s \n", + ri->rec.entry, ri->rec.link_path); + + } else if (ri->rec.op == GF_FOP_LINK) { + + struct glfs_object *to_obj = NULL; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing hard link for file %s to file %s \n", + ri->rec.entry, ri->rec.gfid); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) return; + + if (glfs_h_link_with_xdata(ctx->fs, to_obj, obj, ri->rec.entry, dict) == -1) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failed to Doing hard link for file %s to file %s \n", + ri->rec.entry, ri->rec.gfid); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finsihed doing hard link for file %s to file %s \n", + ri->rec.entry, ri->rec.gfid); + + } else if (ri->rec.op == GF_FOP_RENAME) { + + struct glfs_object *to_obj = NULL; + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing rename for file %s to file %s \n", + ri->rec.entry, ri->rec.newloc); + + if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return; + if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) return; + + if (glfs_h_rename_with_xdata(ctx->fs, obj, ri->rec.entry, to_obj, ri->rec.newloc, dict) == -1) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "Failed to Doing rename for file %s to file %s \n", + ri->rec.entry, ri->rec.newloc); + return; + } + + nsr_worker_log(this->name, GF_LOG_INFO, + "Finsihed doing renam for file %s to file %s \n", + ri->rec.entry, ri->rec.newloc); + + + } else if ((ri->rec.op == GF_FOP_SETATTR) || (ri->rec.op == GF_FOP_FSETATTR)) { + + struct iatt iatt = {0, }; + int valid = 0; + int ret = -1; + + // TBD - do the actual settings once we do that + // right now we just set the mode so that changelog gets filled + + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing attr for file %s \n", + ri->rec.gfid); + + if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) return; + + if (obj->inode->ia_type == IA_IFDIR) + fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict); + else + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict); + if (fd == NULL) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "open for file %s failed\n", + ri->rec.gfid); + return; + } + + iatt.ia_prot = ia_prot_from_st_mode(777); + valid = GF_SET_ATTR_MODE; + + + // Set one special dict flag to indicate the opcode so that + // the opcode gets set to this + if (dict_set_int32(dict,"recon-attr-opcode",ri->rec.op)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "setting opcode to %d failed\n",ri->rec.op); + return; + } + + ret = glfs_fsetattr_with_xdata(fd, &iatt, valid, dict); + if (ret == -1) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_INFO, + "failed Doing attr for file %s \n", + ri->rec.gfid); + return; + } + + glfs_close_with_xdata(fd, dict); + nsr_worker_log(this->name, GF_LOG_INFO, + "Doing attr for file %s \n", + ri->rec.gfid); + + } + + return; +} + +//return back opcodes that requires reading from source +static gf_boolean_t +recon_check_changelog(nsr_recon_record_details_t *rd) +{ + return((rd->op == GF_FOP_WRITE) || + (rd->op == GF_FOP_FSETATTR) || + (rd-> op == GF_FOP_SETATTR) || + (rd->op == GF_FOP_FREMOVEXATTR) || + (rd->op == GF_FOP_SETXATTR) || + (rd->op == GF_FOP_FSETXATTR) || + (rd->op == GF_FOP_SYMLINK)); + +} + +// TBD +static gf_boolean_t +recon_compute_undo(nsr_recon_record_details_t *rd) +{ + return(_gf_false); +} + + +/* + * Function that talks to the brick for data tranfer. + * + * Input arguments: + * ctx - worker context + * work - pointer to work object + */ +static void +data_worker_func(nsr_per_node_worker_t *ctx, + nsr_recon_work_t *work) +{ + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + nsr_reconciliation_record_t *ri = NULL; + nsr_recon_record_details_t *rd = NULL; + glfs_fd_t *fd = NULL; + int wip = 0; + + switch (work->req_id){ + case NSR_WORK_ID_INI: + { + nsr_worker_log(this->name, GF_LOG_INFO, + "started data ini \n"); + + nsr_recon_start_work(ctx, _gf_false); + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished data ini \n"); + break; + } + case NSR_WORK_ID_FINI: + { + nsr_worker_log(this->name, GF_LOG_INFO, + "started data fini \n"); + + nsr_recon_end_work(ctx, _gf_false); + + nsr_worker_log(this->name, GF_LOG_INFO, + "finished data fini \n"); + break; + } + case NSR_WORK_ID_SINGLE_RECONCILIATION_READ: + { + dict_t * dict = NULL; + // first_index always starts with 1 but records starts at 0. + wip = work->index - (dr->workers[0].recon_info->first_index); + ri = &(dr->workers[0].recon_info->records[wip]); + rd = &(ri->rec); + + dict = dict_new (); + if (!dict) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "failed allocating for dictionary\n"); + goto commit_out; + } + if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + goto commit_out; + } + if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + goto commit_out; + } + + if (rd->op == GF_FOP_WRITE) { + + // record already copied. + // copy data to this node's info. + struct glfs_fd *fd = NULL; + struct glfs_object *obj = NULL; + uuid_t gfid; + + uuid_parse(ri->rec.gfid, gfid); + + nsr_worker_log(this->name, GF_LOG_INFO, + "started recon read for file %s at offset %d at len %d\n", + ri->rec.gfid, rd->offset, rd->len); + + obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL); + if (obj == NULL) { + GF_ASSERT(obj != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "creating of handle failed\n"); + goto read_out; + } + + // The file has probably got deleted. + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDONLY, dict); + if (fd == NULL) { + GF_ASSERT(fd != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "opening of file failed\n"); + goto read_out; + } + + if (glfs_lseek_with_xdata(fd, rd->offset, SEEK_SET, dict) != rd->offset) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "lseek of file failed to offset %d\n", rd->offset); + goto read_out; + } + + ri->work.data = GF_CALLOC(rd->len , sizeof(char), gf_mt_recon_private_t); + if (glfs_read_with_xdata(fd, ri->work.data, rd->len, 0, dict) != rd->len) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "read of file failed to offset %d for bytes %d\n", rd->offset, rd->len); + goto read_out; + } + + glfs_close_with_xdata(fd, dict); + glfs_h_close(obj); + + } else if (rd->op == GF_FOP_FTRUNCATE) { + } else if (rd->op == GF_FOP_SYMLINK) { + } else if ((rd->op == GF_FOP_RMDIR) || (rd->op == GF_FOP_UNLINK) || + (rd->op == GF_FOP_MKNOD) || (rd->op == GF_FOP_CREATE) || + (rd->op == GF_FOP_LINK) || (rd->op == GF_FOP_MKDIR)) { + } else if (rd->op == GF_FOP_RENAME) { + } else if ((rd->op == GF_FOP_FREMOVEXATTR) || + (rd->op == GF_FOP_REMOVEXATTR) || + (rd->op == GF_FOP_SETXATTR) || + (rd->op == GF_FOP_FSETXATTR)) { + + struct glfs_fd *fd = NULL; + struct glfs_object *obj = NULL; + uuid_t gfid; + + uuid_parse(ri->rec.gfid, gfid); + + + // This is for all the set attribute/extended attributes commands. + // Get all the attributes from the source and fill it in the buffer + // as a NULL seperated key and value which are in turn seperated by + // NULL. + uint32_t k_s = 0, v_s = 0; + char *t_b= NULL; + uint32_t num=0; + + + nsr_worker_log(this->name, GF_LOG_INFO, + "doing getattr for gfid %s \n", + ri->rec.gfid); + + obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL); + if (obj == NULL) { + GF_ASSERT(fd != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "creating of handle failed\n"); + goto read_out; + } + + if (obj->inode->ia_type == IA_IFDIR) + fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict); + else + fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDONLY, dict); + + if (fd == NULL) { + GF_ASSERT(fd != NULL); + nsr_worker_log(this->name, GF_LOG_ERROR, + "opening of file failed\n"); + goto read_out; + } + + if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) { + if (t_b) free(t_b); + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "list of xattr of gfid %s failed\n", rd->gfid); + goto read_out; + } + ri->work.data = GF_CALLOC((k_s + v_s) , sizeof(char), gf_mt_recon_private_t); + get_xattr(fd, t_b, ri->work.data, v_s, num, dict); + ri->work.num = num; + nsr_worker_log(this->name, GF_LOG_INFO, + "finished getattr for gfid %s \n", + ri->rec.gfid); + free(t_b); + goto read_out; + + } else if ((rd->op == GF_FOP_FSETATTR) || + (rd->op == GF_FOP_SETATTR)) { + + //TBD - to get the actual attrbutes and fill + // mode, uid, gid, size, atime, mtime + } +read_out: + nsr_worker_log(this->name, GF_LOG_INFO, + "finished recon read for gfid %s at offset %d for %d bytes \n", + rd->gfid, rd->offset, rd->len); + break; + } + case NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT: + { + dict_t * dict = NULL; + // first_index always starts with 1 but records starts at 0. + wip = work->index - (dr->workers[0].recon_info->first_index); + ri = &(dr->workers[0].recon_info->records[wip]); + rd = &(ri->rec); + + nsr_worker_log(this->name, GF_LOG_INFO, + "got recon commit for index %d that has gfid %s \n", + wip, rd->gfid); + dict = dict_new (); + if (!dict) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "failed allocating for dictionary\n"); + goto commit_out; + } + if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + goto commit_out; + } + if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + goto commit_out; + } + apply_record(ctx, ri, dict); +commit_out: + dict_unref (dict); + nsr_worker_log(this->name, GF_LOG_INFO, + "finished recon commit for gfid %s \n", + rd->gfid); + break; + } + case NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH: + { + dict_t * dict = NULL; + dict = dict_new (); + if (!dict) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "failed allocating for dictionary\n"); + goto commit_out; + } + if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + goto commit_out; + } + if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) { + GF_ASSERT(0); + nsr_worker_log(this->name, GF_LOG_ERROR, + "error setting term in dict\n"); + goto commit_out; + } + + // Increment work index with the start index + wip = work->index - (dr->workers[0].recon_info->first_index); + ri = &(dr->workers[0].recon_info->records[wip]); + rd = &(ri->rec); + //fd = glfs_open(ctx->fs, rd->gfid, O_RDONLY); //TBD - using gfid + + glfs_fsync_with_xdata(fd, dict); + break; + } + } + return; +} + +// thread for doing data work +static void * +data_worker_main(nsr_per_node_worker_t *ctx) +{ + nsr_worker_log(this->name, GF_LOG_INFO, + "starting data worker func\n"); + init_worker(ctx, 0); + + while(1) { + nsr_recon_work_t *work = NULL; + nsr_recon_driver_ctx_t *dr = ctx->driver_ctx; + + nsr_worker_log(this->name, GF_LOG_INFO, + "waiting for work\n"); + + pthread_mutex_lock(&(ctx->mutex)); + while (list_empty(&(ctx->head.list))) { + pthread_cond_wait(&(ctx->cv), &(ctx->mutex)); + } + pthread_mutex_unlock(&(ctx->mutex)); + list_for_each_entry(work, &(ctx->head.list), list) { + nsr_worker_log(this->name, GF_LOG_INFO, + "got work with id %d\n",work->req_id); + work->in_use = _gf_false; + data_worker_func(ctx, work); + atomic_dec(&(dr->outstanding)); + break; + } + nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n"); + list_del_init (&work->list); + nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n"); + } + + return NULL; +} + + +//make recon work +static void +recon_make_work(nsr_recon_work_t **work, + nsr_recon_work_req_id_t req_id, + int32_t i) +{ + // TBD - change this to get from a static pool + // This cannot fail + (*work) = GF_CALLOC (1, sizeof (nsr_recon_work_t), gf_mt_recon_private_t); + (*work)->req_id = req_id; + (*work)->index = i; + (*work)->in_use = _gf_true; + INIT_LIST_HEAD(&((*work)->list)); + return; +} + +// Schedule a work object to a worker thread. +static void +recon_queue_to_worker(nsr_recon_driver_ctx_t *ctx, + nsr_recon_work_t *work, + unsigned int id, + nsr_recon_queue_type_t type) +{ + nsr_per_node_worker_t *worker; + if (type == NSR_RECON_QUEUE_TO_CONTROL) { + worker = ctx->workers[id].control_worker; + nsr_driver_log(this->name, GF_LOG_INFO, + "queueing work to control index %d\n",id); + } else { + worker= ctx->workers[id].data_worker; + nsr_driver_log(this->name, GF_LOG_INFO, + "queueing work to data index %d\n",id); + } + pthread_mutex_lock(&worker->mutex); + list_add_tail(&work->list, &worker->head.list); + pthread_cond_signal(&worker->cv); + pthread_mutex_unlock(&worker->mutex); + return; +} + +typedef void * (*F_t) (void *); + +// In case mode is set to NSR_USE_THREADS, create worker threads. +static gf_boolean_t +create_worker_threads(nsr_recon_private_t *priv, + nsr_recon_driver_ctx_t *ctx, + nsr_per_node_worker_t *w, + gf_boolean_t control_or_data, + F_t f, + uint32_t num) +{ + uint32_t i; + nsr_per_node_worker_t *worker = w; + + + for (i=0; i < num; i++) { + worker->id = GF_CALLOC(1, 10, gf_mt_recon_private_t); + if (!worker->id) { + nsr_driver_log (priv->this->name, GF_LOG_ERROR, "memory allocation error \n"); + return _gf_false; + } + sprintf(worker->id,"recon_%d", i); + worker->driver_ctx = ctx ; + + if (ctx->mode == NSR_USE_THREADS) { + if (pthread_create(&worker->thread_id, NULL, f, worker)) { + nsr_driver_log (ctx->this->name, GF_LOG_ERROR, "control work thread creation error \n"); + return _gf_false; + } + } + worker->index = i; + worker++; + } + return _gf_true; +} + +/* + * In case of thread, send the work item; else call the function directly. + * + * Input arguments: + * bm - bitmap containing indices of nodes we want to send work + * num - number of such indices + * ctx - driver context from where we derive per worker context + * id - request ID + * q - control or data + * misc - used to overload such as index. + */ +static void +send_and_wait(int32_t bm, + uint32_t num, + nsr_recon_driver_ctx_t *ctx, + nsr_recon_work_req_id_t id, + nsr_recon_queue_type_t q, + int32_t misc) +{ + uint32_t i = 0; + nsr_recon_work_t *work; + + if (ctx->mode == NSR_SEQ) { + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + recon_make_work(&work, id, misc); + if (q == NSR_RECON_QUEUE_TO_CONTROL) { + if (i == 0) + control_worker_func_0(ctx->workers[0].control_worker, work); + else + control_worker_func(ctx->workers[i].control_worker, work); + } else { + data_worker_func(ctx->workers[i].data_worker, work); + } + } + } + nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned\n"); + return; + } + + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + recon_make_work(&work, id, misc); + atomic_inc(&(ctx->outstanding)); + recon_queue_to_worker(ctx, work, i, q); + } + } + + nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: waiting\n"); + while (ctx->outstanding) { + pthread_yield(); + } + nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned\n"); + return; +} + +#if 0 +static void +send_and_do_not_wait(int32_t bm, + uint32_t num, + nsr_recon_driver_ctx_t *ctx, + nsr_recon_work_req_id_t id, + nsr_recon_queue_type_t q, + int32_t misc) +{ + uint32_t i = 0; + + for (i=0; i < num; i++) { + if ((bm & (1 << i)) && ctx->workers[i].in_use) { + nsr_recon_work_t *work; + recon_make_work(&work, id, misc); + recon_queue_to_worker(ctx, work, i, q); + } + } + + return; +} +#endif + +// send INI or FINI +static void +nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx, + uint32_t i, + gf_boolean_t in_use) +{ + uint32_t bm = 1 << i; + gf_boolean_t send = _gf_false; + + if (in_use == _gf_false) { + if (ctx->workers[i].in_use == _gf_true) + send = _gf_true; + ctx->workers[i].in_use = _gf_false; + } else { + if (ctx->workers[i].in_use != _gf_true) { + ctx->workers[i].in_use = _gf_true; + send = _gf_true; + } + } +#if 1 + if (send == _gf_true) { + if (in_use == _gf_true) { + nsr_driver_log(this->name, GF_LOG_INFO, "sending INI to index %d\n",i); + } else { + nsr_driver_log(this->name, GF_LOG_INFO, "sending FINI to index %d\n",i); + } + send_and_wait(bm, ctx->replica_group_size, ctx, + (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI, + NSR_RECON_QUEUE_TO_CONTROL, -1); + send_and_wait(bm, ctx->replica_group_size, ctx, + (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI, + NSR_RECON_QUEUE_TO_DATA, -1); + } +#endif +} + +// main recon driver thread +void * +nsr_reconciliation_driver(void *arg) +{ + nsr_recon_private_t *priv = (nsr_recon_private_t *) arg; + uint32_t replica_group_size = priv->replica_group_size; + uint32_t i; + nsr_per_node_worker_t *control_s, *data_s; + nsr_recon_driver_ctx_t **driver_ctx, *ctx; + int32_t bm; + xlator_t *this = priv->this; + + driver_ctx = &priv->driver_thread_context; + (*driver_ctx) = GF_CALLOC (1, + sizeof (nsr_recon_driver_ctx_t), + gf_mt_recon_private_t); + if (!driver_ctx) { + gf_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + ctx = *driver_ctx; + ctx->this = priv->this; + ctx->replica_group_size = replica_group_size; + if ((pthread_mutex_init(&(ctx->mutex), NULL)) || + (pthread_cond_init(&(ctx->cv), NULL))){ + nsr_driver_log (this->name, GF_LOG_ERROR, "mutex init error \n"); + return NULL; + } + + ctx->workers = GF_CALLOC (replica_group_size, + sizeof(nsr_replica_worker_t), + gf_mt_recon_private_t); + if (!ctx->workers) { + nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + for (i=0; i < replica_group_size; i++) { + strcpy(ctx->workers[i].name, priv->replica_group_members[i]); + } + + control_s = GF_CALLOC (replica_group_size, + sizeof(nsr_per_node_worker_t), + gf_mt_recon_private_t); + if (!control_s) { + nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + + data_s = GF_CALLOC (replica_group_size, + sizeof(nsr_per_node_worker_t), + gf_mt_recon_private_t); + if (!data_s) { + nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n"); + return NULL; + } + for (i=0; i < replica_group_size; i++) { + ctx->workers[i].control_worker = &control_s[i]; + ctx->workers[i].data_worker = &data_s[i]; + } + + nsr_driver_log (this->name, GF_LOG_INFO, "creating threads \n"); + // Create the worker threads + // For every brick including itself there will be 2 worker threads: + // one for data and one for control + if (!create_worker_threads(priv, ctx, control_s, _gf_true, + (F_t) control_worker_main, replica_group_size) || + !create_worker_threads(priv, ctx, data_s, _gf_false, + (F_t) data_worker_main, replica_group_size)) { + return NULL; + } + + for (i=0; i < replica_group_size; i++) { + nsr_recon_get_file(priv->volname, &(ctx->workers[i])); + } + + while (1) { + + nsr_driver_log (this->name, GF_LOG_INFO, "waiting for state change \n"); + pthread_mutex_lock(&(ctx->mutex)); + while ((*driver_ctx)->state == 0) { + pthread_cond_wait(&(ctx->cv), &(ctx->mutex)); + } + pthread_mutex_unlock(&(ctx->mutex)); + + nsr_driver_log (this->name, GF_LOG_INFO, " state changed to %d \n", ctx->state); +#if 0 + for (i=0; i < replica_group_size; i++) { + if (ctx->workers[i].in_use) { + nsr_recon_start_work(ctx->workers[i].control_worker, _gf_true); + nsr_recon_start_work(ctx->workers[i].data_worker, _gf_false); + } + } +#endif + + if (ctx->state == leader) { + + int32_t chosen = -1; + int32_t last_term = -1, last_ops = -1; + + nsr_driver_log (this->name, GF_LOG_INFO, "getting last term info from all members of this group\n"); + // Get last term info from all members for this group + send_and_wait(-1, + replica_group_size, + ctx, + NSR_WORK_ID_GET_LAST_TERM_INFO, + NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term); + + + // compare all the info received and choose the reconciliator + // First choose all with latest term + for (i=0; i < replica_group_size; i++) { + if (ctx->workers[i].in_use) { + if (ctx->workers[i].recon_info->last_term > last_term) { + last_term = ctx->workers[i].recon_info->last_term; + } + } + } + // First choose all with latest term and highest ops + for (i=0; i < replica_group_size; i++) { + if ((ctx->workers[i].in_use) && (last_term == ctx->workers[i].recon_info->last_term)) { + if (ctx->workers[i].recon_info->commited_ops > last_ops) { + last_ops = ctx->workers[i].recon_info->commited_ops; + } + } + } + // choose the first among the lot + for (i=0; i < replica_group_size; i++) { + if ((ctx->workers[i].in_use) && + (last_term == ctx->workers[i].recon_info->last_term) && + (last_ops == ctx->workers[i].recon_info->commited_ops)) { + chosen = i; + break; + } + } + + nsr_driver_log (this->name, GF_LOG_INFO, "reconciliator chosen is %d\n", chosen); + ctx->reconciliator_index = chosen; + GF_ASSERT(chosen != -1); + if (chosen == -1) { + nsr_driver_log (this->name, GF_LOG_INFO, "no reconciliatior chosen\n"); + goto out; + } + + // send the message to reconciliator to do reconciliation with list of nodes that are part of this quorum + if (chosen != 0) { + nsr_driver_log (this->name, GF_LOG_INFO, "sending reconciliation work to %d\n", chosen); + bm = 1 << ctx->reconciliator_index; + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_RECONCILIATOR_DO_WORK, + NSR_RECON_QUEUE_TO_CONTROL, -1); + nsr_driver_log (this->name, GF_LOG_INFO, "finished reconciliation work to %d\n", chosen); + } else { + nsr_driver_log (this->name, GF_LOG_INFO, "local node is reconciliator. before set jmp\n"); + ctx->env = calloc(1,sizeof(jmp_buf)); + /* + * REVIEW + * Use of setjmp/longjmp in an environment + * where we already use ucontext is dangerous + * and therefore forbidden. Refactoring will + * also help with some of the rampant 80-column + * violations and indented code crawling across + * the screen, which together make this entire + * file almost unreadable. + */ + if (!setjmp(*(ctx->env))) { + ctx->state = reconciliator; + goto i_am_reconciliator; + } else { + nsr_driver_log (this->name, GF_LOG_INFO, "long jmp return to leader\n"); + free(ctx->env); + ctx->env = NULL; + ctx->state = leader; + } + } + + // send message to all other nodes to sync up with the reconciliator including itself if required + // requires optimisation - TBD + if (chosen != 0) { + nsr_driver_log (this->name, GF_LOG_INFO, "local node resolution needs to be done. before set jmp\n"); + ctx->env = calloc(1,sizeof(jmp_buf)); + if (!setjmp(*(ctx->env))) { + ctx->state = resolutor; + goto i_am_resolutor; + } else { + nsr_driver_log (this->name, GF_LOG_INFO, "long jmp return to leader\n"); + free(ctx->env); + ctx->env = NULL; + ctx->state = leader; + } + } + + nsr_driver_log (this->name, GF_LOG_INFO, "sending resolution work to all nodes except this node and reconciliator\n"); + bm = ~((1 << ctx->reconciliator_index) || 1); + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_RESOLUTION_DO_WORK, + NSR_RECON_QUEUE_TO_CONTROL, -1); + + nsr_driver_log (this->name, GF_LOG_INFO, "finished reconciliation work as leader \n"); + + } +i_am_reconciliator: + if (ctx->state == reconciliator) { + gf_boolean_t do_recon = _gf_false; + uint32_t start_index = ctx->workers[0].recon_info->first_index; + uint32_t end_index = ctx->workers[0].recon_info->last_index; + uint32_t num = ((start_index == 0) && (end_index == 0)) ? 0 : (end_index - start_index + 1); + + nsr_driver_log (this->name, GF_LOG_INFO, "starting reconciliation work as reconciliator \n"); + + // nothing to be done? signal back to the recon translator that this phase done. + bm = 1; + for (i=1; i < replica_group_size; i++) { + if (ctx->workers[i].in_use && + (ctx->workers[0].recon_info->last_term == ctx->workers[i].recon_info->last_term)) { + ctx->workers[i].recon_info->last_index = end_index; + ctx->workers[i].recon_info->first_index = start_index; + bm = (1 << i); + do_recon = _gf_true; + } + } + + if (!do_recon || !num) { + nsr_driver_log (this->name, GF_LOG_INFO, "nothing needs to be done as resolutor \n"); + if (ctx->env) { + nsr_driver_log (this->name, GF_LOG_INFO, "before longjmp \n"); + longjmp(*(ctx->env), 1); + } else { + goto out; + } + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "getting reconciliation window for term %d from %dto %d \n", + ctx->workers[0].recon_info->last_term, + start_index, end_index); + // We have set the bm in the above for loop where + // we go thru all nodes including this node that + // have seen the last term. + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_GET_RECONCILATION_WINDOW, + NSR_RECON_QUEUE_TO_CONTROL, -1); + nsr_driver_log (this->name, GF_LOG_INFO, + "finished getting reconciliation window for term %d from %dto %d \n", + ctx->workers[0].recon_info->last_term, + start_index, end_index); + + + // from the changelogs, calculate the entries + // that need action and the source for each of these entries + compute_reconciliation_work(ctx); + + // for each of the entries that need fixup, issue IO + for (i=start_index; i < (start_index + num); i++) { + nsr_reconciliator_info_t *my_recon_info = + ctx->workers[0].recon_info; + nsr_reconciliation_record_t *record = + &(my_recon_info->records[i - start_index]); + + record->work.term = ctx->workers[0].recon_info->last_term; + record->work.index = i; + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing index %d\n",i); + if ((record->work.type == NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE) || + (record->work.type == NSR_RECON_WORK_HOLE_TO_FILL)) { + // 1st case (RECON_WORK_HOLE_TO_PSEUDO_HOLE): + // If there are only pseudo_holes in others, it is best effort. + // Just pick from the first node that has it and proceed. + // 2nd case (RECON_WORK_HOLE_TO_FILL): + // this node has either a HOLE or PSUEDO_HOLE and some one else has a FILL(source). + // analyse the changelog to check if data needs to be read or if the log has all the data required + + if (recon_check_changelog(&record->rec)) { + bm = (1 << record->work.source); + nsr_driver_log (this->name, GF_LOG_INFO, + "reading data from source %d\n",record->work.source); + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_READ, + NSR_RECON_QUEUE_TO_DATA, + i); + nsr_driver_log (this->name, GF_LOG_INFO, + "got data from source %d\n",record->work.source); + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing local data as part of reconciliation\n"); + + bm = 1; + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT, + NSR_RECON_QUEUE_TO_DATA, + i); + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished fixing local data as part of reconciliation\n"); + + } else if (record->work.type == NSR_RECON_WORK_COMPARE_PSEUDO_HOLE) { + // this node has a pseudo_hole and some others have just that too. Just convert this to FILL. + // let others blindly pick it from here. + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing this record as a fill\n"); + bm = 1; + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH, + NSR_RECON_QUEUE_TO_DATA, + i); + nsr_driver_log (this->name, GF_LOG_INFO, + "finished fixing this record as a fill\n"); + } + } + + nsr_driver_log (this->name, GF_LOG_INFO, "finished reconciliation work as reconciliator \n"); + + if (ctx->env) { + nsr_driver_log (this->name, GF_LOG_INFO, "before longjmp \n"); + longjmp(*(ctx->env), 1); + } + + // tbd - mark this term golden in the reconciliator + + } +i_am_resolutor: + if (ctx->state == resolutor) { + + // This node's last term is filled when it gets a message + // from the leader to act as a reconciliator. + uint32_t recon_index = ctx->reconciliator_index; + nsr_reconciliator_info_t *my_info = + ctx->workers[0].recon_info; + nsr_reconciliator_info_t *his_info = + ctx->workers[recon_index].recon_info; + uint32_t my_last_term = my_info->last_term; + uint32_t to_do_term = his_info->last_term; + uint32_t my_start_index = 1, my_end_index = 1; + uint32_t his_start_index = 1, his_end_index = 1; + uint32_t num = 0; + gf_boolean_t fl = _gf_true; + + nsr_driver_log (this->name, GF_LOG_INFO, + "starting resolutor work with reconciliator as %d from term %d to term %d \n", + recon_index, my_last_term, to_do_term); + + do { + + if (!fl) { + (his_info->last_term)++; + (my_info->last_term)++; + } else { + his_info->last_term = my_last_term; + } + + nsr_driver_log (this->name, GF_LOG_INFO, "resolving term %d \n", my_info->last_term); + + // Get reconciliator's term information for that term + nsr_driver_log (this->name, GF_LOG_INFO, + "getting info from reconciliator for term %d \n", my_info->last_term); + bm = (1 << recon_index); + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_GET_GIVEN_TERM_INFO, + NSR_RECON_QUEUE_TO_CONTROL, his_info->last_term); + nsr_driver_log (this->name, GF_LOG_INFO, + "finished getting info from reconciliator for term %d \n", my_info->last_term); + + + // empty term + if (!his_info->commited_ops) { + nsr_driver_log (this->name, GF_LOG_INFO, + "reconciliator for term %d is empty. moving to next term. \n", my_info->last_term); + // TBD - mark the term golden + fl = _gf_false; + continue; + } + + // calculate the resolution window boundary. + // for the last term this node saw, we compare the resolution window of this and reconciliator. + // for the rest of the nodes, we just accept the reconciliator info. + if (fl) { + my_start_index = my_info->first_index; + my_end_index = my_info->last_index; + his_start_index = his_info->first_index; + his_end_index = his_info->last_index; + my_info->first_index = (my_start_index < his_start_index) ? my_start_index : his_start_index; + my_info->last_index = (my_end_index > his_end_index) ? my_end_index : his_end_index; + } else { + my_info->first_index = his_info->first_index; + my_info->last_index = his_info->last_index; + my_info->commited_ops = his_info->commited_ops; + } + if (my_info->first_index == 0) + my_info->first_index = 1; + num = (my_info->last_index - my_info->first_index) + 1; + + + // Get the logs from the reconciliator (and this node for this term) + if (fl) + bm = ((1 << recon_index) | 1); + else + bm = (1 << recon_index); + + nsr_driver_log (this->name, GF_LOG_INFO, + "getting reconciliation window for term %d from %d to %d \n", + my_info->last_term, + my_info->first_index, my_info->last_index); + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_GET_RECONCILATION_WINDOW, + NSR_RECON_QUEUE_TO_CONTROL, -1); + nsr_driver_log (this->name, GF_LOG_INFO, + "finished getting reconciliation window for term %d from %d to %d \n", + my_info->last_term, + my_info->first_index, my_info->last_index); + + // from the changelogs, calculate the entries that need action + compute_resolution_work(ctx, my_info, his_info, !fl); + + + // for each of the entries that need fixup, issue IO + for (i=my_info->first_index; i < (my_info->first_index + num); i++) { + nsr_reconciliation_record_t *record = + &(my_info->records[i - my_info->first_index]); + + record->work.term = my_info->last_term; + record->work.index = i; + + nsr_driver_log (this->name, GF_LOG_INFO, "fixing index %d\n",i); + if ((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) || + (record->work.type == NSR_RECON_WORK_UNDO_FILL)) { + if (((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) && + recon_check_changelog(&record->rec)) || + ((record->work.type == NSR_RECON_WORK_UNDO_FILL) && + recon_compute_undo(&record->rec))) { + nsr_driver_log (this->name, GF_LOG_INFO, + "reading data from source %d\n",recon_index); + bm = (1 << recon_index); + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_READ, + NSR_RECON_QUEUE_TO_DATA, + i); + nsr_driver_log (this->name, GF_LOG_INFO, + "finished reading data from source %d\n",recon_index); + } + + nsr_driver_log (this->name, GF_LOG_INFO, + "fixing local data as part of resolutor\n"); + + bm = 1; + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT, + NSR_RECON_QUEUE_TO_DATA, + i); + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished fixing local data as part of resolutor\n"); + } + } + fl = _gf_false; + + // tbd - mark this term golden in the reconciliator + } while (my_last_term++ != to_do_term); + + nsr_driver_log (this->name, GF_LOG_INFO, + "finished resolutor work \n"); + + if (ctx->env) { + nsr_driver_log (this->name, GF_LOG_INFO, "before longjmp \n"); + longjmp(*(ctx->env), 1); + } + + } + + // free the asasociated recon_info contexts created as part of this role + +out: + nsr_driver_log (this->name, GF_LOG_INFO, + "sending end of reconciliation message \n"); + nsr_recon_return_back(priv, ctx->txn_id); +#if 0 + // send message that job is done by writing to local recon translator + bm = 1; + send_and_wait(bm, + replica_group_size, + ctx, + NSR_WORK_ID_END_RECONCILIATION, + NSR_RECON_QUEUE_TO_CONTROL, -1); +#endif + nsr_driver_log (this->name, GF_LOG_INFO, + "finished sending end of reconciliation message \n"); + ctx->state = 0; + } + + return NULL; +} diff --git a/xlators/cluster/nsr-recon/src/recon_driver.h b/xlators/cluster/nsr-recon/src/recon_driver.h new file mode 100644 index 000000000..67f4d6014 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_driver.h @@ -0,0 +1,308 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RECON_DRIVER_H__ +#define __RECON_DRIVER_H__ + + +#include "api/src/glfs.h" +#include + +#define MAX_HOSTNAME_LEN 32 +#define MAXIMUM_REPLICA_STRENGTH 8 +#define MAX_RECONCILIATION_WINDOW_SIZE 10000 + +#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd" +#define GLUSTERD_VOLUME_DIR_PREFIX "vols" +#define GLUSTERD_BRICK_INFO_DIR "bricks" + +/* + * Even with the names fixed, the non-NSR_DEBUG definitions of nsr_*_log don't + * work because many callers don't have "this" defined. + * + * TBD: use gf_log, fix "this" problem, eliminate extra fields and newlines. + */ +#define NSR_DEBUG + +typedef enum nsr_recon_work_req_id_t { + NSR_WORK_ID_GET_NONE = 0, + NSR_WORK_ID_GET_LAST_TERM_INFO = NSR_WORK_ID_GET_NONE + 1, + NSR_WORK_ID_GET_GIVEN_TERM_INFO = NSR_WORK_ID_GET_LAST_TERM_INFO + 1, + NSR_WORK_ID_RECONCILIATOR_DO_WORK = NSR_WORK_ID_GET_GIVEN_TERM_INFO + 1, + NSR_WORK_ID_RESOLUTION_DO_WORK = NSR_WORK_ID_RECONCILIATOR_DO_WORK + 1, + NSR_WORK_ID_GET_RECONCILATION_WINDOW = NSR_WORK_ID_RESOLUTION_DO_WORK + 1, + NSR_WORK_ID_SINGLE_RECONCILIATION_READ = NSR_WORK_ID_GET_RECONCILATION_WINDOW + 1, + NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT = NSR_WORK_ID_SINGLE_RECONCILIATION_READ + 1, + NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH = NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT + 1, + NSR_WORK_ID_GET_RESOLUTION_WINDOW = NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH + 1, + NSR_WORK_ID_END_RECONCILIATION = NSR_WORK_ID_GET_RESOLUTION_WINDOW + 1, + NSR_WORK_ID_INI = NSR_WORK_ID_END_RECONCILIATION + 1, + NSR_WORK_ID_FINI = NSR_WORK_ID_INI + 1 +} nsr_recon_work_req_id_t; + +typedef enum nsr_recon_queue_type_t { + NSR_RECON_QUEUE_TO_CONTROL = 0, + NSR_RECON_QUEUE_TO_DATA =NSR_RECON_QUEUE_TO_CONTROL + 1, +} nsr_recon_queue_type_t; + +typedef enum nsr_log_type_t { + NSR_LOG_HOLE = 0b0, + NSR_LOG_PSEUDO_HOLE = 0b1, + NSR_LOG_FILL = 0b11 +} nsr_log_type_t; + +typedef enum nsr_mode_t { + NSR_SEQ = 0, + NSR_USE_THREADS = 1, + NSR_ASYNC = 2 +} nsr_mode_t; + +typedef enum nsr_recon_work_type_t { + NSR_RECON_WORK_NONE = 0, + NSR_RECON_WORK_HOLE_TO_NOOP = NSR_RECON_WORK_NONE + 1, + NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_NOOP + 1, + NSR_RECON_WORK_COMPARE_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE + 1, + NSR_RECON_WORK_HOLE_TO_FILL = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE + 1, + NSR_RECON_WORK_UNDO_FILL = NSR_RECON_WORK_HOLE_TO_FILL + 1, +} nsr_recon_work_type_t; + +typedef enum nsr_recon_driver_state_t { + none = 0, + leader = 1, + reconciliator = 2, + resolutor = 3, +} nsr_recon_driver_state_t; + +// role structure +#pragma pack(push, 1) +typedef struct _nsr_recon_role_s { + uint32_t role; // leader, reconciliator, resolutor + uint32_t num; // required in case state is reconciliator + uint32_t current_term; // current term used in case of leader + // In case this is reconciliator, num is set to nodes that were part + // of previous term. + // In case this is resolutor, num is set to 2. + // info[0] - information for this node. + // info[1] - information of the reconciliator. + // In case this is leader, num is set to this term's membership list + // set info.name to all members including the leader + struct { + int32_t last_term; + int32_t commited_ops; + uint32_t last_index; + uint32_t first_index; + char name[MAX_HOSTNAME_LEN]; + } info[MAXIMUM_REPLICA_STRENGTH]; +} nsr_recon_role_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_RR(rr, is_true) \ +{ \ + uint32_t i=0; \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + if (is_true == _gf_true) rr.num = f(rr.num); \ + rr.current_term = f(rr.current_term); \ + for (i=0; i < rr.num; i++) { \ + rr.info[i].last_term = f(rr.info[i].last_term); \ + rr.info[i].commited_ops = f(rr.info[i].commited_ops); \ + rr.info[i].last_index = f(rr.info[i].last_index); \ + rr.info[i].first_index = f(rr.info[i].first_index); \ + } \ + if (is_true == _gf_false) rr.num = f(rr.num); \ +} + +// last term info structure +#pragma pack(push, 1) +typedef struct _nsr_recon_last_term_info_s { + int32_t last_term; + int32_t commited_ops; + uint32_t last_index; + uint32_t first_index; +} nsr_recon_last_term_info_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_LT(lt, is_true) \ +{ \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + lt.last_term = f(lt.last_term); \ + lt.commited_ops = f(lt.commited_ops); \ + lt.last_index = f(lt.last_index); \ + lt.first_index = f(lt.first_index); \ +} + +// log information +#pragma pack(push, 1) +typedef struct _nsr_recon_log_info_s { + uint32_t term; + uint32_t first_index; + uint32_t last_index; +} nsr_recon_log_info_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_LI(li, is_true) \ +{ \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + li.term = f(li.term); \ + li.first_index = f(li.first_index); \ + li.last_index = f(li.last_index); \ +} + +#pragma pack(push, 1) +typedef struct nsr_recon_record_details_s { + uint32_t type; + uint32_t op; + char gfid[36+1]; + char pargfid[36+1]; + char link_path[256]; // should it be PATH_MAX? + uint32_t offset; + uint32_t len; + char entry[128]; + char newloc[128]; // for rename. can you overload link_path for this? TBD +} nsr_recon_record_details_t; +#pragma pack(pop) + +#define ENDIAN_CONVERSION_RD(rd, is_true) \ +{ \ + uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \ + rd.type = f(rd.type); \ + rd.op = f(rd.op); \ + rd.offset = f(rd.offset); \ + rd.len = f(rd.len); \ +} + +typedef struct _nsr_recon_work_s { + gf_boolean_t in_use; + uint32_t index; + uint32_t req_id; + struct list_head list; +} nsr_recon_work_t; + +typedef struct _nsr_reconciliation_work_s { + uint32_t term; + uint32_t index; + uint32_t type; + uint32_t source; + void *data; + + uint32_t num; // used for xattr + +} nsr_reconciliation_work_t; + +typedef struct _nsr_reconciliation_record_s { + nsr_reconciliation_work_t work; // will store the computed work + nsr_recon_record_details_t rec; +} nsr_reconciliation_record_t; + +typedef struct _nsr_reconciliator_info { + uint32_t reconcilator_index; + int32_t last_term; + int32_t commited_ops; + uint32_t last_index; + uint32_t first_index; + nsr_reconciliation_record_t records[MAX_RECONCILIATION_WINDOW_SIZE]; +} nsr_reconciliator_info_t; + +typedef struct _nsr_per_node_worker_s { + char *id; // identifier + char vol_file[256]; //volfile that will be used by this thread + glfs_t *fs; + glfs_fd_t *aux_fd; + uint32_t index; // index into array of workers + pthread_t thread_id; // thread id + void * context; // thread context + struct _nsr_recon_driver_ctxt *driver_ctx; + char local; // local data worker + //struct list_head list; //list of work items + nsr_recon_work_t head; + pthread_mutex_t mutex; //mutex to gaurd the above list + pthread_cond_t cv; //condition variable for signaling the worker thread + gf_boolean_t is_control; +#ifdef NSR_DEBUG + uint32_t worker_log_fd; +#endif +} nsr_per_node_worker_t; + +typedef struct _nsr_replica_worker_s { + char name[256]; + nsr_per_node_worker_t *control_worker; + nsr_per_node_worker_t *data_worker; + gf_boolean_t in_use; + nsr_reconciliator_info_t *recon_info; // Bunch of infos kept for this reconciliation +} nsr_replica_worker_t; + +typedef struct _nsr_recon_driver_ctxt { + xlator_t *this; + uint32_t replica_group_size; // number of static members of replica group + nsr_replica_worker_t *workers; // worker info + int32_t reconciliator; + pthread_mutex_t mutex; //mutex to gaurd the state + pthread_cond_t cv; //condition variable for signaling the driver thread + uint32_t state; //driver state + volatile int32_t outstanding; + uint32_t reconciliator_index; + uint32_t txn_id; + uint32_t current_term; + jmp_buf *env; +#ifdef NSR_DEBUG + uint32_t driver_log_fd; +#endif + nsr_mode_t mode; // default set to seq +} nsr_recon_driver_ctx_t; + +void * +nsr_reconciliation_driver(void *); + +gf_boolean_t +nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, nsr_recon_role_t *rr, uint32_t txn_id); + +#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1)) +#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1)) +#define atomic_fetch_and __sync_fetch_and_and +#define atomic_fetch_or __sync_fetch_and_or + +/* + * REVIEW + * Ideally, use gf_log like everyone else. Failing that, at least put the logs + * with all the others in /var/log instead of /tmp. + * NB two instances, for nsr_driver_log and nsr_worker_log + */ +#ifdef NSR_DEBUG +#define nsr_driver_log(dom, levl, fmt...) \ + { \ + char c[255]; \ + if (!ctx->driver_log_fd) { \ + mkdir("/tmp/nsr-logs/", 0777); \ + ctx->driver_log_fd = open("/tmp/nsr-logs/nsr-driver-log", O_RDWR|O_CREAT|O_TRUNC); \ + } \ + sprintf(c, fmt); \ + write(ctx->driver_log_fd, c, strlen(c)); \ + } +#else +#define nsr_driver_log(dom, levl, fmt...) gf_log(dom, levl, fmt) +#endif + +#ifdef NSR_DEBUG +#define nsr_worker_log(dom, levl, fmt...) \ + { \ + char c[255]; \ + if (!ctx->worker_log_fd) { \ + char str[255]; \ + sprintf(str,"/tmp/nsr-logs/%s-%d",ctx->is_control? "con" : "data",ctx->index); \ + mkdir("/tmp/nsr-logs/", 0777); \ + ctx->worker_log_fd = open(str, O_RDWR|O_CREAT|O_TRUNC); \ + } \ + sprintf(c, fmt); \ + write(ctx->worker_log_fd, c, strlen(c)); \ + } +#else +#define nsr_worker_log(dom, levl, fmt...) gf_log(dom, levl, fmt) +#endif + +#endif /* #ifndef __RECON_DRIVER_H__ */ diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.c b/xlators/cluster/nsr-recon/src/recon_xlator.c new file mode 100644 index 000000000..62583d526 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_xlator.c @@ -0,0 +1,837 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" + +#include "recon_driver.h" +#include "recon_xlator.h" + +typedef struct _nsr_recon_fd_s { + int32_t term; + nsr_recon_driver_state_t state; + uint32_t first_index; + uint32_t last_index; + call_frame_t *frame; +} nsr_recon_fd_t; + + +typedef struct _nsr_txn_id_s { + uint32_t txn_id; + call_frame_t *frame; + struct list_head list; +} nsr_txn_id_t; + +// Given fd, get back the NSR based fd context. +static int32_t this_fd_ctx_get(fd_t *fd, xlator_t *this, nsr_recon_fd_t **rfd) +{ + uint64_t tmp = 0; + int32_t ret = -1; + + if ((ret = fd_ctx_get(fd, this, &tmp)) != 0) { + return ret; + } else { + *rfd = (nsr_recon_fd_t *)tmp; + return 0; + } +} + +// Add the frame in q after associating with txn_id +static void put_frame(nsr_recon_private_t *priv, + call_frame_t *frame, + uint32_t txn_id) +{ + xlator_t *this = priv->this; + nsr_txn_id_t * tid = GF_CALLOC(1, sizeof(nsr_txn_id_t), gf_mt_recon_private_t); + tid->txn_id = txn_id; + tid->frame = frame; + INIT_LIST_HEAD(&(tid->list)); + list_add_tail(&(tid->list), &(priv->list)); + recon_main_log (this->name, GF_LOG_INFO, "adding framef or txn id %d into queue \n", txn_id); +} + +// get the frame from the queue given the txn id +static void get_frame(nsr_recon_private_t *priv, + call_frame_t **frame, + uint32_t txn_id) +{ + nsr_txn_id_t *tid = NULL; + xlator_t *this = priv->this; + + list_for_each_entry(tid, &(priv->list), list) { + if (tid->txn_id == txn_id) { + *frame = tid->frame; + recon_main_log (this->name, GF_LOG_INFO, "got frame for txn id %d into queue \n", txn_id); + return; + } + } + recon_main_log (this->name, GF_LOG_INFO, "got no frame for txn id %d into queue \n", txn_id); + GF_ASSERT(0); +} + +// Get the term info for the term number specified +void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt) +{ + struct stat buf; + char path[PATH_MAX]; + + bzero(lt, sizeof(nsr_recon_last_term_info_t)); + lt->last_term = term; + sprintf(path,"%s/%s%d",bp,"TERM.",term); + if (!stat(path, &buf) && (buf.st_size > 128)) { + if (buf.st_size <= 128) { + lt->first_index = 0; + lt->last_index = 0; + lt->commited_ops = 0; + } + else { + lt->first_index = 1; + lt->last_index = ((buf.st_size - 128)/128) + 1 ; + lt->commited_ops = lt->last_index - lt->first_index + 1; + } + } + recon_main_log (this->name, GF_LOG_INFO, "for term=%d got first_index=%d last_index=%d commited_ops=%d\n", + term, lt->first_index, lt->last_index, lt->commited_ops); + return; +} + +// Given the term number, find the last term in the changelogs +void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt) +{ + uint32_t t = term; + struct stat buf; + char path[PATH_MAX]; + bzero(lt, sizeof(nsr_recon_last_term_info_t)); + while(t) { + // journal file is of type TERM-1.jnl + sprintf(path,"%s/%s%d",bp,"TERM.",t); + if (!stat(path, &buf)) { + nsr_recon_libchangelog_get_this_term_info(this, bp, t, lt); + recon_main_log (this->name, GF_LOG_INFO, "got last term given current term %d as %d\n", term, t); + return; + } + t--; + } + recon_main_log (this->name, GF_LOG_INFO, "got no last term given current term %d \n", term); + + return; +} + +// Return back the frame stored against the txn_id +void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t txn_id) +{ + call_frame_t *old_frame = NULL; + xlator_t *this = priv->this; + int32_t op_ret = 0; + int32_t op_errno = 0; + + get_frame(priv, &old_frame, txn_id); + if (old_frame) { + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev returns old frame \n"); + // first return the original write for which this ack was sent + STACK_UNWIND_STRICT (writev, old_frame, op_ret, op_errno, NULL, NULL, NULL); + } else { + recon_main_log (this->name, GF_LOG_ERROR, "EIII---nsr_recon_writev cnnot return old frame \n"); + } +} + +typedef enum records_type_t { + fop_gfid_pgfid_oldloc_newloc = 1, + fop_gfid_pgfid_entry = fop_gfid_pgfid_oldloc_newloc + 1, + fop_gfid = fop_gfid_pgfid_entry + 1 , + fop_gfid_offset = fop_gfid + 1, + fop_gfid_offset_len = fop_gfid_offset + 1, +} records_type_t; + +// Get the backend ./glusterfs/xx/xx/<...> path +static void +get_gfid_path(nsr_recon_private_t *priv, char *gfid, char *path) +{ + strcpy(path, priv->base_dir); + strcat(path, "/.glusterfs/"); + strncat(path,gfid,2); + strcat(path,"/"); + strncat(path,gfid+2,2); + strcat(path,"/"); + strcat(path,gfid); +} + + +// Get the link to which backend points to +static gf_boolean_t +get_link_using_gfid(nsr_recon_private_t *priv, char *gfid, char *path) +{ + char lp[PATH_MAX]; + xlator_t *this = priv->this; + get_gfid_path(priv,gfid, lp); + if (readlink(lp, path, 255) == -1) { + GF_ASSERT(0); + recon_main_log(priv->this, GF_LOG_ERROR, + "cannot get readlink for %s\n",lp); + return _gf_false; + } + return _gf_true; +} + +// Get the list of changelog records given a term , first and last index. +void nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf) +{ + // do a mmap; seek into the first and read all records till last. + // TBD - right now all records are pseudo holes but mark them as fills. + // TBD - pseudo hole to be implemented when actual fsync gets done on data. + char read_buf[((last - first) + 1) * 128]; + char *rb = &(read_buf[0]); + char path[PATH_MAX]; + int fd; + uint32_t index = 0; + + recon_main_log (this->name, GF_LOG_INFO, + "libchangelog_get_records called for term %d index from %d to %d \n", + term, first, last ); + + sprintf(path,"%s/%s%d",bp,"TERM.",term); + fd = open(path, O_RDONLY); + if (fd != -1) { + char *start = NULL; + nsr_recon_record_details_t * rec = (nsr_recon_record_details_t *)buf; + if (first == 0) + lseek(fd, 128, SEEK_SET); + else + lseek(fd, first * 128, SEEK_SET); + read(fd, rb, (last - first + 1) * 128); + start = rb; + index = first; + do { + recon_main_log (this->name, GF_LOG_INFO, + "libchangelog_get_records start inspecting records at index %d \n", + index ); + if (!strncmp(start, "_PRE_", 5)) { + char op_str[4]; + uint32_t i=0, opcode = 0; + records_type_t type; + + start += 5; + // increment by the NULLs after the PRE + start += 4; + // now we have the opcode + i = 0; + while (*start != 0) { + op_str[i++] = (*start); + start++; + } + op_str[i] = '\0'; + opcode = strtoul(op_str, NULL, 10); + recon_main_log (this->name, GF_LOG_ERR, + "libchangelog_get_records: got opcode %d @index %d\n", opcode, index); + if ((opcode == GF_FOP_RENAME)) { + type = fop_gfid_pgfid_oldloc_newloc; + } else if ((opcode == GF_FOP_UNLINK) || + (opcode == GF_FOP_RMDIR) || + (opcode == GF_FOP_LINK) || + (opcode == GF_FOP_MKDIR) || + (opcode == GF_FOP_SYMLINK) || + (opcode == GF_FOP_MKNOD) || + (opcode == GF_FOP_CREATE)) { + type = fop_gfid_pgfid_entry; + } else if ((opcode == GF_FOP_FSETATTR) || + (opcode == GF_FOP_SETATTR) || + (opcode == GF_FOP_FREMOVEXATTR) || + (opcode == GF_FOP_REMOVEXATTR) || + (opcode == GF_FOP_SETXATTR) || + (opcode == GF_FOP_FSETXATTR)) { + type = fop_gfid; + } else if ((opcode == GF_FOP_TRUNCATE) || + (opcode == GF_FOP_FTRUNCATE)) { + type = fop_gfid_offset; + } else if (opcode == GF_FOP_WRITE) { + type = fop_gfid_offset_len; + } else { + recon_main_log (this->name, GF_LOG_ERR, + "libchangelog_get_records:got no proper opcode %d @index %d\n", + opcode, index); + //GF_ASSERT(0); + // make this as a hole. + // TBD - check this logic later. maybe we should raise alarm here because + // this means that changelog is corrupted. We are not handling changelog + // corruptions as of now. + rec->type = NSR_LOG_HOLE; + goto finish; + } + // TBD - handle psuedo holes once that logic is in. + rec->type = NSR_LOG_FILL; + recon_main_log (this->name, GF_LOG_ERR, + "libchangelog_get_records:got type %d at index %d \n", + rec->type, index); + rec->op = opcode; + + // Now get the gfid and parse it + // before that increment the pointer + start++; + for (i=0; i < 36; i++) { + rec->gfid[i] = (*start); + start++; + } + rec->gfid[i] = '\0'; + + if (opcode == GF_FOP_SYMLINK) { + // the symlink would have been removed. Hence ignore this. + // TBD - have an uniform error policy in case of such cases. + // Right now we are handling some on the source and some on the destination. + if(get_link_using_gfid(this->private, rec->gfid, rec->link_path) == _gf_false) { + rec->type = NSR_LOG_HOLE; + goto finish; + } + } + + GF_ASSERT(*start == 0); + start ++; + + i = 0; + // If type is fop_gfid_offset+_len, get offset + if ((type == fop_gfid_offset) || (type == fop_gfid_offset_len)) { + char offset_str[128]; + while(*start != 0) { + offset_str[i++] = *start; + start ++; + } + offset_str[i] = '\0'; + // get over the 0 + start++; + rec->offset = strtoul(offset_str, NULL, 10); + recon_main_log (this->name, GF_LOG_ERR, + "libchangelog_get_records:got offset %d @index %d \n", rec->offset, index); + + } + i = 0; + if (type == fop_gfid_offset_len) { + char len_str[128]; + while(*start != 0) { + len_str[i++] = *start; + start ++; + } + len_str[i] = '\0'; + // get over the 0 + start++; + rec->len = strtoul(len_str, NULL, 10); + recon_main_log (this->name, GF_LOG_ERR, + "libchangelog_get_records:got length %d @index %d \n", rec->len, index); + } + i = 0; + if (type == fop_gfid_pgfid_entry) { + // first get the gfid and then the path + for (i=0; i < 36; i++) { + rec->pargfid[i] = (*start); + start++; + } + rec->pargfid[i] = '\0'; + GF_ASSERT(*start == '/'); + start ++; + + i = 0; + while(*start != 0) { + rec->entry[i++] = *start; + start ++; + } + rec->entry[i] = '\0'; + // get over the 0 + start++; + recon_main_log (this->name, GF_LOG_ERR, + "libchangelog_get_records:got entry %s @index %d \n", rec->entry, index); + + } + i = 0; + if (type == fop_gfid_pgfid_oldloc_newloc) { + + // first get the source and then the destination + // source stuff gets stored in pargfid/entry + for (i=0; i < 36; i++) { + rec->pargfid[i] = (*start); + start++; + } + rec->pargfid[i] = '\0'; + GF_ASSERT(*start == '/'); + start ++; + + i=0; + while(*start != 0) { + rec->entry[i++] = *start; + start ++; + } + rec->entry[i] = '\0'; + // get over the 0 + start++; + + // dst stuff gets stored in gfid/newloc + for (i=0; i < 36; i++) { + rec->gfid[i] = (*start); + start++; + } + rec->gfid[i] = '\0'; + GF_ASSERT(*start == '/'); + start ++; + i = 0; + while(*start != 0) { + rec->newloc[i++] = *start; + start ++; + } + rec->newloc[i] = '\0'; + // get over the 0 + start++; + + } + ENDIAN_CONVERSION_RD((*rec), _gf_false); //htonl + } +finish: + if (index == last) + break; + index++; + rb += 128; + start = rb; + rec++; + } while(1); + } + close(fd); + + recon_main_log (this->name, GF_LOG_INFO, + "libchangelog_get_records finsihed inspecting records for term %d \n", + term); + return; +} + +int32_t +nsr_recon_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + nsr_recon_fd_t *rfd = NULL; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open called for path %s \n",loc->path ); + rfd = GF_CALLOC (1, sizeof (*rfd), gf_mt_recon_private_t); + if (!rfd) { + op_ret = -1; + op_errno = ENOMEM; + } + + op_ret = fd_ctx_set (fd, this, (uint64_t)(long)rfd); + if (op_ret) { + op_ret = -1; + op_errno = EINVAL; + } + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open returns with %d for path %s \n",op_ret,loc->path ); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL); + return 0; +} + +int32_t +nsr_recon_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + nsr_recon_fd_t *rfd = NULL; + nsr_recon_private_t *priv = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + int32_t ret = 0; + + ret = this_fd_ctx_get (fd, this, &rfd); + if (ret < 0) { + return -1; + } + priv = (nsr_recon_private_t *)this->private; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called for offset %d \n",(unsigned int)offset ); + GF_ASSERT(count == 1); + switch (offset) { + // gets called to return back + case nsr_recon_xlator_sector_0: + { + char c[4]; + uint32_t txn_id; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev clled to return back \n"); + memcpy((void *)c, (void *)vector[0].iov_base, 4); + txn_id = ntohl(atoi(c)); + nsr_recon_return_back(priv, txn_id); + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + // client(brick, leader) writes the role of the node + case nsr_recon_xlator_sector_1 : + { + nsr_recon_role_t rr; + memcpy((void *)&rr, (void *)vector[0].iov_base, sizeof(rr)); + ENDIAN_CONVERSION_RR(rr, _gf_true); //ntohl + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called to set role %d\n", rr.role); + if ((rr.role != leader) && + (rr.role != reconciliator) && + (rr.role != resolutor)) { + recon_main_log (this->name, GF_LOG_ERROR, + "EIII---nsr_recon_writev cannot set state \n"); + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, + NULL, NULL, NULL); + } + + GF_ASSERT(rr.num <= MAXIMUM_REPLICA_STRENGTH); + + // Store the stack frame so that when the actual job gets finished + // we send the response back to the brick. + if (nsr_recon_driver_set_role(priv->driver_thread_context, + &rr, + priv->txn_id) == _gf_false) { + recon_main_log (this->name, GF_LOG_ERROR, + "nsr_recon_writev set_role - cannot seem to set role \n"); + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, + NULL, NULL, NULL); + } else { + uint32_t old = priv->txn_id; + atomic_cmpxchg(&priv->txn_id, old,old+1); + put_frame(priv, frame, old); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev set_role - set role succesfully \n"); + } + break; + } + // client(reconciliator) writes how much it needs for the read + case nsr_recon_xlator_sector_2 : + { + nsr_recon_log_info_t li; + memcpy((void *)&li, (void *)vector[0].iov_base, sizeof(li)); + ENDIAN_CONVERSION_LI(li, _gf_true); //ntohl + + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev - setting term info for reconcilation info. term=%d, first_index=%d,start_index=%d \n", + li.term, li.first_index, li.last_index); + rfd->term = li.term; + rfd->last_index = li.last_index; + rfd->first_index = li.first_index; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + // client(reconciliator) writes term for which it needs info + case nsr_recon_xlator_sector_3 : + { + int32_t term; + + memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term)); + term = ntohl(term); //ntohl + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev - setting term info for term info. term=%d\n", + term); + rfd->term = term; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + // client(reconciliator) writes current term so that it gets last term info later + case nsr_recon_xlator_sector_4 : + { + int32_t term; + + memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term)); + term = ntohl(term); //ntohl + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_writev - setting term info for last term info given current term=%d\n", + term); + rfd->term = term; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; + } + } + + return 0; +} + +int +nsr_recon_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + nsr_recon_fd_t *rfd = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + // copied stuff from quick-read.c and posix.c + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = {0, }; + int32_t ret = -1; + nsr_recon_private_t *priv = NULL; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, op_ret); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + + iobref = iobref_new (); + if (!iobref) { + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + ret = this_fd_ctx_get (fd, this, &rfd); + if (ret < 0) { + op_errno = -ret; + goto out; + } + priv = (nsr_recon_private_t *)this->private; + + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_readv called for offset %d \n",(unsigned int)offset ); + switch (offset) { + // client(leader) reads from here to get info for this term on this node + // invole libchagelog to get the information + case nsr_recon_xlator_sector_3 : + { + nsr_recon_last_term_info_t lt; + GF_ASSERT(size == sizeof(lt)); + nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, rfd->term, <); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - getting term info for term=%d, ops=%d, first=%d, last=%d\n", + rfd->term, lt.commited_ops, lt.first_index, lt.last_index); + ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl + memcpy(iobuf->ptr, <, size); + goto out; + } + // client(reconciliator) reads individual record information + case nsr_recon_xlator_sector_2 : + { + uint32_t num = (rfd->last_index - rfd->first_index + 1); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - expected size %lu got size %lu\n", + (num * sizeof(nsr_recon_record_details_t)), size); + + GF_ASSERT(size == (num * sizeof(nsr_recon_record_details_t))); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - getting records for term=%d from %d to %d\n", + rfd->term, rfd->first_index, rfd->last_index); + nsr_recon_libchangelog_get_records(this, priv->changelog_base_path, + rfd->term, rfd->first_index, rfd->last_index, iobuf->ptr); + goto out; + } + // read last term info + case nsr_recon_xlator_sector_4 : + { + nsr_recon_last_term_info_t lt; + GF_ASSERT(size == sizeof(lt)); + nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, rfd->term, <); + recon_main_log (this->name, GF_LOG_INFO, + "nsr_recon_readv - getting last term info given current term=%d. last term = %d ops=%d, first=%d, last=%d\n", + rfd->term, lt.last_term, lt.commited_ops, lt.first_index, lt.last_index); + ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl + memcpy(iobuf->ptr, <, size); + goto out; + } + } + +out: + if (op_errno == 0) { + iov.iov_base = iobuf->ptr; + ret = iov.iov_len = size; + } + + STACK_UNWIND_STRICT (readv, frame, ret, op_errno, &iov, 1, NULL, iobref , NULL); + + if (iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + return 0; +} + +int +nsr_recon_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xdata) +{ + struct iatt buf = {0, }; + // dirty hack to set root as regular but seems to work. + buf.ia_type = IA_IFREG; + recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_lookup called \n"); + + STACK_UNWIND_STRICT (lookup, frame, 0, 0, this->itable->root, &buf, NULL, NULL); + return 0; +} + + +int32_t +nsr_recon_flush (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *xdata) +{ + STACK_UNWIND_STRICT (flush, frame, 0, 0, NULL); + return 0; +} + +int32_t +init (xlator_t *this) +{ + nsr_recon_private_t *priv = NULL; + char *local, *members; + unsigned int i=0; + + priv = GF_CALLOC (1, sizeof (*priv), gf_mt_recon_private_t); + if (!priv) { + recon_main_log (this->name, GF_LOG_ERROR, + "priv allocation error\n"); + return -1; + } + GF_OPTION_INIT ("replica-group-size", priv->replica_group_size, uint32, err); + GF_OPTION_INIT ("vol-name", priv->volname, str, err); + if (!priv->volname) { + recon_main_log (this->name, GF_LOG_ERROR, + "missing volname option (required)"); + return -1; + } + GF_OPTION_INIT ("changelog-dir", priv->changelog_base_path, str, err); + if (!priv->changelog_base_path) { + recon_main_log (this->name, GF_LOG_ERROR, + "missing changelog directory option (required)"); + return -1; + } + GF_OPTION_INIT ("base-dir", priv->base_dir, str, err); + if (!priv->base_dir) { + recon_main_log (this->name, GF_LOG_ERROR, + "missing brick base directory option (required)"); + return -1; + } + GF_OPTION_INIT ("replica-group-members", members, str, err); + if (!members) { + recon_main_log (this->name, GF_LOG_ERROR, + "missing membership option (required)"); + return -1; + } + GF_OPTION_INIT ("local-member", local, str, err); + if (!local) { + recon_main_log (this->name, GF_LOG_ERROR, + "missing local member option (required)"); + return -1; + } + + priv->replica_group_members = GF_CALLOC (priv->replica_group_size, + sizeof(char *), + gf_mt_recon_private_t); + priv->replica_group_members[0] = GF_CALLOC (1, + strlen(local), + gf_mt_recon_private_t); + if (!priv->replica_group_members || !(priv->replica_group_members[0])) { + recon_main_log (this->name, GF_LOG_ERROR, + "str allocation error\n"); + return -1; + } + strcpy(priv->replica_group_members[0], local); + for (i=1; i < priv->replica_group_size; i++) { + char *member; + if (i == 1) + member = strtok(members, ","); + else + member = strtok(NULL, ","); + priv->replica_group_members[i] = GF_CALLOC (1, strlen(member) + 1, gf_mt_recon_private_t); + if (!priv->replica_group_members[i]) { + recon_main_log (this->name, GF_LOG_ERROR, + "str allocation error\n"); + return -1; + } + strcpy(priv->replica_group_members[i], member); + } + + + priv->this = this; + this->private = (void *)priv; + + recon_main_log (this->name, GF_LOG_INFO, "creating reconciliation driver \n"); + + if (pthread_create(&priv->thread_id, NULL, nsr_reconciliation_driver, priv)) { + recon_main_log (this->name, GF_LOG_ERROR, + "pthread creation error \n"); + return -1; + } + + INIT_LIST_HEAD(&(priv->list)); + + + return 0; + +err: + return -1; +} + + +void +fini (xlator_t *this) +{ + nsr_recon_private_t *priv = NULL; + void *ret = NULL; + + priv = (nsr_recon_private_t *)this->private; + + pthread_cancel(priv->thread_id); + pthread_join(priv->thread_id, &ret); +} + + +struct xlator_fops fops = { + .open = nsr_recon_open, + .readv = nsr_recon_readv, + .writev = nsr_recon_writev, + .lookup = nsr_recon_lookup, + .flush = nsr_recon_flush +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"replica-group-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 2, + .max = INT_MAX, + .default_value = "2", + .description = "Number of bricks in replica group. can be derived but putting it here for testing." + }, + { + .key = {"vol-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "volume name" + }, + { + .key = {"local-member"}, + .type = GF_OPTION_TYPE_STR, + .description = "member(brick) for which this translator is responsible." + }, + { + .key = {"replica-group-members"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma seperated member names other than local." + }, + { + .key = {"changelog-dir"}, + .type = GF_OPTION_TYPE_STR, + .description = "Base directory where per term changelogs are maintained." + }, + { + .key = {"base-dir"}, + .type = GF_OPTION_TYPE_STR, + .description = "Base directory for this brick. This should go away once we fix gfid based lookups" + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.h b/xlators/cluster/nsr-recon/src/recon_xlator.h new file mode 100644 index 000000000..c0f1e2145 --- /dev/null +++ b/xlators/cluster/nsr-recon/src/recon_xlator.h @@ -0,0 +1,78 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RECON_XLATOR_H__ +#define __RECON_XLATOR_H__ + +#include +#include + +enum gf_dht_mem_types_ { + gf_mt_recon_private_t = gf_common_mt_end + 1, +}; + +enum nsr_recon_xlator_sector_t { + nsr_recon_xlator_sector_0 = 0, // to report back the status of given transaction ids + nsr_recon_xlator_sector_1 = 512, // to write here information about leadership changes from the brick + nsr_recon_xlator_sector_2 = (512 * 2), // to write here individual roles and wait for that role to be done + nsr_recon_xlator_sector_3 = (512 *3), // read from here to get term info for given term + nsr_recon_xlator_sector_4 = (512 * 4), // read from here to get last term info +}; + + +typedef struct _nsr_recon_private_s { + xlator_t *this; //back pointer + unsigned int replica_group_size; // number of static members of replica group + char **replica_group_members; // replica group members (including itself in first slot) + pthread_t thread_id; // driver thread id + nsr_recon_driver_ctx_t *driver_thread_context; //driver thread context + unsigned int outstanding; // for communicating with driver thread + call_frame_t *frame; // old frame that is pending (just one as of now) + struct list_head list; + char *volname; + uint32_t txn_id; + char *changelog_base_path; + char *base_dir; +#ifdef NSR_DEBUG + uint32_t recon_main_log_fd; +#endif +} nsr_recon_private_t; + +#define atomic_cmpxchg __sync_val_compare_and_swap + +/* + * REVIEW + * Ideally, use gf_log like everyone else. Failing that, at least put the logs + * with all the others in /var/log instead of /tmp. + */ +#ifdef NSR_DEBUG +#define recon_main_log(dom, levl, fmt...) \ + { \ + nsr_recon_private_t *priv = this->private; \ + char c[255]; \ + if (!priv->recon_main_log_fd) { \ + mkdir("/tmp/nsr-logs/", 0777); \ + priv->recon_main_log_fd = open("/tmp/nsr-logs/recon-main-log", O_RDWR|O_CREAT|O_TRUNC); \ + } \ + sprintf(c, fmt); \ + write(priv->recon_main_log_fd, c, strlen(c)); \ + } +#else +#define recon_main_log(dom, levl, fmt...) gf_log(dom, levl, fmt) +#endif + + +void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt); +void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt); +void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term_id); +void nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf); + + +#endif /* #ifndef __RECON_XLATOR_H__ */ diff --git a/xlators/cluster/nsr-server/Makefile.am b/xlators/cluster/nsr-server/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/nsr-server/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/nsr-server/src/Makefile.am b/xlators/cluster/nsr-server/src/Makefile.am new file mode 100644 index 000000000..df0d68539 --- /dev/null +++ b/xlators/cluster/nsr-server/src/Makefile.am @@ -0,0 +1,36 @@ +python_PYTHON = codegen.py gen-fops.py + +xlator_LTLIBRARIES = nsr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +nsr_la_LDFLAGS = -module -avoid-version -lgfapi -lcurl +nsr_la_SOURCES = nsr.c leader.c etcd-api.c \ + yajl.c yajl_alloc.c yajl_buf.c yajl_encode.c yajl_gen.c \ + yajl_lex.c yajl_parser.c yajl_tree.c yajl_version.c + +nsr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = nsr-internal.h etcd-api.h all-templates.c \ + yajl_alloc.h yajl_buf.h yajl_bytestack.h yajl_encode.h \ + yajl_lex.h yajl_parser.h yajl/yajl_common.h yajl/yajl_gen.h \ + yajl/yajl_parse.h yajl/yajl_tree.h yajl/yajl_version.h \ + $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/glusterfsd/src/glusterfsd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h + +CLEANFILES = nsr-cg.c + +nsr-cg.c: gen-fops.py codegen.py $(XLATOR_HEADER) all-templates.c + $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) all-templates.c > $@ + +nsr.lo: nsr-cg.c + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c new file mode 100644 index 000000000..541653029 --- /dev/null +++ b/xlators/cluster/nsr-server/src/all-templates.c @@ -0,0 +1,299 @@ +/* + * You can put anything here - it doesn't even have to be a comment - and it + * will be ignored until we reach the first template-name comment. + */ + + +// template-name read-fop +$TYPE$ +nsr_$NAME$ (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_$NAME$_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$, + $ARGS_SHORT$); + return 0; + +err: + STACK_UNWIND_STRICT ($NAME$, frame, -1, EREMOTE, + $DEFAULTS$); + return 0; +} + +// template-name read-dispatch +/* No "dispatch" function needed for $NAME$ */ + +// template-name read-fan-in +/* No "fan-in" function needed for $NAME$ */ + +// template-name read-continue +/* No "continue" function needed for $NAME$ */ + +// template-name read-complete +/* No "complete" function needed for $NAME$ */ + +// template-name write-fop +$TYPE$ +nsr_$NAME$ (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if defined(NSR_CG_NEED_FD) + local->fd = fd_ref(fd); +#else + local->fd = NULL; +#endif + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_$NAME$_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$, + $ARGS_SHORT$); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_$NAME$_stub (frame,nsr_$NAME$_continue, + $ARGS_SHORT$); + if (!local->stub) { + goto err; + } + +#if defined(NSR_CG_QUEUE) + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd->inode); + if (!ictx) { + op_errno = EIO; + goto err; + } + LOCK(&ictx->lock); + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + /* + * TBD: enqueue only for real conflict + * + * Currently we just act like all writes are in + * conflict with one another. What we should really do + * is check the active/pending queues and defer only if + * there's a conflict there. + * + * It's important to check the pending queue because we + * might have an active request X which conflicts with + * a pending request Y, and this request Z might + * conflict with Y but not X. If we checked only the + * active queue then Z could jump ahead of Y, which + * would be incorrect. + */ + local->qstub = fop_$NAME$_stub (frame, + nsr_$NAME$_dispatch, + $ARGS_SHORT$); + if (!local->qstub) { + UNLOCK(&ictx->lock); + goto err; + } + list_add_tail(&local->qlinks,&ictx->pqueue); + ++(ictx->pending); + UNLOCK(&ictx->lock); + return 0; + } + else { + list_add_tail(&local->qlinks,&ictx->aqueue); + ++(ictx->active); + } + UNLOCK(&ictx->lock); +#endif + + return nsr_$NAME$_dispatch (frame, this, $ARGS_SHORT$); + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->qstub) { + call_stub_destroy(local->qstub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT ($NAME$, frame, -1, op_errno, + $DEFAULTS$); + return 0; +} + +// template-name write-dispatch +$TYPE$ +nsr_$NAME$_dispatch (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + nsr_local_t *local = frame->local; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + + /* + * TBD: unblock pending request(s) if we fail after this point but + * before we get to nsr_$NAME$_complete (where that code currently + * resides). + */ + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_$NAME$_fan_in, + trav->xlator, trav->xlator->fops->$NAME$, + $ARGS_SHORT$); + } + + // TBD: variable Issue count + return 0; +} + +// template-name write-fan-in +$TYPE$ +nsr_$NAME$_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + $ARGS_LONG$) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +// template-name write-continue +$TYPE$ +nsr_$NAME$_continue (call_frame_t *frame, xlator_t *this, + $ARGS_LONG$) +{ + STACK_WIND (frame, nsr_$NAME$_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$, + $ARGS_SHORT$); + return 0; +} + +// template-name write-complete +$TYPE$ +nsr_$NAME$_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + $ARGS_LONG$) +{ +#if defined(NSR_CG_NEED_FD) + nsr_local_t *local = frame->local; +#endif + +#if defined(NSR_CG_QUEUE) + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd->inode); + nsr_local_t *next; + if (ictx) { + LOCK(&ictx->lock); + list_del(&local->qlinks); + if (ictx->pending) { + /* + * TBD: dequeue *all* non-conflicting reqs + * + * With the stub implementation there can only + * be one request active at a time (zero here) + * so it's not an issue. In a real + * implementation there might still be other + * active requests to check against, and + * multiple pending requests that could + * continue. + */ + gf_log (this->name, GF_LOG_DEBUG, + "unblocking next request"); + --(ictx->pending); + next = list_entry (ictx->pqueue.next, + nsr_local_t, qlinks); + list_del(&next->qlinks); + list_add_tail(&next->qlinks,&ictx->aqueue); + call_resume(next->qstub); + } + else { + --(ictx->active); + } + UNLOCK(&ictx->lock); + } +#endif + +#if defined(NSR_CG_FSYNC) + nsr_mark_fd_dirty(this,local); +#endif + +#if defined(NSR_CG_NEED_FD) + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno, + $ARGS_SHORT$); + return 0; + +} diff --git a/xlators/cluster/nsr-server/src/codegen.py b/xlators/cluster/nsr-server/src/codegen.py new file mode 100644 index 000000000..709f5662f --- /dev/null +++ b/xlators/cluster/nsr-server/src/codegen.py @@ -0,0 +1,174 @@ +#!/usr/bin/python + +# This module lets us auto-generate boilerplate versions of fops and cbks, +# both for the client side and (eventually) on the server side as well. This +# allows us to implement common logic (e.g. leader fan-out and sequencing) +# once, without all the problems that come with copying and pasting the same +# code into dozens of functions (or failing to). +# +# I've tried to make this code pretty generic, since it's already likely to +# be used multiple ways within NSR. Really, we should use something like this +# to generate defaults.[ch] as well, to avoid the same sorts of mismatches +# that we've already seen and to which this approach makes NSR immune. That +# would require using something other than defaults.h as the input, but that +# format could be even simpler so that's a good thing too. + + +import re +import sys + +decl_re = re.compile("([a-z0-9_]+)$") +tmpl_re = re.compile("// template-name (.*)") + +class CodeGenerator: + + def __init__ (self): + self.decls = {} + self.skip = 0 + self.templates = {} + self.make_defaults = self._make_defaults + + # Redefine this to preprocess the name in a declaration, e.g. + # fop_lookup_t => nsrc_lookup + def munge_name (self, orig): + return orig + + # By default, this will convert the argument string into a sequence of + # (type, name) tuples minus the first self.skip (default zero) arguments. + # You can redefine it to skip the conversion, do a different conversion, + # or rearrange the arguments however you like. + def munge_args (self, orig): + args = [] + for decl in orig.strip("(); ").split(","): + m = decl_re.search(decl) + if m: + args.append((m.group(1),decl[:m.start(1)].strip())) + else: + raise RuntimeError("can't split %s into type+name"%decl) + return args[self.skip:] + + def add_decl (self, fname, ftype, fargs): + self.decls[self.munge_name(fname)] = (ftype, self.munge_args(fargs)) + + def parse_decls (self, path, pattern): + regex = re.compile(pattern) + f = open(path,"r") + have_decl = False + while True: + line = f.readline() + if not line: + break + m = regex.search(line) + if m: + if have_decl: + self.add_decl(f_name,f_type,f_args) + f_name = m.group(2) + f_type = m.group(1) + f_args = line[m.end(0):-1].strip() + if f_args.rfind(")") >= 0: + self.add_decl(f_name,f_type,f_args) + else: + have_decl = True + elif have_decl: + if line.strip() == "": + self.add_decl(f_name,f_type,f_args) + have_decl = False + else: + f_args += " " + f_args += line[:-1].strip() + if have_decl: + self.add_decl(f_name,f_type,f_args) + + # Legacy function (yeah, already) to load a single template. If you're + # using multiple templates, you're better off loading them all from one + # file using load_templates (note plural) instead. + def load_template (self, name, path): + self.templates[name] = open(path,"r").readlines() + + # Load multiple templates. Each is introduced by a special comment of + # the form + # + # // template-name xyz + # + # One side effect is that the block before the first such comment will be + # ignored. This seems like it might be useful some day so I'll leave it + # in, but if people trip over it maybe it will change. + # + # It is recommended to define templates in expected execution order, to + # make the result more readable than the inverted order (e.g. callback + # then fop) common in the rest of our code. + def load_templates (self, path): + t_name = None + for line in open(path,"r").readlines(): + if not line: + break + m = tmpl_re.match(line) + if m: + if t_name: + self.templates[t_name] = t_contents + t_name = m.group(1).strip() + t_contents = [] + elif t_name: + t_contents.append(line) + if t_name: + self.templates[t_name] = t_contents + + # Emit the template, with the following expansions: + # + # $NAME$ => function name (as passed in) + # $TYPE$ => function return value + # $ARGS_SHORT$ => argument list, including types + # $ARGS_LONG$ => argument list, *not* including types + # $DEFAULTS$ => default callback args (see below) + # + # The $DEFAULTS$ substitution is for the case where a fop (which has one + # set of arguments) needs to signal an error via STACK_UNWIND (which + # requires a different set of arguments). In this case we look up the + # argument list for the opposite direction, using self.make_defaults which + # the user must explicitly set to the method for the opposite direction. + # If an argument is a pointer, we replace it with NULL; otherwise we + # replace it with zero. It's a hack, but it's the only thing we do that + # doesn't require specific knowledge of our environment and the specific + # call we're handling. If this doesn't suffice, we'll have to add + # something like $ARG0$ which can be passed in for specific cases. + def emit (self, f_name, tmpl): + args = self.decls[f_name][1] + zipper = lambda x: x[0] + a_short = ", ".join(map(zipper,args)) + zipper = lambda x: x[1] + " " + x[0] + a_long = ", ".join(map(zipper,args)) + for line in self.templates[tmpl]: + line = line.replace("$NAME$",f_name) + line = line.replace("$TYPE$",self.decls[f_name][0]) + line = line.replace("$ARGS_SHORT$",a_short) + line = line.replace("$ARGS_LONG$",a_long) + line = line.replace("$DEFAULTS$",self.make_defaults(f_name)) + print(line.rstrip()) + + def _make_defaults (self, f_name): + result = [] + for arg in self.decls[f_name][1]: + if arg[1][-1] == "*": + result.append("NULL") + else: + result.append("0") + return ", ".join(result) + +if __name__ == "__main__": + type_re = "([a-z_0-9]+)" + name_re = "\(\*fop_([a-z0-9]+)_t\)" + full_re = type_re + " *" + name_re + cg = CodeGenerator() + cg.skip = 2 + cg.parse_decls(sys.argv[1],full_re) + """ + for k, v in cg.decls.iteritems(): + print("=== %s" % k) + print(" return type %s" % v[0]) + for arg in v[1]: + print(" arg %s (type %s)" % arg) + """ + cg.load_template("fop",sys.argv[2]) + cg.emit("lookup","fop") + cg.emit("rename","fop") + cg.emit("setxattr","fop") diff --git a/xlators/cluster/nsr-server/src/codegen.pyc b/xlators/cluster/nsr-server/src/codegen.pyc new file mode 100644 index 000000000..388b517df Binary files /dev/null and b/xlators/cluster/nsr-server/src/codegen.pyc differ diff --git a/xlators/cluster/nsr-server/src/etcd-api.c b/xlators/cluster/nsr-server/src/etcd-api.c new file mode 100644 index 000000000..a46a40745 --- /dev/null +++ b/xlators/cluster/nsr-server/src/etcd-api.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2013, Red Hat + * All rights reserved. + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + + +#include "etcd-api.h" + +#define DEFAULT_ETCD_PORT 4001 +#define SL_DELIM "\n\r\t ,;" + +/* + * This shuts up gcc, which complains about "null argument where non-null + * required" when we pass the result to strdup. + */ +#define MY_YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : "fubar") + +typedef struct { + etcd_server *servers; +} _etcd_session; + +typedef struct { + char *key; + char *value; + int *index_in; /* pointer so NULL can be special */ + int index_out; /* NULL would be meaningless */ +} etcd_watch_t; + +typedef size_t curl_callback_t (void *, size_t, size_t, void *); + +int g_inited = 0; + +#if defined(DEBUG) +void +print_curl_error (char *intro, CURLcode res) +{ + printf("%s: %s\n",intro,curl_easy_strerror(res)); +} +#else +#define print_curl_error(intro,res) +#endif + + +etcd_session +etcd_open (etcd_server *server_list) +{ + _etcd_session *this; + + if (!g_inited) { + curl_global_init(CURL_GLOBAL_ALL); + g_inited = 1; + } + + this = malloc(sizeof(*this)); + if (!this) { + return NULL; + } + + /* + * Some day we'll set up more persistent connections, and keep track + * (via redirects) of which server is leader so that we can always + * try it first. For now we just push that to the individual request + * functions, which do the most brain-dead thing that can work. + */ + + this->servers = server_list; + return this; +} + + +void +etcd_close (etcd_session this) +{ + free(this); +} + + +size_t +parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + yajl_val node; + yajl_val value; + static const char *path[] = { "value", NULL }; + + node = yajl_tree_parse(ptr,NULL,0); + if (node) { + value = yajl_tree_get(node,path,yajl_t_string); + if (value) { + /* + * YAJL probably copied it once, now we're going to + * copy it again. If anybody really cares for such + * small and infrequently used values, we'd have to do + * do something much more complicated (like using the + * stream interface) to avoid the copy. Right now it's + * just not worth it. + */ + *((char **)stream) = strdup(MY_YAJL_GET_STRING(value)); + } + } + + return size*nmemb; +} + + +etcd_result +etcd_get_one (_etcd_session *this, char *key, etcd_server *srv, char *prefix, + char *post, curl_callback_t cb, char **stream) +{ + char *url; + CURL *curl; + CURLcode curl_res; + etcd_result res = ETCD_WTF; + void *err_label = &&done; + + if (asprintf(&url,"http://%s:%u/v1/%s%s", + srv->host,srv->port,prefix,key) < 0) { + goto *err_label; + } + err_label = &&free_url; + + curl = curl_easy_init(); + if (!curl) { + goto *err_label; + } + err_label = &&cleanup_curl; + + /* TBD: add error checking for these */ + curl_easy_setopt(curl,CURLOPT_URL,url); + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L); + curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,cb); + curl_easy_setopt(curl,CURLOPT_WRITEDATA,stream); + if (post) { + curl_easy_setopt(curl,CURLOPT_POST,1L); + curl_easy_setopt(curl,CURLOPT_POSTFIELDS,post); + } +#if defined(DEBUG) + curl_easy_setopt(curl,CURLOPT_VERBOSE,1L); +#endif + + curl_res = curl_easy_perform(curl); + if (curl_res != CURLE_OK) { + print_curl_error("perform",curl_res); + goto *err_label; + } + + res = ETCD_OK; + +cleanup_curl: + curl_easy_cleanup(curl); +free_url: + free(url); +done: + return res; +} + + +char * +etcd_get (etcd_session this_as_void, char *key) +{ + _etcd_session *this = this_as_void; + etcd_server *srv; + etcd_result res; + char *value = NULL; + + for (srv = this->servers; srv->host; ++srv) { + res = etcd_get_one(this,key,srv,"keys/",NULL, + parse_get_response,&value); + if ((res == ETCD_OK) && value) { + return value; + } + } + + return NULL; +} + + +size_t +parse_watch_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + yajl_val node; + yajl_val value; + etcd_watch_t *watch = stream; + static const char *i_path[] = { "index", NULL }; + static const char *k_path[] = { "key", NULL }; + static const char *v_path[] = { "value", NULL }; + + node = yajl_tree_parse(ptr,NULL,0); + if (node) { + value = yajl_tree_get(node,i_path,yajl_t_number); + if (value) { + watch->index_out = strtoul(YAJL_GET_NUMBER(value), + NULL,10); + } + value = yajl_tree_get(node,k_path,yajl_t_string); + if (value) { + watch->key = strdup(MY_YAJL_GET_STRING(value)); + } + value = yajl_tree_get(node,v_path,yajl_t_string); + if (value) { + watch->value = strdup(MY_YAJL_GET_STRING(value)); + } + else { + /* Must have been a DELETE. */ + watch->value = NULL; + } + } + + return size*nmemb; +} + + +etcd_result +etcd_watch (etcd_session this_as_void, char *pfx, + char **keyp, char **valuep, int *index_in, int *index_out) +{ + _etcd_session *this = this_as_void; + etcd_server *srv; + etcd_result res; + etcd_watch_t watch; + char *post; + + if (index_in) { + if (asprintf(&post,"index=%d",*index_in) < 0) { + return ETCD_WTF; + } + } + else { + post = NULL; + } + + memset(&watch.key,0,sizeof(watch)); + watch.index_in = index_in; + + for (srv = this->servers; srv->host; ++srv) { + res = etcd_get_one(this,pfx,srv,"watch/",post, + parse_watch_response,(char **)&watch); + if ((res == ETCD_OK) && watch.key) { + if (keyp) { + *keyp = watch.key; + } + if (valuep) { + *valuep = watch.value; + } + if (index_out) { + *index_out = watch.index_out; + } + break; + } + } + + if (post) { + free(post); + } + return res; +} + + +size_t +parse_set_response (void *ptr, size_t size, size_t nmemb, void *stream) +{ + yajl_val node; + yajl_val value; + etcd_result res = ETCD_PROTOCOL_ERROR; + /* + * Success responses contain prevValue and index. Failure responses + * contain errorCode and cause. Among all these, index seems to be the + * one we're most likely to need later, so look for that. + */ + static const char *path[] = { "index", NULL }; + + node = yajl_tree_parse(ptr,NULL,0); + if (node) { + value = yajl_tree_get(node,path,yajl_t_number); + if (value) { + res = ETCD_OK; + } + } + + *((etcd_result *)stream) = res; + return size*nmemb; +} + + +/* NB: a null value means to use HTTP DELETE and ignore precond/ttl */ +etcd_result +etcd_put_one (_etcd_session *this, char *key, char *value, + char *precond, unsigned int ttl, etcd_server *srv) +{ + char *url; + char *contents = NULL; + CURL *curl; + etcd_result res = ETCD_WTF; + CURLcode curl_res; + void *err_label = &&done; + + if (asprintf(&url,"http://%s:%u/v1/keys/%s", + srv->host,srv->port,key) < 0) { + goto *err_label; + } + err_label = &&free_url; + + if (value) { + if (asprintf(&contents,"value=%s",value) < 0) { + goto *err_label; + } + err_label = &&free_contents; + + if (precond) { + char *c2; + if (asprintf(&c2,"%s;prevValue=%s",contents, + precond) < 0) { + goto *err_label; + } + free(contents); + contents = c2; + } + + if (ttl) { + char *c2; + if (asprintf(&c2,"%s;ttl=%u",contents,ttl) < 0) { + goto *err_label; + } + free(contents); + contents = c2; + } + } + + curl = curl_easy_init(); + if (!curl) { + goto *err_label; + } + err_label = &&cleanup_curl; + + /* TBD: add error checking for these */ + curl_easy_setopt(curl,CURLOPT_URL,url); + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L); + curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,parse_set_response); + curl_easy_setopt(curl,CURLOPT_WRITEDATA,&res); + if (value) { + /* + * CURLOPT_HTTPPOST would be easier, but it looks like etcd + * will barf on that. Sigh. + */ + curl_easy_setopt(curl,CURLOPT_POST,1L); + curl_easy_setopt(curl,CURLOPT_POSTFIELDS,contents); + } + else { + /* This must be a DELETE. */ + curl_easy_setopt(curl,CURLOPT_CUSTOMREQUEST,"DELETE"); + } +#if defined(DEBUG) + curl_easy_setopt(curl,CURLOPT_VERBOSE,1L); +#endif + + curl_res = curl_easy_perform(curl); + if (curl_res != CURLE_OK) { + print_curl_error("perform",curl_res); + goto *err_label; + } + + /* + * If the request succeeded, or at least got to the server and failed + * there, parse_set_response should have set res appropriately. + */ + +cleanup_curl: + curl_easy_cleanup(curl); +free_contents: + free(contents); /* might already be NULL for delete, but that's OK */ +free_url: + free(url); +done: + return res; +} + + +etcd_result +etcd_set (etcd_session this_as_void, char *key, char *value, + char *precond, unsigned int ttl) +{ + _etcd_session *this = this_as_void; + etcd_server *srv; + etcd_result res; + + for (srv = this->servers; srv->host; ++srv) { + res = etcd_put_one(this,key,value,precond,ttl,srv); + /* + * Protocol errors are likely to be things like precondition + * failures, which won't be helped by retrying on another + * server. + */ + if ((res == ETCD_OK) || (res == ETCD_PROTOCOL_ERROR)) { + return res; + } + } + + return ETCD_WTF; +} + + +/* + * This uses the same path and status checks as SET, but with a different HTTP + * command instead of data. Precondition and TTL are obviously not used in + * this case, though a conditional delete would be a cool feature for etcd. I + * think you can get a timed delete by doing a conditional set to the current + * value with a TTL, but I haven't actually tried it. + */ +etcd_result +etcd_delete (etcd_session this_as_void, char *key) +{ + _etcd_session *this = this_as_void; + etcd_server *srv; + etcd_result res; + + for (srv = this->servers; srv->host; ++srv) { + res = etcd_put_one(this,key,NULL,NULL,0,srv); + if (res == ETCD_OK) { + break; + } + } + + return res; +} + + +size_t +store_leader (void *ptr, size_t size, size_t nmemb, void *stream) +{ + *((char **)stream) = strdup(ptr); + return size * nmemb; +} + + +char * +etcd_leader (etcd_session this_as_void) +{ + _etcd_session *this = this_as_void; + etcd_server *srv; + etcd_result res; + char *value = NULL; + + for (srv = this->servers; srv->host; ++srv) { + res = etcd_get_one(this,"leader",srv,"",NULL, + store_leader,&value); + if ((res == ETCD_OK) && value) { + return value; + } + } + + return NULL; +} + + +void +free_sl (etcd_server *server_list) +{ + size_t num_servers; + + for (num_servers = 0; server_list[num_servers].host; ++num_servers) { + free(server_list[num_servers].host); + } + free(server_list); +} + + +int +_count_matching (char *text, char *cset, int result) +{ + char *t; + int res = 0; + + for (t = text; *t; ++t) { + if ((strchr(cset,*t) != NULL) != result) { + break; + } + ++res; + } + + return res; +} + +#define count_matching(t,cs) _count_matching(t,cs,1) +#define count_nonmatching(t,cs) _count_matching(t,cs,0) + + +etcd_session +etcd_open_str (char *server_names) +{ + char *snp; + int run_len; + int host_len; + size_t num_servers; + etcd_server *server_list; + etcd_session *session; + + /* + * Yeah, we iterate over the string twice so we can allocate an + * appropriately sized array instead of turning it into a linked list. + * Unfortunately this means we can't use strtok* which is destructive + * with no platform-independent way to reverse the destructive effects. + */ + + num_servers = 0; + snp = server_names; + while (*snp) { + run_len = count_nonmatching(snp,SL_DELIM); + if (!run_len) { + snp += count_matching(snp,SL_DELIM); + continue; + } + ++num_servers; + snp += run_len; + } + + if (!num_servers) { + return NULL; + } + + server_list = calloc(num_servers+1,sizeof(*server_list)); + if (!server_list) { + return NULL; + } + num_servers = 0; + + snp = server_names; + while (*snp) { + run_len = count_nonmatching(snp,SL_DELIM); + if (!run_len) { + snp += count_matching(snp,SL_DELIM); + continue; + } + host_len = count_nonmatching(snp,":"); + if ((run_len - host_len) > 1) { + server_list[num_servers].host = strndup(snp,host_len); + server_list[num_servers].port = (unsigned short) + strtoul(snp+host_len+1,NULL,10); + } + else { + server_list[num_servers].host = strndup(snp,run_len); + server_list[num_servers].port = DEFAULT_ETCD_PORT; + } + ++num_servers; + snp += run_len; + } + + session = etcd_open(server_list); + if (!session) { + free_sl(server_list); + } + return session; +} + + +void +etcd_close_str (etcd_session this) +{ + free_sl(((_etcd_session *)this)->servers); + etcd_close(this); +} diff --git a/xlators/cluster/nsr-server/src/etcd-api.h b/xlators/cluster/nsr-server/src/etcd-api.h new file mode 100644 index 000000000..df8babd55 --- /dev/null +++ b/xlators/cluster/nsr-server/src/etcd-api.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2013, Red Hat + * All rights reserved. + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Description of an etcd server. For now it just includes the name and + * port, but some day it might include other stuff like SSL certificate + * information. + */ + +typedef enum { + ETCD_OK = 0, + ETCD_PROTOCOL_ERROR, + /* TBD: add other error categories here */ + ETCD_WTF /* anything we can't easily categorize */ +} etcd_result; + +typedef struct { + char *host; + unsigned short port; +} etcd_server; + +typedef void *etcd_session; + +/* + * etcd_open + * + * Establish a session to an etcd cluster, with automatic reconnection and + * so on. + * + * server_list + * Array of etcd_server structures, with the last having host=NULL. The + * caller is responsible for ensuring that this remains valid as long as + * the session exists. + */ +etcd_session etcd_open (etcd_server *server_list); + + +/* + * etcd_open_str + * + * Same as etcd_open, except that the servers are specified as a list of + * host:port strings, separated by comma/semicolon or whitespace. + */ +etcd_session etcd_open_str (char *server_names); + + +/* + * etcd_close + * + * Terminate a session, closing connections and freeing memory (or any other + * resources) associated with it. + */ +void etcd_close (etcd_session this); + + +/* + * etcd_close + * + * Same as etcd_close, but also free the server list as etcd_open_str would + * have allocated it. + */ +void etcd_close_str (etcd_session this_as_void); + + +/* + * etcd_get + * + * Fetch a key from one of the servers in a session. The return value is a + * newly allocated string, which must be freed by the caller. + * + * key + * The etcd key (path) to fetch. + */ +char * etcd_get (etcd_session this, char *key); + + +/* + * etcd_watch + * Watch the set of keys matching a prefix. + * + * pfx + * The etcd key prefix (like a path) to watch. + * + * keyp + * Space for a pointer to the key that was added/modified/deleted. + * + * valuep + * Space for a pointer to the value if a key was added/modified. A delete + * is signified by this being set to NULL. + * + * index_in + * Pointer to an index to be used for *issuing* the watch request, or + * NULL for a watch without an index. + * + * index_out + * Pointer to space for an index *returned* by etcd, or NULL to mean don't + * bother. + * + * In normal usage, index_in will be NULL and index_out will be set to receive + * the index for the first watch. Subsequently, index_in will be set to + * provide the previous index (plus one) and index_out will be set to receive + * the next. It's entirely legitimate to point both at the same variable. + */ + +etcd_result etcd_watch (etcd_session this, char *pfx, + char **keyp, char **valuep, + int *index_in, int *index_out); + + +/* + * etcd_set + * + * Write a key, with optional TTL and/or previous value (as a precondition). + * + * key + * The etcd key (path) to set. + * + * value + * New value as a null-terminated string. Unlike etcd_get, we can derive + * the length ourselves instead of needing it to be passed in separately. + * + * precond + * Required previous value as a null-terminated string, or NULL to mean + * an unconditional set. + * + * ttl + * Time in seconds after which the value will automatically expire and be + * deleted, or zero to mean no auto-expiration. + */ + +etcd_result etcd_set (etcd_session this, char *key, char *value, + char *precond, unsigned int ttl); + + +/* + * etcd_delete + * + * Delete a key from one of the servers in a session. + * + * key + * The etcd key (path) to delete. + */ + +etcd_result etcd_delete (etcd_session this, char *key); + + +/* + * etcd_leader + * + * Get the identify of the current leader. + */ + +char * etcd_leader (etcd_session this_as_void); diff --git a/xlators/cluster/nsr-server/src/gen-fops.py b/xlators/cluster/nsr-server/src/gen-fops.py new file mode 100644 index 000000000..d0f88d370 --- /dev/null +++ b/xlators/cluster/nsr-server/src/gen-fops.py @@ -0,0 +1,123 @@ +#!/usr/bin/python + +# This script generates the boilerplate versions of most fops and cbks in the +# server. This allows the details of leadership-status checking, sequencing +# between leader and followers (including fan-out), and basic error checking +# to be centralized one place, with per-operation code kept to a minimum. + +import sys +import codegen + +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_t\)" +full_re = type_re + " *" + name_re +fop_cg = codegen.CodeGenerator() +fop_cg.skip = 2 +fop_cg.parse_decls(sys.argv[1],full_re) +fop_cg.load_templates(sys.argv[2]) + +# Use the multi-template feature to generate multiple callbacks from the same +# parsed declarations. +type_re = "([a-z_0-9]+)" +name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)" +full_re = type_re + " *" + name_re +cbk_cg = codegen.CodeGenerator() +cbk_cg.skip = 5 +cbk_cg.parse_decls(sys.argv[1],full_re) +cbk_cg.load_templates(sys.argv[2]) + +# This is a nasty little trick to handle the case where a generated fop needs +# a set of default arguments for the corresponding callback. +fop_cg.make_defaults = cbk_cg.make_defaults + +# We need two types of templates. The first, for pure read operations, just +# needs to do a simple am-i-leader check (augmented to allow dirty reads). +# The second, for pure writes, needs to do fan-out to followers between those +# initial checks and local execution. There are other operations that don't +# fit neatly into either category - e.g. lock ops or fsync - so we'll just have +# to handle those manually. The table thus includes entries only for those we +# can categorize. The special cases, plus any new operations we've never even +# heard of, aren't in there. +# +# The "cplx" suffix means that we need to do special things to propagate an +# fd from the fop to the final callback. The way we do that is that we define +# a macro for the generated code to use. If this is a "complex" operation, +# the macro saves/releases the fd; otherwise it's a no-op. I know that's very +# icky and hard to follow. Sorry. This would all be a lot easier if the +# translator infrastructure used a request block instead of separate argument +# lists for every call (and then we wouldn't even need stubs), but that's not +# the way things work so we're stuck with legacy-preserving hacks like this. + +fop_table = { + "access": "read", + "create": "write", + "discard": "write", +# "entrylk": "read", + "fallocate": "write", +# "fentrylk": "read", + "fgetxattr": "read", +# "finodelk": "read", +# "flush": "read", + "fremovexattr": "write", + "fsetattr": "write", + "fsetxattr": "write", + "fstat": "read", +# "fsync": "read", +# "fsyncdir": "read", + "ftruncate": "write", + "fxattrop": "write", + "getxattr": "read", +# "inodelk": "read", + "link": "write", +# "lk": "read", +# "lookup": "read", + "mkdir": "write", + "mknod": "write", + "open": "write", + "opendir": "read", + "rchecksum": "read", + "readdir": "read", + "readdirp": "read", + "readlink": "read", + "readv": "read", + "removexattr": "write", + "rename": "write", + "rmdir": "write", + "setattr": "write", + "setxattr": "write", + "stat": "read", + "statfs": "read", + "symlink": "write", + "truncate": "write", + "unlink": "write", + "writev": "write,fsync,queue", + "xattrop": "write", +} + +fops_done = [] +for x in sorted(fop_cg.decls.keys()): + if x in fop_table.keys(): + info = fop_table[x].split(",") + kind = info[0] + flags = info[1:] + if ("fsync" in flags) or ("queue" in flags): + flags.append("need_fd") + for fname in flags: + print "#define NSR_CG_%s" % fname.upper() + cbk_cg.emit(x,kind+"-complete") + fop_cg.emit(x,kind+"-continue") + cbk_cg.emit(x,kind+"-fan-in") + fop_cg.emit(x,kind+"-dispatch") + fop_cg.emit(x,kind+"-fop") + for fname in flags: + print "#undef NSR_CG_%s" % fname.upper() + fops_done.append(x) + else: + print("/* No code emitted for %s */"%x) + print("") + +# Just for fun, emit the fops table too. +print("struct xlator_fops fops = {") +for x in fops_done: + print(" .%s = nsr_%s,"%(x,x)) +print("};") diff --git a/xlators/cluster/nsr-server/src/leader.c b/xlators/cluster/nsr-server/src/leader.c new file mode 100644 index 000000000..bb0dbabe7 --- /dev/null +++ b/xlators/cluster/nsr-server/src/leader.c @@ -0,0 +1,420 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include +//#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "api/src/glfs.h" +#include "api/src/glfs-internal.h" + +#include "etcd-api.h" +#include "nsr-internal.h" +#include "../../nsr-recon/src/recon_driver.h" +#include "../../nsr-recon/src/recon_xlator.h" + +/* Vote format: UUID,vote_status,fitness,term_number */ +#define VOTE_ELEMS 4 /* Whole match plus four actual pieces. */ +#define DEFAULT_FITNESS 42 +#define DEFAULT_KEY "nsr" +#define LEADER_TTL 5 /* TBD: make this tunable */ + +typedef enum { LS_SUCCESS, LS_FAILURE, LS_ERROR } leader_retval_t; +enum { NO_LEADER, TENTATIVE, CONFIRMED }; + +regex_t vote_re; + +long +nsr_get_fitness (xlator_t *this) +{ + /* TBD: calculate based on presence/absence from terms */ + return 42; +} + +long +nsr_get_term (xlator_t *this) +{ + nsr_private_t *priv = this->private; + char *text = NULL; + etcd_session etcd = priv->etcd; + + text = etcd_get(etcd, priv->term_uuid); + // first time and hence no key at all. + // this should ideally be done at vol creation time + // by glusterd. Move it there later + if(text == NULL) { + gf_log (this->name, GF_LOG_TRACE, "nsr_get_term returns 1"); + return 0; + } else { + gf_log (this->name, GF_LOG_TRACE, + "nsr_get_term returns %ld", strtol(text, NULL, 10)); + return (strtol(text, NULL, 10)); + } +} + + +// in etcd-api-master. +// send a patch to this package to expose this +extern size_t +parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream); +typedef struct { + etcd_server *servers; +} _etcd_session; +typedef size_t curl_callback_t (void *, size_t, size_t, void *); +extern etcd_result etcd_get_one (_etcd_session *this, char *key, etcd_server *srv, char *prefix, + char *post, curl_callback_t cb, char **stream); + + + +void +nsr_leader_cb(glfs_fd_t *fd, ssize_t ret, void *data) +{ + xlator_t *this = (xlator_t *) data; + nsr_private_t *priv = this->private; + + gf_log (this->name, GF_LOG_INFO, + "nsr_leader_cb arrived with return value %d", (int)ret); + + // TBD - error handling; look at ret + atomic_fetch_and(&(priv->fence_io), 0); + + return; +} + +void +nsr_set_leader (xlator_t *this) +{ + long term = 0; + etcd_server *srv; + etcd_result res; + char *value = NULL; + nsr_private_t *priv = this->private; + _etcd_session *etcd = priv->etcd; + char *term_key = priv->term_uuid; + char *master_key = priv->vol_uuid; + char n_t[sizeof(long)+1]; + nsr_recon_role_t role; + char *text = NULL; + + gf_log (this->name, GF_LOG_INFO, "Just became leader"); + + text = etcd_get(etcd, priv->term_uuid); + if(text == NULL) { + term = 0; + } else { + term = strtol(text, NULL, 10); + } + sprintf(n_t,"%ld",term+1); + res = etcd_set(etcd, term_key,n_t,text,0); + if(res != ETCD_OK) { + gf_log (this->name, GF_LOG_ERROR, "failed to set term"); + return; + } + priv->leader = _gf_true; + + if (priv->nsr_recon_start == _gf_false) { + atomic_fetch_and(&(priv->fence_io), 0); + return; + } + + priv->current_term = term + 1; + + atomic_fetch_or(&(priv->fence_io), 1); + + role.num = 0; + role.role = leader; + // Get the rest of nodes for this term. + // TBD: fix this so that it uses per-brick keys instead of violating + // modularity and making bad assumptions about etcd behavior. + for (srv = etcd->servers; srv->host; ++srv) { + res = etcd_get_one(etcd,master_key,srv,"keys/",NULL, + parse_get_response,&value); + gf_log (this->name, GF_LOG_INFO, + "Probing for %s, got %d, value:%s", + srv->host, res, value); + if ((res == ETCD_OK) && value) { + gf_log (this->name, GF_LOG_INFO, + "Found for %s", srv->host); + strcpy(role.info[role.num].name, srv->host); + (role.num)++; + } + value = NULL; + } + gf_log (this->name, GF_LOG_INFO, + "Discovered %d nodes that has key %s", role.num, master_key); + + gf_log (this->name, GF_LOG_INFO, + "setting current term as %ld", term + 1); + role.current_term = term + 1; + ENDIAN_CONVERSION_RR(role, _gf_false); + + // inform the reconciliator that this is leader + // in the callback (once reconciliation is done), + // we will unfence the IOs. + // TBD - error handling later. + glfs_lseek(priv->fd, nsr_recon_xlator_sector_1, SEEK_SET); + gf_log (this->name, GF_LOG_INFO, + "Writing to local node to set leader"); + glfs_write_async(priv->fd, &role, + sizeof(role),nsr_recon_xlator_sector_1, + nsr_leader_cb, this); +} + + +leader_retval_t +nsr_get_leader (xlator_t *this, etcd_session etcd, char *key) +{ + char *text = NULL; + regmatch_t matches[VOTE_ELEMS]; + char *nominee; + long state; + long fitness; + char *vote = NULL; + int retval = LS_ERROR; + nsr_private_t *priv = this->private; + + for (;;sleep(1)) { + + if (text) { + free(text); + } + + text = etcd_get(etcd,key); + if (text) { + if (regexec(&vote_re,text,VOTE_ELEMS,matches,0) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "got malformed vote %s\n", text); + continue; + } + /* We can be destructive here, so convert commas. */ + text[matches[1].rm_eo] = '\0'; + text[matches[2].rm_eo] = '\0'; + nominee = text + matches[1].rm_so; + state = strtol(text+matches[2].rm_so,NULL,10); + fitness = strtol(text+matches[3].rm_so,NULL,10); + } + else { + nominee = NULL; + state = NO_LEADER; + fitness = 0; + } + + if (state == CONFIRMED) { + gf_log (this->name, GF_LOG_TRACE, + "leader is %s\n",nominee); + if (strcmp(nominee,priv->brick_uuid) == 0) { + nsr_set_leader(this); + retval = LS_SUCCESS; + } + else { + priv->leader = _gf_false; + retval = LS_FAILURE; + } + break; + } + + /* TBD: override based on fitness */ + if ((state >= TENTATIVE) && (strcmp(nominee, + priv->brick_uuid) != 0)) { + continue; + } + + if (vote) { + free(vote); + } + + fitness = nsr_get_fitness(this); + if (asprintf(&vote,"%s,%ld,%ld",priv->brick_uuid, + state+1,fitness) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to construct vote\n"); + break; + } + + if (text) { + text[matches[1].rm_eo] = ','; + text[matches[2].rm_eo] = ','; + } + if (etcd_set(etcd,key,vote,text,LEADER_TTL) != ETCD_OK) { + gf_log (this->name, GF_LOG_ERROR, + "failed to cast vote\n"); + continue; + } + + } + + if (text) { + free(text); + } + if (vote) { + free(vote); + } + return retval; +} + +leader_retval_t +nsr_confirm (xlator_t *this, etcd_session etcd, char *key) +{ + char *vote; + long fitness; + nsr_private_t *priv = this->private; + + fitness = nsr_get_fitness(this); + if (asprintf(&vote,"%s,%ld,%ld",priv->brick_uuid,(long)CONFIRMED, + fitness) < 0) { + fprintf (stderr, "%s: failed to construct confirmation\n", + __func__); + return LS_ERROR; + } + + if (etcd_set(etcd,key,vote,vote,LEADER_TTL) != ETCD_OK) { + fprintf (stderr, "%s: failed to confirm\n", __func__); + free(vote); + return LS_FAILURE; + } + + free(vote); + return LS_SUCCESS; +} + +gf_boolean_t +nsr_init_re (xlator_t *this) +{ + static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + static int was_inited = 0; + static char *vote_re_str = "([^,]+),([^,]+),([^,]+)"; + gf_boolean_t retval = _gf_false; + + pthread_mutex_lock(&mutex); + if (!was_inited) { + if (regcomp(&vote_re,vote_re_str,REG_EXTENDED) == 0) { + retval = _gf_true; + } + else { + gf_log (this->name, GF_LOG_ERROR, + "failed to set up vote regex\n"); + } + } + pthread_mutex_unlock(&mutex); + + return retval; +} + + +uint32_t +nsr_leader_setup_recon (xlator_t *this) +{ + nsr_private_t *priv = this->private; + xlator_t *old = this; + uint32_t ret = 0; + + if (priv->nsr_recon_start == _gf_false) + return 0; + + priv->fs = glfs_new(priv->vol_uuid); + if (!priv->fs) { + ret = 1; + gf_log (this->name, GF_LOG_ERROR, "failed to initialise glfs \n"); + goto done; + } + + glusterfs_this_set(old); + ret = glfs_set_volfile(priv->fs, priv->vol_file); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to set volfile \n"); + goto done; + } + + glusterfs_this_set(old); + /* + * REVIEW + * Logs belong in /var/log not /tmp. + */ + glfs_set_logging (priv->fs,"/tmp/glfs-log", 7); + if (glfs_init(priv->fs) < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to init volfile \n"); + ret = 1; + goto done; + } + + glusterfs_this_set(old); + priv->fd = glfs_open (priv->fs, "/", O_RDWR); + if (priv->fd == NULL) { + ret = 1; + gf_log (this->name, GF_LOG_ERROR, + "failed to open fd to communicate with recon process \n"); + goto done; + } + + +done: + glusterfs_this_set(old); + return ret; +} + +void * +nsr_leader_thread (xlator_t *this) +{ + leader_retval_t retval; + nsr_private_t *priv = this->private; + + if (!nsr_init_re(this)) { + gf_log (this->name, GF_LOG_ERROR, "could not init regex"); + return NULL; + } + + if (nsr_leader_setup_recon(this)) { + gf_log (this->name, GF_LOG_ERROR, + "failed to do glfs initialisation inside leader thread"); + return NULL; + } + + priv->leader_inited = 1; + + gf_log (this->name, GF_LOG_INFO, + "calling glfs_opens_str on servers %s", priv->etcd_servers); + + priv->etcd = etcd_open_str(priv->etcd_servers); + if (!(priv->etcd)) { + gf_log (this->name, GF_LOG_ERROR, + "failed to open etcd session\n"); + return NULL; + } + + for (;;) { + if (nsr_get_leader(this,priv->etcd,priv->vol_uuid) == LS_ERROR) { + break; + } + if (priv->leader) { + do { + sleep(1); + retval = nsr_confirm(this,priv->etcd,priv->vol_uuid); + } while (retval == LS_SUCCESS); + if (retval == LS_ERROR) { + break; + } + } + else { + sleep(1); + } + } + + etcd_close_str(priv->etcd); + return NULL; +} + diff --git a/xlators/cluster/nsr-server/src/nsr-cg.c b/xlators/cluster/nsr-server/src/nsr-cg.c new file mode 100644 index 000000000..54f370b75 --- /dev/null +++ b/xlators/cluster/nsr-server/src/nsr-cg.c @@ -0,0 +1,4444 @@ +/* No stub needed for access */ + +/* No cbk needed for access */ + +int32_t +nsr_access (call_frame_t *frame, xlator_t *this, + loc_t * loc, int32_t mask, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_access_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->access, + loc, mask, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (access, frame, -1, EREMOTE, + NULL); + return 0; +} + +int32_t +nsr_create_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t * fd, inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, + fd, inode, buf, preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_create_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, int32_t flags, mode_t mode, mode_t umask, fd_t * fd, dict_t * xdata) +{ + STACK_WIND (frame, nsr_create_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + +int32_t +nsr_create_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t * fd, inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_create (call_frame_t *frame, xlator_t *this, + loc_t * loc, int32_t flags, mode_t mode, mode_t umask, fd_t * fd, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_create_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_create_stub (frame,nsr_create_continue, + loc, flags, mode, umask, fd, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_create_fan_in, + trav->xlator, trav->xlator->fops->create, + loc, flags, mode, umask, fd, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_discard_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + return 0; + +} +int32_t +nsr_discard_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, off_t offset, size_t len, dict_t * xdata) +{ + STACK_WIND (frame, nsr_discard_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; +} + +int32_t +nsr_discard_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_discard (call_frame_t *frame, xlator_t *this, + fd_t * fd, off_t offset, size_t len, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_discard_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_discard_stub (frame,nsr_discard_continue, + fd, offset, len, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_discard_fan_in, + trav->xlator, trav->xlator->fops->discard, + fd, offset, len, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (discard, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +/* No code emitted for entrylk */ + +int32_t +nsr_fallocate_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + return 0; + +} +int32_t +nsr_fallocate_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, int32_t keep_size, off_t offset, size_t len, dict_t * xdata) +{ + STACK_WIND (frame, nsr_fallocate_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, + fd, keep_size, offset, len, xdata); + return 0; +} + +int32_t +nsr_fallocate_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_fallocate (call_frame_t *frame, xlator_t *this, + fd_t * fd, int32_t keep_size, off_t offset, size_t len, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_fallocate_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, + fd, keep_size, offset, len, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_fallocate_stub (frame,nsr_fallocate_continue, + fd, keep_size, offset, len, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fallocate_fan_in, + trav->xlator, trav->xlator->fops->fallocate, + fd, keep_size, offset, len, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (fallocate, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +/* No code emitted for fentrylk */ + +/* No stub needed for fgetxattr */ + +/* No cbk needed for fgetxattr */ + +int32_t +nsr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t * fd, const char * name, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_fgetxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr, + fd, name, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (fgetxattr, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No code emitted for finodelk */ + +/* No code emitted for flush */ + +int32_t +nsr_fremovexattr_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, + xdata); + return 0; + +} +int32_t +nsr_fremovexattr_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, const char * name, dict_t * xdata) +{ + STACK_WIND (frame, nsr_fremovexattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + +int32_t +nsr_fremovexattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t * fd, const char * name, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_fremovexattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_fremovexattr_stub (frame,nsr_fremovexattr_continue, + fd, name, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fremovexattr_fan_in, + trav->xlator, trav->xlator->fops->fremovexattr, + fd, name, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (fremovexattr, frame, -1, op_errno, + NULL); + return 0; +} + +int32_t +nsr_fsetattr_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + return 0; + +} +int32_t +nsr_fsetattr_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, struct iatt * stbuf, int32_t valid, dict_t * xdata) +{ + STACK_WIND (frame, nsr_fsetattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; +} + +int32_t +nsr_fsetattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_fsetattr (call_frame_t *frame, xlator_t *this, + fd_t * fd, struct iatt * stbuf, int32_t valid, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_fsetattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_fsetattr_stub (frame,nsr_fsetattr_continue, + fd, stbuf, valid, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fsetattr_fan_in, + trav->xlator, trav->xlator->fops->fsetattr, + fd, stbuf, valid, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (fsetattr, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_fsetxattr_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, + xdata); + return 0; + +} +int32_t +nsr_fsetxattr_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, dict_t * dict, int32_t flags, dict_t * xdata) +{ + STACK_WIND (frame, nsr_fsetxattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; +} + +int32_t +nsr_fsetxattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t * fd, dict_t * dict, int32_t flags, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_fsetxattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_fsetxattr_stub (frame,nsr_fsetxattr_continue, + fd, dict, flags, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fsetxattr_fan_in, + trav->xlator, trav->xlator->fops->fsetxattr, + fd, dict, flags, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (fsetxattr, frame, -1, op_errno, + NULL); + return 0; +} + +/* No stub needed for fstat */ + +/* No cbk needed for fstat */ + +int32_t +nsr_fstat (call_frame_t *frame, xlator_t *this, + fd_t * fd, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_fstat_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, + fd, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (fstat, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No code emitted for fsync */ + +/* No code emitted for fsyncdir */ + +int32_t +nsr_ftruncate_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; + +} +int32_t +nsr_ftruncate_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, off_t offset, dict_t * xdata) +{ + STACK_WIND (frame, nsr_ftruncate_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + +int32_t +nsr_ftruncate_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t * fd, off_t offset, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_ftruncate_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, + fd, offset, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_ftruncate_stub (frame,nsr_ftruncate_continue, + fd, offset, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_ftruncate_fan_in, + trav->xlator, trav->xlator->fops->ftruncate, + fd, offset, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_fxattrop_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xattr, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, + xattr, xdata); + return 0; + +} +int32_t +nsr_fxattrop_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata) +{ + STACK_WIND (frame, nsr_fxattrop_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop, + fd, optype, xattr, xdata); + return 0; +} + +int32_t +nsr_fxattrop_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xattr, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t * fd, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_fxattrop_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop, + fd, optype, xattr, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_fxattrop_stub (frame,nsr_fxattrop_continue, + fd, optype, xattr, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fxattrop_fan_in, + trav->xlator, trav->xlator->fops->fxattrop, + fd, optype, xattr, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (fxattrop, frame, -1, op_errno, + NULL, NULL); + return 0; +} + +/* No code emitted for getspec */ + +/* No stub needed for getxattr */ + +/* No cbk needed for getxattr */ + +int32_t +nsr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t * loc, const char * name, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_getxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No code emitted for inodelk */ + +int32_t +nsr_link_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_link_continue (call_frame_t *frame, xlator_t *this, + loc_t * oldloc, loc_t * newloc, dict_t * xdata) +{ + STACK_WIND (frame, nsr_link_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + +int32_t +nsr_link_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_link (call_frame_t *frame, xlator_t *this, + loc_t * oldloc, loc_t * newloc, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_link_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_link_stub (frame,nsr_link_continue, + oldloc, newloc, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_link_fan_in, + trav->xlator, trav->xlator->fops->link, + oldloc, newloc, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (link, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + return 0; +} + +/* No code emitted for lk */ + +/* No code emitted for lookup */ + +int32_t +nsr_mkdir_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_mkdir_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, mode_t mode, mode_t umask, dict_t * xdata) +{ + STACK_WIND (frame, nsr_mkdir_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + +int32_t +nsr_mkdir_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t * loc, mode_t mode, mode_t umask, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_mkdir_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_mkdir_stub (frame,nsr_mkdir_continue, + loc, mode, umask, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_mkdir_fan_in, + trav->xlator, trav->xlator->fops->mkdir, + loc, mode, umask, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_mknod_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_mknod_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, mode_t mode, dev_t rdev, mode_t umask, dict_t * xdata) +{ + STACK_WIND (frame, nsr_mknod_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; +} + +int32_t +nsr_mknod_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_mknod (call_frame_t *frame, xlator_t *this, + loc_t * loc, mode_t mode, dev_t rdev, mode_t umask, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_mknod_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_mknod_stub (frame,nsr_mknod_continue, + loc, mode, rdev, umask, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_mknod_fan_in, + trav->xlator, trav->xlator->fops->mknod, + loc, mode, rdev, umask, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_open_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t * fd, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, + fd, xdata); + return 0; + +} +int32_t +nsr_open_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, int32_t flags, fd_t * fd, dict_t * xdata) +{ + STACK_WIND (frame, nsr_open_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + loc, flags, fd, xdata); + return 0; +} + +int32_t +nsr_open_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t * fd, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_open (call_frame_t *frame, xlator_t *this, + loc_t * loc, int32_t flags, fd_t * fd, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_open_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + loc, flags, fd, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_open_stub (frame,nsr_open_continue, + loc, flags, fd, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_open_fan_in, + trav->xlator, trav->xlator->fops->open, + loc, flags, fd, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (open, frame, -1, op_errno, + NULL, NULL); + return 0; +} + +/* No stub needed for opendir */ + +/* No cbk needed for opendir */ + +int32_t +nsr_opendir (call_frame_t *frame, xlator_t *this, + loc_t * loc, fd_t * fd, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_opendir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, + loc, fd, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (opendir, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No stub needed for rchecksum */ + +/* No cbk needed for rchecksum */ + +int32_t +nsr_rchecksum (call_frame_t *frame, xlator_t *this, + fd_t * fd, off_t offset, int32_t len, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_rchecksum_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rchecksum, + fd, offset, len, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (rchecksum, frame, -1, EREMOTE, + 0, NULL, NULL); + return 0; +} + +/* No stub needed for readdir */ + +/* No cbk needed for readdir */ + +int32_t +nsr_readdir (call_frame_t *frame, xlator_t *this, + fd_t * fd, size_t size, off_t offset, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_readdir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, + fd, size, offset, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (readdir, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No stub needed for readdirp */ + +/* No cbk needed for readdirp */ + +int32_t +nsr_readdirp (call_frame_t *frame, xlator_t *this, + fd_t * fd, size_t size, off_t offset, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_readdirp_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, + fd, size, offset, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (readdirp, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No stub needed for readlink */ + +/* No cbk needed for readlink */ + +int32_t +nsr_readlink (call_frame_t *frame, xlator_t *this, + loc_t * loc, size_t size, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_readlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink, + loc, size, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (readlink, frame, -1, EREMOTE, + NULL, NULL, NULL); + return 0; +} + +/* No stub needed for readv */ + +/* No cbk needed for readv */ + +int32_t +nsr_readv (call_frame_t *frame, xlator_t *this, + fd_t * fd, size_t size, off_t offset, uint32_t flags, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_readv_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (readv, frame, -1, EREMOTE, + NULL, 0, NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_removexattr_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, + xdata); + return 0; + +} +int32_t +nsr_removexattr_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, const char * name, dict_t * xdata) +{ + STACK_WIND (frame, nsr_removexattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; +} + +int32_t +nsr_removexattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t * loc, const char * name, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_removexattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_removexattr_stub (frame,nsr_removexattr_continue, + loc, name, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_removexattr_fan_in, + trav->xlator, trav->xlator->fops->removexattr, + loc, name, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (removexattr, frame, -1, op_errno, + NULL); + return 0; +} + +int32_t +nsr_rename_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * buf, struct iatt * preoldparent, struct iatt * postoldparent, struct iatt * prenewparent, struct iatt * postnewparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, + buf, preoldparent, postoldparent, prenewparent, postnewparent, xdata); + return 0; + +} +int32_t +nsr_rename_continue (call_frame_t *frame, xlator_t *this, + loc_t * oldloc, loc_t * newloc, dict_t * xdata) +{ + STACK_WIND (frame, nsr_rename_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + +int32_t +nsr_rename_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * buf, struct iatt * preoldparent, struct iatt * postoldparent, struct iatt * prenewparent, struct iatt * postnewparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_rename (call_frame_t *frame, xlator_t *this, + loc_t * oldloc, loc_t * newloc, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_rename_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_rename_stub (frame,nsr_rename_continue, + oldloc, newloc, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_rename_fan_in, + trav->xlator, trav->xlator->fops->rename, + oldloc, newloc, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (rename, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_rmdir_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_rmdir_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, int xflags, dict_t * xdata) +{ + STACK_WIND (frame, nsr_rmdir_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, + loc, xflags, xdata); + return 0; +} + +int32_t +nsr_rmdir_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t * loc, int xflags, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_rmdir_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, + loc, xflags, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_rmdir_stub (frame,nsr_rmdir_continue, + loc, xflags, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_rmdir_fan_in, + trav->xlator, trav->xlator->fops->rmdir, + loc, xflags, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_setattr_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + return 0; + +} +int32_t +nsr_setattr_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, struct iatt * stbuf, int32_t valid, dict_t * xdata) +{ + STACK_WIND (frame, nsr_setattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + +int32_t +nsr_setattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_setattr (call_frame_t *frame, xlator_t *this, + loc_t * loc, struct iatt * stbuf, int32_t valid, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_setattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_setattr_stub (frame,nsr_setattr_continue, + loc, stbuf, valid, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_setattr_fan_in, + trav->xlator, trav->xlator->fops->setattr, + loc, stbuf, valid, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (setattr, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_setxattr_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, + xdata); + return 0; + +} +int32_t +nsr_setxattr_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, dict_t * dict, int32_t flags, dict_t * xdata) +{ + STACK_WIND (frame, nsr_setxattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +} + +int32_t +nsr_setxattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t * loc, dict_t * dict, int32_t flags, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_setxattr_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_setxattr_stub (frame,nsr_setxattr_continue, + loc, dict, flags, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_setxattr_fan_in, + trav->xlator, trav->xlator->fops->setxattr, + loc, dict, flags, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, + NULL); + return 0; +} + +/* No stub needed for stat */ + +/* No cbk needed for stat */ + +int32_t +nsr_stat (call_frame_t *frame, xlator_t *this, + loc_t * loc, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_stat_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, + loc, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (stat, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +/* No stub needed for statfs */ + +/* No cbk needed for statfs */ + +int32_t +nsr_statfs (call_frame_t *frame, xlator_t *this, + loc_t * loc, dict_t * xdata) +{ + nsr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + // allow reads during reconciliation + // TBD: allow "dirty" reads on non-leaders + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_statfs_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs, + loc, xdata); + return 0; + +err: + STACK_UNWIND_STRICT (statfs, frame, -1, EREMOTE, + NULL, NULL); + return 0; +} + +int32_t +nsr_symlink_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_symlink_continue (call_frame_t *frame, xlator_t *this, + const char * linkname, loc_t * loc, mode_t umask, dict_t * xdata) +{ + STACK_WIND (frame, nsr_symlink_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; +} + +int32_t +nsr_symlink_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_symlink (call_frame_t *frame, xlator_t *this, + const char * linkname, loc_t * loc, mode_t umask, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_symlink_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_symlink_stub (frame,nsr_symlink_continue, + linkname, loc, umask, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_symlink_fan_in, + trav->xlator, trav->xlator->fops->symlink, + linkname, loc, umask, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_truncate_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; + +} +int32_t +nsr_truncate_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, off_t offset, dict_t * xdata) +{ + STACK_WIND (frame, nsr_truncate_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, + loc, offset, xdata); + return 0; +} + +int32_t +nsr_truncate_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_truncate (call_frame_t *frame, xlator_t *this, + loc_t * loc, off_t offset, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_truncate_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, + loc, offset, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_truncate_stub (frame,nsr_truncate_continue, + loc, offset, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_truncate_fan_in, + trav->xlator, trav->xlator->fops->truncate, + loc, offset, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +int32_t +nsr_unlink_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; + +} +int32_t +nsr_unlink_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, int xflags, dict_t * xdata) +{ + STACK_WIND (frame, nsr_unlink_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, + loc, xflags, xdata); + return 0; +} + +int32_t +nsr_unlink_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * preparent, struct iatt * postparent, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_unlink (call_frame_t *frame, xlator_t *this, + loc_t * loc, int xflags, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_unlink_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, + loc, xflags, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_unlink_stub (frame,nsr_unlink_continue, + loc, xflags, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_unlink_fan_in, + trav->xlator, trav->xlator->fops->unlink, + loc, xflags, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +#define NSR_CG_FSYNC +#define NSR_CG_QUEUE +#define NSR_CG_NEED_FD +int32_t +nsr_writev_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; + +} +int32_t +nsr_writev_continue (call_frame_t *frame, xlator_t *this, + fd_t * fd, struct iovec * vector, int32_t count, off_t offset, uint32_t flags, struct iobref * iobref, dict_t * xdata) +{ + STACK_WIND (frame, nsr_writev_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; +} + +int32_t +nsr_writev_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_writev (call_frame_t *frame, xlator_t *this, + fd_t * fd, struct iovec * vector, int32_t count, off_t offset, uint32_t flags, struct iobref * iobref, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_writev_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_writev_stub (frame,nsr_writev_continue, + fd, vector, count, offset, flags, iobref, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_writev_fan_in, + trav->xlator, trav->xlator->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, + NULL, NULL, NULL); + return 0; +} + +#undef NSR_CG_FSYNC +#undef NSR_CG_QUEUE +#undef NSR_CG_NEED_FD +int32_t +nsr_xattrop_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xattr, dict_t * xdata) +{ +#if NSR_CG_NEED_FD + nsr_local_t *local = frame->local; +#endif + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd); + if (ictx) { + /* TBD: LOCK */ + if (ictx->pending) { + gf_log (this->name, GF_LOG_DEBUG, + "unblocking %u requests", + ictx->pending); + /* TBD: actually dequeue */ + ictx->pending = 0; + } + /* TBD: UNLOCK */ + } +#endif + +#if NSR_CG_FSYNC + nsr_mark_fd_dirty(this,local); +#endif + +#if NSR_CG_NEED_FD + fd_unref(local->fd); +#endif + + STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, + xattr, xdata); + return 0; + +} +int32_t +nsr_xattrop_continue (call_frame_t *frame, xlator_t *this, + loc_t * loc, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata) +{ + STACK_WIND (frame, nsr_xattrop_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, + loc, optype, xattr, xdata); + return 0; +} + +int32_t +nsr_xattrop_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t * xattr, dict_t * xdata) +{ + nsr_local_t *local = frame->local; + uint8_t call_count; + + gf_log (this->name, GF_LOG_TRACE, + "op_ret = %d, op_errno = %d\n", op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + // TBD: variable Completion count + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +int32_t +nsr_xattrop (call_frame_t *frame, xlator_t *this, + loc_t * loc, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata) +{ + nsr_local_t *local = NULL; + nsr_private_t *priv = this->private; + xlator_list_t *trav; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if NSR_CG_NEED_FD + local->fd = fd_ref(fd) +#else + local->fd = NULL +#endif + frame->local = local; + + if (xdata) { + from_leader = !!dict_get(xdata,NSR_TERM_XATTR); + from_recon = !!dict_get(xdata,RECON_TERM_XATTR) + && !!dict_get(xdata,RECON_INDEX_XATTR); + } + else { + from_leader = from_recon = _gf_false; + } + + // follower/recon path + // just send it to local node + if (from_leader || from_recon) { + STACK_WIND (frame, nsr_xattrop_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, + loc, optype, xattr, xdata); + return 0; + } + + if (!priv->leader || priv->fence_io) { + op_errno = EREMOTE; + goto err; + } + +#if NSR_CG_QUEUE + nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd); + if (!ictx) { + op_errno = EIO; + goto err; + } + /* TBD: LOCK */ + if (ictx->active) { + gf_log (this->name, GF_LOG_DEBUG, + "queuing request due to conflict"); + ++(ictx->pending); + /* TBD: actually enqueue */ + } + else { + ++(ictx->active); + } + /* TBD: UNLOCK */ +#endif + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set nsr-term"); + goto err; + } + + local->stub = fop_xattrop_stub (frame,nsr_xattrop_continue, + loc, optype, xattr, xdata); + if (!local->stub) { + goto err; + } + + local->call_count = priv->n_children - 1; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_xattrop_fan_in, + trav->xlator, trav->xlator->fops->xattrop, + loc, optype, xattr, xdata); + } + + // TBD: variable Issue count + return 0; + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (xattrop, frame, -1, op_errno, + NULL, NULL); + return 0; +} + +/* No code emitted for zerofill */ + +struct xlator_fops fops = { + .access = nsr_access, + .create = nsr_create, + .discard = nsr_discard, + .fallocate = nsr_fallocate, + .fgetxattr = nsr_fgetxattr, + .fremovexattr = nsr_fremovexattr, + .fsetattr = nsr_fsetattr, + .fsetxattr = nsr_fsetxattr, + .fstat = nsr_fstat, + .ftruncate = nsr_ftruncate, + .fxattrop = nsr_fxattrop, + .getxattr = nsr_getxattr, + .link = nsr_link, + .mkdir = nsr_mkdir, + .mknod = nsr_mknod, + .open = nsr_open, + .opendir = nsr_opendir, + .rchecksum = nsr_rchecksum, + .readdir = nsr_readdir, + .readdirp = nsr_readdirp, + .readlink = nsr_readlink, + .readv = nsr_readv, + .removexattr = nsr_removexattr, + .rename = nsr_rename, + .rmdir = nsr_rmdir, + .setattr = nsr_setattr, + .setxattr = nsr_setxattr, + .stat = nsr_stat, + .statfs = nsr_statfs, + .symlink = nsr_symlink, + .truncate = nsr_truncate, + .unlink = nsr_unlink, + .writev = nsr_writev, + .xattrop = nsr_xattrop, +}; diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h new file mode 100644 index 000000000..282247a47 --- /dev/null +++ b/xlators/cluster/nsr-server/src/nsr-internal.h @@ -0,0 +1,81 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include +#include + +#define LEADER_XATTR "user.nsr.leader" +#define SECOND_CHILD(xl) (xl->children->next->xlator) + +enum { + gf_mt_nsr_private_t = gf_common_mt_end + 1, + gf_mt_nsr_fd_ctx_t, + gf_mt_nsr_inode_ctx_t, + gf_mt_nsr_dirty_t, + gf_mt_nsr_end +}; + + +typedef struct { + char *etcd_servers; + char *vol_uuid; + char *term_uuid; + char *brick_uuid; + gf_boolean_t leader; + uint8_t n_children; + char *vol_file; + glfs_t *fs; + etcd_session etcd; + volatile unsigned int fence_io; + glfs_fd_t *fd; + uint32_t current_term; +#ifdef NSR_DEBUG + uint32_t leader_log_fd; +#endif + volatile int leader_inited; + uint32_t kid_state; + gf_lock_t dirty_lock; + struct list_head dirty_fds; + gf_boolean_t nsr_recon_start; +} nsr_private_t; + +typedef struct { + call_stub_t *stub; + call_stub_t *qstub; + uint8_t call_count; + fd_t *fd; + struct list_head qlinks; +} nsr_local_t; + +/* + * This should match whatever changelog returns on the pre-op for us to pass + * when we're ready for our post-op. + */ +typedef uint32_t log_id_t; + +typedef struct { + struct list_head links; + log_id_t id; +} nsr_dirty_list_t; + +typedef struct { + fd_t *fd; + struct list_head dirty_list; + struct list_head fd_list; +} nsr_fd_ctx_t; + +typedef struct { + gf_lock_t lock; + uint32_t active; + struct list_head aqueue; + uint32_t pending; + struct list_head pqueue; +} nsr_inode_ctx_t; + diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c new file mode 100644 index 000000000..3707b3003 --- /dev/null +++ b/xlators/cluster/nsr-server/src/nsr.c @@ -0,0 +1,682 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "api/src/glfs.h" +#include "api/src/glfs-internal.h" +#include "run.h" +#include "common-utils.h" + + +#include "etcd-api.h" +#include "nsr-internal.h" +#include "../../nsr-recon/src/recon_driver.h" +#include "../../nsr-recon/src/recon_xlator.h" + + +#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd" +#define GLUSTERD_VOLUME_DIR_PREFIX "vols" +#define GLUSTERD_BRICK_INFO_DIR "bricks" + +#define NSR_FLUSH_INTERVAL 5 + +nsr_inode_ctx_t * +nsr_get_inode_ctx (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_int = 0LL; + nsr_inode_ctx_t *ctx_ptr; + + if (__inode_ctx_get(inode,this,&ctx_int) == 0) { + ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int; + } + else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), + gf_mt_nsr_inode_ctx_t); + if (ctx_ptr) { + ctx_int = (uint64_t)(long)ctx_ptr; + if (__inode_ctx_set(inode,this,&ctx_int) == 0) { + LOCK_INIT(&ctx_ptr->lock); + INIT_LIST_HEAD(&ctx_ptr->aqueue); + INIT_LIST_HEAD(&ctx_ptr->pqueue); + } + else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +nsr_fd_ctx_t * +nsr_get_fd_ctx (xlator_t *this, fd_t *fd) +{ + uint64_t ctx_int = 0LL; + nsr_fd_ctx_t *ctx_ptr; + + if (__fd_ctx_get(fd,this,&ctx_int) == 0) { + ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; + } + else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t); + if (ctx_ptr) { + if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) { + INIT_LIST_HEAD(&ctx_ptr->dirty_list); + INIT_LIST_HEAD(&ctx_ptr->fd_list); + } + else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +void +nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local) +{ + fd_t *fd = local->fd; + nsr_fd_ctx_t *ctx_ptr; + nsr_dirty_list_t *dirty; + nsr_private_t *priv = this->private; + + /* + * TBD: don't do any of this for O_SYNC/O_DIRECT writes. + * Unfortunately, that optimization requires that we distinguish + * between writev and other "write" calls, saving the original flags + * and checking them in the callback. Too much work for too little + * gain right now. + */ + + LOCK(&fd->lock); + ctx_ptr = nsr_get_fd_ctx(this,fd); + dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t); + if (ctx_ptr && dirty) { + gf_log (this->name, GF_LOG_TRACE, + "marking fd %p as dirty (%p)", fd, dirty); + /* TBD: fill dirty->id from what changelog gave us */ + list_add_tail(&dirty->links,&ctx_ptr->dirty_list); + if (list_empty(&ctx_ptr->fd_list)) { + /* Add a ref so _release doesn't get called. */ + ctx_ptr->fd = fd_ref(fd); + LOCK(&priv->dirty_lock); + list_add_tail (&ctx_ptr->fd_list, + &priv->dirty_fds); + UNLOCK(&priv->dirty_lock); + } + } + else { + gf_log (this->name, GF_LOG_ERROR, + "could not mark %p dirty", fd); + if (ctx_ptr) { + GF_FREE(ctx_ptr); + } + if (dirty) { + GF_FREE(dirty); + } + } + UNLOCK(&fd->lock); +} + +#define NSR_TERM_XATTR "trusted.nsr.term" +#define RECON_TERM_XATTR "trusted.nsr.recon-term" +#define RECON_INDEX_XATTR "trusted.nsr.recon-index" +#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count" +#include "nsr-cg.c" + +uint8_t +nsr_count_up_kids (nsr_private_t *priv) +{ + uint8_t retval = 0; + uint8_t i; + + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + ++retval; + } + } + + return retval; +} + +/* + * The fsync machinery looks a lot like that for any write call, but there are + * some important differences that are easy to miss. First, we don't care + * about the xdata that shows whether the call came from a leader or + * reconciliation process. If we're the leader we fan out; if we're not we + * don't. Second, we don't wait for followers before we issue the local call. + * The code generation system could be updated to handle this, and still might + * if we need to implement other "almost identical" paths (e.g. for open), but + * a copy is more readable as long as it's just one. + */ + +int32_t +nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + nsr_local_t *local = frame->local; + gf_boolean_t unwind; + + LOCK(&frame->lock); + unwind = !--(local->call_count); + UNLOCK(&frame->lock); + + if (unwind) { + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + } + return 0; +} + +int32_t +nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + nsr_dirty_list_t *dirty; + nsr_dirty_list_t *dtmp; + nsr_local_t *local = frame->local; + + list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) { + gf_log (this->name, GF_LOG_TRACE, + "sending post-op on %p (%p)", local->fd, dirty); + GF_FREE(dirty); + } + + return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} + +int32_t +nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + nsr_private_t *priv = this->private; + nsr_local_t *local; + uint64_t ctx_int = 0LL; + nsr_fd_ctx_t *ctx_ptr; + xlator_list_t *trav; + + local = mem_get0(this->local_pool); + if (!local) { + STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata); + return 0; + } + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + /* Move the dirty list from the fd to the fsync request. */ + LOCK(&fd->lock); + if (__fd_ctx_get(fd,this,&ctx_int) == 0) { + ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; + list_splice_init (&ctx_ptr->dirty_list, + &local->qlinks); + } + UNLOCK(&fd->lock); + + /* Issue the local call. */ + local->call_count = priv->leader ? priv->n_children : 1; + STACK_WIND (frame, nsr_fsync_local_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + + /* Issue remote calls if we're the leader. */ + if (priv->leader) { + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + } + } + + return 0; +} + +int32_t +nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + dict_t *result; + uint8_t up; + nsr_private_t *priv = this->private; + + if (!priv->leader) { + STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL); + return 0; + } + + if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) { + STACK_WIND_TAIL (frame, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + return 0; + } + + result = dict_new(); + if (!result) { + goto dn_failed; + } + + up = nsr_count_up_kids(this->private); + if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,up) != 0) { + goto dsu_failed; + } + + STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL); + dict_destroy(result); + return 0; + +dsu_failed: + dict_destroy(result); +dn_failed: + STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +void +nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx) +{ + nsr_dirty_list_t *dirty; + nsr_dirty_list_t *dtmp; + + list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) { + gf_log (this->name, GF_LOG_TRACE, + "sending post-op on %p (%p)", fd_ctx->fd, dirty); + GF_FREE(dirty); + } + + INIT_LIST_HEAD(&fd_ctx->dirty_list); +} + +void * +nsr_flush_thread (void *ctx) +{ + xlator_t *this = ctx; + nsr_private_t *priv = this->private; + struct list_head dirty_fds; + nsr_fd_ctx_t *fd_ctx; + nsr_fd_ctx_t *fd_tmp; + + for (;;) { + /* + * We have to be very careful to avoid lock inversions here, so + * we can't just hold priv->dirty_lock while we take and + * release locks for each fd. Instead, we only hold dirty_lock + * at the beginning of each iteration, as we (effectively) make + * a copy of the current list head and then clear the original. + * This leads to four scenarios for adding the first entry to + * an fd and potentially putting it on the global list. + * + * (1) While we're asleep. No lock contention, it just gets + * added and will be processed on the next iteration. + * + * (2) After we've made a local copy, but before we've started + * processing that fd. The new entry will be added to the + * fd (under its lock), and we'll process it on the current + * iteration. + * + * (3) While we're processing the fd. They'll block on the fd + * lock, then see that the list is empty and put it on the + * global list. We'll process it here on the next + * iteration. + * + * (4) While we're working, but after we've processed that fd. + * Same as (1) as far as that fd is concerned. + */ + INIT_LIST_HEAD(&dirty_fds); + LOCK(&priv->dirty_lock); + list_splice_init(&priv->dirty_fds,&dirty_fds); + UNLOCK(&priv->dirty_lock); + + list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) { + LOCK(&fd_ctx->fd->lock); + nsr_flush_fd(this,fd_ctx); + list_del_init(&fd_ctx->fd_list); + UNLOCK(&fd_ctx->fd->lock); + fd_unref(fd_ctx->fd); + } + + sleep(NSR_FLUSH_INTERVAL); + } + + return NULL; +} + +int32_t +nsr_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx = 0LL; + + if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +int32_t +nsr_release (xlator_t *this, fd_t *fd) +{ + uint64_t ctx = 0LL; + + if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +struct xlator_cbks cbks = { + .forget = nsr_forget, + .release = nsr_release, +}; + +int +nsr_reconfigure (xlator_t *this, dict_t *options) +{ + nsr_private_t *priv = this->private; + + GF_OPTION_RECONF ("leader", priv->leader, options, bool, err); + return 0; + +err: + return -1; +} + +int +nsr_get_child_index (xlator_t *this, xlator_t *kid) +{ + xlator_list_t *trav; + int retval = -1; + + for (trav = this->children; trav; trav = trav->next) { + ++retval; + if (trav->xlator == kid) { + return retval; + } + } + + return -1; +} + +/* + * Child notify handling is unreasonably FUBAR. Sometimes we'll get a + * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. + * Other times we won't. Because it's effectively random (probably racy), we + * can't just maintain a count. We actually have to keep track of the state + * for each child separately, to filter out the bogus CHILD_DOWN events, and + * then generate counts on demand. + */ +int +nsr_notify (xlator_t *this, int event, void *data, ...) +{ + nsr_private_t *priv = this->private; + int index; + + switch (event) { + case GF_EVENT_CHILD_UP: + index = nsr_get_child_index(this,data); + if (index >= 0) { + priv->kid_state |= (1 << index); + gf_log (this->name, GF_LOG_INFO, + "got CHILD_UP for %s, now %u kids", + ((xlator_t *)data)->name, + nsr_count_up_kids(priv)); + } + break; + case GF_EVENT_CHILD_DOWN: + index = nsr_get_child_index(this,data); + if (index >= 0) { + priv->kid_state &= ~(1 << index); + gf_log (this->name, GF_LOG_INFO, + "got CHILD_DOWN for %s, now %u kids", + ((xlator_t *)data)->name, + nsr_count_up_kids(priv)); + } + break; + default: + ; + } + + return default_notify(this,event,data); +} + + +extern void *nsr_leader_thread (void *); + +int32_t +nsr_init (xlator_t *this) +{ + xlator_list_t *remote; + xlator_list_t *local; + nsr_private_t *priv = NULL; + xlator_list_t *trav; + pthread_t kid; + uuid_t tmp_uuid; + char *my_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL; + char *volname; + extern xlator_t global_xlator; + glusterfs_ctx_t *oldctx = global_xlator.ctx; + runner_t runner = {0,}; + int32_t ret = -1; + struct stat buf; + + /* + * Any fop that gets special treatment has to be patched in here, + * because the compiled-in table is produced by the code generator and + * only contains generated functions. Note that we have to go through + * this->fops because of some dynamic-linking strangeness; modifying + * the static table doesn't work. + */ + this->fops->getxattr = nsr_getxattr_special; + this->fops->fsync = nsr_fsync; + + local = this->children; + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "no local subvolume"); + goto err; + } + + remote = local->next; + if (!remote) { + gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes"); + goto err; + } + + this->local_pool = mem_pool_new (nsr_local_t, 128); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create nsr_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t); + if (!priv) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate priv"); + goto err; + } + + // set this so that unless leader election is done, IO is fenced + priv->fence_io = 1; + + for (trav = this->children; trav; trav = trav->next) { + ++(priv->n_children); + } + + LOCK_INIT(&priv->dirty_lock); + INIT_LIST_HEAD(&priv->dirty_fds); + + this->private = priv; + + GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err); + if (!priv->etcd_servers) { + gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???"); + goto err; + } + priv->vol_uuid = "temporary"; + uuid_generate(tmp_uuid); + priv->brick_uuid = strdup(uuid_utoa(tmp_uuid)); + priv->term_uuid = "nsr-term"; + gf_log (this->name, GF_LOG_INFO, + "brick_uuid = %s\n", priv->brick_uuid); + + GF_OPTION_INIT ("my-name", my_name, str, err); + if (!my_name) { + gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???"); + goto err; + } + GF_OPTION_INIT ("vol-name", volname, str, err); + if (!volname) { + gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???"); + goto err; + } + + recon_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t); + recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("recon") +1, gf_mt_nsr_private_t); + if ((!recon_file) || (!recon_pid_file)) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); + goto err; + } + ptr = strchr (my_name, '/'); + while (ptr) { + *ptr = '-'; + ptr = strchr (my_name, '/'); + } + + sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + GLUSTERD_BRICK_INFO_DIR); + strcat(recon_file, my_name); + strcat(recon_file, "-nsr-recon.vol"); + + sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + "run"); + strcat(recon_pid_file, my_name); + strcat(recon_pid_file, "-recon.pid"); + + priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t); + if (!priv->vol_file) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); + goto err; + } + sprintf(priv->vol_file,"%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + GLUSTERD_BRICK_INFO_DIR); + strcat(priv->vol_file, "con:"); + strcat(priv->vol_file, my_name); + + if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not start flush thread"); + /* TBD: treat this as a fatal error? */ + } + + // Start the recon process. Then start the leader thread. + /* + * REVIEW + * Logs belong in /var/log not /tmp. + */ + if (!stat(priv->vol_file, &buf)) { + runinit (&runner); + runner_add_args(&runner, "/usr/local/sbin/glusterfs", + "-f", recon_file, + "-p", recon_pid_file, + "-l", "/tmp/reconciliation.log", + NULL); + ret = runner_run (&runner); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "could not exec reconciliation process " ); + goto err; + } + + // TBD - convert this to make sure recon process runs + sleep(2); + priv->nsr_recon_start = _gf_true; + } + + + (void)pthread_create(&kid,NULL,nsr_leader_thread,this); + while (priv->leader_inited == 0) { + sleep(1); + } + /* + * Calling glfs_new changes old->ctx, even if THIS still points + * to global_xlator. That causes problems later in the main + * thread, when gf_log_dump_graph tries to use the FILE after + * we've mucked with it and gets a segfault in __fprintf_chk. + * We can avoid all that by undoing the damage before we + * continue. + */ + global_xlator.ctx = oldctx; + + return 0; + +err: + if (priv) { + GF_FREE(priv); + } + return -1; +} + + +void +nsr_fini (xlator_t *this) +{ +} + +class_methods_t class_methods = { + .init = nsr_init, + .fini = nsr_fini, + .reconfigure = nsr_reconfigure, + .notify = nsr_notify, +}; + +struct volume_options options[] = { + { .key = {"leader"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Start in the leader role. This is only for " + "bootstrapping the code, and should go away when we " + "have real leader election." + }, + { .key ={"vol-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "volume name" + }, + { .key = {"my-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "brick name in form of host:/path" + }, + { .key = {"etcd-servers"}, + .type = GF_OPTION_TYPE_STR, + .description = "list of comma seperated etc servers" + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/nsr-server/src/stub_etcd.c b/xlators/cluster/nsr-server/src/stub_etcd.c new file mode 100644 index 000000000..83f5525a2 --- /dev/null +++ b/xlators/cluster/nsr-server/src/stub_etcd.c @@ -0,0 +1,129 @@ +/* + * Stub version of etcd. If the etcd executable is present, this will + * behave exactly like the regular etcd API. Otherwise, it will stub out + * the API functions by using local files. + */ + +#include "etcd-api.h" + +/* copied from glusterd-etcd.c */ +#define GLUSTERD_ETCD_DIR "/var/lib/glusterd/etcd" +#define GLUSTERD_ETCD_CMD "/root/etcd/etcd" + +#define MAX_KEY_SIZE 256 +#define MAX_VALUE_SIZE 1023 + +etcd_session *bogus_etcd = (void *)0x7766554433221100; + +void +concat_convert (char *dst, char *base, char *key) +{ + while (*dst) { + *(base++) = *(dst++); + } + *(base++) = '/'; + + while (*key) { + *(base++) = (*key == '/') ? '@' : *@key; + ++key; + } + *(base++) = '\0'; +} + +etcd_session +s_etcd_open_str (char *server_names) +{ + if (access(GLUSTERD_ETCD_CMD,X_OK) == 0) { + return etcd_open_str(server_names); + } + + return bogus_etcd; +} + +void +s_etcd_close_str (etcd_session this_as_void) +{ + if (this_as_void != bogus_etcd) { + etcd_close_str(this_as_void); + } +} + +char * +s_etcd_get (etcd_session this, char *key) +{ + char path[MAX_KEY_SIZE]; + int fd = -1; + char buf[MAX_VALUE_SIZE+1]; + ssize_t bytes; + char *retval = NULL; + + if (this != bogus_etcd) { + return etcd_get(this,key); + } + + concat_convert(path,GLUSTERD_ETCD_DIR,key); + + fd = open(path,O_RDONLY); + if (fd < 0) { + perror("open"); + goto err; + } + + bytes = read(fd,buf,MAX_VALUE_SIZE); + if (bytes <= 0) { + if (bytes < 0) { + perror("read"); + } + goto err; + } + + buf[bytes] = '\0'; + retval = strdup(buf); + +err: + if (fd >= 0) { + close(fd); + } + return retval; +} + +etcd_result +s_etcd_set (etcd_session this, char *key, char *value, + char *precond, unsigned int ttl) +{ + char path[MAX_KEY_SIZE]; + int fd = -1; + ssize_t bytes; + etcd_result retval = ETCD_WTF; + + if (this != bogus_etcd) { + return etcd_set(this,key,value,precond,ttl); + } + + concat_convert(path,GLUSTERD_ETCD_DIR,key); + + fd = open(path,O_WRONLY,0666); + if (fd < 0) { + perror("open"); + goto err; + } + + bytes = write(fd,value,strlen(value)+1); + if (bytes <= 0) { + if (bytes < 0) { + perror("write"); + } + goto err; + } + + retval = ETCD_OK; + + +err: + if (fd >= 0) { + close(fd); + } + return retval; +} + + diff --git a/xlators/cluster/nsr-server/src/yajl.c b/xlators/cluster/nsr-server/src/yajl.c new file mode 100644 index 000000000..54e6474fc --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl/yajl_parse.h" +#include "yajl_lex.h" +#include "yajl_parser.h" +#include "yajl_alloc.h" + +#include +#include +#include +#include + +const char * +yajl_status_to_string(yajl_status stat) +{ + const char * statStr = "unknown"; + switch (stat) { + case yajl_status_ok: + statStr = "ok, no error"; + break; + case yajl_status_client_canceled: + statStr = "client canceled parse"; + break; + case yajl_status_error: + statStr = "parse error"; + break; + } + return statStr; +} + +yajl_handle +yajl_alloc(const yajl_callbacks * callbacks, + yajl_alloc_funcs * afs, + void * ctx) +{ + yajl_handle hand = NULL; + yajl_alloc_funcs afsBuffer; + + /* first order of business is to set up memory allocation routines */ + if (afs != NULL) { + if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL) + { + return NULL; + } + } else { + yajl_set_default_alloc_funcs(&afsBuffer); + afs = &afsBuffer; + } + + hand = (yajl_handle) YA_MALLOC(afs, sizeof(struct yajl_handle_t)); + + /* copy in pointers to allocation routines */ + memcpy((void *) &(hand->alloc), (void *) afs, sizeof(yajl_alloc_funcs)); + + hand->callbacks = callbacks; + hand->ctx = ctx; + hand->lexer = NULL; + hand->bytesConsumed = 0; + hand->decodeBuf = yajl_buf_alloc(&(hand->alloc)); + hand->flags = 0; + yajl_bs_init(hand->stateStack, &(hand->alloc)); + yajl_bs_push(hand->stateStack, yajl_state_start); + + return hand; +} + +int +yajl_config(yajl_handle h, yajl_option opt, ...) +{ + int rv = 1; + va_list ap; + va_start(ap, opt); + + switch(opt) { + case yajl_allow_comments: + case yajl_dont_validate_strings: + case yajl_allow_trailing_garbage: + case yajl_allow_multiple_values: + case yajl_allow_partial_values: + if (va_arg(ap, int)) h->flags |= opt; + else h->flags &= ~opt; + break; + default: + rv = 0; + } + va_end(ap); + + return rv; +} + +void +yajl_free(yajl_handle handle) +{ + yajl_bs_free(handle->stateStack); + yajl_buf_free(handle->decodeBuf); + if (handle->lexer) { + yajl_lex_free(handle->lexer); + handle->lexer = NULL; + } + YA_FREE(&(handle->alloc), handle); +} + +yajl_status +yajl_parse(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen) +{ + yajl_status status; + + /* lazy allocation of the lexer */ + if (hand->lexer == NULL) { + hand->lexer = yajl_lex_alloc(&(hand->alloc), + hand->flags & yajl_allow_comments, + !(hand->flags & yajl_dont_validate_strings)); + } + + status = yajl_do_parse(hand, jsonText, jsonTextLen); + return status; +} + + +yajl_status +yajl_complete_parse(yajl_handle hand) +{ + /* The lexer is lazy allocated in the first call to parse. if parse is + * never called, then no data was provided to parse at all. This is a + * "premature EOF" error unless yajl_allow_partial_values is specified. + * allocating the lexer now is the simplest possible way to handle this + * case while preserving all the other semantics of the parser + * (multiple values, partial values, etc). */ + if (hand->lexer == NULL) { + hand->lexer = yajl_lex_alloc(&(hand->alloc), + hand->flags & yajl_allow_comments, + !(hand->flags & yajl_dont_validate_strings)); + } + + return yajl_do_finish(hand); +} + +unsigned char * +yajl_get_error(yajl_handle hand, int verbose, + const unsigned char * jsonText, size_t jsonTextLen) +{ + return yajl_render_error_string(hand, jsonText, jsonTextLen, verbose); +} + +size_t +yajl_get_bytes_consumed(yajl_handle hand) +{ + if (!hand) return 0; + else return hand->bytesConsumed; +} + + +void +yajl_free_error(yajl_handle hand, unsigned char * str) +{ + /* use memory allocation functions if set */ + YA_FREE(&(hand->alloc), str); +} + +/* XXX: add utility routines to parse from file */ diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_common.h b/xlators/cluster/nsr-server/src/yajl/yajl_common.h new file mode 100644 index 000000000..49ca3a5cb --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_common.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_COMMON_H__ +#define __YAJL_COMMON_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define YAJL_MAX_DEPTH 128 + +/* msft dll export gunk. To build a DLL on windows, you + * must define WIN32, YAJL_SHARED, and YAJL_BUILD. To use a shared + * DLL, you must define YAJL_SHARED and WIN32 */ +#if defined(WIN32) && defined(YAJL_SHARED) +# ifdef YAJL_BUILD +# define YAJL_API __declspec(dllexport) +# else +# define YAJL_API __declspec(dllimport) +# endif +#else +# if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303 +# define YAJL_API __attribute__ ((visibility("default"))) +# else +# define YAJL_API +# endif +#endif + +/** pointer to a malloc function, supporting client overriding memory + * allocation routines */ +typedef void * (*yajl_malloc_func)(void *ctx, size_t sz); + +/** pointer to a free function, supporting client overriding memory + * allocation routines */ +typedef void (*yajl_free_func)(void *ctx, void * ptr); + +/** pointer to a realloc function which can resize an allocation. */ +typedef void * (*yajl_realloc_func)(void *ctx, void * ptr, size_t sz); + +/** A structure which can be passed to yajl_*_alloc routines to allow the + * client to specify memory allocation functions to be used. */ +typedef struct +{ + /** pointer to a function that can allocate uninitialized memory */ + yajl_malloc_func malloc; + /** pointer to a function that can resize memory allocations */ + yajl_realloc_func realloc; + /** pointer to a function that can free memory allocated using + * reallocFunction or mallocFunction */ + yajl_free_func free; + /** a context pointer that will be passed to above allocation routines */ + void * ctx; +} yajl_alloc_funcs; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_gen.h b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h new file mode 100644 index 000000000..52fa99fc2 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_gen.h + * Interface to YAJL's JSON generation facilities. + */ + +#include + +#ifndef __YAJL_GEN_H__ +#define __YAJL_GEN_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + /** generator status codes */ + typedef enum { + /** no error */ + yajl_gen_status_ok = 0, + /** at a point where a map key is generated, a function other than + * yajl_gen_string was called */ + yajl_gen_keys_must_be_strings, + /** YAJL's maximum generation depth was exceeded. see + * YAJL_MAX_DEPTH */ + yajl_max_depth_exceeded, + /** A generator function (yajl_gen_XXX) was called while in an error + * state */ + yajl_gen_in_error_state, + /** A complete JSON document has been generated */ + yajl_gen_generation_complete, + /** yajl_gen_double was passed an invalid floating point value + * (infinity or NaN). */ + yajl_gen_invalid_number, + /** A print callback was passed in, so there is no internal + * buffer to get from */ + yajl_gen_no_buf, + /** returned from yajl_gen_string() when the yajl_gen_validate_utf8 + * option is enabled and an invalid was passed by client code. + */ + yajl_gen_invalid_string + } yajl_gen_status; + + /** an opaque handle to a generator */ + typedef struct yajl_gen_t * yajl_gen; + + /** a callback used for "printing" the results. */ + typedef void (*yajl_print_t)(void * ctx, + const char * str, + size_t len); + + /** configuration parameters for the parser, these may be passed to + * yajl_gen_config() along with option specific argument(s). In general, + * all configuration parameters default to *off*. */ + typedef enum { + /** generate indented (beautiful) output */ + yajl_gen_beautify = 0x01, + /** + * Set an indent string which is used when yajl_gen_beautify + * is enabled. Maybe something like \\t or some number of + * spaces. The default is four spaces ' '. + */ + yajl_gen_indent_string = 0x02, + /** + * Set a function and context argument that should be used to + * output generated json. the function should conform to the + * yajl_print_t prototype while the context argument is a + * void * of your choosing. + * + * example: + * yajl_gen_config(g, yajl_gen_print_callback, myFunc, myVoidPtr); + */ + yajl_gen_print_callback = 0x04, + /** + * Normally the generator does not validate that strings you + * pass to it via yajl_gen_string() are valid UTF8. Enabling + * this option will cause it to do so. + */ + yajl_gen_validate_utf8 = 0x08, + /** + * the forward solidus (slash or '/' in human) is not required to be + * escaped in json text. By default, YAJL will not escape it in the + * iterest of saving bytes. Setting this flag will cause YAJL to + * always escape '/' in generated JSON strings. + */ + yajl_gen_escape_solidus = 0x10 + } yajl_gen_option; + + /** allow the modification of generator options subsequent to handle + * allocation (via yajl_alloc) + * \returns zero in case of errors, non-zero otherwise + */ + YAJL_API int yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...); + + /** allocate a generator handle + * \param allocFuncs an optional pointer to a structure which allows + * the client to overide the memory allocation + * used by yajl. May be NULL, in which case + * malloc/free/realloc will be used. + * + * \returns an allocated handle on success, NULL on failure (bad params) + */ + YAJL_API yajl_gen yajl_gen_alloc(const yajl_alloc_funcs * allocFuncs); + + /** free a generator handle */ + YAJL_API void yajl_gen_free(yajl_gen handle); + + YAJL_API yajl_gen_status yajl_gen_integer(yajl_gen hand, long long int number); + /** generate a floating point number. number may not be infinity or + * NaN, as these have no representation in JSON. In these cases the + * generator will return 'yajl_gen_invalid_number' */ + YAJL_API yajl_gen_status yajl_gen_double(yajl_gen hand, double number); + YAJL_API yajl_gen_status yajl_gen_number(yajl_gen hand, + const char * num, + size_t len); + YAJL_API yajl_gen_status yajl_gen_string(yajl_gen hand, + const unsigned char * str, + size_t len); + YAJL_API yajl_gen_status yajl_gen_null(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_bool(yajl_gen hand, int boolean); + YAJL_API yajl_gen_status yajl_gen_map_open(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_map_close(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_array_open(yajl_gen hand); + YAJL_API yajl_gen_status yajl_gen_array_close(yajl_gen hand); + + /** access the null terminated generator buffer. If incrementally + * outputing JSON, one should call yajl_gen_clear to clear the + * buffer. This allows stream generation. */ + YAJL_API yajl_gen_status yajl_gen_get_buf(yajl_gen hand, + const unsigned char ** buf, + size_t * len); + + /** clear yajl's output buffer, but maintain all internal generation + * state. This function will not "reset" the generator state, and is + * intended to enable incremental JSON outputing. */ + YAJL_API void yajl_gen_clear(yajl_gen hand); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_parse.h b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h new file mode 100644 index 000000000..55c831101 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_parse.h + * Interface to YAJL's JSON stream parsing facilities. + */ + +#include + +#ifndef __YAJL_PARSE_H__ +#define __YAJL_PARSE_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + /** error codes returned from this interface */ + typedef enum { + /** no error was encountered */ + yajl_status_ok, + /** a client callback returned zero, stopping the parse */ + yajl_status_client_canceled, + /** An error occured during the parse. Call yajl_get_error for + * more information about the encountered error */ + yajl_status_error + } yajl_status; + + /** attain a human readable, english, string for an error */ + YAJL_API const char * yajl_status_to_string(yajl_status code); + + /** an opaque handle to a parser */ + typedef struct yajl_handle_t * yajl_handle; + + /** yajl is an event driven parser. this means as json elements are + * parsed, you are called back to do something with the data. The + * functions in this table indicate the various events for which + * you will be called back. Each callback accepts a "context" + * pointer, this is a void * that is passed into the yajl_parse + * function which the client code may use to pass around context. + * + * All callbacks return an integer. If non-zero, the parse will + * continue. If zero, the parse will be canceled and + * yajl_status_client_canceled will be returned from the parse. + * + * \attention { + * A note about the handling of numbers: + * + * yajl will only convert numbers that can be represented in a + * double or a 64 bit (long long) int. All other numbers will + * be passed to the client in string form using the yajl_number + * callback. Furthermore, if yajl_number is not NULL, it will + * always be used to return numbers, that is yajl_integer and + * yajl_double will be ignored. If yajl_number is NULL but one + * of yajl_integer or yajl_double are defined, parsing of a + * number larger than is representable in a double or 64 bit + * integer will result in a parse error. + * } + */ + typedef struct { + int (* yajl_null)(void * ctx); + int (* yajl_boolean)(void * ctx, int boolVal); + int (* yajl_integer)(void * ctx, long long integerVal); + int (* yajl_double)(void * ctx, double doubleVal); + /** A callback which passes the string representation of the number + * back to the client. Will be used for all numbers when present */ + int (* yajl_number)(void * ctx, const char * numberVal, + size_t numberLen); + + /** strings are returned as pointers into the JSON text when, + * possible, as a result, they are _not_ null padded */ + int (* yajl_string)(void * ctx, const unsigned char * stringVal, + size_t stringLen); + + int (* yajl_start_map)(void * ctx); + int (* yajl_map_key)(void * ctx, const unsigned char * key, + size_t stringLen); + int (* yajl_end_map)(void * ctx); + + int (* yajl_start_array)(void * ctx); + int (* yajl_end_array)(void * ctx); + } yajl_callbacks; + + /** allocate a parser handle + * \param callbacks a yajl callbacks structure specifying the + * functions to call when different JSON entities + * are encountered in the input text. May be NULL, + * which is only useful for validation. + * \param afs memory allocation functions, may be NULL for to use + * C runtime library routines (malloc and friends) + * \param ctx a context pointer that will be passed to callbacks. + */ + YAJL_API yajl_handle yajl_alloc(const yajl_callbacks * callbacks, + yajl_alloc_funcs * afs, + void * ctx); + + + /** configuration parameters for the parser, these may be passed to + * yajl_config() along with option specific argument(s). In general, + * all configuration parameters default to *off*. */ + typedef enum { + /** Ignore javascript style comments present in + * JSON input. Non-standard, but rather fun + * arguments: toggled off with integer zero, on otherwise. + * + * example: + * yajl_config(h, yajl_allow_comments, 1); // turn comment support on + */ + yajl_allow_comments = 0x01, + /** + * When set the parser will verify that all strings in JSON input are + * valid UTF8 and will emit a parse error if this is not so. When set, + * this option makes parsing slightly more expensive (~7% depending + * on processor and compiler in use) + * + * example: + * yajl_config(h, yajl_dont_validate_strings, 1); // disable utf8 checking + */ + yajl_dont_validate_strings = 0x02, + /** + * By default, upon calls to yajl_complete_parse(), yajl will + * ensure the entire input text was consumed and will raise an error + * otherwise. Enabling this flag will cause yajl to disable this + * check. This can be useful when parsing json out of a that contains more + * than a single JSON document. + */ + yajl_allow_trailing_garbage = 0x04, + /** + * Allow multiple values to be parsed by a single handle. The + * entire text must be valid JSON, and values can be seperated + * by any kind of whitespace. This flag will change the + * behavior of the parser, and cause it continue parsing after + * a value is parsed, rather than transitioning into a + * complete state. This option can be useful when parsing multiple + * values from an input stream. + */ + yajl_allow_multiple_values = 0x08, + /** + * When yajl_complete_parse() is called the parser will + * check that the top level value was completely consumed. I.E., + * if called whilst in the middle of parsing a value + * yajl will enter an error state (premature EOF). Setting this + * flag suppresses that check and the corresponding error. + */ + yajl_allow_partial_values = 0x10 + } yajl_option; + + /** allow the modification of parser options subsequent to handle + * allocation (via yajl_alloc) + * \returns zero in case of errors, non-zero otherwise + */ + YAJL_API int yajl_config(yajl_handle h, yajl_option opt, ...); + + /** free a parser handle */ + YAJL_API void yajl_free(yajl_handle handle); + + /** Parse some json! + * \param hand - a handle to the json parser allocated with yajl_alloc + * \param jsonText - a pointer to the UTF8 json text to be parsed + * \param jsonTextLength - the length, in bytes, of input text + */ + YAJL_API yajl_status yajl_parse(yajl_handle hand, + const unsigned char * jsonText, + size_t jsonTextLength); + + /** Parse any remaining buffered json. + * Since yajl is a stream-based parser, without an explicit end of + * input, yajl sometimes can't decide if content at the end of the + * stream is valid or not. For example, if "1" has been fed in, + * yajl can't know whether another digit is next or some character + * that would terminate the integer token. + * + * \param hand - a handle to the json parser allocated with yajl_alloc + */ + YAJL_API yajl_status yajl_complete_parse(yajl_handle hand); + + /** get an error string describing the state of the + * parse. + * + * If verbose is non-zero, the message will include the JSON + * text where the error occured, along with an arrow pointing to + * the specific char. + * + * \returns A dynamically allocated string will be returned which should + * be freed with yajl_free_error + */ + YAJL_API unsigned char * yajl_get_error(yajl_handle hand, int verbose, + const unsigned char * jsonText, + size_t jsonTextLength); + + /** + * get the amount of data consumed from the last chunk passed to YAJL. + * + * In the case of a successful parse this can help you understand if + * the entire buffer was consumed (which will allow you to handle + * "junk at end of input"). + * + * In the event an error is encountered during parsing, this function + * affords the client a way to get the offset into the most recent + * chunk where the error occured. 0 will be returned if no error + * was encountered. + */ + YAJL_API size_t yajl_get_bytes_consumed(yajl_handle hand); + + /** free an error returned from yajl_get_error */ + YAJL_API void yajl_free_error(yajl_handle hand, unsigned char * str); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_tree.h b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h new file mode 100644 index 000000000..8b377f636 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2010-2011 Florian Forster + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_tree.h + * + * Parses JSON data and returns the data in tree form. + * + * \author Florian Forster + * \date August 2010 + * + * This interface makes quick parsing and extraction of + * smallish JSON docs trivial: + * + * \include example/parse_config.c + */ + +#ifndef YAJL_TREE_H +#define YAJL_TREE_H 1 + +#include + +/** possible data types that a yajl_val_s can hold */ +typedef enum { + yajl_t_string = 1, + yajl_t_number = 2, + yajl_t_object = 3, + yajl_t_array = 4, + yajl_t_true = 5, + yajl_t_false = 6, + yajl_t_null = 7, + /** The any type isn't valid for yajl_val_s.type, but can be + * used as an argument to routines like yajl_tree_get(). + */ + yajl_t_any = 8 +} yajl_type; + +#define YAJL_NUMBER_INT_VALID 0x01 +#define YAJL_NUMBER_DOUBLE_VALID 0x02 + +/** A pointer to a node in the parse tree */ +typedef struct yajl_val_s * yajl_val; + +/** + * A JSON value representation capable of holding one of the seven + * types above. For "string", "number", "object", and "array" + * additional data is available in the union. The "YAJL_IS_*" + * and "YAJL_GET_*" macros below allow type checking and convenient + * value extraction. + */ +struct yajl_val_s +{ + /** Type of the value contained. Use the "YAJL_IS_*" macors to check for a + * specific type. */ + yajl_type type; + /** Type-specific data. You may use the "YAJL_GET_*" macros to access these + * members. */ + union + { + char * string; + struct { + long long i; /*< integer value, if representable. */ + double d; /*< double value, if representable. */ + /** Signals whether the \em i and \em d members are + * valid. See \c YAJL_NUMBER_INT_VALID and + * \c YAJL_NUMBER_DOUBLE_VALID. */ + char *r; /*< unparsed number in string form. */ + unsigned int flags; + } number; + struct { + const char **keys; /*< Array of keys */ + yajl_val *values; /*< Array of values. */ + size_t len; /*< Number of key-value-pairs. */ + } object; + struct { + yajl_val *values; /*< Array of elements. */ + size_t len; /*< Number of elements. */ + } array; + } u; +}; + +/** + * Parse a string. + * + * Parses an null-terminated string containing JSON data and returns a pointer + * to the top-level value (root of the parse tree). + * + * \param input Pointer to a null-terminated utf8 string containing + * JSON data. + * \param error_buffer Pointer to a buffer in which an error message will + * be stored if \em yajl_tree_parse fails, or + * \c NULL. The buffer will be initialized before + * parsing, so its content will be destroyed even if + * \em yajl_tree_parse succeeds. + * \param error_buffer_size Size of the memory area pointed to by + * \em error_buffer_size. If \em error_buffer_size is + * \c NULL, this argument is ignored. + * + * \returns Pointer to the top-level value or \c NULL on error. The memory + * pointed to must be freed using \em yajl_tree_free. In case of an error, a + * null terminated message describing the error in more detail is stored in + * \em error_buffer if it is not \c NULL. + */ +YAJL_API yajl_val yajl_tree_parse (const char *input, + char *error_buffer, size_t error_buffer_size); + +/** + * Free a parse tree returned by "yajl_tree_parse". + * + * \param v Pointer to a JSON value returned by "yajl_tree_parse". Passing NULL + * is valid and results in a no-op. + */ +YAJL_API void yajl_tree_free (yajl_val v); + +/** + * Access a nested value inside a tree. + * + * \param parent the node under which you'd like to extract values. + * \param path A null terminated array of strings, each the name of an object key + * \param type the yajl_type of the object you seek, or yajl_t_any if any will do. + * + * \returns a pointer to the found value, or NULL if we came up empty. + * + * Future Ideas: it'd be nice to move path to a string and implement support for + * a teeny tiny micro language here, so you can extract array elements, do things + * like .first and .last, even .length. Inspiration from JSONPath and css selectors? + * No it wouldn't be fast, but that's not what this API is about. + */ +YAJL_API yajl_val yajl_tree_get(yajl_val parent, const char ** path, yajl_type type); + +/* Various convenience macros to check the type of a `yajl_val` */ +#define YAJL_IS_STRING(v) (((v) != NULL) && ((v)->type == yajl_t_string)) +#define YAJL_IS_NUMBER(v) (((v) != NULL) && ((v)->type == yajl_t_number)) +#define YAJL_IS_INTEGER(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_INT_VALID)) +#define YAJL_IS_DOUBLE(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_DOUBLE_VALID)) +#define YAJL_IS_OBJECT(v) (((v) != NULL) && ((v)->type == yajl_t_object)) +#define YAJL_IS_ARRAY(v) (((v) != NULL) && ((v)->type == yajl_t_array )) +#define YAJL_IS_TRUE(v) (((v) != NULL) && ((v)->type == yajl_t_true )) +#define YAJL_IS_FALSE(v) (((v) != NULL) && ((v)->type == yajl_t_false )) +#define YAJL_IS_NULL(v) (((v) != NULL) && ((v)->type == yajl_t_null )) + +/** Given a yajl_val_string return a ptr to the bare string it contains, + * or NULL if the value is not a string. */ +#define YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : NULL) + +/** Get the string representation of a number. You should check type first, + * perhaps using YAJL_IS_NUMBER */ +#define YAJL_GET_NUMBER(v) ((v)->u.number.r) + +/** Get the double representation of a number. You should check type first, + * perhaps using YAJL_IS_DOUBLE */ +#define YAJL_GET_DOUBLE(v) ((v)->u.number.d) + +/** Get the 64bit (long long) integer representation of a number. You should + * check type first, perhaps using YAJL_IS_INTEGER */ +#define YAJL_GET_INTEGER(v) ((v)->u.number.i) + +/** Get a pointer to a yajl_val_object or NULL if the value is not an object. */ +#define YAJL_GET_OBJECT(v) (YAJL_IS_OBJECT(v) ? &(v)->u.object : NULL) + +/** Get a pointer to a yajl_val_array or NULL if the value is not an object. */ +#define YAJL_GET_ARRAY(v) (YAJL_IS_ARRAY(v) ? &(v)->u.array : NULL) + +#endif /* YAJL_TREE_H */ diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_version.h b/xlators/cluster/nsr-server/src/yajl/yajl_version.h new file mode 100644 index 000000000..0fba9b8fc --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl/yajl_version.h @@ -0,0 +1,23 @@ +#ifndef YAJL_VERSION_H_ +#define YAJL_VERSION_H_ + +#include + +#define YAJL_MAJOR 2 +#define YAJL_MINOR 0 +#define YAJL_MICRO 1 + +#define YAJL_VERSION ((YAJL_MAJOR * 10000) + (YAJL_MINOR * 100) + YAJL_MICRO) + +#ifdef __cplusplus +extern "C" { +#endif + +extern int YAJL_API yajl_version(void); + +#ifdef __cplusplus +} +#endif + +#endif /* YAJL_VERSION_H_ */ + diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.c b/xlators/cluster/nsr-server/src/yajl_alloc.c new file mode 100644 index 000000000..276315af7 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_alloc.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_alloc.h + * default memory allocation routines for yajl which use malloc/realloc and + * free + */ + +#include "yajl_alloc.h" +#include + +static void * yajl_internal_malloc(void *ctx, size_t sz) +{ + return malloc(sz); +} + +static void * yajl_internal_realloc(void *ctx, void * previous, + size_t sz) +{ + return realloc(previous, sz); +} + +static void yajl_internal_free(void *ctx, void * ptr) +{ + free(ptr); +} + +void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf) +{ + yaf->malloc = yajl_internal_malloc; + yaf->free = yajl_internal_free; + yaf->realloc = yajl_internal_realloc; + yaf->ctx = NULL; +} + diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.h b/xlators/cluster/nsr-server/src/yajl_alloc.h new file mode 100644 index 000000000..a8a9e45e6 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_alloc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * \file yajl_alloc.h + * default memory allocation routines for yajl which use malloc/realloc and + * free + */ + +#ifndef __YAJL_ALLOC_H__ +#define __YAJL_ALLOC_H__ + +#include "yajl/yajl_common.h" + +#define YA_MALLOC(afs, sz) (afs)->malloc((afs)->ctx, (sz)) +#define YA_FREE(afs, ptr) (afs)->free((afs)->ctx, (ptr)) +#define YA_REALLOC(afs, ptr, sz) (afs)->realloc((afs)->ctx, (ptr), (sz)) + +void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_buf.c b/xlators/cluster/nsr-server/src/yajl_buf.c new file mode 100644 index 000000000..0d249d364 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_buf.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl_buf.h" + +#include +#include +#include + +#define YAJL_BUF_INIT_SIZE 2048 + +struct yajl_buf_t { + size_t len; + size_t used; + unsigned char * data; + yajl_alloc_funcs * alloc; +}; + +static +void yajl_buf_ensure_available(yajl_buf buf, size_t want) +{ + size_t need; + + assert(buf != NULL); + + /* first call */ + if (buf->data == NULL) { + buf->len = YAJL_BUF_INIT_SIZE; + buf->data = (unsigned char *) YA_MALLOC(buf->alloc, buf->len); + buf->data[0] = 0; + } + + need = buf->len; + + while (want >= (need - buf->used)) need <<= 1; + + if (need != buf->len) { + buf->data = (unsigned char *) YA_REALLOC(buf->alloc, buf->data, need); + buf->len = need; + } +} + +yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc) +{ + yajl_buf b = YA_MALLOC(alloc, sizeof(struct yajl_buf_t)); + memset((void *) b, 0, sizeof(struct yajl_buf_t)); + b->alloc = alloc; + return b; +} + +void yajl_buf_free(yajl_buf buf) +{ + assert(buf != NULL); + if (buf->data) YA_FREE(buf->alloc, buf->data); + YA_FREE(buf->alloc, buf); +} + +void yajl_buf_append(yajl_buf buf, const void * data, size_t len) +{ + yajl_buf_ensure_available(buf, len); + if (len > 0) { + assert(data != NULL); + memcpy(buf->data + buf->used, data, len); + buf->used += len; + buf->data[buf->used] = 0; + } +} + +void yajl_buf_clear(yajl_buf buf) +{ + buf->used = 0; + if (buf->data) buf->data[buf->used] = 0; +} + +const unsigned char * yajl_buf_data(yajl_buf buf) +{ + return buf->data; +} + +size_t yajl_buf_len(yajl_buf buf) +{ + return buf->used; +} + +void +yajl_buf_truncate(yajl_buf buf, size_t len) +{ + assert(len <= buf->used); + buf->used = len; +} diff --git a/xlators/cluster/nsr-server/src/yajl_buf.h b/xlators/cluster/nsr-server/src/yajl_buf.h new file mode 100644 index 000000000..94929a519 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_buf.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_BUF_H__ +#define __YAJL_BUF_H__ + +#include "yajl/yajl_common.h" +#include "yajl_alloc.h" + +/* + * Implementation/performance notes. If this were moved to a header + * only implementation using #define's where possible we might be + * able to sqeeze a little performance out of the guy by killing function + * call overhead. YMMV. + */ + +/** + * yajl_buf is a buffer with exponential growth. the buffer ensures that + * you are always null padded. + */ +typedef struct yajl_buf_t * yajl_buf; + +/* allocate a new buffer */ +yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc); + +/* free the buffer */ +void yajl_buf_free(yajl_buf buf); + +/* append a number of bytes to the buffer */ +void yajl_buf_append(yajl_buf buf, const void * data, size_t len); + +/* empty the buffer */ +void yajl_buf_clear(yajl_buf buf); + +/* get a pointer to the beginning of the buffer */ +const unsigned char * yajl_buf_data(yajl_buf buf); + +/* get the length of the buffer */ +size_t yajl_buf_len(yajl_buf buf); + +/* truncate the buffer */ +void yajl_buf_truncate(yajl_buf buf, size_t len); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_bytestack.h b/xlators/cluster/nsr-server/src/yajl_bytestack.h new file mode 100644 index 000000000..1fc50c470 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_bytestack.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * A header only implementation of a simple stack of bytes, used in YAJL + * to maintain parse state. + */ + +#ifndef __YAJL_BYTESTACK_H__ +#define __YAJL_BYTESTACK_H__ + +#include "yajl/yajl_common.h" + +#define YAJL_BS_INC 128 + +typedef struct yajl_bytestack_t +{ + unsigned char * stack; + size_t size; + size_t used; + yajl_alloc_funcs * yaf; +} yajl_bytestack; + +/* initialize a bytestack */ +#define yajl_bs_init(obs, _yaf) { \ + (obs).stack = NULL; \ + (obs).size = 0; \ + (obs).used = 0; \ + (obs).yaf = (_yaf); \ + } \ + + +/* initialize a bytestack */ +#define yajl_bs_free(obs) \ + if ((obs).stack) (obs).yaf->free((obs).yaf->ctx, (obs).stack); + +#define yajl_bs_current(obs) \ + (assert((obs).used > 0), (obs).stack[(obs).used - 1]) + +#define yajl_bs_push(obs, byte) { \ + if (((obs).size - (obs).used) == 0) { \ + (obs).size += YAJL_BS_INC; \ + (obs).stack = (obs).yaf->realloc((obs).yaf->ctx,\ + (void *) (obs).stack, (obs).size);\ + } \ + (obs).stack[((obs).used)++] = (byte); \ +} + +/* removes the top item of the stack, returns nothing */ +#define yajl_bs_pop(obs) { ((obs).used)--; } + +#define yajl_bs_set(obs, byte) \ + (obs).stack[((obs).used) - 1] = (byte); + + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_encode.c b/xlators/cluster/nsr-server/src/yajl_encode.c new file mode 100644 index 000000000..9dc9a3e81 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_encode.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl_encode.h" + +#include +#include +#include +#include + +static void CharToHex(unsigned char c, char * hexBuf) +{ + const char * hexchar = "0123456789ABCDEF"; + hexBuf[0] = hexchar[c >> 4]; + hexBuf[1] = hexchar[c & 0x0F]; +} + +void +yajl_string_encode(const yajl_print_t print, + void * ctx, + const unsigned char * str, + size_t len, + int escape_solidus) +{ + size_t beg = 0; + size_t end = 0; + char hexBuf[7]; + hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0'; + hexBuf[6] = 0; + + while (end < len) { + const char * escaped = NULL; + switch (str[end]) { + case '\r': escaped = "\\r"; break; + case '\n': escaped = "\\n"; break; + case '\\': escaped = "\\\\"; break; + /* it is not required to escape a solidus in JSON: + * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt + * specifically, this production from the grammar: + * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF + */ + case '/': if (escape_solidus) escaped = "\\/"; break; + case '"': escaped = "\\\""; break; + case '\f': escaped = "\\f"; break; + case '\b': escaped = "\\b"; break; + case '\t': escaped = "\\t"; break; + default: + if ((unsigned char) str[end] < 32) { + CharToHex(str[end], hexBuf + 4); + escaped = hexBuf; + } + break; + } + if (escaped != NULL) { + print(ctx, (const char *) (str + beg), end - beg); + print(ctx, escaped, (unsigned int)strlen(escaped)); + beg = ++end; + } else { + ++end; + } + } + print(ctx, (const char *) (str + beg), end - beg); +} + +static void hexToDigit(unsigned int * val, const unsigned char * hex) +{ + unsigned int i; + for (i=0;i<4;i++) { + unsigned char c = hex[i]; + if (c >= 'A') c = (c & ~0x20) - 7; + c -= '0'; + assert(!(c & 0xF0)); + *val = (*val << 4) | c; + } +} + +static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf) +{ + if (codepoint < 0x80) { + utf8Buf[0] = (char) codepoint; + utf8Buf[1] = 0; + } else if (codepoint < 0x0800) { + utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0); + utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80); + utf8Buf[2] = 0; + } else if (codepoint < 0x10000) { + utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0); + utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80); + utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80); + utf8Buf[3] = 0; + } else if (codepoint < 0x200000) { + utf8Buf[0] =(char)((codepoint >> 18) | 0xF0); + utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80); + utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80); + utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80); + utf8Buf[4] = 0; + } else { + utf8Buf[0] = '?'; + utf8Buf[1] = 0; + } +} + +void yajl_string_decode(yajl_buf buf, const unsigned char * str, + size_t len) +{ + size_t beg = 0; + size_t end = 0; + + while (end < len) { + if (str[end] == '\\') { + char utf8Buf[5]; + const char * unescaped = "?"; + yajl_buf_append(buf, str + beg, end - beg); + switch (str[++end]) { + case 'r': unescaped = "\r"; break; + case 'n': unescaped = "\n"; break; + case '\\': unescaped = "\\"; break; + case '/': unescaped = "/"; break; + case '"': unescaped = "\""; break; + case 'f': unescaped = "\f"; break; + case 'b': unescaped = "\b"; break; + case 't': unescaped = "\t"; break; + case 'u': { + unsigned int codepoint = 0; + hexToDigit(&codepoint, str + ++end); + end+=3; + /* check if this is a surrogate */ + if ((codepoint & 0xFC00) == 0xD800) { + end++; + if (str[end] == '\\' && str[end + 1] == 'u') { + unsigned int surrogate = 0; + hexToDigit(&surrogate, str + end + 2); + codepoint = + (((codepoint & 0x3F) << 10) | + ((((codepoint >> 6) & 0xF) + 1) << 16) | + (surrogate & 0x3FF)); + end += 5; + } else { + unescaped = "?"; + break; + } + } + + Utf32toUtf8(codepoint, utf8Buf); + unescaped = utf8Buf; + + if (codepoint == 0) { + yajl_buf_append(buf, unescaped, 1); + beg = ++end; + continue; + } + + break; + } + default: + assert("this should never happen" == NULL); + } + yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped)); + beg = ++end; + } else { + end++; + } + } + yajl_buf_append(buf, str + beg, end - beg); +} + +#define ADV_PTR s++; if (!(len--)) return 0; + +int yajl_string_validate_utf8(const unsigned char * s, size_t len) +{ + if (!len) return 1; + if (!s) return 0; + + while (len--) { + /* single byte */ + if (*s <= 0x7f) { + /* noop */ + } + /* two byte */ + else if ((*s >> 5) == 0x6) { + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + } + /* three byte */ + else if ((*s >> 4) == 0x0e) { + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + } + /* four byte */ + else if ((*s >> 3) == 0x1e) { + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + ADV_PTR; + if (!((*s >> 6) == 0x2)) return 0; + } else { + return 0; + } + + s++; + } + + return 1; +} diff --git a/xlators/cluster/nsr-server/src/yajl_encode.h b/xlators/cluster/nsr-server/src/yajl_encode.h new file mode 100644 index 000000000..af1e8bbde --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_encode.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_ENCODE_H__ +#define __YAJL_ENCODE_H__ + +#include "yajl_buf.h" +#include "yajl/yajl_gen.h" + +void yajl_string_encode(const yajl_print_t printer, + void * ctx, + const unsigned char * str, + size_t length, + int escape_solidus); + +void yajl_string_decode(yajl_buf buf, const unsigned char * str, + size_t length); + +int yajl_string_validate_utf8(const unsigned char * s, size_t len); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_gen.c b/xlators/cluster/nsr-server/src/yajl_gen.c new file mode 100644 index 000000000..73763a9e0 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_gen.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl/yajl_gen.h" +#include "yajl_buf.h" +#include "yajl_encode.h" + +#include +#include +#include +#include +#include + +typedef enum { + yajl_gen_start, + yajl_gen_map_start, + yajl_gen_map_key, + yajl_gen_map_val, + yajl_gen_array_start, + yajl_gen_in_array, + yajl_gen_complete, + yajl_gen_error +} yajl_gen_state; + +struct yajl_gen_t +{ + unsigned int flags; + unsigned int depth; + const char * indentString; + yajl_gen_state state[YAJL_MAX_DEPTH]; + yajl_print_t print; + void * ctx; /* yajl_buf */ + /* memory allocation routines */ + yajl_alloc_funcs alloc; +}; + +int +yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...) +{ + int rv = 1; + va_list ap; + va_start(ap, opt); + + switch(opt) { + case yajl_gen_beautify: + case yajl_gen_validate_utf8: + if (va_arg(ap, int)) g->flags |= opt; + else g->flags &= ~opt; + break; + case yajl_gen_indent_string: { + const char *indent = va_arg(ap, const char *); + g->indentString = indent; + for (; *indent; indent++) { + if (*indent != '\n' + && *indent != '\v' + && *indent != '\f' + && *indent != '\t' + && *indent != '\r' + && *indent != ' ') + { + g->indentString = NULL; + rv = 0; + } + } + break; + } + case yajl_gen_print_callback: + yajl_buf_free(g->ctx); + g->print = va_arg(ap, const yajl_print_t); + g->ctx = va_arg(ap, void *); + break; + default: + rv = 0; + } + + va_end(ap); + + return rv; +} + + + +yajl_gen +yajl_gen_alloc(const yajl_alloc_funcs * afs) +{ + yajl_gen g = NULL; + yajl_alloc_funcs afsBuffer; + + /* first order of business is to set up memory allocation routines */ + if (afs != NULL) { + if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL) + { + return NULL; + } + } else { + yajl_set_default_alloc_funcs(&afsBuffer); + afs = &afsBuffer; + } + + g = (yajl_gen) YA_MALLOC(afs, sizeof(struct yajl_gen_t)); + if (!g) return NULL; + + memset((void *) g, 0, sizeof(struct yajl_gen_t)); + /* copy in pointers to allocation routines */ + memcpy((void *) &(g->alloc), (void *) afs, sizeof(yajl_alloc_funcs)); + + g->print = (yajl_print_t)&yajl_buf_append; + g->ctx = yajl_buf_alloc(&(g->alloc)); + g->indentString = " "; + + return g; +} + +void +yajl_gen_free(yajl_gen g) +{ + if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_free((yajl_buf)g->ctx); + YA_FREE(&(g->alloc), g); +} + +#define INSERT_SEP \ + if (g->state[g->depth] == yajl_gen_map_key || \ + g->state[g->depth] == yajl_gen_in_array) { \ + g->print(g->ctx, ",", 1); \ + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); \ + } else if (g->state[g->depth] == yajl_gen_map_val) { \ + g->print(g->ctx, ":", 1); \ + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, " ", 1); \ + } + +#define INSERT_WHITESPACE \ + if ((g->flags & yajl_gen_beautify)) { \ + if (g->state[g->depth] != yajl_gen_map_val) { \ + unsigned int _i; \ + for (_i=0;_idepth;_i++) \ + g->print(g->ctx, \ + g->indentString, \ + (unsigned int)strlen(g->indentString)); \ + } \ + } + +#define ENSURE_NOT_KEY \ + if (g->state[g->depth] == yajl_gen_map_key || \ + g->state[g->depth] == yajl_gen_map_start) { \ + return yajl_gen_keys_must_be_strings; \ + } \ + +/* check that we're not complete, or in error state. in a valid state + * to be generating */ +#define ENSURE_VALID_STATE \ + if (g->state[g->depth] == yajl_gen_error) { \ + return yajl_gen_in_error_state;\ + } else if (g->state[g->depth] == yajl_gen_complete) { \ + return yajl_gen_generation_complete; \ + } + +#define INCREMENT_DEPTH \ + if (++(g->depth) >= YAJL_MAX_DEPTH) return yajl_max_depth_exceeded; + +#define DECREMENT_DEPTH \ + if (--(g->depth) >= YAJL_MAX_DEPTH) return yajl_gen_error; + +#define APPENDED_ATOM \ + switch (g->state[g->depth]) { \ + case yajl_gen_start: \ + g->state[g->depth] = yajl_gen_complete; \ + break; \ + case yajl_gen_map_start: \ + case yajl_gen_map_key: \ + g->state[g->depth] = yajl_gen_map_val; \ + break; \ + case yajl_gen_array_start: \ + g->state[g->depth] = yajl_gen_in_array; \ + break; \ + case yajl_gen_map_val: \ + g->state[g->depth] = yajl_gen_map_key; \ + break; \ + default: \ + break; \ + } \ + +#define FINAL_NEWLINE \ + if ((g->flags & yajl_gen_beautify) && g->state[g->depth] == yajl_gen_complete) \ + g->print(g->ctx, "\n", 1); + +yajl_gen_status +yajl_gen_integer(yajl_gen g, long long int number) +{ + char i[32]; + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + sprintf(i, "%lld", number); + g->print(g->ctx, i, (unsigned int)strlen(i)); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +#ifdef WIN32 +#include +#define isnan _isnan +#define isinf !_finite +#endif + +yajl_gen_status +yajl_gen_double(yajl_gen g, double number) +{ + char i[32]; + ENSURE_VALID_STATE; ENSURE_NOT_KEY; + if (isnan(number) || isinf(number)) return yajl_gen_invalid_number; + INSERT_SEP; INSERT_WHITESPACE; + sprintf(i, "%.20g", number); + g->print(g->ctx, i, (unsigned int)strlen(i)); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_number(yajl_gen g, const char * s, size_t l) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, s, l); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_string(yajl_gen g, const unsigned char * str, + size_t len) +{ + // if validation is enabled, check that the string is valid utf8 + // XXX: This checking could be done a little faster, in the same pass as + // the string encoding + if (g->flags & yajl_gen_validate_utf8) { + if (!yajl_string_validate_utf8(str, len)) { + return yajl_gen_invalid_string; + } + } + ENSURE_VALID_STATE; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, "\"", 1); + yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus); + g->print(g->ctx, "\"", 1); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_null(yajl_gen g) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, "null", strlen("null")); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_bool(yajl_gen g, int boolean) +{ + const char * val = boolean ? "true" : "false"; + + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + g->print(g->ctx, val, (unsigned int)strlen(val)); + APPENDED_ATOM; + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_map_open(yajl_gen g) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + INCREMENT_DEPTH; + + g->state[g->depth] = yajl_gen_map_start; + g->print(g->ctx, "{", 1); + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_map_close(yajl_gen g) +{ + ENSURE_VALID_STATE; + DECREMENT_DEPTH; + + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + APPENDED_ATOM; + INSERT_WHITESPACE; + g->print(g->ctx, "}", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_array_open(yajl_gen g) +{ + ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE; + INCREMENT_DEPTH; + g->state[g->depth] = yajl_gen_array_start; + g->print(g->ctx, "[", 1); + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_array_close(yajl_gen g) +{ + ENSURE_VALID_STATE; + DECREMENT_DEPTH; + if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); + APPENDED_ATOM; + INSERT_WHITESPACE; + g->print(g->ctx, "]", 1); + FINAL_NEWLINE; + return yajl_gen_status_ok; +} + +yajl_gen_status +yajl_gen_get_buf(yajl_gen g, const unsigned char ** buf, + size_t * len) +{ + if (g->print != (yajl_print_t)&yajl_buf_append) return yajl_gen_no_buf; + *buf = yajl_buf_data((yajl_buf)g->ctx); + *len = yajl_buf_len((yajl_buf)g->ctx); + return yajl_gen_status_ok; +} + +void +yajl_gen_clear(yajl_gen g) +{ + if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_clear((yajl_buf)g->ctx); +} diff --git a/xlators/cluster/nsr-server/src/yajl_lex.c b/xlators/cluster/nsr-server/src/yajl_lex.c new file mode 100644 index 000000000..b098e6a99 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_lex.c @@ -0,0 +1,763 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl_lex.h" +#include "yajl_buf.h" + +#include +#include +#include +#include + +#ifdef YAJL_LEXER_DEBUG +static const char * +tokToStr(yajl_tok tok) +{ + switch (tok) { + case yajl_tok_bool: return "bool"; + case yajl_tok_colon: return "colon"; + case yajl_tok_comma: return "comma"; + case yajl_tok_eof: return "eof"; + case yajl_tok_error: return "error"; + case yajl_tok_left_brace: return "brace"; + case yajl_tok_left_bracket: return "bracket"; + case yajl_tok_null: return "null"; + case yajl_tok_integer: return "integer"; + case yajl_tok_double: return "double"; + case yajl_tok_right_brace: return "brace"; + case yajl_tok_right_bracket: return "bracket"; + case yajl_tok_string: return "string"; + case yajl_tok_string_with_escapes: return "string_with_escapes"; + } + return "unknown"; +} +#endif + +/* Impact of the stream parsing feature on the lexer: + * + * YAJL support stream parsing. That is, the ability to parse the first + * bits of a chunk of JSON before the last bits are available (still on + * the network or disk). This makes the lexer more complex. The + * responsibility of the lexer is to handle transparently the case where + * a chunk boundary falls in the middle of a token. This is + * accomplished is via a buffer and a character reading abstraction. + * + * Overview of implementation + * + * When we lex to end of input string before end of token is hit, we + * copy all of the input text composing the token into our lexBuf. + * + * Every time we read a character, we do so through the readChar function. + * readChar's responsibility is to handle pulling all chars from the buffer + * before pulling chars from input text + */ + +struct yajl_lexer_t { + /* the overal line and char offset into the data */ + size_t lineOff; + size_t charOff; + + /* error */ + yajl_lex_error error; + + /* a input buffer to handle the case where a token is spread over + * multiple chunks */ + yajl_buf buf; + + /* in the case where we have data in the lexBuf, bufOff holds + * the current offset into the lexBuf. */ + size_t bufOff; + + /* are we using the lex buf? */ + unsigned int bufInUse; + + /* shall we allow comments? */ + unsigned int allowComments; + + /* shall we validate utf8 inside strings? */ + unsigned int validateUTF8; + + yajl_alloc_funcs * alloc; +}; + +#define readChar(lxr, txt, off) \ + (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \ + (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \ + ((txt)[(*(off))++])) + +#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--)) + +yajl_lexer +yajl_lex_alloc(yajl_alloc_funcs * alloc, + unsigned int allowComments, unsigned int validateUTF8) +{ + yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t)); + memset((void *) lxr, 0, sizeof(struct yajl_lexer_t)); + lxr->buf = yajl_buf_alloc(alloc); + lxr->allowComments = allowComments; + lxr->validateUTF8 = validateUTF8; + lxr->alloc = alloc; + return lxr; +} + +void +yajl_lex_free(yajl_lexer lxr) +{ + yajl_buf_free(lxr->buf); + YA_FREE(lxr->alloc, lxr); + return; +} + +/* a lookup table which lets us quickly determine three things: + * VEC - valid escaped control char + * note. the solidus '/' may be escaped or not. + * IJC - invalid json char + * VHC - valid hex char + * NFP - needs further processing (from a string scanning perspective) + * NUC - needs utf8 checking when enabled (from a string scanning perspective) + */ +#define VEC 0x01 +#define IJC 0x02 +#define VHC 0x04 +#define NFP 0x08 +#define NUC 0x10 + +static const char charLookupTable[256] = +{ +/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , +/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , +/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , +/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , + +/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 , +/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC , +/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC , +/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 , + +/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 , +/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , +/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , +/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , + +/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 , +/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 , +/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 , +/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC +}; + +/** process a variable length utf8 encoded codepoint. + * + * returns: + * yajl_tok_string - if valid utf8 char was parsed and offset was + * advanced + * yajl_tok_eof - if end of input was hit before validation could + * complete + * yajl_tok_error - if invalid utf8 was encountered + * + * NOTE: on error the offset will point to the first char of the + * invalid utf8 */ +#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; } + +static yajl_tok +yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset, + unsigned char curChar) +{ + if (curChar <= 0x7f) { + /* single byte */ + return yajl_tok_string; + } else if ((curChar >> 5) == 0x6) { + /* two byte */ + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) return yajl_tok_string; + } else if ((curChar >> 4) == 0x0e) { + /* three byte */ + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) { + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) return yajl_tok_string; + } + } else if ((curChar >> 3) == 0x1e) { + /* four byte */ + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) { + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) { + UTF8_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if ((curChar >> 6) == 0x2) return yajl_tok_string; + } + } + } + + return yajl_tok_error; +} + +/* lex a string. input is the lexer, pointer to beginning of + * json text, and start of string (offset). + * a token is returned which has the following meanings: + * yajl_tok_string: lex of string was successful. offset points to + * terminating '"'. + * yajl_tok_eof: end of text was encountered before we could complete + * the lex. + * yajl_tok_error: embedded in the string were unallowable chars. offset + * points to the offending char + */ +#define STR_CHECK_EOF \ +if (*offset >= jsonTextLen) { \ + tok = yajl_tok_eof; \ + goto finish_string_lex; \ +} + +/** scan a string for interesting characters that might need further + * review. return the number of chars that are uninteresting and can + * be skipped. + * (lth) hi world, any thoughts on how to make this routine faster? */ +static size_t +yajl_string_scan(const unsigned char * buf, size_t len, int utf8check) +{ + unsigned char mask = IJC|NFP|(utf8check ? NUC : 0); + size_t skip = 0; + while (skip < len && !(charLookupTable[*buf] & mask)) + { + skip++; + buf++; + } + return skip; +} + +static yajl_tok +yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset) +{ + yajl_tok tok = yajl_tok_error; + int hasEscapes = 0; + + for (;;) { + unsigned char curChar; + + /* now jump into a faster scanning routine to skip as much + * of the buffers as possible */ + { + const unsigned char * p; + size_t len; + + if ((lexer->bufInUse && yajl_buf_len(lexer->buf) && + lexer->bufOff < yajl_buf_len(lexer->buf))) + { + p = ((const unsigned char *) yajl_buf_data(lexer->buf) + + (lexer->bufOff)); + len = yajl_buf_len(lexer->buf) - lexer->bufOff; + lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8); + } + else if (*offset < jsonTextLen) + { + p = jsonText + *offset; + len = jsonTextLen - *offset; + *offset += yajl_string_scan(p, len, lexer->validateUTF8); + } + } + + STR_CHECK_EOF; + + curChar = readChar(lexer, jsonText, offset); + + /* quote terminates */ + if (curChar == '"') { + tok = yajl_tok_string; + break; + } + /* backslash escapes a set of control chars, */ + else if (curChar == '\\') { + hasEscapes = 1; + STR_CHECK_EOF; + + /* special case \u */ + curChar = readChar(lexer, jsonText, offset); + if (curChar == 'u') { + unsigned int i = 0; + + for (i=0;i<4;i++) { + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if (!(charLookupTable[curChar] & VHC)) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_hex_char; + goto finish_string_lex; + } + } + } else if (!(charLookupTable[curChar] & VEC)) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_escaped_char; + goto finish_string_lex; + } + } + /* when not validating UTF8 it's a simple table lookup to determine + * if the present character is invalid */ + else if(charLookupTable[curChar] & IJC) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_json_char; + goto finish_string_lex; + } + /* when in validate UTF8 mode we need to do some extra work */ + else if (lexer->validateUTF8) { + yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen, + offset, curChar); + + if (t == yajl_tok_eof) { + tok = yajl_tok_eof; + goto finish_string_lex; + } else if (t == yajl_tok_error) { + lexer->error = yajl_lex_string_invalid_utf8; + goto finish_string_lex; + } + } + /* accept it, and move on */ + } + finish_string_lex: + /* tell our buddy, the parser, wether he needs to process this string + * again */ + if (hasEscapes && tok == yajl_tok_string) { + tok = yajl_tok_string_with_escapes; + } + + return tok; +} + +#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof; + +static yajl_tok +yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset) +{ + /** XXX: numbers are the only entities in json that we must lex + * _beyond_ in order to know that they are complete. There + * is an ambiguous case for integers at EOF. */ + + unsigned char c; + + yajl_tok tok = yajl_tok_integer; + + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + /* optional leading minus */ + if (c == '-') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } + + /* a single zero, or a series of integers */ + if (c == '0') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } else if (c >= '1' && c <= '9') { + do { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } while (c >= '0' && c <= '9'); + } else { + unreadChar(lexer, offset); + lexer->error = yajl_lex_missing_integer_after_minus; + return yajl_tok_error; + } + + /* optional fraction (indicates this is floating point) */ + if (c == '.') { + int numRd = 0; + + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + while (c >= '0' && c <= '9') { + numRd++; + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } + + if (!numRd) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_missing_integer_after_decimal; + return yajl_tok_error; + } + tok = yajl_tok_double; + } + + /* optional exponent (indicates this is floating point) */ + if (c == 'e' || c == 'E') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + /* optional sign */ + if (c == '+' || c == '-') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } + + if (c >= '0' && c <= '9') { + do { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } while (c >= '0' && c <= '9'); + } else { + unreadChar(lexer, offset); + lexer->error = yajl_lex_missing_integer_after_exponent; + return yajl_tok_error; + } + tok = yajl_tok_double; + } + + /* we always go "one too far" */ + unreadChar(lexer, offset); + + return tok; +} + +static yajl_tok +yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset) +{ + unsigned char c; + + yajl_tok tok = yajl_tok_comment; + + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + + /* either slash or star expected */ + if (c == '/') { + /* now we throw away until end of line */ + do { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + } while (c != '\n'); + } else if (c == '*') { + /* now we throw away until end of comment */ + for (;;) { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + if (c == '*') { + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); + if (c == '/') { + break; + } else { + unreadChar(lexer, offset); + } + } + } + } else { + lexer->error = yajl_lex_invalid_char; + tok = yajl_tok_error; + } + + return tok; +} + +yajl_tok +yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset, + const unsigned char ** outBuf, size_t * outLen) +{ + yajl_tok tok = yajl_tok_error; + unsigned char c; + size_t startOffset = *offset; + + *outBuf = NULL; + *outLen = 0; + + for (;;) { + assert(*offset <= jsonTextLen); + + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + + c = readChar(lexer, jsonText, offset); + + switch (c) { + case '{': + tok = yajl_tok_left_bracket; + goto lexed; + case '}': + tok = yajl_tok_right_bracket; + goto lexed; + case '[': + tok = yajl_tok_left_brace; + goto lexed; + case ']': + tok = yajl_tok_right_brace; + goto lexed; + case ',': + tok = yajl_tok_comma; + goto lexed; + case ':': + tok = yajl_tok_colon; + goto lexed; + case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': + startOffset++; + break; + case 't': { + const char * want = "rue"; + do { + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + c = readChar(lexer, jsonText, offset); + if (c != *want) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_invalid_string; + tok = yajl_tok_error; + goto lexed; + } + } while (*(++want)); + tok = yajl_tok_bool; + goto lexed; + } + case 'f': { + const char * want = "alse"; + do { + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + c = readChar(lexer, jsonText, offset); + if (c != *want) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_invalid_string; + tok = yajl_tok_error; + goto lexed; + } + } while (*(++want)); + tok = yajl_tok_bool; + goto lexed; + } + case 'n': { + const char * want = "ull"; + do { + if (*offset >= jsonTextLen) { + tok = yajl_tok_eof; + goto lexed; + } + c = readChar(lexer, jsonText, offset); + if (c != *want) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_invalid_string; + tok = yajl_tok_error; + goto lexed; + } + } while (*(++want)); + tok = yajl_tok_null; + goto lexed; + } + case '"': { + tok = yajl_lex_string(lexer, (const unsigned char *) jsonText, + jsonTextLen, offset); + goto lexed; + } + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + /* integer parsing wants to start from the beginning */ + unreadChar(lexer, offset); + tok = yajl_lex_number(lexer, (const unsigned char *) jsonText, + jsonTextLen, offset); + goto lexed; + } + case '/': + /* hey, look, a probable comment! If comments are disabled + * it's an error. */ + if (!lexer->allowComments) { + unreadChar(lexer, offset); + lexer->error = yajl_lex_unallowed_comment; + tok = yajl_tok_error; + goto lexed; + } + /* if comments are enabled, then we should try to lex + * the thing. possible outcomes are + * - successful lex (tok_comment, which means continue), + * - malformed comment opening (slash not followed by + * '*' or '/') (tok_error) + * - eof hit. (tok_eof) */ + tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText, + jsonTextLen, offset); + if (tok == yajl_tok_comment) { + /* "error" is silly, but that's the initial + * state of tok. guilty until proven innocent. */ + tok = yajl_tok_error; + yajl_buf_clear(lexer->buf); + lexer->bufInUse = 0; + startOffset = *offset; + break; + } + /* hit error or eof, bail */ + goto lexed; + default: + lexer->error = yajl_lex_invalid_char; + tok = yajl_tok_error; + goto lexed; + } + } + + + lexed: + /* need to append to buffer if the buffer is in use or + * if it's an EOF token */ + if (tok == yajl_tok_eof || lexer->bufInUse) { + if (!lexer->bufInUse) yajl_buf_clear(lexer->buf); + lexer->bufInUse = 1; + yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset); + lexer->bufOff = 0; + + if (tok != yajl_tok_eof) { + *outBuf = yajl_buf_data(lexer->buf); + *outLen = yajl_buf_len(lexer->buf); + lexer->bufInUse = 0; + } + } else if (tok != yajl_tok_error) { + *outBuf = jsonText + startOffset; + *outLen = *offset - startOffset; + } + + /* special case for strings. skip the quotes. */ + if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes) + { + assert(*outLen >= 2); + (*outBuf)++; + *outLen -= 2; + } + + +#ifdef YAJL_LEXER_DEBUG + if (tok == yajl_tok_error) { + printf("lexical error: %s\n", + yajl_lex_error_to_string(yajl_lex_get_error(lexer))); + } else if (tok == yajl_tok_eof) { + printf("EOF hit\n"); + } else { + printf("lexed %s: '", tokToStr(tok)); + fwrite(*outBuf, 1, *outLen, stdout); + printf("'\n"); + } +#endif + + return tok; +} + +const char * +yajl_lex_error_to_string(yajl_lex_error error) +{ + switch (error) { + case yajl_lex_e_ok: + return "ok, no error"; + case yajl_lex_string_invalid_utf8: + return "invalid bytes in UTF8 string."; + case yajl_lex_string_invalid_escaped_char: + return "inside a string, '\\' occurs before a character " + "which it may not."; + case yajl_lex_string_invalid_json_char: + return "invalid character inside string."; + case yajl_lex_string_invalid_hex_char: + return "invalid (non-hex) character occurs after '\\u' inside " + "string."; + case yajl_lex_invalid_char: + return "invalid char in json text."; + case yajl_lex_invalid_string: + return "invalid string in json text."; + case yajl_lex_missing_integer_after_exponent: + return "malformed number, a digit is required after the exponent."; + case yajl_lex_missing_integer_after_decimal: + return "malformed number, a digit is required after the " + "decimal point."; + case yajl_lex_missing_integer_after_minus: + return "malformed number, a digit is required after the " + "minus sign."; + case yajl_lex_unallowed_comment: + return "probable comment found in input text, comments are " + "not enabled."; + } + return "unknown error code"; +} + + +/** allows access to more specific information about the lexical + * error when yajl_lex_lex returns yajl_tok_error. */ +yajl_lex_error +yajl_lex_get_error(yajl_lexer lexer) +{ + if (lexer == NULL) return (yajl_lex_error) -1; + return lexer->error; +} + +size_t yajl_lex_current_line(yajl_lexer lexer) +{ + return lexer->lineOff; +} + +size_t yajl_lex_current_char(yajl_lexer lexer) +{ + return lexer->charOff; +} + +yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t offset) +{ + const unsigned char * outBuf; + size_t outLen; + size_t bufLen = yajl_buf_len(lexer->buf); + size_t bufOff = lexer->bufOff; + unsigned int bufInUse = lexer->bufInUse; + yajl_tok tok; + + tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset, + &outBuf, &outLen); + + lexer->bufOff = bufOff; + lexer->bufInUse = bufInUse; + yajl_buf_truncate(lexer->buf, bufLen); + + return tok; +} diff --git a/xlators/cluster/nsr-server/src/yajl_lex.h b/xlators/cluster/nsr-server/src/yajl_lex.h new file mode 100644 index 000000000..cbaae0c13 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_lex.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_LEX_H__ +#define __YAJL_LEX_H__ + +#include "yajl/yajl_common.h" + +typedef enum { + yajl_tok_bool, + yajl_tok_colon, + yajl_tok_comma, + yajl_tok_eof, + yajl_tok_error, + yajl_tok_left_brace, + yajl_tok_left_bracket, + yajl_tok_null, + yajl_tok_right_brace, + yajl_tok_right_bracket, + + /* we differentiate between integers and doubles to allow the + * parser to interpret the number without re-scanning */ + yajl_tok_integer, + yajl_tok_double, + + /* we differentiate between strings which require further processing, + * and strings that do not */ + yajl_tok_string, + yajl_tok_string_with_escapes, + + /* comment tokens are not currently returned to the parser, ever */ + yajl_tok_comment +} yajl_tok; + +typedef struct yajl_lexer_t * yajl_lexer; + +yajl_lexer yajl_lex_alloc(yajl_alloc_funcs * alloc, + unsigned int allowComments, + unsigned int validateUTF8); + +void yajl_lex_free(yajl_lexer lexer); + +/** + * run/continue a lex. "offset" is an input/output parameter. + * It should be initialized to zero for a + * new chunk of target text, and upon subsetquent calls with the same + * target text should passed with the value of the previous invocation. + * + * the client may be interested in the value of offset when an error is + * returned from the lexer. This allows the client to render useful +n * error messages. + * + * When you pass the next chunk of data, context should be reinitialized + * to zero. + * + * Finally, the output buffer is usually just a pointer into the jsonText, + * however in cases where the entity being lexed spans multiple chunks, + * the lexer will buffer the entity and the data returned will be + * a pointer into that buffer. + * + * This behavior is abstracted from client code except for the performance + * implications which require that the client choose a reasonable chunk + * size to get adequate performance. + */ +yajl_tok yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t * offset, + const unsigned char ** outBuf, size_t * outLen); + +/** have a peek at the next token, but don't move the lexer forward */ +yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, + size_t jsonTextLen, size_t offset); + + +typedef enum { + yajl_lex_e_ok = 0, + yajl_lex_string_invalid_utf8, + yajl_lex_string_invalid_escaped_char, + yajl_lex_string_invalid_json_char, + yajl_lex_string_invalid_hex_char, + yajl_lex_invalid_char, + yajl_lex_invalid_string, + yajl_lex_missing_integer_after_decimal, + yajl_lex_missing_integer_after_exponent, + yajl_lex_missing_integer_after_minus, + yajl_lex_unallowed_comment +} yajl_lex_error; + +const char * yajl_lex_error_to_string(yajl_lex_error error); + +/** allows access to more specific information about the lexical + * error when yajl_lex_lex returns yajl_tok_error. */ +yajl_lex_error yajl_lex_get_error(yajl_lexer lexer); + +/** get the current offset into the most recently lexed json string. */ +size_t yajl_lex_current_offset(yajl_lexer lexer); + +/** get the number of lines lexed by this lexer instance */ +size_t yajl_lex_current_line(yajl_lexer lexer); + +/** get the number of chars lexed by this lexer instance since the last + * \n or \r */ +size_t yajl_lex_current_char(yajl_lexer lexer); + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_parser.c b/xlators/cluster/nsr-server/src/yajl_parser.c new file mode 100644 index 000000000..bf9ef24ef --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_parser.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "yajl/yajl_parse.h" +#include "yajl_lex.h" +#include "yajl_parser.h" +#include "yajl_encode.h" +#include "yajl_bytestack.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_VALUE_TO_MULTIPLY ((LLONG_MAX / 10) + (LLONG_MAX % 10)) + + /* same semantics as strtol */ +long long +yajl_parse_integer(const unsigned char *number, unsigned int length) +{ + long long ret = 0; + long sign = 1; + const unsigned char *pos = number; + if (*pos == '-') { pos++; sign = -1; } + if (*pos == '+') { pos++; } + + while (pos < number + length) { + if ( ret > MAX_VALUE_TO_MULTIPLY ) { + errno = ERANGE; + return sign == 1 ? LLONG_MAX : LLONG_MIN; + } + ret *= 10; + if (LLONG_MAX - ret < (*pos - '0')) { + errno = ERANGE; + return sign == 1 ? LLONG_MAX : LLONG_MIN; + } + ret += (*pos++ - '0'); + } + + return sign * ret; +} + +unsigned char * +yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen, int verbose) +{ + size_t offset = hand->bytesConsumed; + unsigned char * str; + const char * errorType = NULL; + const char * errorText = NULL; + char text[72]; + const char * arrow = " (right here) ------^\n"; + + if (yajl_bs_current(hand->stateStack) == yajl_state_parse_error) { + errorType = "parse"; + errorText = hand->parseError; + } else if (yajl_bs_current(hand->stateStack) == yajl_state_lexical_error) { + errorType = "lexical"; + errorText = yajl_lex_error_to_string(yajl_lex_get_error(hand->lexer)); + } else { + errorType = "unknown"; + } + + { + size_t memneeded = 0; + memneeded += strlen(errorType); + memneeded += strlen(" error"); + if (errorText != NULL) { + memneeded += strlen(": "); + memneeded += strlen(errorText); + } + str = (unsigned char *) YA_MALLOC(&(hand->alloc), memneeded + 2); + if (!str) return NULL; + str[0] = 0; + strcat((char *) str, errorType); + strcat((char *) str, " error"); + if (errorText != NULL) { + strcat((char *) str, ": "); + strcat((char *) str, errorText); + } + strcat((char *) str, "\n"); + } + + /* now we append as many spaces as needed to make sure the error + * falls at char 41, if verbose was specified */ + if (verbose) { + size_t start, end, i; + size_t spacesNeeded; + + spacesNeeded = (offset < 30 ? 40 - offset : 10); + start = (offset >= 30 ? offset - 30 : 0); + end = (offset + 30 > jsonTextLen ? jsonTextLen : offset + 30); + + for (i=0;ialloc), (unsigned int)(strlen((char *) str) + + strlen((char *) text) + + strlen(arrow) + 1)); + if (newStr) { + newStr[0] = 0; + strcat((char *) newStr, (char *) str); + strcat((char *) newStr, text); + strcat((char *) newStr, arrow); + } + YA_FREE(&(hand->alloc), str); + str = (unsigned char *) newStr; + } + } + return str; +} + +/* check for client cancelation */ +#define _CC_CHK(x) \ + if (!(x)) { \ + yajl_bs_set(hand->stateStack, yajl_state_parse_error); \ + hand->parseError = \ + "client cancelled parse via callback return value"; \ + return yajl_status_client_canceled; \ + } + + +yajl_status +yajl_do_finish(yajl_handle hand) +{ + yajl_status stat; + stat = yajl_do_parse(hand,(const unsigned char *) " ",1); + + if (stat != yajl_status_ok) return stat; + + switch(yajl_bs_current(hand->stateStack)) + { + case yajl_state_parse_error: + case yajl_state_lexical_error: + return yajl_status_error; + case yajl_state_got_value: + case yajl_state_parse_complete: + return yajl_status_ok; + default: + if (!(hand->flags & yajl_allow_partial_values)) + { + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "premature EOF"; + return yajl_status_error; + } + return yajl_status_ok; + } +} + +yajl_status +yajl_do_parse(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen) +{ + yajl_tok tok; + const unsigned char * buf; + size_t bufLen; + size_t * offset = &(hand->bytesConsumed); + + *offset = 0; + + around_again: + switch (yajl_bs_current(hand->stateStack)) { + case yajl_state_parse_complete: + if (hand->flags & yajl_allow_multiple_values) { + yajl_bs_set(hand->stateStack, yajl_state_got_value); + goto around_again; + } + if (!(hand->flags & yajl_allow_trailing_garbage)) { + if (*offset != jsonTextLen) { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + if (tok != yajl_tok_eof) { + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "trailing garbage"; + } + goto around_again; + } + } + return yajl_status_ok; + case yajl_state_lexical_error: + case yajl_state_parse_error: + return yajl_status_error; + case yajl_state_start: + case yajl_state_got_value: + case yajl_state_map_need_val: + case yajl_state_array_need_val: + case yajl_state_array_start: { + /* for arrays and maps, we advance the state for this + * depth, then push the state of the next depth. + * If an error occurs during the parsing of the nesting + * enitity, the state at this level will not matter. + * a state that needs pushing will be anything other + * than state_start */ + + yajl_state stateToPush = yajl_state_start; + + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + + switch (tok) { + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + case yajl_tok_string: + if (hand->callbacks && hand->callbacks->yajl_string) { + _CC_CHK(hand->callbacks->yajl_string(hand->ctx, + buf, bufLen)); + } + break; + case yajl_tok_string_with_escapes: + if (hand->callbacks && hand->callbacks->yajl_string) { + yajl_buf_clear(hand->decodeBuf); + yajl_string_decode(hand->decodeBuf, buf, bufLen); + _CC_CHK(hand->callbacks->yajl_string( + hand->ctx, yajl_buf_data(hand->decodeBuf), + yajl_buf_len(hand->decodeBuf))); + } + break; + case yajl_tok_bool: + if (hand->callbacks && hand->callbacks->yajl_boolean) { + _CC_CHK(hand->callbacks->yajl_boolean(hand->ctx, + *buf == 't')); + } + break; + case yajl_tok_null: + if (hand->callbacks && hand->callbacks->yajl_null) { + _CC_CHK(hand->callbacks->yajl_null(hand->ctx)); + } + break; + case yajl_tok_left_bracket: + if (hand->callbacks && hand->callbacks->yajl_start_map) { + _CC_CHK(hand->callbacks->yajl_start_map(hand->ctx)); + } + stateToPush = yajl_state_map_start; + break; + case yajl_tok_left_brace: + if (hand->callbacks && hand->callbacks->yajl_start_array) { + _CC_CHK(hand->callbacks->yajl_start_array(hand->ctx)); + } + stateToPush = yajl_state_array_start; + break; + case yajl_tok_integer: + if (hand->callbacks) { + if (hand->callbacks->yajl_number) { + _CC_CHK(hand->callbacks->yajl_number( + hand->ctx,(const char *) buf, bufLen)); + } else if (hand->callbacks->yajl_integer) { + long long int i = 0; + i = yajl_parse_integer(buf, bufLen); + if ((i == LLONG_MIN || i == LLONG_MAX) && + errno == ERANGE) + { + yajl_bs_set(hand->stateStack, + yajl_state_parse_error); + hand->parseError = "integer overflow" ; + /* try to restore error offset */ + if (*offset >= bufLen) *offset -= bufLen; + else *offset = 0; + goto around_again; + } + _CC_CHK(hand->callbacks->yajl_integer(hand->ctx, + i)); + } + } + break; + case yajl_tok_double: + if (hand->callbacks) { + if (hand->callbacks->yajl_number) { + _CC_CHK(hand->callbacks->yajl_number( + hand->ctx, (const char *) buf, bufLen)); + } else if (hand->callbacks->yajl_double) { + double d = 0.0; + yajl_buf_clear(hand->decodeBuf); + yajl_buf_append(hand->decodeBuf, buf, bufLen); + buf = yajl_buf_data(hand->decodeBuf); + d = strtod((char *) buf, NULL); + if ((d == HUGE_VAL || d == -HUGE_VAL) && + errno == ERANGE) + { + yajl_bs_set(hand->stateStack, + yajl_state_parse_error); + hand->parseError = "numeric (floating point) " + "overflow"; + /* try to restore error offset */ + if (*offset >= bufLen) *offset -= bufLen; + else *offset = 0; + goto around_again; + } + _CC_CHK(hand->callbacks->yajl_double(hand->ctx, + d)); + } + } + break; + case yajl_tok_right_brace: { + if (yajl_bs_current(hand->stateStack) == + yajl_state_array_start) + { + if (hand->callbacks && + hand->callbacks->yajl_end_array) + { + _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + } + /* intentional fall-through */ + } + case yajl_tok_colon: + case yajl_tok_comma: + case yajl_tok_right_bracket: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = + "unallowed token at this point in JSON text"; + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "invalid token, internal error"; + goto around_again; + } + /* got a value. transition depends on the state we're in. */ + { + yajl_state s = yajl_bs_current(hand->stateStack); + if (s == yajl_state_start || s == yajl_state_got_value) { + yajl_bs_set(hand->stateStack, yajl_state_parse_complete); + } else if (s == yajl_state_map_need_val) { + yajl_bs_set(hand->stateStack, yajl_state_map_got_val); + } else { + yajl_bs_set(hand->stateStack, yajl_state_array_got_val); + } + } + if (stateToPush != yajl_state_start) { + yajl_bs_push(hand->stateStack, stateToPush); + } + + goto around_again; + } + case yajl_state_map_start: + case yajl_state_map_need_key: { + /* only difference between these two states is that in + * start '}' is valid, whereas in need_key, we've parsed + * a comma, and a string key _must_ follow */ + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + case yajl_tok_string_with_escapes: + if (hand->callbacks && hand->callbacks->yajl_map_key) { + yajl_buf_clear(hand->decodeBuf); + yajl_string_decode(hand->decodeBuf, buf, bufLen); + buf = yajl_buf_data(hand->decodeBuf); + bufLen = yajl_buf_len(hand->decodeBuf); + } + /* intentional fall-through */ + case yajl_tok_string: + if (hand->callbacks && hand->callbacks->yajl_map_key) { + _CC_CHK(hand->callbacks->yajl_map_key(hand->ctx, buf, + bufLen)); + } + yajl_bs_set(hand->stateStack, yajl_state_map_sep); + goto around_again; + case yajl_tok_right_bracket: + if (yajl_bs_current(hand->stateStack) == + yajl_state_map_start) + { + if (hand->callbacks && hand->callbacks->yajl_end_map) { + _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + } + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = + "invalid object key (must be a string)"; + goto around_again; + } + } + case yajl_state_map_sep: { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_colon: + yajl_bs_set(hand->stateStack, yajl_state_map_need_val); + goto around_again; + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "object key and value must " + "be separated by a colon (':')"; + goto around_again; + } + } + case yajl_state_map_got_val: { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_right_bracket: + if (hand->callbacks && hand->callbacks->yajl_end_map) { + _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + case yajl_tok_comma: + yajl_bs_set(hand->stateStack, yajl_state_map_need_key); + goto around_again; + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = "after key and value, inside map, " + "I expect ',' or '}'"; + /* try to restore error offset */ + if (*offset >= bufLen) *offset -= bufLen; + else *offset = 0; + goto around_again; + } + } + case yajl_state_array_got_val: { + tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen, + offset, &buf, &bufLen); + switch (tok) { + case yajl_tok_right_brace: + if (hand->callbacks && hand->callbacks->yajl_end_array) { + _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx)); + } + yajl_bs_pop(hand->stateStack); + goto around_again; + case yajl_tok_comma: + yajl_bs_set(hand->stateStack, yajl_state_array_need_val); + goto around_again; + case yajl_tok_eof: + return yajl_status_ok; + case yajl_tok_error: + yajl_bs_set(hand->stateStack, yajl_state_lexical_error); + goto around_again; + default: + yajl_bs_set(hand->stateStack, yajl_state_parse_error); + hand->parseError = + "after array element, I expect ',' or ']'"; + goto around_again; + } + } + } + + abort(); + return yajl_status_error; +} + diff --git a/xlators/cluster/nsr-server/src/yajl_parser.h b/xlators/cluster/nsr-server/src/yajl_parser.h new file mode 100644 index 000000000..53409731a --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_parser.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2007-2011, Lloyd Hilaiel + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __YAJL_PARSER_H__ +#define __YAJL_PARSER_H__ + +#include "yajl/yajl_parse.h" +#include "yajl_bytestack.h" +#include "yajl_buf.h" +#include "yajl_lex.h" + + +typedef enum { + yajl_state_start = 0, + yajl_state_parse_complete, + yajl_state_parse_error, + yajl_state_lexical_error, + yajl_state_map_start, + yajl_state_map_sep, + yajl_state_map_need_val, + yajl_state_map_got_val, + yajl_state_map_need_key, + yajl_state_array_start, + yajl_state_array_got_val, + yajl_state_array_need_val, + yajl_state_got_value, +} yajl_state; + +struct yajl_handle_t { + const yajl_callbacks * callbacks; + void * ctx; + yajl_lexer lexer; + const char * parseError; + /* the number of bytes consumed from the last client buffer, + * in the case of an error this will be an error offset, in the + * case of an error this can be used as the error offset */ + size_t bytesConsumed; + /* temporary storage for decoded strings */ + yajl_buf decodeBuf; + /* a stack of states. access with yajl_state_XXX routines */ + yajl_bytestack stateStack; + /* memory allocation routines */ + yajl_alloc_funcs alloc; + /* bitfield */ + unsigned int flags; +}; + +yajl_status +yajl_do_parse(yajl_handle handle, const unsigned char * jsonText, + size_t jsonTextLen); + +yajl_status +yajl_do_finish(yajl_handle handle); + +unsigned char * +yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText, + size_t jsonTextLen, int verbose); + +/* A little built in integer parsing routine with the same semantics as strtol + * that's unaffected by LOCALE. */ +long long +yajl_parse_integer(const unsigned char *number, unsigned int length); + + +#endif diff --git a/xlators/cluster/nsr-server/src/yajl_tree.c b/xlators/cluster/nsr-server/src/yajl_tree.c new file mode 100644 index 000000000..1a69134e7 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_tree.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2010-2011 Florian Forster + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "yajl/yajl_tree.h" +#include "yajl/yajl_parse.h" + +#include "yajl_parser.h" + +#ifdef WIN32 +#define snprintf sprintf_s +#endif + +#define STATUS_CONTINUE 1 +#define STATUS_ABORT 0 + +struct stack_elem_s; +typedef struct stack_elem_s stack_elem_t; +struct stack_elem_s +{ + char * key; + yajl_val value; + stack_elem_t *next; +}; + +struct context_s +{ + stack_elem_t *stack; + yajl_val root; + char *errbuf; + size_t errbuf_size; +}; +typedef struct context_s context_t; + +#define RETURN_ERROR(ctx,retval,...) { \ + if ((ctx)->errbuf != NULL) \ + snprintf ((ctx)->errbuf, (ctx)->errbuf_size, __VA_ARGS__); \ + return (retval); \ + } + +static yajl_val value_alloc (yajl_type type) +{ + yajl_val v; + + v = malloc (sizeof (*v)); + if (v == NULL) return (NULL); + memset (v, 0, sizeof (*v)); + v->type = type; + + return (v); +} + +static void yajl_object_free (yajl_val v) +{ + size_t i; + + if (!YAJL_IS_OBJECT(v)) return; + + for (i = 0; i < v->u.object.len; i++) + { + free((char *) v->u.object.keys[i]); + v->u.object.keys[i] = NULL; + yajl_tree_free (v->u.object.values[i]); + v->u.object.values[i] = NULL; + } + + free((void*) v->u.object.keys); + free(v->u.object.values); + free(v); +} + +static void yajl_array_free (yajl_val v) +{ + size_t i; + + if (!YAJL_IS_ARRAY(v)) return; + + for (i = 0; i < v->u.array.len; i++) + { + yajl_tree_free (v->u.array.values[i]); + v->u.array.values[i] = NULL; + } + + free(v->u.array.values); + free(v); +} + +/* + * Parsing nested objects and arrays is implemented using a stack. When a new + * object or array starts (a curly or a square opening bracket is read), an + * appropriate value is pushed on the stack. When the end of the object is + * reached (an appropriate closing bracket has been read), the value is popped + * off the stack and added to the enclosing object using "context_add_value". + */ +static int context_push(context_t *ctx, yajl_val v) +{ + stack_elem_t *stack; + + stack = malloc (sizeof (*stack)); + if (stack == NULL) + RETURN_ERROR (ctx, ENOMEM, "Out of memory"); + memset (stack, 0, sizeof (*stack)); + + assert ((ctx->stack == NULL) + || YAJL_IS_OBJECT (v) + || YAJL_IS_ARRAY (v)); + + stack->value = v; + stack->next = ctx->stack; + ctx->stack = stack; + + return (0); +} + +static yajl_val context_pop(context_t *ctx) +{ + stack_elem_t *stack; + yajl_val v; + + if (ctx->stack == NULL) + RETURN_ERROR (ctx, NULL, "context_pop: " + "Bottom of stack reached prematurely"); + + stack = ctx->stack; + ctx->stack = stack->next; + + v = stack->value; + + free (stack); + + return (v); +} + +static int object_add_keyval(context_t *ctx, + yajl_val obj, char *key, yajl_val value) +{ + const char **tmpk; + yajl_val *tmpv; + + /* We're checking for NULL in "context_add_value" or its callers. */ + assert (ctx != NULL); + assert (obj != NULL); + assert (key != NULL); + assert (value != NULL); + + /* We're assuring that "obj" is an object in "context_add_value". */ + assert(YAJL_IS_OBJECT(obj)); + + tmpk = realloc((void *) obj->u.object.keys, sizeof(*(obj->u.object.keys)) * (obj->u.object.len + 1)); + if (tmpk == NULL) + RETURN_ERROR(ctx, ENOMEM, "Out of memory"); + obj->u.object.keys = tmpk; + + tmpv = realloc(obj->u.object.values, sizeof (*obj->u.object.values) * (obj->u.object.len + 1)); + if (tmpv == NULL) + RETURN_ERROR(ctx, ENOMEM, "Out of memory"); + obj->u.object.values = tmpv; + + obj->u.object.keys[obj->u.object.len] = key; + obj->u.object.values[obj->u.object.len] = value; + obj->u.object.len++; + + return (0); +} + +static int array_add_value (context_t *ctx, + yajl_val array, yajl_val value) +{ + yajl_val *tmp; + + /* We're checking for NULL pointers in "context_add_value" or its + * callers. */ + assert (ctx != NULL); + assert (array != NULL); + assert (value != NULL); + + /* "context_add_value" will only call us with array values. */ + assert(YAJL_IS_ARRAY(array)); + + tmp = realloc(array->u.array.values, + sizeof(*(array->u.array.values)) * (array->u.array.len + 1)); + if (tmp == NULL) + RETURN_ERROR(ctx, ENOMEM, "Out of memory"); + array->u.array.values = tmp; + array->u.array.values[array->u.array.len] = value; + array->u.array.len++; + + return 0; +} + +/* + * Add a value to the value on top of the stack or the "root" member in the + * context if the end of the parsing process is reached. + */ +static int context_add_value (context_t *ctx, yajl_val v) +{ + /* We're checking for NULL values in all the calling functions. */ + assert (ctx != NULL); + assert (v != NULL); + + /* + * There are three valid states in which this function may be called: + * - There is no value on the stack => This is the only value. This is the + * last step done when parsing a document. We assign the value to the + * "root" member and return. + * - The value on the stack is an object. In this case store the key on the + * stack or, if the key has already been read, add key and value to the + * object. + * - The value on the stack is an array. In this case simply add the value + * and return. + */ + if (ctx->stack == NULL) + { + assert (ctx->root == NULL); + ctx->root = v; + return (0); + } + else if (YAJL_IS_OBJECT (ctx->stack->value)) + { + if (ctx->stack->key == NULL) + { + if (!YAJL_IS_STRING (v)) + RETURN_ERROR (ctx, EINVAL, "context_add_value: " + "Object key is not a string (%#04x)", + v->type); + + ctx->stack->key = v->u.string; + v->u.string = NULL; + free(v); + return (0); + } + else /* if (ctx->key != NULL) */ + { + char * key; + + key = ctx->stack->key; + ctx->stack->key = NULL; + return (object_add_keyval (ctx, ctx->stack->value, key, v)); + } + } + else if (YAJL_IS_ARRAY (ctx->stack->value)) + { + return (array_add_value (ctx, ctx->stack->value, v)); + } + else + { + RETURN_ERROR (ctx, EINVAL, "context_add_value: Cannot add value to " + "a value of type %#04x (not a composite type)", + ctx->stack->value->type); + } +} + +static int handle_string (void *ctx, + const unsigned char *string, size_t string_length) +{ + yajl_val v; + + v = value_alloc (yajl_t_string); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.string = malloc (string_length + 1); + if (v->u.string == NULL) + { + free (v); + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + } + memcpy(v->u.string, string, string_length); + v->u.string[string_length] = 0; + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_number (void *ctx, const char *string, size_t string_length) +{ + yajl_val v; + char *endptr; + + v = value_alloc(yajl_t_number); + if (v == NULL) + RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.number.r = malloc(string_length + 1); + if (v->u.number.r == NULL) + { + free(v); + RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory"); + } + memcpy(v->u.number.r, string, string_length); + v->u.number.r[string_length] = 0; + + v->u.number.flags = 0; + + endptr = NULL; + errno = 0; + v->u.number.i = yajl_parse_integer((const unsigned char *) v->u.number.r, + strlen(v->u.number.r)); + if ((errno == 0) && (endptr != NULL) && (*endptr == 0)) + v->u.number.flags |= YAJL_NUMBER_INT_VALID; + + endptr = NULL; + errno = 0; + v->u.number.d = strtod(v->u.number.r, &endptr); + if ((errno == 0) && (endptr != NULL) && (*endptr == 0)) + v->u.number.flags |= YAJL_NUMBER_DOUBLE_VALID; + + return ((context_add_value(ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_start_map (void *ctx) +{ + yajl_val v; + + v = value_alloc(yajl_t_object); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.object.keys = NULL; + v->u.object.values = NULL; + v->u.object.len = 0; + + return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_end_map (void *ctx) +{ + yajl_val v; + + v = context_pop (ctx); + if (v == NULL) + return (STATUS_ABORT); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_start_array (void *ctx) +{ + yajl_val v; + + v = value_alloc(yajl_t_array); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + v->u.array.values = NULL; + v->u.array.len = 0; + + return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_end_array (void *ctx) +{ + yajl_val v; + + v = context_pop (ctx); + if (v == NULL) + return (STATUS_ABORT); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_boolean (void *ctx, int boolean_value) +{ + yajl_val v; + + v = value_alloc (boolean_value ? yajl_t_true : yajl_t_false); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +static int handle_null (void *ctx) +{ + yajl_val v; + + v = value_alloc (yajl_t_null); + if (v == NULL) + RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory"); + + return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT); +} + +/* + * Public functions + */ +yajl_val yajl_tree_parse (const char *input, + char *error_buffer, size_t error_buffer_size) +{ + static const yajl_callbacks callbacks = + { + /* null = */ handle_null, + /* boolean = */ handle_boolean, + /* integer = */ NULL, + /* double = */ NULL, + /* number = */ handle_number, + /* string = */ handle_string, + /* start map = */ handle_start_map, + /* map key = */ handle_string, + /* end map = */ handle_end_map, + /* start array = */ handle_start_array, + /* end array = */ handle_end_array + }; + + yajl_handle handle; + yajl_status status; + context_t ctx = { NULL, NULL, NULL, 0 }; + + ctx.errbuf = error_buffer; + ctx.errbuf_size = error_buffer_size; + + if (error_buffer != NULL) + memset (error_buffer, 0, error_buffer_size); + + handle = yajl_alloc (&callbacks, NULL, &ctx); + yajl_config(handle, yajl_allow_comments, 1); + + status = yajl_parse(handle, + (unsigned char *) input, + strlen (input)); + status = yajl_complete_parse (handle); + if (status != yajl_status_ok) { + if (error_buffer != NULL && error_buffer_size > 0) { + snprintf( + error_buffer, error_buffer_size, "%s", + (char *) yajl_get_error(handle, 1, + (const unsigned char *) input, + strlen(input))); + } + yajl_free (handle); + return NULL; + } + + yajl_free (handle); + return (ctx.root); +} + +yajl_val yajl_tree_get(yajl_val n, const char ** path, yajl_type type) +{ + if (!path) return NULL; + while (n && *path) { + unsigned int i; + + if (n->type != yajl_t_object) return NULL; + for (i = 0; i < n->u.object.len; i++) { + if (!strcmp(*path, n->u.object.keys[i])) { + n = n->u.object.values[i]; + break; + } + } + if (i == n->u.object.len) return NULL; + path++; + } + if (n && type != yajl_t_any && type != n->type) n = NULL; + return n; +} + +void yajl_tree_free (yajl_val v) +{ + if (v == NULL) return; + + if (YAJL_IS_STRING(v)) + { + free(v->u.string); + free(v); + } + else if (YAJL_IS_NUMBER(v)) + { + free(v->u.number.r); + free(v); + } + else if (YAJL_GET_OBJECT(v)) + { + yajl_object_free(v); + } + else if (YAJL_GET_ARRAY(v)) + { + yajl_array_free(v); + } + else /* if (yajl_t_true or yajl_t_false or yajl_t_null) */ + { + free(v); + } +} diff --git a/xlators/cluster/nsr-server/src/yajl_version.c b/xlators/cluster/nsr-server/src/yajl_version.c new file mode 100644 index 000000000..0671da722 --- /dev/null +++ b/xlators/cluster/nsr-server/src/yajl_version.c @@ -0,0 +1,7 @@ +#include + +int yajl_version(void) +{ + return YAJL_VERSION; +} + diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h index 3aa6ed7b8..f35220ccb 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-helpers.h +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h @@ -94,4 +94,5 @@ gf_ftruncate (int fd, off_t length); off_t gf_lseek (int fd, off_t offset, int whence); + #endif diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am index e85031ad4..f8beba430 100644 --- a/xlators/features/changelog/src/Makefile.am +++ b/xlators/features/changelog/src/Makefile.am @@ -3,15 +3,17 @@ xlator_LTLIBRARIES = changelog.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \ - changelog-misc.h changelog-encoders.h changelog-notifier.h + changelog-misc.h changelog-encoders.h changelog-notifier.h \ + changelog-fops.h policy/changelog-policy.h changelog_la_LDFLAGS = -module -avoidversion changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \ - changelog-encoders.c changelog-notifier.c + changelog-encoders.c changelog-notifier.c changelog-default-fops.c \ + policy/changelog-policy-default.c policy/changelog-policy-replication.c changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -Ipolicy/ -fPIC -D_FILE_OFFSET_BITS=64 \ -D_GNU_SOURCE -D$(GF_HOST_OS) -shared -nostartfiles -DDATADIR=\"$(localstatedir)\" AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/changelog/src/changelog-default-fops.c b/xlators/features/changelog/src/changelog-default-fops.c new file mode 100644 index 000000000..59749905e --- /dev/null +++ b/xlators/features/changelog/src/changelog-default-fops.c @@ -0,0 +1,561 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" + +#include "changelog-encoders.h" + +/** FOPS */ + +/* default rmdir */ +int32_t +changelog_default_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 2); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default unlink */ +int32_t +changelog_default_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 2); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default rename */ +int32_t +changelog_default_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + /* 3 == fop + oldloc + newloc */ + CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->inode->gfid, 3); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name, + entry_fn, entry_free_fn, xtra_len, out); + + co++; + CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 3); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default link */ +int32_t +changelog_default_link (call_frame_t *frame, + xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + int ret = 1; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->gfid, 2); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default mknid */ +int32_t +changelog_default_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto out; + } + uuid_copy (gfid, uuid_req); + + ret = -1; + CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default symlink */ +int32_t +changelog_default_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc, + mode_t umask, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto out; + } + uuid_copy (gfid, uuid_req); + + ret = -1; + CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default mknod */ +int32_t +changelog_default_mknod (call_frame_t *frame, + xlator_t *this, loc_t *loc, + mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_local_t *local = NULL; + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto out; + } + uuid_copy (gfid, uuid_req); + + ret = -1; + CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default create */ +int32_t +changelog_default_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; + changelog_local_t *local = NULL; + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto out; + } + uuid_copy (gfid, uuid_req); + + /* init with two extra records */ + ret = -1; + CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2); + if (!local) + goto out; + + co = changelog_get_usable_buffer (local); + if (!co) + goto out; + + CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, out); + + changelog_set_usable_record_and_length (local, xtra_len, 2); + + frame->local = local; + ret = 0; + + out: + if (ret) + changelog_local_cleanup (this, local); + return ret; +} + +/* default fsetattr */ +int32_t +changelog_default_fsetattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + return 0; +} + +/* default setattr */ +int32_t +changelog_default_setattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + return 0; +} + +/* default fremovexattr */ +int32_t +changelog_default_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + return 0; +} + +/* default removexattr */ +int32_t +changelog_default_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + return 0; +} + +/* default setxattr */ +int32_t +changelog_default_setxattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + return 0; +} + +/* default fsetxattr */ +int32_t +changelog_default_fsetxattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + return 0; +} + +/* default truncate */ +int32_t +changelog_default_truncate (call_frame_t *frame, + xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + return 0; +} + +/* default ftruncate */ +int32_t +changelog_default_ftruncate (call_frame_t *frame, + xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + return 0; +} + +/* default writev */ +int32_t +changelog_default_writev (call_frame_t *frame, + xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + return 0; +} + +/** COPS */ + +int +changelog_default_cops_open (xlator_t *this, + changelog_priv_t *priv, + void *cpriv, char *name, gf_boolean_t last) +{ + changelog_log_data_t cld = {0,}; + changelog_rollover_data_t *crd = NULL; + struct timeval tv = {0,}; + + crd = &cld.cld_roll; + + cld.cld_type = CHANGELOG_TYPE_ROLLOVER; + + if (gettimeofday (&tv, NULL)) + return -1; + + crd->crd_prealloc_size = 0; /* no preallocation */ + crd->crd_finale = last; + crd->crd_use_suffix = _gf_true; + crd->crd_roll_key = (unsigned long) tv.tv_sec; + + (void) strcpy (crd->crd_changelog_name, name); + (void) strcpy (crd->crd_changelog_oname, name); + + /* inject a roll-over event */ + return changelog_inject_single_event (this, priv, NULL, &cld); +} + +int +changelog_default_cops_rollover (xlator_t *this, + changelog_priv_t *priv, void *cpriv, + char *name, gf_boolean_t last) +{ + return changelog_default_cops_open (this, priv, cpriv, name, last); +} + +int +changelog_default_cops_sync (xlator_t *this, + changelog_priv_t *priv, void *cpriv) +{ + changelog_log_data_t cld = {0,}; + + cld.cld_type = CHANGELOG_TYPE_FSYNC; + return changelog_inject_single_event (this, priv, NULL, &cld); +} + +/** + * write to the changelog: @changelog_update() implements inode version + * checking and all other stuffs... + */ +int +changelog_default_cops_write (xlator_t *this, + changelog_priv_t *priv, void *cpriv, + changelog_local_t *local, changelog_log_type type) +{ + changelog_update (this, priv, local, type); + return 0; +} + +off_t +changelog_default_cops_get_offset (xlator_t *this, + changelog_priv_t *priv, void *cpriv, + changelog_local_t *local) +{ + return *(off_t *)cpriv; +} + +void +changelog_default_cops_set_offset (xlator_t *this, + changelog_priv_t *priv, void *cpriv, + changelog_local_t *local, off_t bytes) +{ + *(off_t *)cpriv += bytes; +} + +void +changelog_default_cops_reset_offset (xlator_t *this, changelog_priv_t *priv, + void *cpriv, changelog_local_t *local) +{ + *(off_t *)cpriv = 0; +} + +/** + * roll-over takes care of close and open + */ +int +changelog_default_cops_close (xlator_t *this, + changelog_priv_t *priv, void *cpriv) +{ + errno = ENOTSUP; + return -1; +} + +int +changelog_default_cops_read (xlator_t *this, + changelog_priv_t *priv, void *cpriv, char *buffer) +{ + errno = ENOTSUP; + return -1; +} + +/** + * no purging of changelogs + */ +int +changelog_default_cops_unlink (xlator_t *this, + changelog_priv_t *priv, void *cpriv, char *name) +{ + errno = ENOTSUP; + return -1; +} diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c index 553eec85c..8d45ee1ff 100644 --- a/xlators/features/changelog/src/changelog-encoders.c +++ b/xlators/features/changelog/src/changelog-encoders.c @@ -72,7 +72,7 @@ entry_free_fn (void *data) */ static inline void -changelog_encode_write_xtra (changelog_log_data_t *cld, +changelog_encode_write_xtra (changelog_write_data_t *cwd, char *buffer, size_t *off, gf_boolean_t encode) { int i = 0; @@ -82,10 +82,11 @@ changelog_encode_write_xtra (changelog_log_data_t *cld, offset = *off; - co = (changelog_opt_t *) cld->cld_ptr; + co = (changelog_opt_t *) cwd->cwd_ptr; - for (; i < cld->cld_xtra_records; i++, co++) { - CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1); + for (; i < cwd->cwd_xtra_records; i++, co++) { + if (i) + CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1); switch (co->co_type) { case CHANGELOG_OPT_REC_FOP: @@ -94,6 +95,12 @@ changelog_encode_write_xtra (changelog_log_data_t *cld, case CHANGELOG_OPT_REC_ENTRY: data = &co->co_entry; break; + case CHANGELOG_OPT_REC_ULL: + data = &co->co_number; + break; + case CHANGELOG_OPT_REC_UUID: + data = &co->co_uuid; + break; } if (co->co_convert) @@ -108,69 +115,59 @@ changelog_encode_write_xtra (changelog_log_data_t *cld, } int -changelog_encode_ascii (xlator_t *this, changelog_log_data_t *cld) +changelog_encode_ascii (xlator_t *this, + changelog_local_t *local, changelog_log_data_t *cld) { - size_t off = 0; - size_t gfid_len = 0; - char *gfid_str = NULL; - char *buffer = NULL; - changelog_priv_t *priv = NULL; + size_t off = 0; + size_t gfid_len = 0; + char *gfid_str = NULL; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + changelog_write_data_t *cwd = NULL; priv = this->private; + cwd = &cld->cld_wdata; - gfid_str = uuid_utoa (cld->cld_gfid); + gfid_str = uuid_utoa (cwd->cwd_gfid); gfid_len = strlen (gfid_str); /* extra bytes for decorations */ - buffer = alloca (gfid_len + cld->cld_ptr_len + 10); - CHANGELOG_STORE_ASCII (priv, buffer, - off, gfid_str, gfid_len, cld); - - if (cld->cld_xtra_records) - changelog_encode_write_xtra (cld, buffer, &off, _gf_true); - - CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + buffer = alloca (gfid_len + cwd->cwd_ptr_len + 100); + if (!priv->no_gfid_hdr) + CHANGELOG_STORE_ASCII (priv, buffer, + off, gfid_str, gfid_len, cld); + + if (cwd->cwd_xtra_records) { + changelog_encode_write_xtra (cwd, buffer, &off, _gf_true); + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + } - return changelog_write_change (priv, buffer, off); + return changelog_write_change (this, priv, + local, buffer, off); } int -changelog_encode_binary (xlator_t *this, changelog_log_data_t *cld) +changelog_encode_binary (xlator_t *this, + changelog_local_t *local, changelog_log_data_t *cld) { - size_t off = 0; - char *buffer = NULL; - changelog_priv_t *priv = NULL; + size_t off = 0; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + changelog_write_data_t *cwd = NULL; priv = this->private; + cwd = &cld->cld_wdata; /* extra bytes for decorations */ - buffer = alloca (sizeof (uuid_t) + cld->cld_ptr_len + 10); - CHANGELOG_STORE_BINARY (priv, buffer, off, cld->cld_gfid, cld); - - if (cld->cld_xtra_records) - changelog_encode_write_xtra (cld, buffer, &off, _gf_false); + buffer = alloca (sizeof (uuid_t) + cwd->cwd_ptr_len + 100); + if (!priv->no_gfid_hdr) + CHANGELOG_STORE_BINARY (priv, buffer, off, cwd->cwd_gfid, cld); - CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); - - return changelog_write_change (priv, buffer, off); -} - -static struct changelog_encoder -cb_encoder[] = { - [CHANGELOG_ENCODE_BINARY] = - { - .encoder = CHANGELOG_ENCODE_BINARY, - .encode = changelog_encode_binary, - }, - [CHANGELOG_ENCODE_ASCII] = - { - .encoder = CHANGELOG_ENCODE_ASCII, - .encode = changelog_encode_ascii, - }, -}; + if (cwd->cwd_xtra_records) { + changelog_encode_write_xtra (cwd, buffer, &off, _gf_false); + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + } -void -changelog_encode_change( changelog_priv_t * priv) -{ - priv->ce = &cb_encoder[priv->encode_mode]; + return changelog_write_change (this, priv, + local, buffer, off); } diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h index a3efbee05..2a96ba4dd 100644 --- a/xlators/features/changelog/src/changelog-encoders.h +++ b/xlators/features/changelog/src/changelog-encoders.h @@ -21,6 +21,7 @@ priv->maps[cld->cld_type], 1); \ CHANGELOG_FILL_BUFFER (buffer, \ off, gfid, gfid_len); \ + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); \ } while (0) #define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) do { \ @@ -28,6 +29,7 @@ priv->maps[cld->cld_type], 1); \ CHANGELOG_FILL_BUFFER (buffer, \ off, gfid, sizeof (uuid_t)); \ + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); \ } while (0) size_t @@ -37,10 +39,10 @@ fop_fn (void *data, char *buffer, gf_boolean_t encode); void entry_free_fn (void *data); int -changelog_encode_binary (xlator_t *, changelog_log_data_t *); +changelog_encode_binary (xlator_t *, + changelog_local_t *, changelog_log_data_t *); int -changelog_encode_ascii (xlator_t *, changelog_log_data_t *); -void -changelog_encode_change(changelog_priv_t *); +changelog_encode_ascii (xlator_t *, + changelog_local_t *, changelog_log_data_t *); #endif /* _CHANGELOG_ENCODERS_H */ diff --git a/xlators/features/changelog/src/changelog-fops.h b/xlators/features/changelog/src/changelog-fops.h new file mode 100644 index 000000000..597327be3 --- /dev/null +++ b/xlators/features/changelog/src/changelog-fops.h @@ -0,0 +1,157 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_FOPS_H +#define _CHANGELOG_FOPS_H + +/* FOPS */ + +int32_t +changelog_default_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata); +int32_t +changelog_default_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata); +int32_t +changelog_default_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata); +int32_t +changelog_default_link (call_frame_t *frame, + xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata); +int32_t +changelog_default_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); +int32_t +changelog_default_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc, + mode_t umask, dict_t *xdata); +int32_t +changelog_default_mknod (call_frame_t *frame, + xlator_t *this, loc_t *loc, + mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); +int32_t +changelog_default_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata); +int32_t +changelog_default_fsetattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t +changelog_default_setattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t +changelog_default_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); +int32_t +changelog_default_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata); +int32_t +changelog_default_setxattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata); +int32_t +changelog_default_fsetxattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); +int32_t +changelog_default_truncate (call_frame_t *frame, + xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata); +int32_t +changelog_default_ftruncate (call_frame_t *frame, + xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata); +int32_t +changelog_default_writev (call_frame_t *frame, + xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +/* COPS */ +int +changelog_default_cops_open (xlator_t *, changelog_priv_t *, + void *, char*, gf_boolean_t); + +int +changelog_default_cops_close (xlator_t *, changelog_priv_t *, void *); + +int +changelog_default_cops_sync (xlator_t *this, + changelog_priv_t *priv, void *); + +int +changelog_default_cops_rollover (xlator_t *, + changelog_priv_t *, void *, + char *, gf_boolean_t); +int +changelog_default_cops_write (xlator_t *, + changelog_priv_t *, void *, + changelog_local_t *, changelog_log_type); + +int +changelog_default_cops_read (xlator_t *, + changelog_priv_t *, void *, char *); + +int +changelog_default_cops_unlink (xlator_t *, + changelog_priv_t *, void *, char *); + +off_t +changelog_default_cops_get_offset (xlator_t *, + changelog_priv_t *, void *, + changelog_local_t *); + +void +changelog_default_cops_set_offset (xlator_t *, + changelog_priv_t *, void *, + changelog_local_t *, off_t ); + +void +changelog_default_cops_reset_offset (xlator_t *, changelog_priv_t *, + void *, changelog_local_t *); + + +GF_UNUSED static struct xlator_fops changelog_default_fops = { + .mknod = changelog_default_mknod, + .mkdir = changelog_default_mkdir, + .create = changelog_default_create, + .symlink = changelog_default_symlink, + .writev = changelog_default_writev, + .truncate = changelog_default_truncate, + .ftruncate = changelog_default_ftruncate, + .link = changelog_default_link, + .rename = changelog_default_rename, + .unlink = changelog_default_unlink, + .rmdir = changelog_default_rmdir, + .setattr = changelog_default_setattr, + .fsetattr = changelog_default_fsetattr, + .setxattr = changelog_default_setxattr, + .fsetxattr = changelog_default_fsetxattr, + .removexattr = changelog_default_removexattr, + .fremovexattr = changelog_default_fremovexattr, +}; + +GF_UNUSED static struct changelog_ops changelog_default_cops = { + .open = changelog_default_cops_open, + .sync = changelog_default_cops_sync, + .read = changelog_default_cops_read, + .close = changelog_default_cops_close, + .write = changelog_default_cops_write, + .unlink = changelog_default_cops_unlink, + .rollover = changelog_default_cops_rollover, + .get_offset = changelog_default_cops_get_offset, + .set_offset = changelog_default_cops_set_offset, + .reset_offset = changelog_default_cops_reset_offset, +}; + +#endif /* _CHANGELOG_FOPS_H */ diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c index 7ab0091b5..ad4fe4013 100644 --- a/xlators/features/changelog/src/changelog-helpers.c +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -21,7 +21,6 @@ #include "changelog-helpers.h" #include "changelog-mem-types.h" -#include "changelog-encoders.h" #include void @@ -53,48 +52,45 @@ changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) inline void * changelog_get_usable_buffer (changelog_local_t *local) { - changelog_log_data_t *cld = NULL; + changelog_write_data_t *cwd = &local->cld.cld_wdata; - cld = &local->cld; - if (!cld->cld_iobuf) + if (!cwd->cwd_iobuf) return NULL; - return cld->cld_iobuf->ptr; + return cwd->cwd_ptr; } inline void changelog_set_usable_record_and_length (changelog_local_t *local, size_t len, int xr) { - changelog_log_data_t *cld = NULL; + changelog_write_data_t *cwd = &local->cld.cld_wdata; - cld = &local->cld; - - cld->cld_ptr_len = len; - cld->cld_xtra_records = xr; + cwd->cwd_ptr_len = len; + cwd->cwd_xtra_records = xr; } void changelog_local_cleanup (xlator_t *xl, changelog_local_t *local) { - int i = 0; - changelog_opt_t *co = NULL; - changelog_log_data_t *cld = NULL; + int i = 0; + changelog_opt_t *co = NULL; + changelog_write_data_t *cwd = NULL; if (!local) return; - cld = &local->cld; + cwd = &local->cld.cld_wdata; /* cleanup dynamic allocation for extra records */ - if (cld->cld_xtra_records) { - co = (changelog_opt_t *) cld->cld_ptr; - for (; i < cld->cld_xtra_records; i++, co++) + if (cwd->cwd_xtra_records) { + co = (changelog_opt_t *) cwd->cwd_ptr; + for (; i < cwd->cwd_xtra_records; i++, co++) if (co->co_free) co->co_free (co); } - CHANGELOG_IOBUF_UNREF (cld->cld_iobuf); + CHANGELOG_IOBUF_UNREF (cwd->cwd_iobuf); if (local->inode) inode_unref (local->inode); @@ -122,7 +118,8 @@ changelog_write (int fd, char *buffer, size_t len) static int changelog_rollover_changelog (xlator_t *this, - changelog_priv_t *priv, unsigned long ts) + changelog_priv_t *priv, + changelog_rollover_data_t *crd) { int ret = -1; int notify = 0; @@ -135,11 +132,22 @@ changelog_rollover_changelog (xlator_t *this, priv->changelog_fd = -1; } + /** + * no rolling-over of changelogs, policy implementer choose + * to do the heavy-lifting of having distinct changelog name. + * + * NOTE: This implies libgfchangelog would not be notified + (well, we could, but lets not do that now...) + */ + if (!crd->crd_use_suffix) + return 0; + (void) snprintf (ofile, PATH_MAX, - "%s/"CHANGELOG_FILE_NAME, priv->changelog_dir); - (void) snprintf (nfile, PATH_MAX, - "%s/"CHANGELOG_FILE_NAME".%lu", - priv->changelog_dir, ts); + "%s/%s", priv->changelog_dir, + crd->crd_changelog_oname); + (void) snprintf (nfile, PATH_MAX, "%s/%s.%lu", + priv->changelog_dir, + crd->crd_changelog_name, crd->crd_roll_key); ret = rename (ofile, nfile); if (!ret) @@ -171,7 +179,8 @@ changelog_rollover_changelog (xlator_t *this, int changelog_open (xlator_t *this, - changelog_priv_t *priv) + changelog_priv_t *priv, + changelog_local_t *local, changelog_rollover_data_t *crd) { int fd = 0; int ret = -1; @@ -180,12 +189,12 @@ changelog_open (xlator_t *this, char changelog_path[PATH_MAX] = {0,}; (void) snprintf (changelog_path, PATH_MAX, - "%s/"CHANGELOG_FILE_NAME, - priv->changelog_dir); + "%s/%s", priv->changelog_dir, + crd->crd_changelog_name); flags |= (O_CREAT | O_RDWR); if (priv->fsync_interval == 0) - flags |= O_SYNC; + flags |= O_SYNC; fd = open (changelog_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); @@ -198,12 +207,25 @@ changelog_open (xlator_t *this, } priv->changelog_fd = fd; + CHANGELOG_INVOKE_CFOP (this, priv, reset_offset, local); + + /* preallocate if required */ + if (crd->crd_prealloc_size > 0) { + ret = posix_fallocate (priv->changelog_fd, + 0, crd->crd_prealloc_size); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to preallocate %llu bytes", + (unsigned long long) crd->crd_prealloc_size); + } + } (void) snprintf (buffer, 1024, CHANGELOG_HEADER, CHANGELOG_VERSION_MAJOR, CHANGELOG_VERSION_MINOR, - priv->ce->encoder); - ret = changelog_write_change (priv, buffer, strlen (buffer)); + priv->encode_mode); + ret = changelog_write_change (this, priv, + local, buffer, strlen (buffer)); if (ret) { close (priv->changelog_fd); priv->changelog_fd = -1; @@ -216,18 +238,19 @@ changelog_open (xlator_t *this, return ret; } -int +static int changelog_start_next_change (xlator_t *this, changelog_priv_t *priv, - unsigned long ts, gf_boolean_t finale) + changelog_local_t *local, + changelog_log_data_t *cld) { - int ret = -1; - - ret = changelog_rollover_changelog (this, priv, ts); + int ret = 0; + changelog_rollover_data_t *crd = &cld->cld_roll; - if (!ret && !finale) - ret = changelog_open (this, priv); + ret = changelog_rollover_changelog (this, priv, crd); + if (!ret && !crd->crd_finale) + ret = changelog_open (this, priv, local, crd); return ret; } @@ -241,37 +264,42 @@ changelog_entry_length () } int -changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last) +changelog_write_change (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, char *buffer, size_t len) { - struct timeval tv = {0,}; + int ret = -1; + off_t offset = 0; + ssize_t size = 0; + size_t writen = 0; - cld->cld_type = CHANGELOG_TYPE_ROLLOVER; + offset = CHANGELOG_INVOKE_CFOP (this, priv, get_offset, local); - if (gettimeofday (&tv, NULL)) - return -1; + while (writen < len) { + size = pwrite (priv->changelog_fd, + buffer + writen, len - writen, offset + writen); + if (size <= 0) + break; - cld->cld_roll_time = (unsigned long) tv.tv_sec; - cld->cld_finale = is_last; - return 0; -} + writen += size; + } -int -changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len) -{ - return changelog_write (priv->changelog_fd, buffer, len); + if (writen == len) { + ret = 0; + CHANGELOG_INVOKE_CFOP (this, priv, set_offset, local, writen); + } + + return ret; } inline int changelog_handle_change (xlator_t *this, - changelog_priv_t *priv, changelog_log_data_t *cld) + changelog_priv_t *priv, + changelog_local_t *local, changelog_log_data_t *cld) { int ret = 0; if (CHANGELOG_TYPE_IS_ROLLOVER (cld->cld_type)) { - changelog_encode_change(priv); - ret = changelog_start_next_change (this, priv, - cld->cld_roll_time, - cld->cld_finale); + ret = changelog_start_next_change (this, priv, local, cld); if (ret) gf_log (this->name, GF_LOG_ERROR, "Problem rolling over changelog(s)"); @@ -295,7 +323,7 @@ changelog_handle_change (xlator_t *this, goto out; } - ret = priv->ce->encode (this, cld); + ret = priv->ce->encode (this, local, cld); if (ret) { gf_log (this->name, GF_LOG_ERROR, "error writing changelog to disk"); @@ -305,6 +333,17 @@ changelog_handle_change (xlator_t *this, return ret; } +static inline void +changelog_local_init_defaults (changelog_local_t *local, + uuid_t gfid, struct iobuf *iobuf) +{ + changelog_write_data_t *cwd = &(local->cld.cld_wdata); + + uuid_copy (cwd->cwd_gfid, gfid); + cwd->cwd_iobuf = iobuf; + cwd->cwd_xtra_records = 0; /* set by the caller */ +} + changelog_local_t * changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid, int xtra_records, @@ -314,7 +353,7 @@ changelog_local_init (xlator_t *this, inode_t *inode, struct iobuf *iobuf = NULL; /** - * We relax the presence of inode if @update_flag is true. + * Relax the presence of inode if @update_flag is true. * The caller (implmentation of the fop) needs to be careful to * not blindly use local->inode. */ @@ -339,10 +378,7 @@ changelog_local_init (xlator_t *this, inode_t *inode, local->update_no_check = update_flag; - uuid_copy (local->cld.cld_gfid, gfid); - - local->cld.cld_iobuf = iobuf; - local->cld.cld_xtra_records = 0; /* set by the caller */ + (void) changelog_local_init_defaults (local, gfid, iobuf); if (inode) local->inode = inode_ref (inode); @@ -370,9 +406,11 @@ changelog_forget (xlator_t *this, inode_t *inode) int changelog_inject_single_event (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_data_t *cld) { - return priv->cd.dispatchfn (this, priv, priv->cd.cd_data, cld, NULL); + return priv->cd.dispatchfn (this, priv, + priv->cd.cd_data, local, cld); } /** @@ -383,9 +421,9 @@ void * changelog_rollover (void *data) { int ret = 0; + char *cname = NULL; xlator_t *this = NULL; struct timeval tv = {0,}; - changelog_log_data_t cld = {0,}; changelog_time_slice_t *slice = NULL; changelog_priv_t *priv = data; @@ -400,16 +438,11 @@ changelog_rollover (void *data) if (ret) continue; - ret = changelog_fill_rollover_data (&cld, _gf_false); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to fill rollover data"); - continue; - } - LOCK (&priv->lock); { - ret = changelog_inject_single_event (this, priv, &cld); + cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp); + ret = CHANGELOG_INVOKE_CFOP (this, priv, rollover, + cname, _gf_false); if (!ret) SLICE_VERSION_UPDATE (slice); } @@ -425,11 +458,9 @@ changelog_fsync_thread (void *data) int ret = 0; xlator_t *this = NULL; struct timeval tv = {0,}; - changelog_log_data_t cld = {0,}; changelog_priv_t *priv = data; this = priv->cf.this; - cld.cld_type = CHANGELOG_TYPE_FSYNC; while (1) { tv.tv_sec = priv->fsync_interval; @@ -439,7 +470,7 @@ changelog_fsync_thread (void *data) if (ret) continue; - ret = changelog_inject_single_event (this, priv, &cld); + ret = CHANGELOG_INVOKE_CFOP (this, priv, sync); if (ret) gf_log (this->name, GF_LOG_ERROR, "failed to inject fsync event"); @@ -637,19 +668,19 @@ changelog_inode_ctx_get (xlator_t *this, * signifies an update was recorded in the current time slice). */ inline void -changelog_update (xlator_t *this, changelog_priv_t *priv, - changelog_local_t *local, changelog_log_type type) +changelog_update (xlator_t *this, + changelog_priv_t *priv, + changelog_local_t *local, + changelog_log_type type) { - int ret = 0; - unsigned long *iver = NULL; - unsigned long version = 0; - inode_t *inode = NULL; - changelog_time_slice_t *slice = NULL; - changelog_inode_ctx_t *ctx = NULL; - changelog_log_data_t *cld_0 = NULL; - changelog_log_data_t *cld_1 = NULL; - changelog_local_t *next_local = NULL; - gf_boolean_t need_upd = _gf_true; + int ret = 0; + unsigned long *iver = NULL; + unsigned long version = 0; + inode_t *inode = NULL; + changelog_time_slice_t *slice = NULL; + changelog_inode_ctx_t *ctx = NULL; + changelog_log_data_t *cld_0 = NULL; + gf_boolean_t need_upd = _gf_true; slice = &priv->slice; @@ -673,13 +704,8 @@ changelog_update (xlator_t *this, changelog_priv_t *priv, cld_0 = &local->cld; cld_0->cld_type = type; - if ( (next_local = local->prev_entry) != NULL ) { - cld_1 = &next_local->cld; - cld_1->cld_type = type; - } - ret = priv->cd.dispatchfn (this, priv, - priv->cd.cd_data, cld_0, cld_1); + priv->cd.cd_data, local, cld_0); /** * update after the dispatcher has successfully done diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h index ad79636b0..656fb7ffa 100644 --- a/xlators/features/changelog/src/changelog-helpers.h +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -19,23 +19,15 @@ #include "changelog-misc.h" /** - * the changelog entry + * structures representing the changelog entries */ -typedef struct changelog_log_data { - /* rollover related */ - unsigned long cld_roll_time; - - /* reopen changelog? */ - gf_boolean_t cld_finale; - - changelog_log_type cld_type; - +typedef struct changelog_write_data { /** * sincd gfid is _always_ a necessity, it's not a part * of the iobuf. by doing this we do not add any overhead * for data and metadata related fops. */ - uuid_t cld_gfid; + uuid_t cwd_gfid; /** * iobufs are used for optionals records: pargfid, path, @@ -43,25 +35,78 @@ typedef struct changelog_log_data { * to allocate (iobuf_get() in the fop) and get unref'ed * in the callback (CHANGELOG_STACK_UNWIND). */ - struct iobuf *cld_iobuf; - -#define cld_ptr cld_iobuf->ptr + struct iobuf *cwd_iobuf; /** * after allocation you can point this to the length of * usable data, but make sure it does not exceed the * the size of the requested iobuf. */ - size_t cld_iobuf_len; - -#define cld_ptr_len cld_iobuf_len + size_t cwd_iobuf_len; + #define cwd_ptr cwd_iobuf->ptr + #define cwd_ptr_len cwd_iobuf_len /** * number of optional records */ - int cld_xtra_records; + int cwd_xtra_records; +} changelog_write_data_t; + +typedef struct changelog_rollover_data { + /** + * need a changelog reopen? + */ + gf_boolean_t crd_finale; + + /** + * changelog file name to be opened after a rollover + */ + char crd_changelog_name[PATH_MAX]; + + /** + * changelog file name before rollover + */ + char crd_changelog_oname[PATH_MAX]; + + /** + * use @crd_roll_key as suffix during roll-over + */ + gf_boolean_t crd_use_suffix; + + /** + * suffix used when rolling a changelog + */ + unsigned long crd_roll_key; + + /** + * preallocation? if yes, how much? + */ + off_t crd_prealloc_size; +} changelog_rollover_data_t; + +/** + * the changelog entry: structure representing the type of entry + * and a union encapsulating the above declared structures. + */ +typedef struct changelog_log_data { + /** + * type of the log data entry + */ + changelog_log_type cld_type; + + /** + * union for the type of changelog operations. @fsync() does + * not have a corresponding entry in this union as it just + * performs and @fsync() on ->changelog_fd. + */ + union { + changelog_write_data_t cld_wdata; + changelog_rollover_data_t cld_roll; + }; } changelog_log_data_t; +typedef struct changelog_local changelog_local_t; + /** * holder for dispatch function and private data */ @@ -70,8 +115,9 @@ typedef struct changelog_priv changelog_priv_t; typedef struct changelog_dispatcher { void *cd_data; - int (*dispatchfn) (xlator_t *, changelog_priv_t *, void *, - changelog_log_data_t *, changelog_log_data_t *); + int (*dispatchfn) (xlator_t *, + changelog_priv_t *, void *, + changelog_local_t *, changelog_log_data_t *); } changelog_dispatcher_t; struct changelog_bootstrap { @@ -82,9 +128,84 @@ struct changelog_bootstrap { struct changelog_encoder { changelog_encoder_t encoder; - int (*encode) (xlator_t *, changelog_log_data_t *); + int (*encode) (xlator_t *, + changelog_local_t *, changelog_log_data_t *); }; +struct changelog_ops { + /* changelog open */ + int (*open) (xlator_t *, changelog_priv_t *, + void *, char *, gf_boolean_t); + + /* changelog close */ + int (*close) (xlator_t *, changelog_priv_t *, void *); + + /* changelog rollover */ + int (*rollover) (xlator_t *, + changelog_priv_t *, + void *, char *, gf_boolean_t); + + int (*sync) (xlator_t *, changelog_priv_t *, void *); + + /* changelog write */ + int (*write) (xlator_t *, + changelog_priv_t *, void *, + changelog_local_t *, changelog_log_type); + + /* changelog read */ + int (*read) (xlator_t *, + changelog_priv_t *, void *, char *); + + int (*unlink) (xlator_t *, + changelog_priv_t *, void *, char *); + + /* {get|set} offset */ + off_t (*get_offset) (xlator_t *this, + changelog_priv_t *, void *, changelog_local_t *); + + void (*set_offset) (xlator_t *this, + changelog_priv_t *, void *, + changelog_local_t *, off_t); + + void (*reset_offset) (xlator_t *this, changelog_priv_t *, + void *, changelog_local_t *); +}; + +/** + * This structure is _filled_ by the policy init (@init_policy) routine. + * Default @fops and @cops are passed to the init routine, which can + * choose to override the file operation or changelog operation behaviour. + * Just by _replacing_ the function pointers, a policy can change it's + * file and changelog operation behaviour. Kind of inheritance... + */ +struct changelog_logpolicy { + /* current changelog name */ + char changelog_name[PATH_MAX]; + + /* private data */ + void *cpriv; + + /* file ops for the policy */ + struct xlator_fops *fops; + + /* changelog operations for the policy */ + struct changelog_ops *cops; + + /* current active policy */ + changelog_log_policy_t policy; + + int (*init_policy) (xlator_t *, + changelog_priv_t *priv, + struct changelog_logpolicy *); + int (*fini_policy) (xlator_t *, struct changelog_logpolicy *); +}; + +#define CHANGELOG_FNAME_FROM_POLICY(c) c->changelog_name + +#define CHANGELOG_INVOKE_FOP(priv,fop,...) priv->cp->fops->fop (__VA_ARGS__) + +#define CHANGELOG_INVOKE_CFOP(this,priv,fop,...) \ + priv->cp->cops->fop (this, priv, priv->cp->cpriv, ##__VA_ARGS__) /* xlator private */ @@ -142,6 +263,11 @@ typedef struct changelog_notify { struct changelog_priv { gf_boolean_t active; + /** + * write the record header? + */ + gf_boolean_t no_gfid_hdr; + /* to generate unique socket file per brick */ char *changelog_brick; @@ -191,24 +317,43 @@ struct changelog_priv { /* encoder */ struct changelog_encoder *ce; + + /* logging policy */ + changelog_log_policy_t policy; + + /* policy logger */ + struct changelog_logpolicy *cp; + + /* current NSR term */ + uint32_t term; }; struct changelog_local { inode_t *inode; + + /** + * fops that do not need inode version checks + */ gf_boolean_t update_no_check; + /** + * the log data entry + */ changelog_log_data_t cld; /** - * ->prev_entry is used in cases when there needs to be - * additional changelog entry for the parent (eg. rename) - * It's analogous to ->next in single linked list world, - * but we call it as ->prev_entry... ha ha ha + * number of bytes written: used for continuation */ - struct changelog_local *prev_entry; -}; + off_t nr_bytes; -typedef struct changelog_local changelog_local_t; + /** + * temporary scratch pads + */ + union { + void *ptr; + unsigned long val; + } lu; +}; /* inode version is stored in inode ctx */ typedef struct changelog_inode_ctx { @@ -224,6 +369,8 @@ typedef struct changelog_inode_ctx { */ typedef enum { CHANGELOG_OPT_REC_FOP, + CHANGELOG_OPT_REC_ULL, + CHANGELOG_OPT_REC_UUID, CHANGELOG_OPT_REC_ENTRY, } changelog_optional_rec_type_t; @@ -253,7 +400,9 @@ typedef struct { size_t co_len; union { - glusterfs_fop_t co_fop; + uuid_t co_uuid; + glusterfs_fop_t co_fop; + unsigned long long co_number; struct changelog_entry_fields co_entry; }; } changelog_opt_t; @@ -277,29 +426,26 @@ changelog_local_t * changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid, int xtra_records, gf_boolean_t update_flag); int -changelog_start_next_change (xlator_t *this, - changelog_priv_t *priv, - unsigned long ts, gf_boolean_t finale); -int -changelog_open (xlator_t *this, changelog_priv_t *priv); -int -changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last); -int changelog_inject_single_event (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_data_t *cld); inline size_t changelog_entry_length (); inline int changelog_write (int fd, char *buffer, size_t len); int -changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len); +changelog_write_change (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, char *buffer, size_t len); inline int changelog_handle_change (xlator_t *this, - changelog_priv_t *priv, changelog_log_data_t *cld); + changelog_priv_t *priv, + changelog_local_t *local, changelog_log_data_t *cld); inline void -changelog_update (xlator_t *this, changelog_priv_t *priv, - changelog_local_t *local, changelog_log_type type); +changelog_update (xlator_t *this, + changelog_priv_t *priv, + changelog_local_t *local, + changelog_log_type type); void * changelog_rollover (void *data); void * @@ -319,9 +465,6 @@ changelog_forget (xlator_t *this, inode_t *inode); } \ STACK_UNWIND_STRICT (fop, frame, params); \ changelog_local_cleanup (__xl, __local); \ - if (__local && __local->prev_entry) \ - changelog_local_cleanup (__xl, \ - __local->prev_entry); \ } while (0) #define CHANGELOG_IOBUF_REF(iobuf) do { \ @@ -346,12 +489,12 @@ changelog_forget (xlator_t *this, inode_t *inode); } \ } while (0) -#define CHANGLOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \ - co->co_convert = converter; \ - co->co_free = NULL; \ - co->co_type = CHANGELOG_OPT_REC_FOP; \ - co->co_fop = fop; \ - xlen += sizeof (fop); \ +#define CHANGELOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_FOP; \ + co->co_fop = fop; \ + xlen += sizeof (fop); \ } while (0) #define CHANGELOG_FILL_ENTRY(co, pargfid, bname, \ @@ -392,4 +535,7 @@ changelog_forget (xlator_t *this, inode_t *inode); goto label; \ } while (0) +int +changelog_open (xlator_t *this, changelog_priv_t *priv, changelog_local_t *local, changelog_rollover_data_t *crd); + #endif /* _CHANGELOG_HELPERS_H */ diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h index d72464eab..a65bbb4f2 100644 --- a/xlators/features/changelog/src/changelog-mem-types.h +++ b/xlators/features/changelog/src/changelog-mem-types.h @@ -19,10 +19,11 @@ enum gf_changelog_mem_types { gf_changelog_mt_batch_t = gf_common_mt_end + 3, gf_changelog_mt_rt_t = gf_common_mt_end + 4, gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, - gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 6, - gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 7, - gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 8, - gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 9, + gf_changelog_mt_fop_policy_t = gf_common_mt_end + 6, + gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 7, + gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 8, + gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 9, + gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 10, gf_changelog_mt_end }; diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h index 0712a3771..58bd3279d 100644 --- a/xlators/features/changelog/src/changelog-misc.h +++ b/xlators/features/changelog/src/changelog-misc.h @@ -65,7 +65,7 @@ } while (0) /** - * everything after 'CHANGELOG_TYPE_ENTRY' are internal types + * everything after @CHANGELOG_TYPE_ENTRY are internal types * (ie. none of the fops trigger this type of event), hence * CHANGELOG_MAX_TYPE = 3 */ @@ -91,6 +91,12 @@ typedef enum { CHANGELOG_ENCODE_MAX, } changelog_encoder_t; +/* logging policies */ +typedef enum { + CHANGELOG_LOG_POLICY_DEFAULT = 0, + CHANGELOG_LOG_POLICY_REPLICATE, +} changelog_log_policy_t; + #define CHANGELOG_VALID_ENCODING(enc) \ (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX) diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c index c147f68ca..4e801ae85 100644 --- a/xlators/features/changelog/src/changelog-rt.c +++ b/xlators/features/changelog/src/changelog-rt.c @@ -52,8 +52,9 @@ changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd) } int -changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, - changelog_log_data_t *cld_0, changelog_log_data_t *cld_1) +changelog_rt_enqueue (xlator_t *this, + changelog_priv_t *priv, void *cbatch, + changelog_local_t *local, changelog_log_data_t *cld_0) { int ret = 0; changelog_rt_t *crt = NULL; @@ -62,9 +63,7 @@ changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, LOCK (&crt->lock); { - ret = changelog_handle_change (this, priv, cld_0); - if (!ret && cld_1) - ret = changelog_handle_change (this, priv, cld_1); + ret = changelog_handle_change (this, priv, local, cld_0); } UNLOCK (&crt->lock); diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h index 1fc2bbc5b..09398041d 100644 --- a/xlators/features/changelog/src/changelog-rt.h +++ b/xlators/features/changelog/src/changelog-rt.h @@ -27,7 +27,8 @@ changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd); int changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd); int -changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, - changelog_log_data_t *cld_0, changelog_log_data_t *cld_1); +changelog_rt_enqueue (xlator_t *this, + changelog_priv_t *priv, void *cbatch, + changelog_local_t *local, changelog_log_data_t *cld_0); #endif /* _CHANGELOG_RT_H */ diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c index cea0e8c70..2e01161a9 100644 --- a/xlators/features/changelog/src/changelog.c +++ b/xlators/features/changelog/src/changelog.c @@ -17,15 +17,15 @@ #include "defaults.h" #include "logging.h" #include "iobuf.h" +#include #include "changelog-rt.h" - +#include "changelog-notifier.h" #include "changelog-encoders.h" #include "changelog-mem-types.h" -#include - -#include "changelog-notifier.h" +#include "changelog-fops.h" +#include "changelog-policy.h" static struct changelog_bootstrap cb_bootstrap[] = { @@ -36,11 +36,41 @@ cb_bootstrap[] = { }, }; -/* Entry operations - TYPE III */ +static struct changelog_encoder +cb_encoder[] = { + [CHANGELOG_ENCODE_BINARY] = + { + .encoder = CHANGELOG_ENCODE_BINARY, + .encode = changelog_encode_binary, + }, + [CHANGELOG_ENCODE_ASCII] = + { + .encoder = CHANGELOG_ENCODE_ASCII, + .encode = changelog_encode_ascii, + }, +}; -/** - * entry operations do not undergo inode version checking. - */ +static struct changelog_logpolicy +cb_policy[] = { + [CHANGELOG_LOG_POLICY_DEFAULT] = + { + .fops = NULL, + .cops = NULL, + .policy = CHANGELOG_LOG_POLICY_DEFAULT, + .init_policy = changelog_default_policy_init, + .fini_policy = changelog_default_policy_fini, + }, + [CHANGELOG_LOG_POLICY_REPLICATE] = + { + .fops = NULL, + .cops = NULL, + .policy = CHANGELOG_LOG_POLICY_REPLICATE, + .init_policy = changelog_replication_policy_init, + .fini_policy = changelog_replication_policy_fini, + }, +}; + +/* Entry operations - TYPE III */ /* {{{ */ @@ -59,7 +89,8 @@ changelog_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (rmdir, frame, op_ret, op_errno, @@ -71,27 +102,12 @@ int32_t changelog_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT_NOCHECK (this, frame->local, - NULL, loc->inode->gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, rmdir, frame, this, loc, xflags, xdata); wind: STACK_WIND (frame, changelog_rmdir_cbk, @@ -115,7 +131,8 @@ changelog_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (unlink, frame, op_ret, op_errno, @@ -127,27 +144,13 @@ int32_t changelog_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, loc->inode->gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, unlink, frame, this, loc, xflags, xdata); wind: STACK_WIND (frame, changelog_unlink_cbk, @@ -174,7 +177,8 @@ changelog_rename_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (rename, frame, op_ret, op_errno, @@ -188,32 +192,12 @@ int32_t changelog_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - /* 3 == fop + oldloc + newloc */ - CHANGELOG_INIT_NOCHECK (this, frame->local, - NULL, oldloc->inode->gfid, 3); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - co++; - CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 3); + CHANGELOG_INVOKE_FOP (priv, rename, frame, this, oldloc, newloc, xdata); wind: STACK_WIND (frame, changelog_rename_cbk, @@ -239,7 +223,8 @@ changelog_link_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (link, frame, op_ret, op_errno, @@ -252,28 +237,14 @@ changelog_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, oldloc->gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, link, frame, this, oldloc, newloc, xdata); wind: STACK_WIND (frame, changelog_link_cbk, @@ -299,7 +270,8 @@ changelog_mkdir_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (mkdir, frame, op_ret, op_errno, @@ -311,37 +283,13 @@ int32_t changelog_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { - int ret = -1; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); - - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, mkdir, frame, this, + loc, mode, umask, xdata); wind: STACK_WIND (frame, changelog_mkdir_cbk, @@ -367,7 +315,8 @@ changelog_symlink_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (symlink, frame, op_ret, op_errno, @@ -380,37 +329,13 @@ changelog_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) { - int ret = -1; - size_t xtra_len = 0; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); - - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, symlink, frame, this, + linkname, loc, umask, xdata); wind: STACK_WIND (frame, changelog_symlink_cbk, @@ -436,7 +361,8 @@ changelog_mknod_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (mknod, frame, op_ret, op_errno, @@ -449,37 +375,13 @@ changelog_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { - int ret = -1; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); - - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, mknod, frame, this, + loc, mode, dev, umask, xdata); wind: STACK_WIND (frame, changelog_mknod_cbk, @@ -506,7 +408,8 @@ changelog_create_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_ENTRY); unwind: CHANGELOG_STACK_UNWIND (create, frame, @@ -520,40 +423,13 @@ changelog_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - int ret = -1; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - changelog_opt_t *co = NULL; - changelog_priv_t *priv = NULL; - size_t xtra_len = 0; + changelog_priv_t *priv = NULL; priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); - - /* init with two extra records */ - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); - if (!frame->local) - goto wind; - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGELOG_INVOKE_FOP (priv, create, frame, this, loc, + flags, mode, umask, fd, xdata); wind: STACK_WIND (frame, changelog_create_cbk, @@ -585,7 +461,8 @@ changelog_fsetattr_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_METADATA); unwind: CHANGELOG_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, @@ -606,8 +483,8 @@ changelog_fsetattr (call_frame_t *frame, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, fsetattr, + frame, this, fd, stbuf, valid, xdata); wind: STACK_WIND (frame, changelog_fsetattr_cbk, @@ -632,7 +509,8 @@ changelog_setattr_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_METADATA); unwind: CHANGELOG_STACK_UNWIND (setattr, frame, op_ret, op_errno, @@ -651,8 +529,8 @@ changelog_setattr (call_frame_t *frame, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, setattr, + frame, this, loc, stbuf, valid, xdata); wind: STACK_WIND (frame, changelog_setattr_cbk, @@ -676,7 +554,8 @@ changelog_fremovexattr_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_METADATA); unwind: CHANGELOG_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); @@ -693,8 +572,8 @@ changelog_fremovexattr (call_frame_t *frame, xlator_t *this, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, fremovexattr, + frame, this, fd, name, xdata); wind: STACK_WIND (frame, changelog_fremovexattr_cbk, @@ -716,7 +595,8 @@ changelog_removexattr_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_METADATA); unwind: CHANGELOG_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); @@ -733,8 +613,7 @@ changelog_removexattr (call_frame_t *frame, xlator_t *this, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, removexattr, frame, this, loc, name, xdata); wind: STACK_WIND (frame, changelog_removexattr_cbk, @@ -758,7 +637,8 @@ changelog_setxattr_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_METADATA); unwind: CHANGELOG_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); @@ -776,8 +656,8 @@ changelog_setxattr (call_frame_t *frame, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, setxattr, + frame, this, loc, dict, flags, xdata); wind: STACK_WIND (frame, changelog_setxattr_cbk, @@ -799,7 +679,8 @@ changelog_fsetxattr_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INVOKE_CFOP (this, priv, + write, local, CHANGELOG_TYPE_METADATA); unwind: CHANGELOG_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); @@ -817,8 +698,8 @@ changelog_fsetxattr (call_frame_t *frame, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, fsetxattr, + frame, this, fd, dict, flags, xdata); wind: STACK_WIND (frame, changelog_fsetxattr_cbk, @@ -850,7 +731,7 @@ changelog_truncate_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA); unwind: CHANGELOG_STACK_UNWIND (truncate, frame, @@ -867,8 +748,7 @@ changelog_truncate (call_frame_t *frame, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, truncate, frame, this, loc, offset, xdata); wind: STACK_WIND (frame, changelog_truncate_cbk, @@ -891,7 +771,7 @@ changelog_ftruncate_cbk (call_frame_t *frame, CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA); unwind: CHANGELOG_STACK_UNWIND (ftruncate, frame, @@ -908,8 +788,7 @@ changelog_ftruncate (call_frame_t *frame, priv = this->private; CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, ftruncate, frame, this, fd, offset, xdata); wind: STACK_WIND (frame, changelog_ftruncate_cbk, @@ -934,7 +813,7 @@ changelog_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, CHANGELOG_COND_GOTO (priv, ((op_ret <= 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA); unwind: CHANGELOG_STACK_UNWIND (writev, frame, @@ -951,10 +830,11 @@ changelog_writev (call_frame_t *frame, changelog_priv_t *priv = NULL; priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INVOKE_FOP (priv, writev, frame, this, fd, + vector, count, offset, flags, iobref, xdata); wind: STACK_WIND (frame, changelog_writev_cbk, FIRST_CHILD (this), @@ -994,6 +874,15 @@ changelog_assign_encoding (changelog_priv_t *priv, char *enc) } } +static void +changelog_assign_policy (changelog_priv_t *priv, char *pol) +{ + if ( strncmp (pol, "default", 7) == 0 ) + priv->policy = CHANGELOG_LOG_POLICY_DEFAULT; + else if ( strncmp (pol, "replication", 11) == 0 ) + priv->policy = CHANGELOG_LOG_POLICY_REPLICATE; +} + /* cleanup any helper threads that are running */ static void changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv) @@ -1016,15 +905,17 @@ changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv) int ret = 0; priv->cr.this = this; - ret = gf_thread_create (&priv->cr.rollover_th, - NULL, changelog_rollover, priv); - if (ret) - goto out; + if (priv->rollover_time) { + ret = pthread_create (&priv->cr.rollover_th, + NULL, changelog_rollover, priv); + if (ret) + goto out; + } if (priv->fsync_interval) { priv->cf.this = this; - ret = gf_thread_create (&priv->cf.fsync_th, - NULL, changelog_fsync_thread, priv); + ret = pthread_create (&priv->cf.fsync_th, + NULL, changelog_fsync_thread, priv); } if (ret) @@ -1088,8 +979,8 @@ changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv) priv->cn.this = this; priv->cn.rfd = pipe_fd[0]; - ret = gf_thread_create (&priv->cn.notify_th, - NULL, changelog_notifier, priv); + ret = pthread_create (&priv->cn.notify_th, + NULL, changelog_notifier, priv); out: return ret; @@ -1117,10 +1008,10 @@ mem_acct_init (xlator_t *this) static int changelog_init (xlator_t *this, changelog_priv_t *priv) { - int i = 0; - int ret = -1; - struct timeval tv = {0,}; - changelog_log_data_t cld = {0,}; + int i = 0; + int ret = -1; + char *cname = NULL; + struct timeval tv = {0,}; ret = gettimeofday (&tv, NULL); if (ret) { @@ -1148,21 +1039,18 @@ changelog_init (xlator_t *this, changelog_priv_t *priv) if (ret) goto out; - /** - * start with a fresh changelog file every time. this is done - * in case there was an encoding change. so... things are kept - * simple here. - */ - ret = changelog_fill_rollover_data (&cld, _gf_false); - if (ret) - goto out; + cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp); LOCK (&priv->lock); { - ret = changelog_inject_single_event (this, priv, &cld); + ret = CHANGELOG_INVOKE_CFOP (this, priv, + open, cname, _gf_false); } UNLOCK (&priv->lock); + if (ret) + goto out; + /* ... and finally spawn the helpers threads */ ret = changelog_spawn_helper_threads (this, priv); @@ -1175,11 +1063,11 @@ reconfigure (xlator_t *this, dict_t *options) { int ret = 0; char *tmp = NULL; + char *cname = NULL; changelog_priv_t *priv = NULL; gf_boolean_t active_earlier = _gf_true; gf_boolean_t active_now = _gf_true; changelog_time_slice_t *slice = NULL; - changelog_log_data_t cld = {0,}; priv = this->private; if (!priv) @@ -1230,15 +1118,13 @@ reconfigure (xlator_t *this, dict_t *options) priv->fsync_interval, options, int32, out); if (active_now || active_earlier) { - ret = changelog_fill_rollover_data (&cld, !active_now); - if (ret) - goto out; - slice = &priv->slice; + cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp); LOCK (&priv->lock); { - ret = changelog_inject_single_event (this, priv, &cld); + ret = CHANGELOG_INVOKE_CFOP (this, priv, rollover, + cname, !active_now); if (!ret && active_now) SLICE_VERSION_UPDATE (slice); } @@ -1345,20 +1231,43 @@ init (xlator_t *this) GF_OPTION_INIT ("encoding", tmp, str, out); changelog_assign_encoding (priv, tmp); - GF_OPTION_INIT ("rollover-time", priv->rollover_time, int32, out); + tmp = NULL; + + GF_OPTION_INIT ("policy", tmp, str, out); + changelog_assign_policy (priv, tmp); GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out); - changelog_encode_change(priv); + GF_ASSERT (cb_encoder[priv->encode_mode].encoder == priv->encode_mode); + priv->ce = &cb_encoder[priv->encode_mode]; GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode); priv->cb = &cb_bootstrap[priv->op_mode]; + GF_ASSERT (cb_policy[priv->policy].policy == priv->policy); + priv->cp = &cb_policy[priv->policy]; + /* ... now bootstrap the logger */ ret = priv->cb->ctor (this, &priv->cd); if (ret) goto out; + /* ... init logging policy */ + ret = priv->cp->init_policy (this, priv, priv->cp); + if (ret) + goto out; + + /* override the value if set */ + if (dict_get (this->options, "rollover-time")) { + ret = dict_get_int32 (this->options, + "rollover-time", &priv->rollover_time); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot get value for \"rollover-time\""); + goto out; + } + } + priv->changelog_fd = -1; ret = changelog_init (this, priv); if (ret) @@ -1437,7 +1346,7 @@ struct xlator_cbks cbks = { struct volume_options options[] = { {.key = {"changelog"}, .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", + .default_value = "on", .description = "enable/disable change-logging" }, {.key = {"changelog-brick"}, @@ -1462,8 +1371,7 @@ struct volume_options options[] = { .description = "encoding type for changelogs" }, {.key = {"rollover-time"}, - .default_value = "60", - .type = GF_OPTION_TYPE_TIME, + .type = GF_OPTION_TYPE_INT, .description = "time to switch to a new changelog file (in seconds)" }, {.key = {"fsync-interval"}, @@ -1472,6 +1380,12 @@ struct volume_options options[] = { .description = "do not open CHANGELOG file with O_SYNC mode." " instead perform fsync() at specified intervals" }, + {.key = {"policy"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "replication", + .value = {"default", "replication"}, + .description = "Logging policies" + }, {.key = {NULL} }, }; diff --git a/xlators/features/changelog/src/policy/changelog-policy-default.c b/xlators/features/changelog/src/policy/changelog-policy-default.c new file mode 100644 index 000000000..089bc10e4 --- /dev/null +++ b/xlators/features/changelog/src/policy/changelog-policy-default.c @@ -0,0 +1,44 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-policy.h" +#include "changelog-fops.h" + +int +changelog_default_policy_init (xlator_t *this, + changelog_priv_t *priv, + struct changelog_logpolicy *cp) +{ + priv->rollover_time = 15; + + priv->no_gfid_hdr = _gf_false; + + cp->cpriv = GF_CALLOC (1, sizeof (off_t), + gf_changelog_mt_fop_policy_t); + if (!cp->cpriv) + return -1; + + (void) memset (cp->changelog_name, '\0', PATH_MAX); + (void) memcpy (cp->changelog_name, + CHANGELOG_FILE_NAME, strlen (CHANGELOG_FILE_NAME)); + + cp->fops = &changelog_default_fops; /* default logging policy */ + cp->cops = &changelog_default_cops; /* default changelog operations */ + + return 0; +} + +int +changelog_default_policy_fini (xlator_t *this, + struct changelog_logpolicy *cp) +{ + GF_FREE (cp->cpriv); + return 0; +} diff --git a/xlators/features/changelog/src/policy/changelog-policy-replication.c b/xlators/features/changelog/src/policy/changelog-policy-replication.c new file mode 100644 index 000000000..536339939 --- /dev/null +++ b/xlators/features/changelog/src/policy/changelog-policy-replication.c @@ -0,0 +1,1184 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-policy.h" +#include "changelog-encoders.h" +#include "changelog-fops.h" + +#define JOURNAL_NAME "TERM" + +#define JOURNAL_SECTOR_SIZE 128 + +#define PRE_OP_MARK 0x5F4552505FULL /* _PRE_ */ +#define POST_OP_MARK 0x5F54534F505FULL /* _POST_ */ + +/** + * assume an ever increasing index for now.. + */ +static unsigned long nsr_index = 1; + +static unsigned long +get_index(changelog_priv_t *priv) { + unsigned long idx = 0; + + LOCK (&priv->lock); + { + idx = nsr_index++; + } + UNLOCK (&priv->lock); + + return idx; +} + +static void +reset_index(changelog_priv_t *priv) { + nsr_index = 1; +} + + +#if 0 +static inline void +//changelog_replication_assign_term (changelog_priv_t *priv, + changelog_local_t *local) +{ + local->nr_bytes = 0; + local->lu.val = get_index (priv); +} +#endif + +size_t +number_fn (void *data, char *buffer, gf_boolean_t encode) +{ + char buf[1024] = {0,}; + size_t bufsz = 0; + unsigned long long nr = 0; + + nr = *(unsigned long long *) data; + + if (encode) { + (void) snprintf (buf, sizeof (buf), "%llu", nr); + CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf)); + } else + CHANGELOG_FILL_BUFFER (buffer, bufsz, + &nr, sizeof (unsigned long long)); + + return bufsz; +} + +size_t +uuid_fn (void *data, char *buffer, gf_boolean_t encode) +{ + char buf[1024] = {0,}; + uuid_t uuid = {0,}; + size_t bufsz = 0; + + memcpy (uuid, (uuid_t *) data, sizeof (uuid_t)); + + if (encode) { + char *tmpbuf = uuid_utoa (uuid); + (void) snprintf (buf, sizeof (buf), "%s", tmpbuf); + CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf)); + } else + CHANGELOG_FILL_BUFFER (buffer, bufsz, uuid, sizeof (uuid_t)); + + return bufsz; +} + +#define CHANGELOG_FILL_USIGNLL(co, number, converter, xlen) do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_ULL; \ + co->co_number = (unsigned long long) number; \ + xlen += sizeof (unsigned long long); \ + if (!co->co_convert) \ + co->co_len = sizeof (unsigned long long); \ + } while (0) + +#define CHANGELOG_FILL_UUID(co, uuid, converter, xlen) do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_UUID; \ + uuid_copy (co->co_uuid, uuid); \ + xlen += sizeof (uuid_t); \ + } while (0) + + +/* TBD: move declarations here and nsr.c into a common place */ +#define NSR_TERM_XATTR "trusted.nsr.term" +#define RECON_TERM_XATTR "trusted.nsr.recon-term" +#define RECON_INDEX_XATTR "trusted.nsr.recon-index" + +static gf_boolean_t +changelog_fix_term(xlator_t *this, + changelog_local_t *local, + dict_t *xdata) +{ + int32_t old_term, new_term; + changelog_priv_t *priv = this->private; + int ret = 0; + char nfile[PATH_MAX] = {0,}; + int32_t recon_term, recon_index; + changelog_rollover_data_t crd; + + // If coming via the regular IO path, we should get the dict "nsr-term" + // If coming via reconciliation, we should get the dicts "nsr-recon-term" + // that indicates the term and "nsr-recon-index" for the index + if (dict_get_int32(xdata,NSR_TERM_XATTR,&new_term) == 0) { + old_term = priv->term; + + if (old_term != new_term) { + GF_ASSERT(new_term > old_term); + LOCK (&priv->lock); + reset_index(priv); + priv->term = new_term; + (void) snprintf (nfile, PATH_MAX, "%s.%d", + JOURNAL_NAME, priv->term); + ret = CHANGELOG_INVOKE_CFOP(this, priv, rollover, + nfile, _gf_false); + UNLOCK (&priv->lock); + if (ret != 0) + return _gf_false; + } + local->nr_bytes = 0; + local->lu.val = get_index (priv); + } else if ((dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + + old_term = priv->term; + + if (old_term != recon_term) { + LOCK (&priv->lock); + priv->term = recon_term; + (void) snprintf (crd.crd_changelog_name, PATH_MAX, "%s.%d", JOURNAL_NAME, priv->term); + crd.crd_prealloc_size = 0; + if (changelog_open(this, priv, local, &crd) != 0) + return _gf_false; + UNLOCK (&priv->lock); + } + local->nr_bytes = 0; + local->lu.val = recon_index; + } else { + return _gf_false; + } + + return _gf_true; +} + + + +/** override FOPS */ + +int32_t +changelog_replication_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + + /*
 + FOP + GFID + Entry */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 4);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_unlink (call_frame_t *frame, xlator_t *this,
+                              loc_t *loc, int xflags, dict_t *xdata)
+{
+        return changelog_replication_rmdir (frame, this, loc, xflags, xdata);
+}
+
+int32_t
+changelog_replication_rename (call_frame_t *frame, xlator_t *this,
+                              loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID + OLDLOC + NEWLOC */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->inode->gfid, 5);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, oldloc->inode->gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        //changelog_replication_assign_term (priv, local);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 5);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_link (call_frame_t *frame,
+                            xlator_t *this, loc_t *oldloc,
+                            loc_t *newloc, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID + Entry */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->gfid, 4);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, oldloc->gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_mkdir (call_frame_t *frame,
+                             xlator_t *this, loc_t *loc,
+                             mode_t mode, mode_t umask, dict_t *xdata)
+{
+        int                ret      = -1;
+        uuid_t             gfid     = {0,};
+        void              *uuid_req = NULL;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+        if (ret) {
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "failed to get gfid from dict");
+                goto out;
+        }
+        uuid_copy (gfid, uuid_req);
+
+        ret = -1;
+
+        /* 
 + FOP + GFID + Entry */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 4);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_symlink (call_frame_t *frame, xlator_t *this,
+                               const char *linkname, loc_t *loc,
+                               mode_t umask, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        uuid_t             gfid     = {0,};
+        void              *uuid_req = NULL;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+        if (ret) {
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "failed to get gfid from dict");
+                goto out;
+        }
+        uuid_copy (gfid, uuid_req);
+
+        ret = -1;
+
+        /* 
 + FOP + GFID + Entry */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 4);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_mknod (call_frame_t *frame,
+                             xlator_t *this, loc_t *loc,
+                             mode_t mode, dev_t dev,
+                             mode_t umask, dict_t *xdata)
+{
+        int                ret      = -1;
+        uuid_t             gfid     = {0,};
+        void              *uuid_req = NULL;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+        if (ret) {
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "failed to get gfid from dict");
+                goto out;
+        }
+        uuid_copy (gfid, uuid_req);
+
+        ret = -1;
+
+        /* 
 + FOP + GFID + Entry */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 4);
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_create (call_frame_t *frame, xlator_t *this,
+                              loc_t *loc, int32_t flags, mode_t mode,
+                              mode_t umask, fd_t *fd, dict_t *xdata)
+{
+        int                ret      = -1;
+        uuid_t             gfid     = {0,};
+        void              *uuid_req = NULL;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+        if (ret) {
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "failed to get gfid from dict");
+                goto out;
+        }
+        uuid_copy (gfid, uuid_req);
+
+        ret = -1;
+
+        /* 
 + FOP + GFID + Entry */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 4);
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+                              entry_fn, entry_free_fn, xtra_len, out);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_fsetattr (call_frame_t *frame,
+                                xlator_t *this, fd_t *fd,
+                                struct iatt *stbuf, int32_t valid,
+                                dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 3);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_setattr (call_frame_t *frame,
+                               xlator_t *this, loc_t *loc,
+                               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 3);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_fremovexattr (call_frame_t *frame, xlator_t *this,
+                                    fd_t *fd, const char *name, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+        int32_t            xattr_op;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 3);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0) {
+                CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+                                                fop_fn, xtra_len);
+        }
+        else {
+                CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+                                           xtra_len);
+        }
+        co++;
+
+        CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_removexattr (call_frame_t *frame, xlator_t *this,
+                                   loc_t *loc, const char *name, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+        int32_t            xattr_op;
+
+        priv = this->private;
+
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 3);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0) {
+                CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+                                           fop_fn, xtra_len);
+        }
+        else {
+                CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+                                           xtra_len);
+        }
+        co++;
+
+        CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_setxattr (call_frame_t *frame,
+                                xlator_t *this, loc_t *loc,
+                                dict_t *dict, int32_t flags, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+        int32_t            xattr_op;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 3);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0) {
+                CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+                                           fop_fn, xtra_len);
+        }
+        else {
+                CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+                                           xtra_len);
+        }
+        co++;
+
+        CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_fsetxattr (call_frame_t *frame,
+                                 xlator_t *this, fd_t *fd, dict_t *dict,
+                                 int32_t flags, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+        int32_t            xattr_op;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 3);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+
+        if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0) {
+                CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+                                           fop_fn, xtra_len);
+        }
+        else {
+                CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+                                           xtra_len);
+        }
+        co++;
+
+        CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_truncate (call_frame_t *frame,
+                                xlator_t *this, loc_t *loc,
+                                off_t offset, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID + Offset */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 4);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_ftruncate (call_frame_t *frame,
+                                 xlator_t *this, fd_t *fd,
+                                 off_t offset, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID + Offset */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 4);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+int32_t
+changelog_replication_writev (call_frame_t *frame,
+                              xlator_t *this, fd_t *fd, struct iovec *vector,
+                              int32_t count, off_t offset, uint32_t flags,
+                              struct iobref *iobref, dict_t *xdata)
+{
+        int                ret      = -1;
+        size_t             xtra_len = 0;
+        changelog_opt_t   *co       = NULL;
+        changelog_priv_t  *priv     = NULL;
+        changelog_local_t *local    = NULL;
+
+        priv = this->private;
+
+        /* 
 + FOP + GFID + Offset + Length */
+        CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 5);
+        if (!local)
+                goto out;
+
+        co = changelog_get_usable_buffer (local);
+        if (!co)
+                goto out;
+
+        if (changelog_fix_term(this, local, xdata) == _gf_false)
+                goto out;
+
+        CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+        co++;
+
+        CHANGELOG_FILL_USIGNLL (co, iov_length (vector, count),
+                               number_fn, xtra_len);
+
+        changelog_set_usable_record_and_length (local, xtra_len, 5);
+
+        //changelog_replication_assign_term (priv, local);
+
+        frame->local = local;
+        ret = 0;
+
+ out:
+        if (ret)
+                changelog_local_cleanup (this, local);
+        return ret;
+}
+
+/* overriden COPS */
+int
+changelog_replication_cops_open (xlator_t *this,
+                                 changelog_priv_t *priv, void *cpriv,
+                                 char *name, gf_boolean_t last)
+{
+        changelog_local_t local = {0,};
+        changelog_log_data_t       cld = {0,};
+        changelog_rollover_data_t *crd = NULL;
+
+        crd = &cld.cld_roll;
+
+        cld.cld_type = CHANGELOG_TYPE_ROLLOVER;
+
+        crd->crd_finale = last;
+        crd->crd_use_suffix = _gf_false;
+        crd->crd_prealloc_size = 1<<10; /* preallocate 1 MB */
+
+
+        (void) strcpy (crd->crd_changelog_name, name);
+
+        local.lu.val = 0;
+        local.nr_bytes = 0;
+
+        return changelog_inject_single_event (this, priv, &local, &cld);
+}
+
+/**
+ * no implicit rollover
+ */
+int
+changelog_replication_cops_rollover (xlator_t *this,
+                                     changelog_priv_t *priv, void *cpriv,
+                                     char *name, gf_boolean_t last)
+{
+        return changelog_replication_cops_open(this, priv, cpriv, name, last);
+}
+
+off_t
+changelog_replication_cops_get_offset (xlator_t *this,
+                                       changelog_priv_t *priv, void *cpriv,
+                                       changelog_local_t *local)
+{
+        if (!local)
+                return 0;
+
+        return (local->lu.val * JOURNAL_SECTOR_SIZE) + local->nr_bytes;
+}
+
+void
+changelog_replication_cops_set_offset (xlator_t *this,
+                                       changelog_priv_t *priv, void *cpriv,
+                                       changelog_local_t *local, off_t bytes)
+{
+        local->nr_bytes += bytes;
+}
+
+void
+changelog_replication_cops_reset_offset (xlator_t *this, changelog_priv_t *priv,
+                                         void *cpriv, changelog_local_t *local)
+{
+        return;
+}
+
+int
+changelog_replication_policy_init (xlator_t *this,
+                                   changelog_priv_t *priv,
+                                   struct changelog_logpolicy *cp)
+{
+        struct xlator_fops   *r_fops = NULL;
+        struct changelog_ops *r_cops = NULL;
+
+        r_fops = GF_CALLOC (1, sizeof (struct xlator_fops),
+                            gf_changelog_mt_fop_policy_t);
+        if (!r_fops)
+                return -1;
+
+        r_cops = GF_CALLOC (1, sizeof (struct changelog_ops),
+                            gf_changelog_mt_fop_policy_t);
+        if (!r_cops) {
+                GF_FREE (r_fops);
+                return -1;
+        }
+
+        /* no roll-over, one big fat journal per term */
+        priv->rollover_time = 0;
+
+        /* fsync() is internally trigerred by NSR */
+        priv->fsync_interval = 0;
+
+        /* no record header: extra data (via iobufs) are always persisted */
+        priv->no_gfid_hdr = _gf_true;
+
+        memcpy (r_fops, &changelog_default_fops, sizeof (struct xlator_fops));
+        memcpy (r_cops, &changelog_default_cops, sizeof (struct changelog_ops));
+
+        priv->term = 0;
+        (void) memset (cp->changelog_name, '\0', PATH_MAX);
+        memcpy(cp->changelog_name, JOURNAL_NAME, strlen(JOURNAL_NAME));
+#if 0
+        (void) snprintf (cp->changelog_name, PATH_MAX,
+                         JOURNAL_NAME, priv->term);
+#endif
+
+        /* overload all fops */
+        r_fops->writev       = changelog_replication_writev;
+        r_fops->ftruncate    = changelog_replication_ftruncate;
+        r_fops->truncate     = changelog_replication_truncate;
+        r_fops->fsetxattr    = changelog_replication_fsetxattr;
+        r_fops->setxattr     = changelog_replication_setxattr;
+        r_fops->removexattr  = changelog_replication_removexattr;
+        r_fops->fremovexattr = changelog_replication_fremovexattr;
+        r_fops->setattr      = changelog_replication_setattr;
+        r_fops->fsetattr     = changelog_replication_fsetattr;
+        r_fops->create       = changelog_replication_create;
+        r_fops->mknod        = changelog_replication_mknod;
+        r_fops->symlink      = changelog_replication_symlink;
+        r_fops->mkdir        = changelog_replication_mkdir;
+        r_fops->link         = changelog_replication_link;
+        r_fops->rename       = changelog_replication_rename;
+        r_fops->unlink       = changelog_replication_unlink;
+        r_fops->rmdir        = changelog_replication_rmdir;
+
+        /* overload cops */
+        r_cops->open         = changelog_replication_cops_open;
+        r_cops->rollover     = changelog_replication_cops_rollover;
+        r_cops->get_offset   = changelog_replication_cops_get_offset;
+        r_cops->set_offset   = changelog_replication_cops_set_offset;
+        r_cops->reset_offset = changelog_replication_cops_reset_offset;
+
+        cp->fops = r_fops;
+        cp->cops = r_cops;
+
+        return 0;
+}
+
+int
+changelog_replication_policy_fini (xlator_t *this,
+                                   struct changelog_logpolicy *cp)
+{
+        GF_FREE (cp->fops);
+        GF_FREE (cp->cops);
+        return 0;
+}
diff --git a/xlators/features/changelog/src/policy/changelog-policy.h b/xlators/features/changelog/src/policy/changelog-policy.h
new file mode 100644
index 000000000..73fdc1a98
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy.h
@@ -0,0 +1,41 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. 
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_POLICY_H
+#define _CHANGELOG_POLICY_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-mem-types.h"
+#include "changelog-helpers.h"
+
+int
+changelog_default_policy_init (xlator_t *this,
+                               changelog_priv_t *priv,
+                               struct changelog_logpolicy *);
+int
+changelog_default_policy_fini (xlator_t *this,
+                               struct changelog_logpolicy *);
+int
+changelog_replication_policy_init (xlator_t *this,
+                                   changelog_priv_t *priv,
+                                   struct changelog_logpolicy *cp);
+int
+changelog_replication_policy_fini (xlator_t *this,
+                                   struct changelog_logpolicy *cp);
+
+#endif /* _CHANGELOG_POLICY_H */
diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am
index a6f49ae01..cbb6353f8 100644
--- a/xlators/mgmt/glusterd/src/Makefile.am
+++ b/xlators/mgmt/glusterd/src/Makefile.am
@@ -11,7 +11,8 @@ glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \
 	glusterd-volgen.c glusterd-rebalance.c glusterd-quota.c \
 	glusterd-geo-rep.c glusterd-replace-brick.c glusterd-log-ops.c \
 	glusterd-volume-ops.c glusterd-brick-ops.c glusterd-mountbroker.c \
-	glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c
+	glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c \
+	glusterd-etcd.c
 
 glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
 		     $(top_builddir)/rpc/xdr/src/libgfxdr.la \
@@ -21,7 +22,7 @@ glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
 noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \
 	glusterd-sm.h glusterd-store.h glusterd-mem-types.h \
 	glusterd-pmap.h glusterd-volgen.h glusterd-mountbroker.h \
-	glusterd-syncop.h glusterd-hooks.h
+	glusterd-syncop.h glusterd-hooks.h glusterd-etcd.h
 
 AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
 	-I$(rpclibdir) -I$(CONTRIBDIR)/rbtree \
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.c b/xlators/mgmt/glusterd/src/glusterd-etcd.c
new file mode 100644
index 000000000..3382e20ae
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.c
@@ -0,0 +1,86 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. 
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "glusterfs.h"
+#include "run.h"
+#include "glusterd-etcd.h"
+
+#define GLUSTERD_ETCD_DIR       "/var/lib/glusterd/etcd"
+#define GLUSTERD_ETCD_CMD       "/root/etcd/etcd"
+
+pid_t
+start_etcd (char *this_host, char *other_host)
+{
+        runner_t        runner;
+        char            me[256];
+
+        if (gethostname(me,sizeof(me)-1) != 0) {
+                gf_log (__func__, GF_LOG_ERROR, "gethostname failed?!?");
+                return -1;
+        }
+        me[sizeof(me)-1] = '\0';
+
+        if ((mkdir(GLUSTERD_ETCD_DIR,0700) < 0) && (errno != EEXIST)) {
+                gf_log (__func__, GF_LOG_ERROR,
+                        "failed to create %s", GLUSTERD_ETCD_DIR);
+                return -1;
+        }
+
+        runinit (&runner);
+        runner_add_args (&runner, GLUSTERD_ETCD_CMD,
+                                  "-n", this_host, "-d", GLUSTERD_ETCD_DIR,
+                                  "-c", NULL);
+        runner_argprintf( &runner, "%s:4001", me);
+        runner_add_arg (&runner, "-s");
+        runner_argprintf (&runner, "%s:7001", me);
+        if (other_host) {
+                runner_add_arg (&runner, "-C");
+                runner_argprintf (&runner, "%s:7001", other_host);
+                gf_log (__func__, GF_LOG_INFO, "starting etcd  via %s", other_host);
+        } else {
+                gf_log (__func__, GF_LOG_INFO, "starting etcd standalone");
+        }
+
+        /*
+         * Runner_run would wait for it.  Runner_run_nowait would not wait,
+         * but would detach it so thoroughly that it won't die when we do.
+         * Also, runner->chpid would be the PID of the transient middle
+         * process, not the one we might actually need to kill later.  This
+         * seems to do exactly what we need.
+         */
+        if (runner_start(&runner) != 0) {
+                gf_log (__func__, GF_LOG_ERROR,
+                        "failed to start %s", GLUSTERD_ETCD_CMD);
+                return -1;
+        }
+
+        return runner.chpid;
+}
+
+void
+stop_etcd (pid_t pid)
+{
+        if (pid > 0) {
+                gf_log (__func__, GF_LOG_INFO, "killing etcd %d", pid);
+                (void)kill(pid,SIGKILL);
+                (void)waitpid(pid,NULL,0);
+        }
+}
+
+void
+nuke_etcd_dir (void)
+{
+        (void)runcmd("rm","-rf",GLUSTERD_ETCD_DIR,NULL);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.h b/xlators/mgmt/glusterd/src/glusterd-etcd.h
new file mode 100644
index 000000000..9459f6bbd
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.h
@@ -0,0 +1,23 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. 
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_ETCD_H_
+#define _GLUSTERD_ETCD_H_
+
+#include 
+#include "glusterfs.h"
+
+pid_t   start_etcd      (char *this_host, char *other_host);
+
+void    stop_etcd       (pid_t pid);
+
+void    nuke_etcd_dir   (void);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 3aafa122b..c7bf53b4e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -49,6 +49,7 @@
 
 #include "globals.h"
 #include "glusterd-syncop.h"
+#include "glusterd-etcd.h"
 
 #ifdef HAVE_BD_XLATOR
 #include 
@@ -2380,7 +2381,18 @@ __glusterd_handle_probe_query (rpcsvc_request_t *req)
                         gf_log ("", GF_LOG_ERROR, "Failed to add peer %s",
                                 remote_hostname);
                         rsp.op_errno = GF_PROBE_ADD_FAILED;
+                        goto respond;
                 }
+                gf_log (THIS->name, GF_LOG_INFO,
+                        "joining, should point etcd at %s", remote_hostname);
+                /*
+                 * We should have started a standalone etcd before.  Now we
+                 * need a new one, with a new config.
+                 */
+                stop_etcd(conf->etcd_pid);
+                nuke_etcd_dir();
+                conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID),
+                                             remote_hostname);
         }
 
 respond:
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c
index c671edf68..2490ba665 100644
--- a/xlators/mgmt/glusterd/src/glusterd-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.c
@@ -34,6 +34,7 @@
 #include "glusterd-op-sm.h"
 #include "glusterd-utils.h"
 #include "glusterd-store.h"
+#include "glusterd-etcd.h"
 
 static struct list_head gd_friend_sm_queue;
 
@@ -596,6 +597,9 @@ glusterd_ac_handle_friend_remove_req (glusterd_friend_sm_event_t *event,
                         "Peer detach cleanup was not successful");
                 ret = 0;
         }
+        gf_log (THIS->name, GF_LOG_INFO, "detached, stopping etcd");
+        stop_etcd(priv->etcd_pid);
+        nuke_etcd_dir();
 out:
         gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
 
@@ -642,6 +646,11 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
         int                             status = 0;
         int32_t                         op_ret = -1;
         int32_t                         op_errno = 0;
+        xlator_t *this = NULL;
+        glusterd_conf_t *priv = NULL;
+
+        this = THIS;
+        priv = this->private;
 
         GF_ASSERT (ctx);
         ev_ctx = ctx;
@@ -692,6 +701,13 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
                                              peerinfo->hostname, ev_ctx->port,
                                              op_ret, op_errno);
 
+        // apply a deterministic function to decide via whom we should join the cluster
+        if (strcmp(peerinfo->hostname, ev_ctx->hostname) > 0) {
+                stop_etcd(priv->etcd_pid);
+                nuke_etcd_dir();
+                priv->etcd_pid = start_etcd (uuid_utoa(MY_UUID), peerinfo->hostname);
+        }
+
 out:
         gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
 
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 15c40f3e4..bcb2dc703 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -1483,6 +1483,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t  *volinfo,
         glusterd_conf_t         *priv = NULL;
         char                    pidfile[PATH_MAX] = {0,};
         int                     ret = 0;
+        glusterd_conf_t         *conf = NULL;
 
         GF_ASSERT (volinfo);
         GF_ASSERT (brickinfo);
@@ -1501,6 +1502,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t  *volinfo,
                 if (ret == 0) {
                         glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
                         (void) glusterd_brick_unlink_socket_file (volinfo, brickinfo);
+                        GLUSTERD_GET_BRICK_RECON_PIDFILE (pidfile, volinfo, brickinfo, priv);
+                        ret = glusterd_service_stop ("recon", pidfile, SIGTERM, _gf_false);
                 }
         }
 
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index dcff8c305..59bc7bcd5 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -101,7 +101,6 @@ xlator_instantiate_va (const char *type, const char *format, va_list arg)
         return NULL;
 }
 
-#ifdef __not_used_as_of_now_
 static xlator_t *
 xlator_instantiate (const char *type, const char *format, ...)
 {
@@ -114,7 +113,6 @@ xlator_instantiate (const char *type, const char *format, ...)
 
         return xl;
 }
-#endif
 
 static int
 volgen_xlator_link (xlator_t *pxl, xlator_t *cxl)
@@ -1400,6 +1398,303 @@ server_spec_extended_option_handler (volgen_graph_t *graph,
 
 static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);
 
+xlator_t *
+add_one_peer (volgen_graph_t *graph, glusterd_brickinfo_t *peer,
+              char *volname, uint16_t index)
+{
+        xlator_t        *kid;
+
+        kid = volgen_graph_add_nolink (graph, "protocol/client",
+                                       "%s-client-%u", volname,
+                                       index++);
+        if (!kid) {
+                return NULL;
+        }
+
+        /* TBD: figure out where to get the proper transport list */
+        if (xlator_set_option(kid,"transport-type","socket")) {
+                return NULL;
+        }
+        if (xlator_set_option(kid,"remote-host",peer->hostname)) {
+                return NULL;
+        }
+        if (xlator_set_option(kid,"remote-subvolume",peer->path)) {
+                return NULL;
+        }
+        /* TBD: deal with RDMA, SSL */
+
+        return kid;
+}
+
+void
+assign_groups (glusterd_volinfo_t *volinfo)
+{
+        glusterd_brickinfo_t    *brickinfo      = NULL;
+        uint16_t                group_num       = 0;
+        int                     in_group        = 0;
+
+        list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+                brickinfo->group = group_num;
+                if (++in_group >= volinfo->replica_count) {
+                        in_group = 0;
+                        ++group_num;
+                }
+        }
+}
+
+int
+add_nsr_stuff (volgen_graph_t *graph, char *volname,
+               glusterd_brickinfo_t *brickinfo, glusterd_volinfo_t *volinfo,
+               char *changelog_basepath)
+{
+        xlator_t                *me;
+        xlator_t                *kid;
+        glusterd_brickinfo_t    *peer;
+        uint16_t                index   = 0;
+        //uint32_t                i=0;
+        char                    *leader_opt;
+        uint32_t                replica_group_size = 1;
+        char                    dst[NSR_MAX_PATH_SIZE];
+        char                    local_path[NSR_MAX_PATH_SIZE];
+        char                    local_name[NSR_MAX_PATH_SIZE];
+        char                    hosts[NSR_MAX_PATH_SIZE * NSR_MAX_REPLICA_GROUP_SIZE];
+        char                    remote_names[NSR_MAX_REPLICA_GROUP_SIZE * NSR_MAX_PATH_SIZE];
+        char                    filepath[PATH_MAX] = {0,};
+        char                    lp[PATH_MAX] = {0,};
+        xlator_t                *xl = NULL;
+        char                    s[256];
+        char                    transt[16] = {0,};
+        char                    auth[256];
+        char                    c_d[NSR_MAX_PATH_SIZE];
+        char                    *username = NULL, *password = NULL;
+        gf_boolean_t            enable_recon = _gf_false;
+
+
+        if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr.recon") > 0) {
+                enable_recon = _gf_true;
+        }
+
+        volgen_graph_t ng = {0,};
+        char  path[PATH_MAX] = {0,};
+        char *ptr = NULL, *this = NULL, *that = NULL;
+        glusterd_conf_t *priv = NULL;
+
+
+        priv = THIS->private;
+        remote_names[0] = '\0';
+        that = gf_strdup (brickinfo->hostname);
+        this = gf_strdup (brickinfo->path);
+        ptr = strchr (this, '/');
+        while (ptr) {
+                *ptr = '-';
+                ptr = strchr (this, '/');
+        }
+        GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
+        snprintf (dst, PATH_MAX,
+                  "%s/%s/%s:%s",
+                  path,
+                  GLUSTERD_BRICK_INFO_DIR,
+                  that,
+                  this);
+
+        /* Create the NSR xlator, but defer linkage for now. */
+        me = xlator_instantiate ("cluster/nsr", "%s-nsr", volname);
+        if (!me || volgen_xlator_link(me,first_of(graph))) {
+                return -1;
+        }
+
+        strcpy(local_name, brickinfo->hostname);
+        strcpy(local_path, brickinfo->hostname);
+        strcat(local_name, ":");
+        strcat(local_name, brickinfo->path);
+        strcpy(hosts, brickinfo->hostname);
+
+        peer = list_prev (brickinfo, &volinfo->bricks,
+                          glusterd_brickinfo_t, brick_list);
+        /* Check leader status while we have this pointer in hand. */
+        leader_opt = (!peer || (peer->group != brickinfo->group)) ? "yes"
+                                                                  : "no";
+        if (xlator_set_option(me,"vol-name",volname))
+                return -1;
+        if (xlator_set_option(me,"my-name",local_name))
+                return -1;
+        if (xlator_set_option(me,"leader",leader_opt))
+                return -1;
+
+#define FILL_REMOTE_NAMES { \
+                strcat(remote_names, \
+                       peer->hostname); \
+                strcat(remote_names, \
+                       ":"); \
+                strcat(remote_names, \
+                       peer->path); \
+                strcat(remote_names, \
+                       ","); \
+                strcat(hosts, ","); \
+                strcat(hosts, \
+                       peer->hostname); \
+                replica_group_size++; \
+}
+
+        /* Now get on with the show. */
+        while (peer) {
+                if (peer->group != brickinfo->group) {
+                        break;
+                }
+                gf_log ("glusterd", GF_LOG_INFO,
+                        "%s:%s needs client for %s:%s",
+                        brickinfo->hostname, brickinfo->path,
+                        peer->hostname, peer->path);
+                kid = add_one_peer (graph, peer, volname, index++);
+                if (!kid || volgen_xlator_link(me,kid)) {
+                        return -1;
+                }
+                FILL_REMOTE_NAMES;
+                peer = list_prev (peer, &volinfo->bricks,
+                                  glusterd_brickinfo_t, brick_list);
+        }
+
+        peer = list_next (brickinfo, &volinfo->bricks,
+                          glusterd_brickinfo_t, brick_list);
+        while (peer) {
+                if (peer->group != brickinfo->group) {
+                        break;
+                }
+                gf_log ("glusterd", GF_LOG_INFO,
+                        "%s:%s needs client for %s:%s",
+                        brickinfo->hostname, brickinfo->path,
+                        peer->hostname, peer->path);
+                kid = add_one_peer (graph, peer, volname, index++);
+                if (!kid || volgen_xlator_link(me,kid)) {
+                        return -1;
+                }
+                FILL_REMOTE_NAMES;
+                peer = list_next (peer, &volinfo->bricks,
+                                  glusterd_brickinfo_t, brick_list);
+        }
+
+        // to remove the final ","
+        if (strlen(remote_names)) {
+                remote_names[strlen(remote_names) - 1] = '\0';
+        }
+        if (xlator_set_option(me,"etcd-servers",hosts))
+                return -1;
+
+        // Finish linkage to client file
+        glusterfs_graph_set_first(&graph->graph,me);
+
+        if (enable_recon == _gf_false)
+                return 0;
+
+        /* Now fill in the various files required for reeconciliation */
+        snprintf (filepath, PATH_MAX,
+                  "%s-nsr-recon.vol",
+                  dst);
+        gf_log ("glusterd", GF_LOG_INFO,
+               "writing nsr recon volfile in %s\n",
+               filepath);
+#if 0
+        strcpy(lp, local_name);
+#else
+        strcpy(lp, brickinfo->path);
+#endif
+        strcat(lp,"/recon");
+        bzero(&ng, sizeof(ng));
+        xl = volgen_graph_add_as (&ng, "cluster/nsr_recon",lp);
+        if (!xl)
+                return -1;
+        sprintf(s,"%d",replica_group_size);
+        if (xlator_set_option(xl, "replica-group-size", s) == -1)
+                return -1;
+        if (xlator_set_option(xl, "local-member", local_name) == -1)
+                return -1;
+        if (xlator_set_option(xl, "replica-group-members", remote_names) == -1)
+                return -1;
+        if (xlator_set_option(xl,"vol-name",volname))
+                return -1;
+        if (xlator_set_option(xl,"changelog-dir",changelog_basepath))
+                return -1;
+        if (xlator_set_option(xl,"base-dir",brickinfo->path))
+                return -1;
+
+        xl = volgen_graph_add (&ng, "protocol/server", lp);
+        if (!xl)
+                return -1;
+        get_vol_transport_type (volinfo, transt);
+        if(xlator_set_option (xl, "transport-type", transt) == -1)
+                return -1;
+        sprintf(s,"%d",27000);
+        if(xlator_set_option (xl, "transport.socket.listen-port", s) == -1)
+                return -1;
+        strcpy(auth, "auth.addr.");
+        strcat(auth, lp);
+        strcat(auth, ".allow");
+        if(xlator_set_option (xl, auth, "*") == -1)
+                return -1;
+        if(xlator_set_option (xl, "rpc-auth.auth-null", "off") == -1)
+                return -1;
+        if(xlator_set_option (xl, "rpc-auth.auth-unix", "off") == -1)
+                return -1;
+        if(xlator_set_option (xl, "rpc-auth.auth-glusterfs", "off") == -1)
+                return -1;
+        if(volgen_write_volfile(&ng, filepath) == -1)
+                return -1;
+
+        bzero(&ng, sizeof(ng));
+        kid = volgen_graph_add_nolink (&ng, "protocol/client",
+                                       "%s-client-%u", lp, 0);
+        if (!kid)
+                return -1;
+        if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+                return -1;
+#if 0
+        strcpy(lp, brickinfo->path);
+        strcat(lp,"/recon");
+#endif
+        if (xlator_set_option(kid,"remote-subvolume",lp))
+                return -1;
+        if(xlator_set_option (kid, "transport-type", transt) == -1)
+                return -1;
+        sprintf(s,"%d",27000);
+        if(xlator_set_option (kid, "remote-port", s) == -1)
+                return -1;
+        snprintf (c_d, PATH_MAX,
+                  "%s/%s/con:%s:%s",
+                  path,
+                  GLUSTERD_BRICK_INFO_DIR,
+                  that, this);
+        if (volgen_write_volfile(&ng, c_d))
+                return -1;
+
+        bzero(&ng, sizeof(ng));
+        kid = volgen_graph_add_nolink (&ng, "protocol/client",
+                                       "%s-client-%u", lp, 0);
+        if (!kid)
+                return -1;
+        if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+                return -1;
+        if (xlator_set_option(kid,"remote-subvolume",brickinfo->path))
+                return -1;
+        if(xlator_set_option (kid, "transport-type", transt) == -1)
+                return -1;
+        username = glusterd_auth_get_username (volinfo);
+        password = glusterd_auth_get_password (volinfo);
+        if(xlator_set_option (kid, "username", username) == -1)
+                return -1;
+        if(xlator_set_option (kid, "password", password) == -1)
+                return -1;
+        snprintf (c_d, PATH_MAX,
+                  "%s/%s/data:%s:%s",
+                  path,
+                  GLUSTERD_BRICK_INFO_DIR, that,
+                  this);
+        if (volgen_write_volfile(&ng, c_d))
+                return -1;
+
+        return 0;
+
+}
+
 static int
 server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
                       dict_t *set_dict, void *param)
@@ -1506,10 +1801,17 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
         if (ret)
                 return -1;
 
+        if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+                ret = xlator_set_option (xl, "encoding", "ascii");
+                if (ret)
+                        return -1;
+        }
+
         ret = check_and_add_debug_xl (graph, set_dict, volname, "changelog");
         if (ret)
                 return -1;
 
+
         xl = volgen_graph_add (graph, "features/access-control", volname);
         if (!xl)
                 return -1;
@@ -1584,9 +1886,19 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
                         return -1;
         }
 
-        xl = volgen_graph_add (graph, "features/index", volname);
-        if (!xl)
-                return -1;
+        /* TBD: conditionalize on NSR being enabled */
+        if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+                assign_groups(volinfo);
+                ret = add_nsr_stuff (graph, volname, brickinfo, volinfo, changelog_basepath);
+                if (ret) {
+                        return -1;
+                }
+        }
+        else {
+                xl = volgen_graph_add (graph, "features/index", volname);
+                if (!xl)
+                        return -1;
+        }
 
         snprintf (index_basepath, sizeof (index_basepath), "%s/%s",
                   path, ".glusterfs/indices");
@@ -2407,8 +2719,8 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
                                     glusterd_volinfo_t *volinfo,
                                     gf_boolean_t is_quotad)
 {
-        char                    *replicate_args[]   = {"cluster/replicate",
-                                                       "%s-replicate-%d"};
+        char                    *replicate_type     = "cluster/replicate";
+        char                    *replicate_fmt      = "%s-replicate-%d";
         char                    *stripe_args[]      = {"cluster/stripe",
                                                        "%s-stripe-%d"};
         int                     rclusters           = 0;
@@ -2422,12 +2734,16 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
         if (volinfo->dist_leaf_count == 1)
                 goto build_distribute;
 
+        if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+                replicate_type = "cluster/nsrc";
+        }
+
         /* All other cases, it will have one or the other cluster type */
         switch (volinfo->type) {
         case GF_CLUSTER_TYPE_REPLICATE:
                 clusters = volgen_graph_build_clusters (graph, volinfo,
-                                                        replicate_args[0],
-                                                        replicate_args[1],
+                                                        replicate_type,
+                                                        replicate_fmt,
                                                         volinfo->brick_count,
                                                         volinfo->replica_count);
                 if (clusters < 0)
@@ -2447,8 +2763,8 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
                 if (volinfo->replica_count == 0)
                         goto out;
                 clusters = volgen_graph_build_clusters (graph, volinfo,
-                                                        replicate_args[0],
-                                                        replicate_args[1],
+                                                        replicate_type,
+                                                        replicate_fmt,
                                                         volinfo->brick_count,
                                                         volinfo->replica_count);
                 if (clusters < 0)
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
index 1683f9050..4411bc4de 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.h
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -35,6 +35,10 @@
 #define AUTH_REJECT_OPT_KEY "auth.addr.*.reject"
 #define NFS_DISABLE_OPT_KEY "nfs.*.disable"
 
+// TBD - bring this from a common conf file
+#define NSR_MAX_REPLICA_GROUP_SIZE      8
+#define NSR_MAX_PATH_SIZE               (1024 + PATH_MAX)
+#define NSR_CONF_PATH                   "/var/lib/glusterd/nsr/"
 
 typedef enum {
         GF_CLIENT_TRUSTED,
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 131f96ce6..f209d1ad9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -860,6 +860,19 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .type        = NO_DOC,
           .op_version  = 2
         },
+        { .key         = "cluster.nsr",
+          .voltype     = "cluster/nsr",
+          .option      = "!nsr",
+          .op_version  = 3,
+          .description = "enable NSR instead of AFR for replication",
+          .flags       = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+        },
+        { .key         = "cluster.nsr.recon",
+          .voltype     = "cluster/nsr",
+          .op_version  = 3,
+          .description = "enable NSR reconciliation",
+          .flags       = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+        },
 
         /* Performance xlators enable/disbable options */
         { .key         = "performance.write-behind",
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
index c2be2c9da..c3fccf8e1 100644
--- a/xlators/mgmt/glusterd/src/glusterd.c
+++ b/xlators/mgmt/glusterd/src/glusterd.c
@@ -1354,7 +1354,21 @@ init (xlator_t *this)
 
         if (list_empty (&conf->peers)) {
                 glusterd_launch_synctask (glusterd_spawn_daemons, NULL);
+                gf_log (this->name, GF_LOG_INFO,
+                        "no peers, should start FRESH etcd");
+                /*
+                 * We might not have any peers now, but if we did once before
+                 * then we don't want to start up with a config that still has
+                 * references to them.
+                 */
+                nuke_etcd_dir();
         }
+        else {
+                gf_log (this->name, GF_LOG_INFO,
+                        "have peers, should start etcd with old config");
+        }
+        conf->etcd_pid = start_etcd(uuid_utoa(MY_UUID),NULL);
+
         ret = glusterd_options_init (this);
         if (ret < 0)
                 goto out;
@@ -1400,6 +1414,8 @@ fini (xlator_t *this)
         conf = this->private;
 
         glusterd_stop_uds_listener (this);
+        stop_etcd(conf->etcd_pid);
+        nuke_etcd_dir();
 
         FREE (conf->pmap);
         if (conf->handle)
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index e1e9e591f..d2c88609e 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -38,6 +38,7 @@
 #include "cli1-xdr.h"
 #include "syncop.h"
 #include "store.h"
+#include "glusterd-etcd.h"
 
 #define GLUSTERD_MAX_VOLUME_NAME        1000
 #define GLUSTERD_TR_LOG_SIZE            50
@@ -157,6 +158,7 @@ typedef struct {
         gf_boolean_t  restart_done;
         rpcsvc_t     *uds_rpc;  /* RPCSVC for the unix domain socket */
         uint32_t      base_port;
+        pid_t         etcd_pid;
 } glusterd_conf_t;
 
 
@@ -180,6 +182,15 @@ struct glusterd_brickinfo {
         int                decommissioned;
         char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */
         int     caps; /* Capability */
+        /*
+         * The group is used to identify which bricks are part of the same
+         * replica set during brick-volfile generation, so that NSR volfiles
+         * can "cross-connect" the bricks to one another.  This same approach
+         * could be used to make client-volfile generation much simpler and
+         * more efficient too, though it would require some further adaptation
+         * to support more than one layer of hierarchy.
+         */
+        uint16_t                group;
 };
 
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
@@ -416,6 +427,15 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
                           volpath, brickinfo->hostname, exp_path);            \
         } while (0)
 
+#define GLUSTERD_GET_BRICK_RECON_PIDFILE(pidfile,volinfo,brickinfo, priv) do {      \
+                char exp_path[PATH_MAX] = {0,};                               \
+                char volpath[PATH_MAX]  = {0,};                               \
+                GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv);             \
+                GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path);  \
+                snprintf (pidfile, PATH_MAX, "%s/run/%s:-%s-recon.pid",              \
+                          volpath, brickinfo->hostname, exp_path);            \
+        } while (0)
+
 #define GLUSTERD_GET_NFS_PIDFILE(pidfile,nfspath) {                     \
                 snprintf (pidfile, PATH_MAX, "%s/run/nfs.pid",          \
                           nfspath);                                     \
-- 
cgit