summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--api/src/glfs-fops.c365
-rw-r--r--api/src/glfs-handleops.c193
-rw-r--r--api/src/glfs-handles.h37
-rw-r--r--api/src/glfs.h86
-rw-r--r--configure.ac6
-rw-r--r--glusterfs.spec.in4
-rw-r--r--libglusterfs/src/call-stub.c2
-rw-r--r--libglusterfs/src/call-stub.h6
-rw-r--r--libglusterfs/src/glusterfs.h2
-rw-r--r--libglusterfs/src/list.h14
-rw-r--r--libglusterfs/src/syncop.c233
-rw-r--r--libglusterfs/src/syncop.h32
-rw-r--r--tests/basic/nsr.t47
-rw-r--r--xlators/cluster/Makefile.am2
-rw-r--r--xlators/cluster/nsr-client/Makefile.am3
-rw-r--r--xlators/cluster/nsr-client/src/Makefile.am33
-rw-r--r--xlators/cluster/nsr-client/src/fop-template.c113
-rw-r--r--xlators/cluster/nsr-client/src/gen-fops.py57
-rw-r--r--xlators/cluster/nsr-client/src/nsrc.c194
-rw-r--r--xlators/cluster/nsr-recon/Makefile.am3
-rw-r--r--xlators/cluster/nsr-recon/src/Makefile.am22
-rw-r--r--xlators/cluster/nsr-recon/src/recon_driver.c2624
-rw-r--r--xlators/cluster/nsr-recon/src/recon_driver.h308
-rw-r--r--xlators/cluster/nsr-recon/src/recon_xlator.c837
-rw-r--r--xlators/cluster/nsr-recon/src/recon_xlator.h78
-rw-r--r--xlators/cluster/nsr-server/Makefile.am3
-rw-r--r--xlators/cluster/nsr-server/src/Makefile.am36
-rw-r--r--xlators/cluster/nsr-server/src/all-templates.c299
-rw-r--r--xlators/cluster/nsr-server/src/codegen.py174
-rw-r--r--xlators/cluster/nsr-server/src/codegen.pycbin0 -> 4915 bytes
-rw-r--r--xlators/cluster/nsr-server/src/etcd-api.c586
-rw-r--r--xlators/cluster/nsr-server/src/etcd-api.h176
-rw-r--r--xlators/cluster/nsr-server/src/gen-fops.py123
-rw-r--r--xlators/cluster/nsr-server/src/leader.c420
-rw-r--r--xlators/cluster/nsr-server/src/nsr-cg.c4444
-rw-r--r--xlators/cluster/nsr-server/src/nsr-internal.h81
-rw-r--r--xlators/cluster/nsr-server/src/nsr.c682
-rw-r--r--xlators/cluster/nsr-server/src/stub_etcd.c129
-rw-r--r--xlators/cluster/nsr-server/src/yajl.c175
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_common.h75
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_gen.h157
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_parse.h226
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_tree.h177
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_version.h23
-rw-r--r--xlators/cluster/nsr-server/src/yajl_alloc.c49
-rw-r--r--xlators/cluster/nsr-server/src/yajl_alloc.h34
-rw-r--r--xlators/cluster/nsr-server/src/yajl_buf.c103
-rw-r--r--xlators/cluster/nsr-server/src/yajl_buf.h57
-rw-r--r--xlators/cluster/nsr-server/src/yajl_bytestack.h69
-rw-r--r--xlators/cluster/nsr-server/src/yajl_encode.c220
-rw-r--r--xlators/cluster/nsr-server/src/yajl_encode.h34
-rw-r--r--xlators/cluster/nsr-server/src/yajl_gen.c350
-rw-r--r--xlators/cluster/nsr-server/src/yajl_lex.c763
-rw-r--r--xlators/cluster/nsr-server/src/yajl_lex.h117
-rw-r--r--xlators/cluster/nsr-server/src/yajl_parser.c492
-rw-r--r--xlators/cluster/nsr-server/src/yajl_parser.h78
-rw-r--r--xlators/cluster/nsr-server/src/yajl_tree.c501
-rw-r--r--xlators/cluster/nsr-server/src/yajl_version.c7
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h1
-rw-r--r--xlators/features/changelog/src/Makefile.am8
-rw-r--r--xlators/features/changelog/src/changelog-default-fops.c561
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c99
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h10
-rw-r--r--xlators/features/changelog/src/changelog-fops.h157
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c208
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h246
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h9
-rw-r--r--xlators/features/changelog/src/changelog-misc.h8
-rw-r--r--xlators/features/changelog/src/changelog-rt.c9
-rw-r--r--xlators/features/changelog/src/changelog-rt.h5
-rw-r--r--xlators/features/changelog/src/changelog.c428
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy-default.c44
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy-replication.c1184
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy.h41
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am5
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.c86
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.h23
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c12
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c338
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c13
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h20
85 files changed, 19036 insertions, 679 deletions
diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c
index f3ac335fb..8d905193a 100644
--- a/api/src/glfs-fops.c
+++ b/api/src/glfs-fops.c
@@ -145,7 +145,7 @@ out:
int
-glfs_close (struct glfs_fd *glfd)
+glfs_close_with_xdata (struct glfs_fd *glfd, dict_t *dict)
{
xlator_t *subvol = NULL;
int ret = -1;
@@ -168,7 +168,7 @@ glfs_close (struct glfs_fd *glfd)
goto out;
}
- ret = syncop_flush (subvol, fd);
+ ret = syncop_flush_with_xdata (subvol, fd, dict);
out:
fs = glfd->fs;
glfs_fd_destroy (glfd);
@@ -181,6 +181,11 @@ out:
return ret;
}
+int
+glfs_close (struct glfs_fd *glfd)
+{
+ return(glfs_close_with_xdata(glfd, NULL));
+}
int
glfs_lstat (struct glfs *fs, const char *path, struct stat *stat)
@@ -249,7 +254,7 @@ out:
int
-glfs_fstat (struct glfs_fd *glfd, struct stat *stat)
+glfs_fstat_with_xdata (struct glfs_fd *glfd, struct stat *stat, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -272,7 +277,7 @@ glfs_fstat (struct glfs_fd *glfd, struct stat *stat)
goto out;
}
- ret = syncop_fstat (subvol, fd, &iatt);
+ ret = syncop_fstat_with_xdata (subvol, fd, &iatt, dict);
if (ret == 0 && stat)
glfs_iatt_to_stat (glfd->fs, &iatt, stat);
@@ -285,17 +290,21 @@ out:
return ret;
}
+int
+glfs_fstat (struct glfs_fd *glfd, struct stat *stat)
+{
+ return(glfs_fstat_with_xdata(glfd, stat, NULL));
+}
+
struct glfs_fd *
-glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
+glfs_creat_with_xdata (struct glfs *fs, const char *path, int flags, mode_t mode, uuid_t gfid, dict_t *xattr_req)
{
int ret = -1;
struct glfs_fd *glfd = NULL;
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
int reval = 0;
__glfs_entry_fs (fs);
@@ -307,14 +316,6 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -404,8 +405,6 @@ retry:
out:
loc_wipe (&loc);
- if (xattr_req)
- dict_unref (xattr_req);
if (ret && glfd) {
glfs_fd_destroy (glfd);
@@ -421,9 +420,28 @@ out:
return glfd;
}
+struct glfs_fd *
+glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
+{
+ dict_t *xattr_req = NULL;
+ uuid_t gfid;
+ struct glfs_fd *fd = NULL;
+
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ uuid_generate (gfid);
+ fd = glfs_creat_with_xdata (fs, path, flags, mode, gfid, xattr_req);
+ if (xattr_req)
+ dict_unref (xattr_req);
+ return (fd);
+}
off_t
-glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence)
+glfs_lseek_with_xdata (struct glfs_fd *glfd, off_t offset, int whence, dict_t *dict)
{
struct stat sb = {0, };
int ret = -1;
@@ -438,7 +456,7 @@ glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence)
glfd->offset += offset;
break;
case SEEK_END:
- ret = glfs_fstat (glfd, &sb);
+ ret = glfs_fstat_with_xdata (glfd, &sb, dict);
if (ret) {
/* seek cannot fail :O */
break;
@@ -450,12 +468,17 @@ glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence)
return glfd->offset;
}
+off_t
+glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence)
+{
+ return(glfs_lseek_with_xdata(glfd, offset, whence, NULL));
+}
//////////////
ssize_t
-glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
- off_t offset, int flags)
+glfs_preadv_with_xdata (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
+ off_t offset, int flags, dict_t *dict)
{
xlator_t *subvol = NULL;
ssize_t ret = -1;
@@ -483,7 +506,7 @@ glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
size = iov_length (iovec, iovcnt);
- ret = syncop_readv (subvol, fd, size, offset, 0, &iov, &cnt, &iobref);
+ ret = syncop_readv_with_xdata (subvol, fd, size, offset, 0, &iov, &cnt, &iobref, dict);
if (ret <= 0)
goto out;
@@ -506,6 +529,12 @@ out:
return ret;
}
+ssize_t
+glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
+ off_t offset, int flags)
+{
+ return(glfs_preadv_with_xdata(glfd, iovec, iovcnt, offset, flags, NULL));
+}
ssize_t
glfs_read (struct glfs_fd *glfd, void *buf, size_t count, int flags)
@@ -521,6 +550,19 @@ glfs_read (struct glfs_fd *glfd, void *buf, size_t count, int flags)
return ret;
}
+ssize_t
+glfs_read_with_xdata (struct glfs_fd *glfd, void *buf, size_t count, int flags, dict_t *dict)
+{
+ struct iovec iov = {0, };
+ ssize_t ret = 0;
+
+ iov.iov_base = buf;
+ iov.iov_len = count;
+
+ ret = glfs_preadv_with_xdata (glfd, &iov, 1, glfd->offset, flags, dict);
+
+ return ret;
+}
ssize_t
glfs_pread (struct glfs_fd *glfd, void *buf, size_t count, off_t offset,
@@ -773,6 +815,12 @@ ssize_t
glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
off_t offset, int flags)
{
+ return(glfs_pwritev_with_xdata(glfd, iovec, iovcnt, offset, flags, NULL));
+}
+ssize_t
+glfs_pwritev_with_xdata (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
+ off_t offset, int flags, dict_t *dict)
+{
xlator_t *subvol = NULL;
int ret = -1;
size_t size = -1;
@@ -828,7 +876,7 @@ glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
iov.iov_base = iobuf_ptr (iobuf);
iov.iov_len = size;
- ret = syncop_writev (subvol, fd, &iov, 1, offset, iobref, flags);
+ ret = syncop_writev_with_xdata (subvol, fd, &iov, 1, offset, iobref, flags, dict);
iobuf_unref (iobuf);
iobref_unref (iobref);
@@ -862,6 +910,20 @@ glfs_write (struct glfs_fd *glfd, const void *buf, size_t count, int flags)
return ret;
}
+ssize_t
+glfs_write_with_xdata (struct glfs_fd *glfd, const void *buf, size_t count, int flags, dict_t *dict)
+{
+ struct iovec iov = {0, };
+ ssize_t ret = 0;
+
+ iov.iov_base = (void *) buf;
+ iov.iov_len = count;
+
+ ret = glfs_pwritev_with_xdata (glfd, &iov, 1, glfd->offset, flags, dict);
+
+ return ret;
+}
+
ssize_t
@@ -875,6 +937,16 @@ glfs_writev (struct glfs_fd *glfd, const struct iovec *iov, int count,
return ret;
}
+ssize_t
+glfs_writev_with_xdata (struct glfs_fd *glfd, const struct iovec *iov, int count,
+ int flags, dict_t *dict)
+{
+ ssize_t ret = 0;
+
+ ret = glfs_pwritev_with_xdata (glfd, iov, count, glfd->offset, flags, dict);
+
+ return ret;
+}
ssize_t
glfs_pwrite (struct glfs_fd *glfd, const void *buf, size_t count, off_t offset,
@@ -978,7 +1050,7 @@ glfs_writev_async (struct glfs_fd *glfd, const struct iovec *iov, int count,
int
-glfs_fsync (struct glfs_fd *glfd)
+glfs_fsync_with_xdata (struct glfs_fd *glfd, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1000,7 +1072,7 @@ glfs_fsync (struct glfs_fd *glfd)
goto out;
}
- ret = syncop_fsync (subvol, fd, 0);
+ ret = syncop_fsync_with_xdata (subvol, fd, 0, dict);
out:
if (fd)
fd_unref (fd);
@@ -1010,6 +1082,11 @@ out:
return ret;
}
+int
+glfs_fsync (struct glfs_fd *glfd)
+{
+ return(glfs_fsync_with_xdata(glfd, NULL));
+}
static int
glfs_fsync_async_common (struct glfs_fd *glfd, glfs_io_cbk fn, void *data,
@@ -1093,7 +1170,7 @@ glfs_fdatasync_async (struct glfs_fd *glfd, glfs_io_cbk fn, void *data)
int
-glfs_ftruncate (struct glfs_fd *glfd, off_t offset)
+glfs_ftruncate_with_xdata (struct glfs_fd *glfd, off_t offset, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1115,7 +1192,7 @@ glfs_ftruncate (struct glfs_fd *glfd, off_t offset)
goto out;
}
- ret = syncop_ftruncate (subvol, fd, offset);
+ ret = syncop_ftruncate_with_xdata (subvol, fd, offset, dict);
out:
if (fd)
fd_unref (fd);
@@ -1125,6 +1202,11 @@ out:
return ret;
}
+int
+glfs_ftruncate (struct glfs_fd *glfd, off_t offset)
+{
+ return(glfs_ftruncate_with_xdata(glfd, offset, NULL));
+}
int
glfs_ftruncate_async (struct glfs_fd *glfd, off_t offset,
@@ -1196,14 +1278,12 @@ out:
int
-glfs_symlink (struct glfs *fs, const char *data, const char *path)
+glfs_symlink_with_xdata (struct glfs *fs, const char *data, const char *path, uuid_t gfid, dict_t *xattr_req)
{
int ret = -1;
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
int reval = 0;
__glfs_entry_fs (fs);
@@ -1215,14 +1295,6 @@ glfs_symlink (struct glfs *fs, const char *data, const char *path)
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -1267,14 +1339,30 @@ retry:
out:
loc_wipe (&loc);
- if (xattr_req)
- dict_unref (xattr_req);
-
glfs_subvol_done (fs, subvol);
return ret;
}
+int
+glfs_symlink (struct glfs *fs, const char *data, const char *path)
+{
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+ int ret = -1;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return -1 ;
+ }
+
+ uuid_generate (gfid);
+ ret = glfs_symlink_with_xdata(fs, data, path, gfid, xattr_req);
+
+ dict_unref (xattr_req);
+ return ret;
+}
int
glfs_readlink (struct glfs *fs, const char *path, char *buf, size_t bufsiz)
@@ -1325,14 +1413,12 @@ out:
int
-glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
+glfs_mknod_with_xdata (struct glfs *fs, const char *path, mode_t mode, dev_t dev, uuid_t gfid, dict_t *xattr_req)
{
int ret = -1;
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
int reval = 0;
__glfs_entry_fs (fs);
@@ -1344,14 +1430,7 @@ glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -1396,24 +1475,38 @@ retry:
out:
loc_wipe (&loc);
- if (xattr_req)
- dict_unref (xattr_req);
-
glfs_subvol_done (fs, subvol);
return ret;
}
+int
+glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
+{
+ dict_t *xattr_req = NULL;
+ uuid_t gfid;
+ int ret;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ uuid_generate (gfid);
+ ret = glfs_mknod_with_xdata(fs, path, mode, dev, gfid, xattr_req);
+
+ dict_unref (xattr_req);
+ return (ret);
+}
int
-glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
+glfs_mkdir_with_xdata (struct glfs *fs, const char *path, mode_t mode, uuid_t gfid, dict_t *xattr_req)
{
int ret = -1;
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
int reval = 0;
__glfs_entry_fs (fs);
@@ -1425,14 +1518,6 @@ glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -1477,17 +1562,33 @@ retry:
out:
loc_wipe (&loc);
- if (xattr_req)
- dict_unref (xattr_req);
glfs_subvol_done (fs, subvol);
return ret;
}
+int
+glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
+{
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+ int ret;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ uuid_generate (gfid);
+ ret = glfs_mkdir_with_xdata(fs, path, mode, gfid, xattr_req);
+ dict_unref (xattr_req);
+ return ret;
+}
int
-glfs_unlink (struct glfs *fs, const char *path)
+glfs_unlink_with_xdata (struct glfs *fs, const char *path, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1517,7 +1618,7 @@ retry:
goto out;
}
- ret = syncop_unlink (subvol, &loc);
+ ret = syncop_unlink_with_xdata (subvol, &loc, dict);
ESTALE_RETRY (ret, errno, reval, &loc, retry);
@@ -1531,9 +1632,14 @@ out:
return ret;
}
+int
+glfs_unlink (struct glfs *fs, const char *path)
+{
+ return(glfs_unlink_with_xdata(fs, path, NULL));
+}
int
-glfs_rmdir (struct glfs *fs, const char *path)
+glfs_rmdir_with_xdata (struct glfs *fs, const char *path, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1563,7 +1669,7 @@ retry:
goto out;
}
- ret = syncop_rmdir (subvol, &loc, 0);
+ ret = syncop_rmdir_with_xdata (subvol, &loc, 0, dict);
ESTALE_RETRY (ret, errno, reval, &loc, retry);
@@ -1577,9 +1683,14 @@ out:
return ret;
}
+int
+glfs_rmdir (struct glfs *fs, const char *path)
+{
+ return (glfs_rmdir_with_xdata(fs, path, NULL));
+}
int
-glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath)
+glfs_rename_with_xdata (struct glfs *fs, const char *oldpath, const char *newpath, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1626,7 +1737,7 @@ retrynew:
/* TODO: check if new or old is a prefix of the other, and fail EINVAL */
- ret = syncop_rename (subvol, &oldloc, &newloc);
+ ret = syncop_rename_with_xdata (subvol, &oldloc, &newloc, dict);
if (ret == -1 && errno == ESTALE) {
if (reval < DEFAULT_REVAL_COUNT) {
@@ -1652,7 +1763,13 @@ out:
int
-glfs_link (struct glfs *fs, const char *oldpath, const char *newpath)
+glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath)
+{
+ return(glfs_rename_with_xdata(fs, oldpath, newpath, NULL));
+}
+
+int
+glfs_link_with_xdata (struct glfs *fs, const char *oldpath, const char *newpath, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1703,7 +1820,7 @@ retrynew:
}
newloc.inode = inode_ref (oldloc.inode);
- ret = syncop_link (subvol, &oldloc, &newloc);
+ ret = syncop_link_with_xdata (subvol, &oldloc, &newloc, dict);
if (ret == -1 && errno == ESTALE) {
loc_wipe (&oldloc);
@@ -1723,6 +1840,11 @@ out:
return ret;
}
+int
+glfs_link (struct glfs *fs, const char *oldpath, const char *newpath)
+{
+ return(glfs_link_with_xdata(fs, oldpath, newpath, NULL));
+}
struct glfs_fd *
glfs_opendir (struct glfs *fs, const char *path)
@@ -2158,8 +2280,8 @@ out:
int
-glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt,
- int valid, int follow)
+glfs_setattr_with_xdata (struct glfs *fs, const char *path, struct iatt *iatt,
+ int valid, int follow, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2186,7 +2308,7 @@ retry:
if (ret)
goto out;
- ret = syncop_setattr (subvol, &loc, iatt, valid, 0, 0);
+ ret = syncop_setattr_with_xdata (subvol, &loc, iatt, valid, 0, 0, dict);
ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
@@ -2197,9 +2319,15 @@ out:
return ret;
}
+int
+glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt,
+ int valid, int follow)
+{
+ return(glfs_setattr_with_xdata(fs, path, iatt, valid, follow, NULL));
+}
int
-glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid)
+glfs_fsetattr_with_xdata (struct glfs_fd *glfd, struct iatt *iatt, int valid, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2221,7 +2349,7 @@ glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid)
goto out;
}
- ret = syncop_fsetattr (subvol, fd, iatt, valid, 0, 0);
+ ret = syncop_fsetattr_with_xdata (subvol, fd, iatt, valid, 0, 0, dict);
out:
if (fd)
fd_unref (fd);
@@ -2231,6 +2359,11 @@ out:
return ret;
}
+int
+glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid)
+{
+ return(glfs_fsetattr_with_xdata(glfd, iatt, valid, NULL));
+}
int
glfs_chmod (struct glfs *fs, const char *path, mode_t mode)
@@ -2471,8 +2604,8 @@ glfs_lgetxattr (struct glfs *fs, const char *path, const char *name,
ssize_t
-glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value,
- size_t size)
+glfs_fgetxattr_with_xdata (struct glfs_fd *glfd, const char *name, void *value,
+ size_t size, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2495,7 +2628,7 @@ glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value,
goto out;
}
- ret = syncop_fgetxattr (subvol, fd, &xattr, name);
+ ret = syncop_fgetxattr_with_xdata (subvol, fd, &xattr, name, dict);
if (ret)
goto out;
@@ -2509,6 +2642,12 @@ out:
return ret;
}
+ssize_t
+glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value,
+ size_t size)
+{
+ return(glfs_fgetxattr_with_xdata(glfd, name, value, size, NULL));
+}
int
glfs_listxattr_process (void *value, size_t size, dict_t *xattr)
@@ -2597,7 +2736,7 @@ glfs_llistxattr (struct glfs *fs, const char *path, void *value, size_t size)
ssize_t
-glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size)
+glfs_flistxattr_with_xdata (struct glfs_fd *glfd, void *value, size_t size,dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2620,7 +2759,7 @@ glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size)
goto out;
}
- ret = syncop_fgetxattr (subvol, fd, &xattr, NULL);
+ ret = syncop_fgetxattr_with_xdata (subvol, fd, &xattr, NULL, dict);
if (ret)
goto out;
@@ -2635,6 +2774,12 @@ out:
}
+ssize_t
+glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size)
+{
+ return(glfs_flistxattr_with_xdata(glfd, value, size, NULL));
+}
+
dict_t *
dict_for_key_value (const char *name, const char *value, size_t size)
{
@@ -2657,7 +2802,7 @@ dict_for_key_value (const char *name, const char *value, size_t size)
int
glfs_setxattr_common (struct glfs *fs, const char *path, const char *name,
- const void *value, size_t size, int flags, int follow)
+ const void *value, size_t size, int flags, int follow, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2692,7 +2837,7 @@ retry:
goto out;
}
- ret = syncop_setxattr (subvol, &loc, xattr, flags);
+ ret = syncop_setxattr_with_xdata (subvol, &loc, xattr, flags, dict);
ESTALE_RETRY (ret, errno, reval, &loc, retry);
@@ -2711,21 +2856,27 @@ int
glfs_setxattr (struct glfs *fs, const char *path, const char *name,
const void *value, size_t size, int flags)
{
- return glfs_setxattr_common (fs, path, name, value, size, flags, 1);
+ return glfs_setxattr_common (fs, path, name, value, size, flags, 1, NULL);
}
+int
+glfs_setxattr_with_xdata (struct glfs *fs, const char *path, const char *name,
+ const void *value, size_t size, int flags, dict_t * dict)
+{
+ return glfs_setxattr_common (fs, path, name, value, size, flags, 1, dict);
+}
int
glfs_lsetxattr (struct glfs *fs, const char *path, const char *name,
const void *value, size_t size, int flags)
{
- return glfs_setxattr_common (fs, path, name, value, size, flags, 0);
+ return glfs_setxattr_common (fs, path, name, value, size, flags, 0, NULL);
}
int
-glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value,
- size_t size, int flags)
+glfs_fsetxattr_with_xdata (struct glfs_fd *glfd, const char *name, const void *value,
+ size_t size, int flags, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2755,7 +2906,7 @@ glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value,
goto out;
}
- ret = syncop_fsetxattr (subvol, fd, xattr, flags);
+ ret = syncop_fsetxattr_with_xdata (subvol, fd, xattr, flags, dict);
out:
if (xattr)
dict_unref (xattr);
@@ -2768,10 +2919,16 @@ out:
return ret;
}
+int
+glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value,
+ size_t size, int flags)
+{
+ return(glfs_fsetxattr_with_xdata(glfd, name, value, size, flags, NULL));
+}
int
glfs_removexattr_common (struct glfs *fs, const char *path, const char *name,
- int follow)
+ int follow, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2798,7 +2955,7 @@ retry:
if (ret)
goto out;
- ret = syncop_removexattr (subvol, &loc, name);
+ ret = syncop_removexattr_with_xdata (subvol, &loc, name, dict);
ESTALE_RETRY (ret, errno, reval, &loc, retry);
@@ -2814,19 +2971,25 @@ out:
int
glfs_removexattr (struct glfs *fs, const char *path, const char *name)
{
- return glfs_removexattr_common (fs, path, name, 1);
+ return glfs_removexattr_common (fs, path, name, 1, NULL);
}
int
glfs_lremovexattr (struct glfs *fs, const char *path, const char *name)
{
- return glfs_removexattr_common (fs, path, name, 0);
+ return glfs_removexattr_common (fs, path, name, 0, NULL);
+}
+
+int
+glfs_removexattr_with_xdata (struct glfs *fs, const char *path, const char *name, dict_t *dict)
+{
+ return glfs_removexattr_common (fs, path, name, 1, dict);
}
int
-glfs_fremovexattr (struct glfs_fd *glfd, const char *name)
+glfs_fremovexattr_with_xdata (struct glfs_fd *glfd, const char *name, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -2848,7 +3011,7 @@ glfs_fremovexattr (struct glfs_fd *glfd, const char *name)
goto out;
}
- ret = syncop_fremovexattr (subvol, fd, name);
+ ret = syncop_fremovexattr_with_xdata (subvol, fd, name, dict);
out:
if (fd)
fd_unref (fd);
@@ -2858,6 +3021,11 @@ out:
return ret;
}
+int
+glfs_fremovexattr (struct glfs_fd *glfd, const char *name)
+{
+ return(glfs_fremovexattr_with_xdata(glfd, name, NULL));
+}
int
glfs_fallocate (struct glfs_fd *glfd, int keep_size, off_t offset, size_t len)
@@ -3100,7 +3268,6 @@ out:
return retpath;
}
-
char *
glfs_getcwd (struct glfs *fs, char *buf, size_t n)
{
diff --git a/api/src/glfs-handleops.c b/api/src/glfs-handleops.c
index 0f996d3a2..6a60557ff 100644
--- a/api/src/glfs-handleops.c
+++ b/api/src/glfs-handleops.c
@@ -270,7 +270,7 @@ out:
}
struct glfs_fd *
-glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags)
+glfs_h_open_with_xdata (struct glfs *fs, struct glfs_object *object, int flags, dict_t * dict)
{
int ret = -1;
struct glfs_fd *glfd = NULL;
@@ -279,7 +279,7 @@ glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags)
loc_t loc = {0, };
/* validate in args */
- if ((fs == NULL) || (object == NULL)) {
+ if ((fs == NULL) || (object == NULL) || (dict == NULL)) {
errno = EINVAL;
return NULL;
}
@@ -330,7 +330,7 @@ glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags)
GLFS_LOC_FILL_INODE (inode, loc, out);
/* fop/op */
- ret = syncop_open (subvol, &loc, flags, glfd->fd);
+ ret = syncop_open_with_xdata (subvol, &loc, flags, glfd->fd, dict);
out:
loc_wipe (&loc);
@@ -352,9 +352,16 @@ out:
return glfd;
}
+struct glfs_fd *
+glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags)
+{
+ return(glfs_h_open_with_xdata(fs, object, flags, NULL));
+}
+
struct glfs_object *
-glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path,
- int flags, mode_t mode, struct stat *stat)
+glfs_h_creat_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path,
+ int flags, mode_t mode, struct stat *stat,
+ uuid_t gfid, dict_t * xattr_req)
{
int ret = -1;
struct glfs_fd *glfd = NULL;
@@ -362,12 +369,10 @@ glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path,
inode_t *inode = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
struct glfs_object *object = NULL;
/* validate in args */
- if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ if ((fs == NULL) || (parent == NULL) || (path == NULL) || (xattr_req == NULL)) {
errno = EINVAL;
return NULL;
}
@@ -389,14 +394,6 @@ glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path,
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -464,20 +461,34 @@ out:
}
struct glfs_object *
-glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path,
- mode_t mode, struct stat *stat)
+glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path,
+ int flags, mode_t mode, struct stat *stat)
+{
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ uuid_generate (gfid);
+ return(glfs_h_creat_with_xdata(fs, parent, path, flags, mode, stat, gfid, xattr_req));
+}
+
+struct glfs_object *
+glfs_h_mkdir_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path,
+ mode_t mode, struct stat *stat, uuid_t gfid, dict_t *xattr_req)
{
int ret = -1;
xlator_t *subvol = NULL;
inode_t *inode = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
struct glfs_object *object = NULL;
/* validate in args */
- if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ if ((fs == NULL) || (parent == NULL) || (path == NULL) || (xattr_req == NULL)) {
errno = EINVAL;
return NULL;
}
@@ -499,14 +510,6 @@ glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path,
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -552,20 +555,36 @@ out:
}
struct glfs_object *
-glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path,
- mode_t mode, dev_t dev, struct stat *stat)
+glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path,
+ mode_t mode, struct stat *stat)
+{
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ uuid_generate (gfid);
+ return(glfs_h_mkdir_with_xdata(fs, parent, path, mode, stat, gfid, xattr_req));
+}
+
+struct glfs_object *
+glfs_h_mknod_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path,
+ mode_t mode, dev_t dev, struct stat *stat,
+ uuid_t gfid, dict_t * xattr_req)
{
int ret = -1;
xlator_t *subvol = NULL;
inode_t *inode = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
struct glfs_object *object = NULL;
/* validate in args */
- if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ if ((fs == NULL) || (parent == NULL) || (path == NULL) || (xattr_req == NULL)) {
errno = EINVAL;
return NULL;
}
@@ -587,14 +606,6 @@ glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path,
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -638,8 +649,26 @@ out:
return object;
}
+struct glfs_object *
+glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path,
+ mode_t mode, dev_t dev, struct stat *stat)
+{
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ uuid_generate (gfid);
+
+ return(glfs_h_mknod_with_xdata(fs, parent, path, mode, dev, stat, gfid, xattr_req));
+}
+
int
-glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path)
+glfs_h_unlink_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *path, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -675,12 +704,12 @@ glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path)
}
if (!IA_ISDIR(loc.inode->ia_type)) {
- ret = syncop_unlink (subvol, &loc);
+ ret = syncop_unlink_with_xdata (subvol, &loc, dict);
if (ret != 0) {
goto out;
}
} else {
- ret = syncop_rmdir (subvol, &loc, 0);
+ ret = syncop_rmdir_with_xdata (subvol, &loc, 0, dict);
if (ret != 0) {
goto out;
}
@@ -700,8 +729,14 @@ out:
return ret;
}
+int
+glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path)
+{
+ return(glfs_h_unlink_with_xdata(fs, parent, path, NULL));
+}
+
struct glfs_fd *
-glfs_h_opendir (struct glfs *fs, struct glfs_object *object)
+glfs_h_opendir_with_xdata (struct glfs *fs, struct glfs_object *object, dict_t *dict)
{
int ret = -1;
struct glfs_fd *glfd = NULL;
@@ -754,7 +789,7 @@ glfs_h_opendir (struct glfs *fs, struct glfs_object *object)
GLFS_LOC_FILL_INODE (inode, loc, out);
/* fop/op */
- ret = syncop_opendir (subvol, &loc, glfd->fd);
+ ret = syncop_opendir_with_xdata (subvol, &loc, glfd->fd, dict);
out:
loc_wipe (&loc);
@@ -775,6 +810,12 @@ out:
return glfd;
}
+struct glfs_fd *
+glfs_h_opendir (struct glfs *fs, struct glfs_object *object)
+{
+ return(glfs_h_opendir_with_xdata(fs, object, NULL));
+}
+
ssize_t
glfs_h_extract_handle (struct glfs_object *object, unsigned char *handle,
int len)
@@ -951,21 +992,19 @@ out:
}
struct glfs_object *
-glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name,
- const char *data, struct stat *stat)
+glfs_h_symlink_with_xdata (struct glfs *fs, struct glfs_object *parent, const char *name,
+ const char *data, struct stat *stat, uuid_t gfid, dict_t * xattr_req)
{
int ret = -1;
xlator_t *subvol = NULL;
inode_t *inode = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
- uuid_t gfid;
- dict_t *xattr_req = NULL;
struct glfs_object *object = NULL;
/* validate in args */
if ((fs == NULL) || (parent == NULL) || (name == NULL) ||
- (data == NULL)) {
+ (data == NULL) || (xattr_req == NULL)) {
errno = EINVAL;
return NULL;
}
@@ -987,14 +1026,6 @@ glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name,
goto out;
}
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- errno = ENOMEM;
- goto out;
- }
-
- uuid_generate (gfid);
ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
if (ret) {
ret = -1;
@@ -1044,6 +1075,23 @@ out:
return object;
}
+struct glfs_object *
+glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name,
+ const char *data, struct stat *stat)
+{
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ uuid_generate (gfid);
+ return(glfs_h_symlink_with_xdata(fs, parent, name, data, stat, gfid, xattr_req));
+}
+
int
glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf,
size_t bufsiz)
@@ -1101,8 +1149,8 @@ out:
}
int
-glfs_h_link (struct glfs *fs, struct glfs_object *linksrc,
- struct glfs_object *parent, const char *name)
+glfs_h_link_with_xdata (struct glfs *fs, struct glfs_object *linksrc,
+ struct glfs_object *parent, const char *name, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1165,7 +1213,7 @@ glfs_h_link (struct glfs *fs, struct glfs_object *linksrc,
newloc.inode = inode_ref (inode);
/* fop/op */
- ret = syncop_link (subvol, &oldloc, &newloc);
+ ret = syncop_link_with_xdata (subvol, &oldloc, &newloc, dict);
if (ret == 0)
/* TODO: No iatt to pass as there has been no lookup */
@@ -1186,8 +1234,14 @@ out:
}
int
-glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname,
- struct glfs_object *newdir, const char *newname)
+glfs_h_link (struct glfs *fs, struct glfs_object *linksrc,
+ struct glfs_object *parent, const char *name)
+{
+ return(glfs_h_link_with_xdata(fs, linksrc, parent, name, NULL));
+}
+int
+glfs_h_rename_with_xdata (struct glfs *fs, struct glfs_object *olddir, const char *oldname,
+ struct glfs_object *newdir, const char *newname, dict_t *dict)
{
int ret = -1;
xlator_t *subvol = NULL;
@@ -1255,7 +1309,7 @@ glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname,
/* TODO: check if new or old is a prefix of the other, and fail EINVAL */
- ret = syncop_rename (subvol, &oldloc, &newloc);
+ ret = syncop_rename_with_xdata (subvol, &oldloc, &newloc, dict);
if (ret == 0)
inode_rename (oldloc.parent->table, oldloc.parent, oldloc.name,
@@ -1276,3 +1330,10 @@ out:
return ret;
}
+
+int
+glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname,
+ struct glfs_object *newdir, const char *newname)
+{
+ return(glfs_h_rename_with_xdata(fs, olddir, oldname, newdir, newname, NULL));
+}
diff --git a/api/src/glfs-handles.h b/api/src/glfs-handles.h
index bc26618c4..548268fd6 100644
--- a/api/src/glfs-handles.h
+++ b/api/src/glfs-handles.h
@@ -84,21 +84,42 @@ struct glfs_object *glfs_h_creat (struct glfs *fs, struct glfs_object *parent,
const char *path, int flags, mode_t mode,
struct stat *sb) __THROW;
+struct glfs_object *glfs_h_creat_with_xdata (struct glfs *fs, struct glfs_object *parent,
+ const char *path, int flags, mode_t mode,
+ struct stat *sb, uuid_t gfid, dict_t * xattr_req);
+
struct glfs_object *glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent,
const char *path, mode_t flags,
struct stat *sb) __THROW;
+struct glfs_object *glfs_h_mkdir_with_xdata (struct glfs *fs, struct glfs_object *parent,
+ const char *path, mode_t flags,
+ struct stat *sb, uuid_t gfid, dict_t * xattr_req);
+
struct glfs_object *glfs_h_mknod (struct glfs *fs, struct glfs_object *parent,
const char *path, mode_t mode, dev_t dev,
struct stat *sb) __THROW;
+struct glfs_object *glfs_h_mknod_with_xdata (struct glfs *fs, struct glfs_object *parent,
+ const char *path, mode_t mode, dev_t dev,
+ struct stat *sb, uuid_t gfid, dict_t * xattr_req);
+
struct glfs_object *glfs_h_symlink (struct glfs *fs, struct glfs_object *parent,
const char *name, const char *data,
struct stat *stat) __THROW;
+struct glfs_object *glfs_h_symlink_with_xdata (struct glfs *fs,
+ struct glfs_object *parent,
+ const char *name,
+ const char *data,
+ struct stat *stat,
+ uuid_t gfid,
+ dict_t * xattr_req) __THROW;
/* Operations on the actual objects */
int glfs_h_unlink (struct glfs *fs, struct glfs_object *parent,
const char *path) __THROW;
+int glfs_h_unlink_with_xdata (struct glfs *fs, struct glfs_object *parent,
+ const char *path, dict_t *dict) __THROW;
int glfs_h_close (struct glfs_object *object) __THROW;
@@ -122,10 +143,16 @@ int glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf,
int glfs_h_link (struct glfs *fs, struct glfs_object *linktgt,
struct glfs_object *parent, const char *name) __THROW;
+int glfs_h_link_with_xdata (struct glfs *fs, struct glfs_object *linktgt,
+ struct glfs_object *parent, const char *name,
+ dict_t *dict) __THROW;
int glfs_h_rename (struct glfs *fs, struct glfs_object *olddir,
const char *oldname, struct glfs_object *newdir,
const char *newname) __THROW;
+int glfs_h_rename_with_xdata (struct glfs *fs, struct glfs_object *olddir,
+ const char *oldname, struct glfs_object *newdir,
+ const char *newname, dict_t *dict) __THROW;
/* Operations enabling opaque invariant handle to object transitions */
ssize_t glfs_h_extract_handle (struct glfs_object *object,
@@ -136,11 +163,17 @@ struct glfs_object *glfs_h_create_from_handle (struct glfs *fs,
struct stat *stat) __THROW;
/* Operations enabling object handles to fd transitions */
-struct glfs_fd *glfs_h_opendir (struct glfs *fs,
- struct glfs_object *object) __THROW;
+struct glfs_fd *glfs_h_opendir (struct glfs *fs, struct glfs_object *object)
+ __THROW;
+struct glfs_fd *glfs_h_opendir_with_xdata (struct glfs *fs,
+ struct glfs_object *object,
+ dict_t *dict) __THROW;
struct glfs_fd *glfs_h_open (struct glfs *fs, struct glfs_object *object,
int flags) __THROW;
+struct glfs_fd *glfs_h_open_with_xdata (struct glfs *fs,
+ struct glfs_object *object, int flags,
+ dict_t *dict) __THROW;
__END_DECLS
diff --git a/api/src/glfs.h b/api/src/glfs.h
index 20fb18c9e..d79385792 100644
--- a/api/src/glfs.h
+++ b/api/src/glfs.h
@@ -354,8 +354,11 @@ glfs_fd_t *glfs_open (glfs_t *fs, const char *path, int flags) __THROW;
glfs_fd_t *glfs_creat (glfs_t *fs, const char *path, int flags,
mode_t mode) __THROW;
+glfs_fd_t *glfs_creat_with_xdata (glfs_t *fs, const char *path, int flags,
+ mode_t mode, uuid_t gfid, dict_t *dict) __THROW;
int glfs_close (glfs_fd_t *fd) __THROW;
+int glfs_close_with_xdata (glfs_fd_t *fd, dict_t *dict) __THROW;
glfs_t *glfs_from_glfd (glfs_fd_t *fd) __THROW;
@@ -389,10 +392,13 @@ typedef void (*glfs_io_cbk) (glfs_fd_t *fd, ssize_t ret, void *data);
// glfs_{read,write}[_async]
-ssize_t glfs_read (glfs_fd_t *fd, void *buf,
- size_t count, int flags) __THROW;
-ssize_t glfs_write (glfs_fd_t *fd, const void *buf,
- size_t count, int flags) __THROW;
+ssize_t glfs_read (glfs_fd_t *fd, void *buf, size_t count, int flags) __THROW;
+ssize_t glfs_read_with_xdata (struct glfs_fd *glfd, void *buf, size_t count,
+ int flags, dict_t *dict) __THROW;
+ssize_t glfs_write (glfs_fd_t *fd, const void *buf, size_t count, int flags)
+ __THROW;
+ssize_t glfs_write_with_xdata (glfs_fd_t *fd, const void *buf, size_t count,
+ int flags, dict_t *dict) __THROW;
int glfs_read_async (glfs_fd_t *fd, void *buf, size_t count, int flags,
glfs_io_cbk fn, void *data) __THROW;
int glfs_write_async (glfs_fd_t *fd, const void *buf, size_t count, int flags,
@@ -404,6 +410,8 @@ ssize_t glfs_readv (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
int flags) __THROW;
ssize_t glfs_writev (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
int flags) __THROW;
+ssize_t glfs_writev_with_xdata (glfs_fd_t *fd, const struct iovec *iov,
+ int iovcnt, int flags, dict_t *dict) __THROW;
int glfs_readv_async (glfs_fd_t *fd, const struct iovec *iov, int count,
int flags, glfs_io_cbk fn, void *data) __THROW;
int glfs_writev_async (glfs_fd_t *fd, const struct iovec *iov, int count,
@@ -424,29 +432,42 @@ int glfs_pwrite_async (glfs_fd_t *fd, const void *buf, int count, off_t offset,
ssize_t glfs_preadv (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
off_t offset, int flags) __THROW;
+ssize_t glfs_preadv_with_xdata (glfs_fd_t *fd, const struct iovec *iov,
+ int iovcnt, off_t offset, int flags,
+ dict_t *dict) __THROW;
ssize_t glfs_pwritev (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
off_t offset, int flags) __THROW;
-int glfs_preadv_async (glfs_fd_t *fd, const struct iovec *iov,
- int count, off_t offset, int flags,
- glfs_io_cbk fn, void *data) __THROW;
-int glfs_pwritev_async (glfs_fd_t *fd, const struct iovec *iov,
- int count, off_t offset, int flags,
- glfs_io_cbk fn, void *data) __THROW;
+ssize_t glfs_pwritev_with_xdata (glfs_fd_t *fd, const struct iovec *iov,
+ int iovcnt, off_t offset, int flags,
+ dict_t *dict) __THROW;
+int glfs_preadv_async (glfs_fd_t *fd, const struct iovec *iov, int count,
+ off_t offset, int flags, glfs_io_cbk fn, void *data)
+ __THROW;
+int glfs_pwritev_async (glfs_fd_t *fd, const struct iovec *iov, int count,
+ off_t offset, int flags, glfs_io_cbk fn, void *data)
+ __THROW;
off_t glfs_lseek (glfs_fd_t *fd, off_t offset, int whence) __THROW;
+off_t glfs_lseek_with_xdata (glfs_fd_t *fd, off_t offset, int whence,
+ dict_t *dict) __THROW;
int glfs_truncate (glfs_t *fs, const char *path, off_t length) __THROW;
int glfs_ftruncate (glfs_fd_t *fd, off_t length) __THROW;
+int glfs_ftruncate_with_xdata (glfs_fd_t *fd, off_t length, dict_t *dict)
+ __THROW;
int glfs_ftruncate_async (glfs_fd_t *fd, off_t length, glfs_io_cbk fn,
void *data) __THROW;
int glfs_lstat (glfs_t *fs, const char *path, struct stat *buf) __THROW;
int glfs_stat (glfs_t *fs, const char *path, struct stat *buf) __THROW;
int glfs_fstat (glfs_fd_t *fd, struct stat *buf) __THROW;
+int glfs_fstat_with_xdata (glfs_fd_t *fd, struct stat *buf, dict_t *dict)
+ __THROW;
int glfs_fsync (glfs_fd_t *fd) __THROW;
+int glfs_fsync_with_xdata (glfs_fd_t *fd, dict_t *dict) __THROW;
int glfs_fsync_async (glfs_fd_t *fd, glfs_io_cbk fn, void *data) __THROW;
int glfs_fdatasync (glfs_fd_t *fd) __THROW;
@@ -454,22 +475,35 @@ int glfs_fdatasync_async (glfs_fd_t *fd, glfs_io_cbk fn, void *data) __THROW;
int glfs_access (glfs_t *fs, const char *path, int mode) __THROW;
-int glfs_symlink (glfs_t *fs, const char *oldpath, const char *newpath) __THROW;
+int glfs_symlink (glfs_t *fs, const char *oldpath, const char *newpath)
+ __THROW;
+int glfs_symlink_with_xdata (glfs_t *fs, const char *oldpath,
+ const char *newpath, uuid_t gfid, dict_t *dict)
+ __THROW;
int glfs_readlink (glfs_t *fs, const char *path,
char *buf, size_t bufsiz) __THROW;
int glfs_mknod (glfs_t *fs, const char *path, mode_t mode, dev_t dev) __THROW;
+int glfs_mknod_with_xdata (glfs_t *fs, const char *path, mode_t mode,
+ dev_t dev, uuid_t gfid, dict_t *dict) __THROW;
int glfs_mkdir (glfs_t *fs, const char *path, mode_t mode) __THROW;
+int glfs_mkdir_with_xdata (glfs_t *fs, const char *path, mode_t mode,
+ uuid_t gfid, dict_t *dict) __THROW;
int glfs_unlink (glfs_t *fs, const char *path) __THROW;
int glfs_rmdir (glfs_t *fs, const char *path) __THROW;
+int glfs_rmdir_with_xdata (glfs_t *fs, const char *path, dict_t *dict) __THROW;
int glfs_rename (glfs_t *fs, const char *oldpath, const char *newpath) __THROW;
+int glfs_rename_with_xdata (glfs_t *fs, const char *oldpath,
+ const char *newpath, dict_t *dict) __THROW;
int glfs_link (glfs_t *fs, const char *oldpath, const char *newpath) __THROW;
+int glfs_link_with_xdata (glfs_t *fs, const char *oldpath, const char *newpath,
+ dict_t *dict) __THROW;
glfs_fd_t *glfs_opendir (glfs_t *fs, const char *path) __THROW;
@@ -532,6 +566,9 @@ ssize_t glfs_lgetxattr (glfs_t *fs, const char *path, const char *name,
ssize_t glfs_fgetxattr (glfs_fd_t *fd, const char *name,
void *value, size_t size) __THROW;
+ssize_t glfs_fgetxattr_with_xdata (glfs_fd_t *fd, const char *name,
+ void *value, size_t size, dict_t *dict)
+ __THROW;
ssize_t glfs_listxattr (glfs_t *fs, const char *path,
void *value, size_t size) __THROW;
@@ -540,21 +577,34 @@ ssize_t glfs_llistxattr (glfs_t *fs, const char *path, void *value,
size_t size) __THROW;
ssize_t glfs_flistxattr (glfs_fd_t *fd, void *value, size_t size) __THROW;
+ssize_t glfs_flistxattr_with_xdata (glfs_fd_t *fd, void *value, size_t size,
+ dict_t *dict) __THROW;
int glfs_setxattr (glfs_t *fs, const char *path, const char *name,
const void *value, size_t size, int flags) __THROW;
+int glfs_setxattr_with_xdata (glfs_t *fs, const char *path, const char *name,
+ const void *value, size_t size, int flags, dict_t *dict);
+
int glfs_lsetxattr (glfs_t *fs, const char *path, const char *name,
const void *value, size_t size, int flags) __THROW;
int glfs_fsetxattr (glfs_fd_t *fd, const char *name,
const void *value, size_t size, int flags) __THROW;
+int glfs_fsetxattr_with_xdata (glfs_fd_t *fd, const char *name,
+ const void *value, size_t size, int flags,
+ dict_t *dict) __THROW;
+
int glfs_removexattr (glfs_t *fs, const char *path, const char *name) __THROW;
+int glfs_removexattr_with_xdata (glfs_t *fs, const char *path,
+ const char *name, dict_t *dict) __THROW;
int glfs_lremovexattr (glfs_t *fs, const char *path, const char *name) __THROW;
int glfs_fremovexattr (glfs_fd_t *fd, const char *name) __THROW;
+int glfs_fremovexattr_with_xdata (glfs_fd_t *fd, const char *name,
+ dict_t *dict) __THROW;
int glfs_fallocate(glfs_fd_t *fd, int keep_size,
off_t offset, size_t len) __THROW;
@@ -578,6 +628,20 @@ int glfs_fchdir (glfs_fd_t *fd) __THROW;
char *glfs_realpath (glfs_t *fs, const char *path, char *resolved_path) __THROW;
+int
+glfs_setattr_with_xdata (struct glfs *fs, const char *path, struct iatt *iatt,
+ int valid, int follow, dict_t *dict);
+int
+glfs_fsetattr_with_xdata (struct glfs_fd *glfd, struct iatt *iatt, int valid, dict_t *dict);
+int
+glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt,
+ int valid, int follow);
+int
+glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid);
+
+
+
+
/*
* @cmd and @flock are as specified in man fcntl(2).
*/
diff --git a/configure.ac b/configure.ac
index 7bfee047a..581b976a0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,6 +58,12 @@ AC_CONFIG_FILES([Makefile
xlators/cluster/Makefile
xlators/cluster/afr/Makefile
xlators/cluster/afr/src/Makefile
+ xlators/cluster/nsr-server/Makefile
+ xlators/cluster/nsr-server/src/Makefile
+ xlators/cluster/nsr-recon/Makefile
+ xlators/cluster/nsr-recon/src/Makefile
+ xlators/cluster/nsr-client/Makefile
+ xlators/cluster/nsr-client/src/Makefile
xlators/cluster/stripe/Makefile
xlators/cluster/stripe/src/Makefile
xlators/cluster/dht/Makefile
diff --git a/glusterfs.spec.in b/glusterfs.spec.in
index f7c2fc5b8..e6dbd5f35 100644
--- a/glusterfs.spec.in
+++ b/glusterfs.spec.in
@@ -661,6 +661,8 @@ find ./tests ./run-tests.sh -type f | cpio -pd %{buildroot}%{_prefix}/share/glus
%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/protocol/server*
%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/mgmt*
%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/nfs*
+%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr.so
+%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr_recon.so
# sample xlators not generally used or usable
%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/encryption/rot-13*
%exclude %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/mac-compat*
@@ -745,6 +747,8 @@ fi
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/protocol/server*
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/mgmt*
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/nfs*
+%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr.so
+%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/cluster/nsr_recon.so
%ghost %attr(0644,-,-) %config(noreplace) %{_sharedstatedir}/glusterd/glusterd.info
%ghost %attr(0600,-,-) %{_sharedstatedir}/glusterd/options
# This is really ugly, but I have no idea how to mark these directories in an
diff --git a/libglusterfs/src/call-stub.c b/libglusterfs/src/call-stub.c
index 7e94ee3c0..86c2463ef 100644
--- a/libglusterfs/src/call-stub.c
+++ b/libglusterfs/src/call-stub.c
@@ -2297,7 +2297,7 @@ out:
}
-static void
+void
call_resume_wind (call_stub_t *stub)
{
GF_VALIDATE_OR_GOTO ("call-stub", stub, out);
diff --git a/libglusterfs/src/call-stub.h b/libglusterfs/src/call-stub.h
index 0f6c108ee..ccf92cf53 100644
--- a/libglusterfs/src/call-stub.h
+++ b/libglusterfs/src/call-stub.h
@@ -764,4 +764,10 @@ fop_zerofill_cbk_stub(call_frame_t *frame,
void call_resume (call_stub_t *stub);
void call_stub_destroy (call_stub_t *stub);
void call_unwind_error (call_stub_t *stub, int op_ret, int op_errno);
+
+/*
+ * Sometimes we might want to call just this, perhaps repeatedly, without
+ * having (or being able) to destroy and recreate it.
+ */
+void call_resume_wind (call_stub_t *stub);
#endif
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index dfe443016..33d2087fc 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -200,7 +200,7 @@ typedef enum {
GF_FOP_WRITE,
GF_FOP_STATFS,
GF_FOP_FLUSH,
- GF_FOP_FSYNC, /* 15 */
+ GF_FOP_FSYNC, /* 16 */
GF_FOP_SETXATTR,
GF_FOP_GETXATTR,
GF_FOP_REMOVEXATTR,
diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h
index 7f3712b51..6fcf17f35 100644
--- a/libglusterfs/src/list.h
+++ b/libglusterfs/src/list.h
@@ -187,4 +187,18 @@ list_append_init (struct list_head *list, struct list_head *head)
&pos->member != (head); \
pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+/*
+ * This list implementation has some advantages, but one disadvantage: you
+ * can't use NULL to check whether you're at the head or tail. Thus, the
+ * address of the head has to be an argument for these macros.
+ */
+
+#define list_next(ptr,head,type,member) \
+ (((ptr)->member.next == head) ? NULL \
+ : list_entry((ptr)->member.next,type,member))
+
+#define list_prev(ptr,head,type,member) \
+ (((ptr)->member.prev == head) ? NULL \
+ : list_entry((ptr)->member.prev,type,member))
+
#endif /* _LLIST_H */
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
index 1f36e5776..a4a5596c3 100644
--- a/libglusterfs/src/syncop.c
+++ b/libglusterfs/src/syncop.c
@@ -1159,6 +1159,22 @@ syncop_opendir (xlator_t *subvol,
}
int
+syncop_opendir_with_xdata (xlator_t *subvol,
+ loc_t *loc,
+ fd_t *fd,
+ dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_opendir_cbk, subvol->fops->opendir,
+ loc, fd, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+
+}
+
+int
syncop_fsyncdir_cbk (call_frame_t *frame, void* cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -1205,10 +1221,16 @@ syncop_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
syncop_removexattr (xlator_t *subvol, loc_t *loc, const char *name)
{
+ return(syncop_removexattr_with_xdata(subvol, loc, name, NULL));
+}
+
+int
+syncop_removexattr_with_xdata (xlator_t *subvol, loc_t *loc, const char *name, dict_t *dict)
+{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_removexattr_cbk, subvol->fops->removexattr,
- loc, name, NULL);
+ loc, name, dict);
errno = args.op_errno;
return args.op_ret;
@@ -1243,6 +1265,17 @@ syncop_fremovexattr (xlator_t *subvol, fd_t *fd, const char *name)
}
int
+syncop_fremovexattr_with_xdata (xlator_t *subvol, fd_t *fd, const char *name, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_fremovexattr_cbk,
+ subvol->fops->fremovexattr, fd, name, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+}
+int
syncop_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -1258,14 +1291,19 @@ syncop_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
int
syncop_setxattr (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags)
{
+ return (syncop_setxattr_with_xdata(subvol, loc, dict, flags, NULL));
+}
+
+int
+syncop_setxattr_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags, dict_t *extra)
+{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_setxattr_cbk, subvol->fops->setxattr,
- loc, dict, flags, NULL);
+ loc, dict, flags, extra);
errno = args.op_errno;
return args.op_ret;
@@ -1301,6 +1339,18 @@ syncop_fsetxattr (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags)
}
int
+syncop_fsetxattr_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags, dict_t *extra)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_fsetxattr_cbk, subvol->fops->fsetxattr,
+ fd, dict, flags, extra);
+
+ errno = args.op_errno;
+ return args.op_ret;
+}
+
+int
syncop_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
{
@@ -1353,12 +1403,12 @@ syncop_getxattr (xlator_t *subvol, loc_t *loc, dict_t **dict, const char *key)
}
int
-syncop_fgetxattr (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key)
+syncop_fgetxattr_with_xdata (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key, dict_t *extra)
{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_getxattr_cbk, subvol->fops->fgetxattr,
- fd, key, NULL);
+ fd, key, extra);
if (dict)
*dict = args.xattr;
@@ -1370,6 +1420,12 @@ syncop_fgetxattr (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key)
}
int
+syncop_fgetxattr (xlator_t *subvol, fd_t *fd, dict_t **dict, const char *key)
+{
+ return(syncop_fgetxattr_with_xdata(subvol, fd, dict, key, NULL));
+}
+
+int
syncop_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct statvfs *buf, dict_t *xdata)
@@ -1432,13 +1488,13 @@ syncop_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
-syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,
- struct iatt *preop, struct iatt *postop)
+syncop_setattr_with_xdata (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,
+ struct iatt *preop, struct iatt *postop, dict_t *dict)
{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_setattr_cbk, subvol->fops->setattr,
- loc, iatt, valid, NULL);
+ loc, iatt, valid, dict);
if (preop)
*preop = args.iatt1;
@@ -1449,15 +1505,21 @@ syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,
return args.op_ret;
}
+int
+syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,
+ struct iatt *preop, struct iatt *postop)
+{
+ return(syncop_setattr_with_xdata(subvol, loc, iatt, valid, preop, postop, NULL));
+}
int
-syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid,
- struct iatt *preop, struct iatt *postop)
+syncop_fsetattr_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid,
+ struct iatt *preop, struct iatt *postop, dict_t *dict)
{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_setattr_cbk, subvol->fops->fsetattr,
- fd, iatt, valid, NULL);
+ fd, iatt, valid, dict);
if (preop)
*preop = args.iatt1;
@@ -1468,6 +1530,12 @@ syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid,
return args.op_ret;
}
+int
+syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid,
+ struct iatt *preop, struct iatt *postop)
+{
+ return(syncop_fsetattr_with_xdata(subvol, fd, iatt, valid, preop, postop, NULL));
+}
int32_t
syncop_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1498,6 +1566,19 @@ syncop_open (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd)
}
+int
+syncop_open_with_xdata (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_open_cbk, subvol->fops->open,
+ loc, flags, fd, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+
+}
+
int32_t
syncop_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1528,14 +1609,14 @@ syncop_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
int
-syncop_readv (xlator_t *subvol, fd_t *fd, size_t size, off_t off,
+syncop_readv_with_xdata (xlator_t *subvol, fd_t *fd, size_t size, off_t off,
uint32_t flags, struct iovec **vector, int *count,
- struct iobref **iobref)
+ struct iobref **iobref, dict_t *dict)
{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_readv_cbk, subvol->fops->readv,
- fd, size, off, flags, NULL);
+ fd, size, off, flags, dict);
if (args.op_ret < 0)
goto out;
@@ -1561,6 +1642,14 @@ out:
}
int
+syncop_readv (xlator_t *subvol, fd_t *fd, size_t size, off_t off,
+ uint32_t flags, struct iovec **vector, int *count,
+ struct iobref **iobref)
+{
+ return(syncop_readv_with_xdata(subvol, fd, size, off, flags, vector, count, iobref, NULL));
+}
+
+int
syncop_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
@@ -1578,20 +1667,28 @@ syncop_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
int
-syncop_writev (xlator_t *subvol, fd_t *fd, const struct iovec *vector,
+syncop_writev_with_xdata (xlator_t *subvol, fd_t *fd, const struct iovec *vector,
int32_t count, off_t offset, struct iobref *iobref,
- uint32_t flags)
+ uint32_t flags, dict_t *dict)
{
struct syncargs args = {0, };
SYNCOP (subvol, (&args), syncop_writev_cbk, subvol->fops->writev,
fd, (struct iovec *) vector, count, offset, flags, iobref,
- NULL);
+ dict);
errno = args.op_errno;
return args.op_ret;
}
+int
+syncop_writev (xlator_t *subvol, fd_t *fd, const struct iovec *vector,
+ int32_t count, off_t offset, struct iobref *iobref,
+ uint32_t flags)
+{
+ return(syncop_writev_with_xdata(subvol, fd, vector, count, offset, iobref, flags, NULL));
+}
+
int syncop_write (xlator_t *subvol, fd_t *fd, const char *buf, int size,
off_t offset, struct iobref *iobref, uint32_t flags)
{
@@ -1685,6 +1782,18 @@ syncop_unlink (xlator_t *subvol, loc_t *loc)
}
int
+syncop_unlink_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_unlink_cbk, subvol->fops->unlink, loc,
+ 0, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+}
+
+int
syncop_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
@@ -1715,6 +1824,18 @@ syncop_rmdir (xlator_t *subvol, loc_t *loc, int flags)
int
+syncop_rmdir_with_xdata (xlator_t *subvol, loc_t *loc, int flags, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_rmdir_cbk, subvol->fops->rmdir, loc,
+ flags, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+}
+
+int
syncop_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, inode_t *inode,
struct iatt *buf, struct iatt *preparent,
@@ -1746,6 +1867,18 @@ syncop_link (xlator_t *subvol, loc_t *oldloc, loc_t *newloc)
return args.op_ret;
}
+int
+syncop_link_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_link_cbk, subvol->fops->link,
+ oldloc, newloc, dict);
+
+ errno = args.op_errno;
+
+ return args.op_ret;
+}
int
syncop_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1780,6 +1913,19 @@ syncop_rename (xlator_t *subvol, loc_t *oldloc, loc_t *newloc)
return args.op_ret;
}
+int
+syncop_rename_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_rename_cbk, subvol->fops->rename,
+ oldloc, newloc, dict);
+
+ errno = args.op_errno;
+
+ return args.op_ret;
+}
+
int
syncop_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1811,6 +1957,18 @@ syncop_ftruncate (xlator_t *subvol, fd_t *fd, off_t offset)
}
int
+syncop_ftruncate_with_xdata (xlator_t *subvol, fd_t *fd, off_t offset, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_ftruncate_cbk, subvol->fops->ftruncate,
+ fd, offset, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+}
+
+int
syncop_truncate (xlator_t *subvol, loc_t *loc, off_t offset)
{
struct syncargs args = {0, };
@@ -1853,6 +2011,19 @@ syncop_fsync (xlator_t *subvol, fd_t *fd, int dataonly)
}
+int
+syncop_fsync_with_xdata (xlator_t *subvol, fd_t *fd, int dataonly, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_fsync_cbk, subvol->fops->fsync,
+ fd, dataonly, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+
+}
+
int
syncop_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1885,6 +2056,19 @@ syncop_flush (xlator_t *subvol, fd_t *fd)
}
int
+syncop_flush_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict)
+{
+ struct syncargs args = {0};
+
+ SYNCOP (subvol, (&args), syncop_flush_cbk, subvol->fops->flush,
+ fd, dict);
+
+ errno = args.op_errno;
+ return args.op_ret;
+
+}
+
+int
syncop_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *stbuf, dict_t *xdata)
{
@@ -1918,6 +2102,21 @@ syncop_fstat (xlator_t *subvol, fd_t *fd, struct iatt *stbuf)
return args.op_ret;
}
+int
+syncop_fstat_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *stbuf, dict_t *dict)
+{
+ struct syncargs args = {0, };
+
+ SYNCOP (subvol, (&args), syncop_fstat_cbk, subvol->fops->fstat,
+ fd, dict);
+
+ if (stbuf)
+ *stbuf = args.iatt1;
+
+ errno = args.op_errno;
+ return args.op_ret;
+
+}
int
syncop_stat (xlator_t *subvol, loc_t *loc, struct iatt *stbuf)
diff --git a/libglusterfs/src/syncop.h b/libglusterfs/src/syncop.h
index 68218bb17..87985588f 100644
--- a/libglusterfs/src/syncop.h
+++ b/libglusterfs/src/syncop.h
@@ -344,49 +344,79 @@ int syncop_readdir (xlator_t *subvol, fd_t *fd, size_t size, off_t off,
gf_dirent_t *entries);
int syncop_opendir (xlator_t *subvol, loc_t *loc, fd_t *fd);
+int syncop_opendir_with_xdata (xlator_t *subvol, loc_t *loc, fd_t *fd, dict_t *dict);
int syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,
/* out */
struct iatt *preop, struct iatt *postop);
+int syncop_setattr_with_xdata (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,
+ /* out */
+ struct iatt *preop, struct iatt *postop, dict_t *dict);
+
int syncop_fsetattr (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid,
/* out */
struct iatt *preop, struct iatt *postop);
+int syncop_fsetattr_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *iatt, int valid,
+ /* out */
+ struct iatt *preop, struct iatt *postop, dict_t *dict);
+
int syncop_statfs (xlator_t *subvol, loc_t *loc, struct statvfs *buf);
int syncop_setxattr (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags);
+int syncop_setxattr_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags, dict_t *extra);
int syncop_fsetxattr (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags);
+int syncop_fsetxattr_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict, int32_t flags, dict_t *extra);
int syncop_listxattr (xlator_t *subvol, loc_t *loc, dict_t **dict);
int syncop_getxattr (xlator_t *xl, loc_t *loc, dict_t **dict, const char *key);
int syncop_fgetxattr (xlator_t *xl, fd_t *fd, dict_t **dict, const char *key);
+int syncop_fgetxattr_with_xdata (xlator_t *xl, fd_t *fd, dict_t **dict, const char *key, dict_t *extra);
int syncop_removexattr (xlator_t *subvol, loc_t *loc, const char *name);
+int syncop_removexattr_with_xdata (xlator_t *subvol, loc_t *loc, const char *name, dict_t *dict);
int syncop_fremovexattr (xlator_t *subvol, fd_t *fd, const char *name);
+int syncop_fremovexattr_with_xdata (xlator_t *subvol, fd_t *fd, const char *name, dict_t *dict);
int syncop_create (xlator_t *subvol, loc_t *loc, int32_t flags, mode_t mode,
fd_t *fd, dict_t *dict, struct iatt *iatt);
int syncop_open (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd);
+int syncop_open_with_xdata (xlator_t *subvol, loc_t *loc, int32_t flags, fd_t *fd, dict_t *dict);
int syncop_close (fd_t *fd);
+int syncop_close_with_xdata (fd_t *fd, dict_t *dict);
int syncop_write (xlator_t *subvol, fd_t *fd, const char *buf, int size,
off_t offset, struct iobref *iobref, uint32_t flags);
int syncop_writev (xlator_t *subvol, fd_t *fd, const struct iovec *vector,
int32_t count, off_t offset, struct iobref *iobref,
uint32_t flags);
+int syncop_writev_with_xdata (xlator_t *subvol, fd_t *fd, const struct iovec *vector,
+ int32_t count, off_t offset, struct iobref *iobref,
+ uint32_t flags, dict_t *dict);
int syncop_readv (xlator_t *subvol, fd_t *fd, size_t size, off_t off,
uint32_t flags,
/* out */
struct iovec **vector, int *count, struct iobref **iobref);
+int syncop_readv_with_xdata (xlator_t *subvol, fd_t *fd, size_t size, off_t off,
+ uint32_t flags,
+ /* out */
+ struct iovec **vector, int *count, struct iobref **iobref, dict_t *dict);
int syncop_ftruncate (xlator_t *subvol, fd_t *fd, off_t offset);
+int syncop_ftruncate_with_xdata (xlator_t *subvol, fd_t *fd, off_t offset, dict_t *dict);
int syncop_truncate (xlator_t *subvol, loc_t *loc, off_t offset);
int syncop_unlink (xlator_t *subvol, loc_t *loc);
+int syncop_unlink_with_xdata (xlator_t *subvol, loc_t *loc, dict_t *dict);
+
int syncop_rmdir (xlator_t *subvol, loc_t *loc, int flags);
+int syncop_rmdir_with_xdata (xlator_t *subvol, loc_t *loc, int flags, dict_t *dict);
int syncop_fsync (xlator_t *subvol, fd_t *fd, int dataonly);
+int syncop_fsync_with_xdata (xlator_t *subvol, fd_t *fd, int dataonly, dict_t *dict);
int syncop_flush (xlator_t *subvol, fd_t *fd);
+int syncop_flush_with_xdata (xlator_t *subvol, fd_t *fd, dict_t *dict);
int syncop_fstat (xlator_t *subvol, fd_t *fd, struct iatt *stbuf);
+int syncop_fstat_with_xdata (xlator_t *subvol, fd_t *fd, struct iatt *stbuf, dict_t *dict);
int syncop_stat (xlator_t *subvol, loc_t *loc, struct iatt *stbuf);
int syncop_symlink (xlator_t *subvol, loc_t *loc, const char *newpath,
@@ -397,6 +427,7 @@ int syncop_mknod (xlator_t *subvol, loc_t *loc, mode_t mode, dev_t rdev,
int syncop_mkdir (xlator_t *subvol, loc_t *loc, mode_t mode, dict_t *dict,
struct iatt *iatt);
int syncop_link (xlator_t *subvol, loc_t *oldloc, loc_t *newloc);
+int syncop_link_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict);
int syncop_fsyncdir (xlator_t *subvol, fd_t *fd, int datasync);
int syncop_access (xlator_t *subvol, loc_t *loc, int32_t mask);
int syncop_fallocate(xlator_t *subvol, fd_t *fd, int32_t keep_size, off_t offset,
@@ -406,6 +437,7 @@ int syncop_discard(xlator_t *subvol, fd_t *fd, off_t offset, size_t len);
int syncop_zerofill(xlator_t *subvol, fd_t *fd, off_t offset, off_t len);
int syncop_rename (xlator_t *subvol, loc_t *oldloc, loc_t *newloc);
+int syncop_rename_with_xdata (xlator_t *subvol, loc_t *oldloc, loc_t *newloc, dict_t *dict);
int syncop_lk (xlator_t *subvol, fd_t *fd, int cmd, struct gf_flock *flock);
diff --git a/tests/basic/nsr.t b/tests/basic/nsr.t
new file mode 100644
index 000000000..5d6faf78e
--- /dev/null
+++ b/tests/basic/nsr.t
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Test *very basic* NSR functionality - startup, mount, simplest possible file
+# write.
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function get_rep_count {
+ v=$(getfattr --only-values -e text -n trusted.nsr.rep-count $1 2> /dev/null)
+ #echo $v > /dev/tty
+ echo $v
+}
+
+function ping_file {
+ dd if=/dev/urandom of=$1 bs=4k count=1 2> /dev/null
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info
+
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2}
+
+EXPECT "$V0" volinfo_field $V0 'Volume Name'
+EXPECT 'Created' volinfo_field $V0 'Status'
+EXPECT '2' brick_count $V0
+
+TEST $CLI volume set $V0 cluster.nsr on
+
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status'
+
+## Mount FUSE with caching disabled (read-only)
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+
+# Give the bricks a chance to connect to each other.
+EXPECT_WITHIN 10 "2" get_rep_count $M0
+
+TEST ping_file $M0/probe
+TEST cmp ${M0}/probe ${B0}/${V0}1/probe
+TEST cmp ${M0}/probe ${B0}/${V0}2/probe
+
+cleanup
+killall -9 etcd
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 0990822a7..6e883e565 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht
+SUBDIRS = stripe afr dht nsr-server nsr-recon nsr-client
CLEANFILES =
diff --git a/xlators/cluster/nsr-client/Makefile.am b/xlators/cluster/nsr-client/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-client/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-client/src/Makefile.am b/xlators/cluster/nsr-client/src/Makefile.am
new file mode 100644
index 000000000..bacd1a906
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/Makefile.am
@@ -0,0 +1,33 @@
+python_PYTHON = gen-fops.py
+
+xlator_LTLIBRARIES = nsrc.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsrc_la_LDFLAGS = -module -avoid-version
+nsrc_la_SOURCES = nsrc.c
+
+nsrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = fop-template.c \
+ $(top_srcdir)/xlators/lib/src/libxlator.h \
+ $(top_srcdir)/glusterfsd/src/glusterfsd.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES = nsrc-cg.c
+
+CODEGEN_DIR = ../../nsr-server/src/codegen.py
+
+nsrc-cg.c: gen-fops.py $(CODEGEN) $(XLATOR_HEADER) fop-template.c
+ $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) fop-template.c > $@
+
+nsrc.lo: nsrc-cg.c
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-client/src/fop-template.c b/xlators/cluster/nsr-client/src/fop-template.c
new file mode 100644
index 000000000..699b07d40
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/fop-template.c
@@ -0,0 +1,113 @@
+// template-name fop
+$TYPE$
+nsrc_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = NULL;
+ xlator_t *target_xl = ACTIVE_CHILD(this);
+
+ local = mem_get(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+
+ local->stub = fop_$NAME$_stub (frame, nsrc_$NAME$_continue,
+ $ARGS_SHORT$);
+ if (!local->stub) {
+ goto err;
+ }
+ local->curr_xl = target_xl;
+ local->scars = 0;
+
+ frame->local = local;
+ STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, target_xl,
+ target_xl, target_xl->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+
+err:
+ if (local) {
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, ENOMEM,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name cbk
+$TYPE$
+nsrc_$NAME$_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = frame->local;
+ xlator_t *last_xl = cookie;
+ xlator_t *next_xl;
+ nsrc_private_t *priv = this->private;
+ struct timespec spec;
+
+ if (op_ret != (-1)) {
+ if (local->scars) {
+ gf_log (this->name, GF_LOG_INFO,
+ HILITE("retried %p OK"), frame->local);
+ }
+ priv->active = last_xl;
+ goto unwind;
+ }
+ if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) {
+ goto unwind;
+ }
+
+ /* TBD: get leader ID from xdata? */
+ next_xl = next_xlator(this,last_xl);
+ /*
+ * We can't just give up after we've tried all bricks, because it's
+ * quite likely that a new leader election just hasn't finished yet.
+ * We also shouldn't retry endlessly, and especially not at a high
+ * rate, but that's good enough while we work on other things.
+ *
+ * TBD: implement slow/finite retry via a worker thread
+ */
+ if (!next_xl || (local->scars >= SCAR_LIMIT)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ HILITE("ran out of retries for %p"), frame->local);
+ goto unwind;
+ }
+
+ local->curr_xl = next_xl;
+ local->scars += 1;
+ spec.tv_sec = 1;
+ spec.tv_nsec = 0;
+ /*
+ * WARNING
+ *
+ * Just calling gf_timer_call_after like this leaves open the
+ * possibility that writes will get reordered, if a first write is
+ * rescheduled and then a second comes along to find an updated
+ * priv->active before the first actually executes. We might need to
+ * implement a stricter (and more complicated) queuing mechanism to
+ * ensure absolute consistency in this case.
+ */
+ if (gf_timer_call_after(this->ctx,spec,nsrc_retry_cb,local)) {
+ return 0;
+ }
+
+unwind:
+ call_stub_destroy(local->stub);
+ STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno,
+ $ARGS_SHORT$);
+ return 0;
+}
+
+// template-name cont-func
+$TYPE$
+nsrc_$NAME$_continue (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = frame->local;
+
+ STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, local->curr_xl,
+ local->curr_xl, local->curr_xl->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+}
diff --git a/xlators/cluster/nsr-client/src/gen-fops.py b/xlators/cluster/nsr-client/src/gen-fops.py
new file mode 100644
index 000000000..b07b3c5b1
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/gen-fops.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+
+# This script generates the boilerplate versions of most fops in the client,
+# mostly so that we can use STACK_WIND instead of STACK_WIND_TAIL (see
+# fop-template.c for the details). The problem we're solving is that we sit
+# under DHT, which makes assumptions about getting callbacks only from its
+# direct children. If we didn't define our own versions of these fops, the
+# default versions would use STACK_WIND_TAIL and the callbacks would come from
+# DHT's grandchildren. The code-generation approach allows us to handle this
+# with a minimum of code, and also keep up with any changes to the fop table.
+
+import sys
+sys.path.append("../../nsr-server/src") # Blech.
+import codegen
+
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_t\)"
+full_re = type_re + " *" + name_re
+fop_cg = codegen.CodeGenerator()
+fop_cg.skip = 2
+fop_cg.parse_decls(sys.argv[1],full_re)
+fop_cg.load_templates(sys.argv[2])
+
+# Use the multi-template feature to generate multiple callbacks from the same
+# parsed declarations.
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)"
+full_re = type_re + " *" + name_re
+cbk_cg = codegen.CodeGenerator()
+cbk_cg.skip = 5
+cbk_cg.parse_decls(sys.argv[1],full_re)
+cbk_cg.load_templates(sys.argv[2])
+
+# This is a nasty little trick to handle the case where a generated fop needs
+# a set of default arguments for the corresponding callback.
+#
+# Yes, it's ironic that I'm copying and pasting the generator code.
+fop_cg.make_defaults = cbk_cg.make_defaults
+
+# Sorry, getspec, you're not a real fop until someone writes a stub function
+# for you.
+del fop_cg.decls["getspec"]
+del cbk_cg.decls["getspec"]
+
+# cbk is used by both fop and continue, so emit first
+for f_name in cbk_cg.decls.keys():
+ cbk_cg.emit(f_name,"cbk")
+ print("")
+
+# continue is used by fop, so emit next
+for f_name in fop_cg.decls.keys():
+ fop_cg.emit(f_name,"cont-func")
+ print("")
+
+for f_name in fop_cg.decls.keys():
+ fop_cg.emit(f_name,"fop")
+ print("")
diff --git a/xlators/cluster/nsr-client/src/nsrc.c b/xlators/cluster/nsr-client/src/nsrc.c
new file mode 100644
index 000000000..6a80b1d86
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/nsrc.c
@@ -0,0 +1,194 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "timer.h"
+#include "xlator.h"
+
+#define SCAR_LIMIT 20
+#define HILITE(x) (""x"")
+
+/*
+ * The fops are actually generated by gen-fops.py; the rest was mostly copied
+ * from defaults.c (commit cd253754 on 27 August 2013).
+ */
+
+enum gf_dht_mem_types_ {
+ gf_mt_nsrc_private_t = gf_common_mt_end + 1,
+};
+
+typedef struct {
+ xlator_t *active;
+} nsrc_private_t;
+
+typedef struct {
+ call_stub_t *stub;
+ xlator_t *curr_xl;
+ uint16_t scars;
+} nsrc_local_t;
+
+char *NSRC_XATTR = "user.nsr.active";
+
+static inline
+xlator_t *
+ACTIVE_CHILD (xlator_t *parent)
+{
+ nsrc_private_t *priv = parent->private;
+
+ return priv ? priv->active : FIRST_CHILD(parent);
+}
+
+xlator_t *
+next_xlator (xlator_t *this, xlator_t *prev)
+{
+ xlator_list_t *trav;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ if (trav->xlator == prev) {
+ return trav->next ? trav->next->xlator
+ : this->children->xlator;
+ }
+ }
+
+ return NULL;
+}
+
+void
+nsrc_retry_cb (void *cb_arg)
+{
+ nsrc_local_t *local = cb_arg;
+
+ gf_log (__func__, GF_LOG_INFO, HILITE("retrying %p"), local);
+ call_resume_wind(local->stub);
+}
+
+#include "nsrc-cg.c"
+
+int32_t
+nsrc_forget (xlator_t *this, inode_t *inode)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement forget_cbk");
+ return 0;
+}
+
+
+int32_t
+nsrc_releasedir (xlator_t *this, fd_t *fd)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement releasedir_cbk");
+ return 0;
+}
+
+int32_t
+nsrc_release (xlator_t *this, fd_t *fd)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement release_cbk");
+ return 0;
+}
+
+struct xlator_fops fops = {
+ .lookup = nsrc_lookup,
+ .stat = nsrc_stat,
+ .fstat = nsrc_fstat,
+ .truncate = nsrc_truncate,
+ .ftruncate = nsrc_ftruncate,
+ .access = nsrc_access,
+ .readlink = nsrc_readlink,
+ .mknod = nsrc_mknod,
+ .mkdir = nsrc_mkdir,
+ .unlink = nsrc_unlink,
+ .rmdir = nsrc_rmdir,
+ .symlink = nsrc_symlink,
+ .rename = nsrc_rename,
+ .link = nsrc_link,
+ .create = nsrc_create,
+ .open = nsrc_open,
+ .readv = nsrc_readv,
+ .writev = nsrc_writev,
+ .flush = nsrc_flush,
+ .fsync = nsrc_fsync,
+ .opendir = nsrc_opendir,
+ .readdir = nsrc_readdir,
+ .readdirp = nsrc_readdirp,
+ .fsyncdir = nsrc_fsyncdir,
+ .statfs = nsrc_statfs,
+ .setxattr = nsrc_setxattr,
+ .getxattr = nsrc_getxattr,
+ .fsetxattr = nsrc_fsetxattr,
+ .fgetxattr = nsrc_fgetxattr,
+ .removexattr = nsrc_removexattr,
+ .fremovexattr = nsrc_fremovexattr,
+ .lk = nsrc_lk,
+ .inodelk = nsrc_inodelk,
+ .finodelk = nsrc_finodelk,
+ .entrylk = nsrc_entrylk,
+ .fentrylk = nsrc_fentrylk,
+ .rchecksum = nsrc_rchecksum,
+ .xattrop = nsrc_xattrop,
+ .fxattrop = nsrc_fxattrop,
+ .setattr = nsrc_setattr,
+ .fsetattr = nsrc_fsetattr,
+ .fallocate = nsrc_fallocate,
+ .discard = nsrc_discard,
+};
+
+struct xlator_cbks cbks = {
+};
+
+int32_t
+nsrc_init (xlator_t *this)
+{
+ nsrc_private_t *priv = NULL;
+
+ this->local_pool = mem_pool_new (nsrc_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsrc_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_mt_nsrc_private_t);
+ if (!priv) {
+ goto err;
+ }
+
+ priv->active = FIRST_CHILD(this);
+ this->private = priv;
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+void
+nsrc_fini (xlator_t *this)
+{
+ GF_FREE(this->private);
+}
+
+class_methods_t class_methods = {
+ .init = nsrc_init,
+ .fini = nsrc_fini,
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-recon/Makefile.am b/xlators/cluster/nsr-recon/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-recon/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-recon/src/Makefile.am b/xlators/cluster/nsr-recon/src/Makefile.am
new file mode 100644
index 000000000..8fa344864
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/Makefile.am
@@ -0,0 +1,22 @@
+xlator_LTLIBRARIES = nsr_recon.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsr_recon_la_LDFLAGS = -module -avoid-version -lgfapi
+nsr_recon_la_SOURCES = recon_driver.c recon_xlator.c
+
+nsr_recon_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = recon_driver.h recon_xlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-recon/src/recon_driver.c b/xlators/cluster/nsr-recon/src/recon_driver.c
new file mode 100644
index 000000000..1328d52dc
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_driver.c
@@ -0,0 +1,2624 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <fnmatch.h>
+
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+
+
+#include "recon_driver.h"
+#include "recon_xlator.h"
+#include "api/src/glfs-internal.h"
+#include "api/src/glfs-handles.h"
+
+/* TBD: move declarations here and nsr.c into a common place */
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+
+/*
+ * Execution architecture for the NSR reconciliation driver. The driver runs
+ * as a seperate process in each node where the brick is. The main function of
+ * the driver is nsr_reconciliation_driver() (last function below) The driver
+ * just sits in a tight loop waiting for state changes. When a brick becomes a
+ * replica leader, it fences IO, contacts this process and waits for
+ * reconciliation to finish.
+ *
+ * The replica leader talks to other bricks in replica group which are alive
+ * and gets the last term info using which it decides which has the latest
+ * data. That brick is referred to as the "reconciliator"; leader sends a
+ * message to reconciliator to freeze its data (by reading any incomplete data
+ * from other nodes from that term if required)
+ *
+ * Once that is done leader sends a message to all nodes except the
+ * reconciliator to sync themselves with the reconciliator. This process is
+ * referred to as "resolution".
+ *
+ * Hence the reconciliation processes need to talk to each other to get a given
+ * term info. This is implemented using the recon translator IOs which
+ * implements a bare bone RPC by exposing a file interface to which
+ * reads/writes are done to pass control messages. This is referred to as the
+ * "control plane". This implementation allows the control plane to be
+ * implemented as a bunch of threads for each of the nodes.
+ *
+ * The reconciliation process also needs to talk to the brick process on that
+ * node to actually write the data as part of reconciliation/resolution. This
+ * is referred to as the "data plane". Again there are a bunch of threads that
+ * do this work.
+ *
+ * The way the worker threads are organised is that main driver context has a
+ * pointer to contexts for each of these thread contexts. The thread context at
+ * index 0 always refers to talking with local recon process/brick. So the
+ * control worker at index 0 will get the local changelog info and data worker
+ * at index 0 will talk to local brick.
+ *
+ * All the ops from the control/data planes are implemented using the glfs
+ * APIs.
+ */
+
+/*
+ * This function gets the size of all the extended attributes for a file.
+ * This is used so that caller knows how much to allocate for key-value storage.
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can get this from the required brick
+ *
+ * Output Arguments:
+ * b - pointer to the buffer where the attributes are filled up.
+ * key_size - the size of all keys
+ * val_size - the size of all values
+ * num - number of key/values
+ */
+static int32_t
+get_xattr_total_size( struct glfs_fd *fd,
+ char **b,
+ uint32_t *key_size,
+ uint32_t *val_size,
+ uint32_t* num,
+ dict_t *dict)
+{
+ int32_t s = -1, ret = -1;
+ char *c = NULL;
+
+ *key_size = 0;
+ *val_size = 0;
+ *num = 0;
+
+ // First get the size of the keys
+ s = glfs_flistxattr_with_xdata(fd, NULL,0, dict);
+ if (s == -1)
+ goto out;
+ *key_size = s;
+
+ // TBD - use the regular calloc
+ (*b) = c = calloc(s+1,1);
+
+ // get the keys themselves
+ if (glfs_flistxattr_with_xdata(fd, c, s+1, dict) == -1)
+ goto out;
+ do {
+ int32_t r;
+ uint32_t len = 0;
+ // for each key get the size of the value
+ r = glfs_fgetxattr_with_xdata(fd, c, NULL, 0, dict);
+ if (r == -1)
+ goto out;
+ (*val_size) += r;
+ len = strlen(c) + 1;
+ c += len;
+ s -= len;
+ (*num)++;
+ } while(s);
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This function gets bunch of xattr values given set of keys.
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * keys - the bunch of keys
+ * size - size of values
+ * num - number of keys
+ * dict - passed so that NSR translator can get this from the required brick
+ *
+ * Output Arguments:
+ * buf - where the values are written one after the other (NULL seperated)
+ */
+static void
+get_xattr(struct glfs_fd *fd,
+ char *keys,
+ char *buf,
+ uint32_t size,
+ uint32_t num,
+ dict_t *dict)
+{
+ while(num--) {
+ int32_t r;
+ uint32_t len = 0;
+
+ // copy the key
+ strcpy(buf, keys);
+ len = strlen(keys);
+ len++;
+ buf += len;
+
+ // get the value and copy the value after incrementing buf after the key
+ r = glfs_fgetxattr_with_xdata(fd, keys, buf, size, dict);
+
+ // TBD - handle error
+ if (r == -1)
+ return;
+
+ // increment the key to next value
+ keys += len;
+
+ // increment buf to hold the next key
+ buf += strlen(buf) + 1;
+ }
+ return;
+}
+
+/*
+ * Function deletes a bunch of key values in extended attributes of a file.
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can do this from the required brick
+ * keys - bunch of NULL seperated key names
+ * num - number of keys
+ */
+static void delete_xattr(struct glfs_fd *fd,
+ dict_t *dict_t,
+ char *keys,
+ uint32_t num)
+{
+ while(num--) {
+ // get the value and copy the value
+ // TBD - handle failure cases when calling glfs_fremovexattr_with_xdata()
+ glfs_fremovexattr_with_xdata(fd, keys, dict_t);
+ keys += strlen(keys) +1;
+ }
+ return;
+}
+
+/*
+ * Given a bunch of key value pairs, fill them as xattrs for a file
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can do this from the required brick
+ * buf - buffer containing the keys-values pairs. The key value are NULL seperated.
+ * Each of the key-value is seperated by NULL in turn.
+ * num - Number of such key value pairs.
+ */
+static void
+fill_xattr(struct glfs_fd *fd,
+ dict_t *dict,
+ char *buf,
+ uint32_t num)
+{
+ char *k = buf, *val = NULL;
+
+ while(num--) {
+ int32_t r;
+
+ val = k + strlen(k) + 1;
+
+ // TBD - handle failure cases when calling glfs_fsetxattr_with_xdata()
+ r = glfs_fsetxattr_with_xdata(fd, k, val, strlen(val), 0, dict);
+ if (r == -1)
+ return;
+ k = val + strlen(val) + 1;
+ }
+ return;
+}
+
+/*
+ * This function gets a file that can be used for doing glfs_init later.
+ * The control file is used by control thread(function) to talk to peer reconciliation process.
+ * The data file is used by the data thread(function) to talk to the bricks.
+ * The control file is of name such as con:gfs1:-mnt-a1 where "gfs1" is name of host
+ * and the brick path is "/mnt/a1".
+ * The data file is of name such as data:gfs1:-mnt-a1.
+ *
+ * Input Arguments:
+ * vol - name of the volume. This is used to build the full path of the control and data file
+ * such as /var/lib/glusterd/vols/test/bricks/gfs2:-mnt-test1-nsr-recon.vol.
+ * In above example the volume name is test and brick on gfs2 is on path /mnt/test1
+ *
+ * worker - The worker for a given node. This worker has 2 threads - one on the data plane
+ * and one on the control plane. The worker->name is already filled with hostname:brickname
+ * in the function nsr_reconciliation_driver(). Use that to build the volume file.
+ * So if worker->name has gfs1:/mnt/a1, control file is con:gfs1:-mnt-a1
+ * and data file is data:gfs1:-mnt-a1.
+ * All these files are under the bricks directory. TBD - move this to a NSR recon directory later.
+ */
+static void
+nsr_recon_get_file(char *vol, nsr_replica_worker_t *worker)
+{
+ char *ptr;
+ char tr[256];
+
+ // Replace the "/" to -
+ strcpy(tr, worker->name);
+ ptr = strchr (tr, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (tr, '/');
+ }
+
+ // Build the base directory such as "/var/lib/glusterd/vols/test/bricks/"
+ sprintf(worker->control_worker->vol_file,
+ "/%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ vol,
+ GLUSTERD_BRICK_INFO_DIR);
+
+ strcat(worker->control_worker->vol_file, "con:");
+ strcat(worker->control_worker->vol_file, tr);
+
+ sprintf(worker->data_worker->vol_file,
+ "/%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ vol,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(worker->data_worker->vol_file, "data:");
+ strcat(worker->data_worker->vol_file, tr);
+}
+
+/*
+ * This function does all the glfs initialisation
+ * so that reconciliation process can talk to other recon processes/bricks
+ * for the control/data messages.
+ * This will be done everytime a worker needs to be kicked off to talk
+ * across any plane.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static int
+nsr_recon_start_work(nsr_per_node_worker_t *ctx,
+ gf_boolean_t control)
+{
+ glfs_t *fs = NULL;
+ xlator_t *this = ctx->driver_ctx->this;
+ int32_t ret = 0;
+ glfs_fd_t *aux_fd = NULL; // fd of auxilary log
+ char lf[256];
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting work with volfile %s\n",
+ ctx->vol_file);
+
+ fs = glfs_new(ctx->id);
+ if (!fs) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot create gfls context for thread %s\n",ctx->id);
+ return -1;
+ }
+
+ // For some vague reason, glfs init APIs seem to be clobbering "this". hence resetting it.
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "init done. setting volfile %s\n",
+ ctx->vol_file);
+
+ ret = glfs_set_volfile(fs, ctx->vol_file);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot set volfile %s for thread %s\n",ctx->vol_file, ctx->id);
+ return -1;
+ }
+
+ // TBD - convert this to right /usr/local/var/log based log files.
+ sprintf(lf,"/tmp/logs/%s-%s",(control == _gf_true)?"con":"data",ctx->id);
+ glfs_set_logging (fs, lf, 7);
+ glusterfs_this_set(this);
+
+ ret = glfs_init (fs);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do init for thread %s with volfile %s\n",ctx->id, ctx->vol_file);
+ return -1;
+ }
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "setting volfile %s done\n",
+ ctx->vol_file);
+
+ // If it is control thread, open the "/" as the aux_fd.
+ // All IOs happening via the fd will do the RPCs across the reconciliation
+ // processes. For some vague reason, the root seems to be open'able like a file.
+ // TBD - try to clean this up. (implement a virtual file???)
+ if (control == _gf_true) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing open for / \n");
+ aux_fd = glfs_open (fs, "/", O_RDWR);
+ // TBD - proper error handling. Stall reconciliation if such a thing happens?
+ if (aux_fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot open aux log file for thread %s\n",ctx->id);
+ } else {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "---opened aux log file for thread %s\n",ctx->id);
+ }
+ ctx->aux_fd = aux_fd;
+ }
+ glusterfs_this_set(this);
+ ctx->fs = fs;
+ return 0;
+}
+
+/*
+ *
+ * This function does the cleanup after reconciliation is done
+ * or before we start a new reconciliation.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static int
+nsr_recon_end_work(nsr_per_node_worker_t *ctx,
+ gf_boolean_t control)
+{
+ int32_t ret = 0;
+ xlator_t *this = ctx->driver_ctx->this;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing fini for recon worker\n");
+
+ ret = glfs_fini(ctx->fs);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do fini for thread %s with volfile %s\n",ctx->id, ctx->vol_file);
+ return -1;
+ }
+ glusterfs_this_set(this);
+ ctx->fs = NULL;
+ if (control == _gf_true) {
+ glfs_close (ctx->aux_fd);
+ ctx->aux_fd = NULL;
+ }
+ return 0;
+}
+
+//called in case all worker functions run as sepeerate threads
+static void
+init_worker(nsr_per_node_worker_t *ctx, gf_boolean_t control)
+{
+ pthread_mutex_init(&(ctx->mutex), NULL);
+ pthread_cond_init(&(ctx->cv), NULL);
+ INIT_LIST_HEAD(&(ctx->head.list));
+}
+
+
+/*
+ * Control worker funct for getting changelog info on this node.
+ * calls directly functions to parse the changelog.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static void
+control_worker_func_0(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ xlator_t *this = ctx->driver_ctx->this;
+ nsr_recon_private_t *priv = this->private;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ ctx->is_control = _gf_true;
+
+ switch (work->req_id){
+ case NSR_WORK_ID_INI:
+ {
+ break;
+ }
+ case NSR_WORK_ID_FINI:
+ {
+ break;
+ }
+ case NSR_WORK_ID_GET_LAST_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // term is stuffed inside work->index. overloading.
+ int32_t term = work->index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get last term info for node %d with current term %d\n",index, term);
+
+ // TBD - handle errors
+ // This is called by the leader after it gets the current term.
+ // Makes searching easier.
+ nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, term, &lt);
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get last term info with current term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+ break;
+ }
+ case NSR_WORK_ID_GET_GIVEN_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // term is stuffed inside work->index. overloading.
+ int32_t term = work->index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get term info for node %d for term %d\n",index, term);
+
+ // TBD - handle errors
+ nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, term, &lt);
+
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get term info for term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->last_index, recon_info->first_index);
+
+ break;
+ }
+ case NSR_WORK_ID_RECONCILIATOR_DO_WORK:
+ {
+ // For local resolution, the main driver thread does it.
+ // SO there is no way we can have this message for this node.
+ GF_ASSERT(0);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this message should not be sent \n");
+ break;
+ }
+ case NSR_WORK_ID_RESOLUTION_DO_WORK:
+ {
+ GF_ASSERT(0);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this message should not be sent \n");
+ break;
+ }
+ case NSR_WORK_ID_END_RECONCILIATION:
+ {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sending reconciliation end message to node %d\n", index);
+ nsr_recon_return_back(priv, dr->txn_id);
+ break;
+ }
+ case NSR_WORK_ID_GET_RECONCILATION_WINDOW:
+ {
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // first_index and last_index at 0 indicates empty log.
+ // For non empty log, the first_index always starts at 1.
+ uint32_t num = (dr->workers[index].recon_info->last_index -
+ dr->workers[index].recon_info->first_index + 1);
+ nsr_recon_record_details_t *rd;
+ uint32_t i=0;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get reconciliation window records for node %d for term %d with first %d last %d\n",
+ index, recon_info->last_term, recon_info->first_index, recon_info->last_index);
+
+ GF_ASSERT(num <= MAX_RECONCILIATION_WINDOW_SIZE);
+
+ // TBD - handle buffer allocation errors
+ rd = GF_CALLOC(num,
+ sizeof(nsr_recon_record_details_t),
+ gf_mt_recon_private_t);
+
+ // TBD - handle errors
+ nsr_recon_libchangelog_get_records(this, priv->changelog_base_path,
+ recon_info->last_term,
+ recon_info->first_index,
+ recon_info->last_index,
+ rd);
+ // The above function writes into rd from 0 to (num -1)
+ // We need to take care of this whenever we deal with records
+ for (i=0; i < num; i++) {
+ ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl
+ memcpy(&(recon_info->records[i].rec),
+ &(rd[i]),
+ sizeof(nsr_recon_record_details_t));
+ }
+
+ GF_FREE(rd);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got reconciliation window records for node %d for term %d \n",
+ index, recon_info->last_term);
+ break;
+ }
+ }
+
+ return;
+}
+
+// Control worker thread
+static void*
+control_worker_main_0(nsr_per_node_worker_t *ctx)
+{
+
+ ctx->is_control = _gf_true;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting control worker func 0\n");
+
+ init_worker(ctx, 1);
+
+ while(1)
+ {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&ctx->mutex);
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&ctx->cv, &ctx->mutex);
+ }
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n", work->req_id);
+ work->in_use = _gf_false;
+
+ // Call the main function.
+ control_worker_func_0(ctx, work);
+
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+/*
+ * Control worker funct for getting changelog info on some other node.
+ * calls glfs functions to seek/read/write on aux_fd.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static void
+control_worker_func(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ ctx->is_control = _gf_true;
+
+ switch (work->req_id){
+ case NSR_WORK_ID_INI:
+ {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "calling nsr_recon_start_work\n");
+
+ // TBD - handle error in case nsr_recon_start_work gives error
+ nsr_recon_start_work(ctx, _gf_true);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished nsr_recon_start_work\n");
+ break;
+ }
+ case NSR_WORK_ID_FINI:
+ {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "calling nsr_recon_end_work\n");
+
+ // TBD - handle error in case nsr_recon_end_work gives error
+ nsr_recon_end_work(ctx, _gf_true);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished nsr_recon_end_work\n");
+ break;
+ }
+ case NSR_WORK_ID_GET_LAST_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ int32_t term = htonl(work->index); // overloading it
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get last term info for node %d with current term %d\n",index, work->index);
+
+ // first write the current term term number
+ // TBD - error handling for all the glfs APIs
+ glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_4, SEEK_SET);
+ glfs_write(ctx->aux_fd, &term, sizeof(term), 0);
+ glfs_read(ctx->aux_fd, &lt, sizeof(lt), 0);
+ ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get last term info with current term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->last_index, recon_info->first_index);
+
+ break;
+ }
+ case NSR_WORK_ID_GET_GIVEN_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ int32_t term = htonl(work->index); // overloading it
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get term info for node %d for term %d\n",index, work->index);
+
+ // first write the term number
+ // TBD - error handling for all the glfs APIs
+ glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_3, SEEK_SET);
+ glfs_write(ctx->aux_fd, &term, sizeof(term), 0);
+ glfs_read(ctx->aux_fd, &lt, sizeof(lt), 0);
+ ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get term info for term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->last_index, recon_info->first_index);
+
+ break;
+ }
+ case NSR_WORK_ID_RECONCILIATOR_DO_WORK:
+ {
+ nsr_recon_role_t rr;
+ uint32_t i=0;
+ uint32_t num=0;
+ uint32_t idx = dr->reconciliator_index;
+ uint32_t term = dr->workers[idx].recon_info->last_term;
+ GF_ASSERT(idx == index);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to make this index %d as reconciliator for term %d\n", index, term);
+
+ // TBD - error handling for all the glfs APIs
+ glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_1,
+ SEEK_SET);
+
+ // We have all the info for all other nodes.
+ // Fill all that info when sending data to that process.
+ for (i=0; i < dr->replica_group_size; i++) {
+ if ( dr->workers[i].in_use &&
+ (dr->workers[i].recon_info->last_term == term)) {
+ rr.info[num].last_term =
+ dr->workers[i].recon_info->last_term;
+ rr.info[num].commited_ops =
+ dr->workers[i].recon_info->commited_ops;
+ rr.info[num].last_index =
+ dr->workers[i].recon_info->last_index;
+ rr.info[num].first_index =
+ dr->workers[i].recon_info->first_index;
+ strcpy(rr.info[num].name,
+ dr->workers[i].name);
+ }
+ num++;
+ }
+ rr.num = num;
+ rr.role = reconciliator;
+ ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl
+ glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sent reconciliator info for term %d with node count as %d\n", term, num);
+
+ break;
+ }
+ case NSR_WORK_ID_RESOLUTION_DO_WORK:
+ {
+ nsr_recon_role_t rr;
+ unsigned int i=0, j=0;
+ unsigned int rec = dr->reconciliator_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to make this index %d as resolutor with reconciliator as %d\n",index, rec);
+
+ // TBD - error handling for all the glfs APIs
+ glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_1,
+ SEEK_SET);
+ rr.num = 2;
+
+ // Fill in info[0] as info for the node for which we are seeking resolution.
+ // Fill in info[1] as info of the reconciliator node.
+ // The function nsr_recon_driver_set_role() that will be called when
+ // this message reaches the node will look at index 1 for term information
+ // related to the reconciliator.
+ for (i=0; i < 2; i++) {
+ (i == 0) ? (j = index) : (j = rec);
+ rr.info[i].last_term =
+ dr->workers[j].recon_info->last_term;
+ rr.info[i].commited_ops =
+ dr->workers[j].recon_info->commited_ops;
+ rr.info[i].last_index =
+ dr->workers[j].recon_info->last_index;
+ rr.info[i].first_index =
+ dr->workers[j].recon_info->first_index;
+ // The name is used as the key to convert indices since
+ // the reconciliator index could be different across the nodes.
+ strcpy(rr.info[i].name,
+ dr->workers[j].name);
+ if (i == 0) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this node info term=%d, ops=%d, first=%d, last=%d\n",
+ rr.info[i].last_term, rr.info[i].commited_ops,
+ rr.info[i].first_index,rr.info[i].last_index);
+ } else {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "reconciliator node info term=%d, ops=%d, first=%d, last=%d\n",
+ rr.info[i].last_term, rr.info[i].commited_ops,
+ rr.info[i].first_index,rr.info[i].last_index);
+ }
+ }
+ rr.role = resolutor;
+ ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl
+ glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sent message to this node %d resolutor with reconciliator as %d\n", index, rec);
+
+ break;
+ }
+ case NSR_WORK_ID_END_RECONCILIATION:
+ {
+ char c[4];
+ uint32_t old = htonl(dr->txn_id);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sending reconciliation end message to node %d\n", index);
+
+ memcpy(c, &old, sizeof(uint32_t));
+ // TBD - error handling for all the glfs APIs
+ glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_0,
+ SEEK_SET);
+ glfs_write(ctx->aux_fd, c, sizeof(c), 0);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished sending reconciliation end message to node %d\n", index);
+
+ break;
+ }
+ case NSR_WORK_ID_GET_RECONCILATION_WINDOW:
+ {
+ nsr_recon_log_info_t li;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ uint32_t i = 0;
+ uint32_t num = (dr->workers[index].recon_info->last_index -
+ dr->workers[index].recon_info->first_index +1);
+ nsr_recon_record_details_t *rd;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get reconciliation window records for node %d for term %d with first %d last %d\n",
+ index, recon_info->last_term, recon_info->first_index, recon_info->last_index);
+
+ GF_ASSERT(num <= MAX_RECONCILIATION_WINDOW_SIZE);
+
+ // TBD - error handling for all the glfs APIs
+ glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_2, SEEK_SET);
+
+ // write to node what term & indices we are interested
+ li.term = recon_info->last_term;
+ li.first_index = recon_info->first_index;
+ li.last_index = recon_info->last_index;
+ ENDIAN_CONVERSION_LI(li, _gf_false); //htonl
+ glfs_write(ctx->aux_fd, &li, sizeof(li), 0);
+
+ // then read
+ rd = GF_CALLOC(num,
+ sizeof(nsr_recon_record_details_t),
+ gf_mt_recon_private_t);
+ glfs_read(ctx->aux_fd, rd, num * sizeof(nsr_recon_record_details_t), 0);
+ for (i=0; i < num; i++) {
+ ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl
+ memcpy(&(recon_info->records[i].rec),
+ &(rd[i]),
+ sizeof(nsr_recon_record_details_t));
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "get_reconcilaition_window:Got %d at index %d\n",
+ recon_info->records[i].rec.type,
+ i + recon_info->first_index);
+ }
+ free(rd);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got reconciliation window records for node %d for term %d \n",
+ index, recon_info->last_term);
+ break;
+ }
+ }
+
+ return;
+}
+
+// Control worker thread
+static void*
+control_worker_main(nsr_per_node_worker_t *ctx)
+{
+ unsigned int index = ctx->index;
+
+ ctx->is_control = _gf_true;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting control worker func\n");
+
+ // if this is for local processing, call the changelog parsing calls directly
+ if (index == 0) {
+ control_worker_main_0(ctx);
+ return NULL;
+ }
+
+ init_worker(ctx, 1);
+
+
+ while(1)
+ {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&ctx->mutex);
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&ctx->cv, &ctx->mutex);
+ }
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n", work->req_id);
+ work->in_use = _gf_false;
+ control_worker_func(ctx,work);
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+/*
+ * This function gets called if this process is chosen as the reconciliator
+ * for this replica group. It would have already got the records for the last term
+ * for the indices that are required (from the first HOLE to last index) from
+ * all other nodes that also witnessed that term. COmpare all the records and
+ * compute the work required.
+ *
+ * Input arguments
+ * ctx - driver context. All recon work is stored in workers[0].recon_info
+ */
+static void
+compute_reconciliation_work(nsr_recon_driver_ctx_t *ctx)
+{
+ uint32_t i=0, j=0;
+ nsr_reconciliator_info_t *my_recon = ctx->workers[0].recon_info;
+ uint32_t num = (my_recon->last_index - my_recon->first_index + 1);
+
+ for (i=0; i < num; i++) {
+ nsr_log_type_t orig, new;
+ unsigned int src = 0;
+ orig = new = my_recon->records[i].rec.type;
+ nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE;
+ // index 0 means this node. Look at all other nodes.
+ for (j=1; j < ctx->replica_group_size; j++) {
+ if (ctx->workers[j].in_use) {
+ nsr_log_type_t pr = ctx->workers[j].recon_info->records[i].work.type;
+ if ((new != pr) && (pr > new)) {
+ src = j;
+ new = (new | pr);
+ }
+ }
+ }
+ // TBD - compare data if new and orig are all FILLs. (can detect changelog corruption)
+ // Right now we compare if both orig and new are psuedo holes since
+ // only that is of interest to us.
+ if (orig != new) {
+ if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_PSEUDO_HOLE))
+ tw = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE;
+ else if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_PSEUDO_HOLE))
+ tw = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE;
+ else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ }
+ if (tw != NSR_RECON_WORK_NONE) {
+ my_recon->records[i].work.type = tw;
+ my_recon->records[i].work.source = src;
+ // Overwrite the record
+ memcpy(&(my_recon->records[i].rec),
+ &(ctx->workers[src].recon_info->records[i].rec),
+ sizeof(nsr_recon_record_details_t));
+ }
+ }
+ return;
+}
+
+static void
+nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx,
+ uint32_t i,
+ gf_boolean_t in_use);
+
+/*
+ * Write the role and associated information to the node.
+ * This gets called from recon xlator indicating node is either
+ * leader, reconciliator or should do resolution.
+ * First we undo the last role to make sure we clean up.
+ *
+ * Input arguments
+ * ctx - driver context.
+ * rr - Role information.
+ * If leader, the thread now sends the list of all nodes that are part of
+ * the current replica group. Use that to find out the activate the
+ * required worker threads.
+ * If reconciliator, the leader node would have sent information about
+ * all nodes which saw last term as the reconciliator.
+ * If resolution to be done, then rr.info[0] will have this node's info
+ * which the leader would have got earlier. rr[1].info will have the
+ * info regarding the reconciliator.
+ * txn_id - All role changes(except when leader becomes reconciliator or resolutor)
+ * would be initiated as write to the recon xlator which would have got a frame from
+ * either the brick process(leader change) or other reconciliation process.
+ * The write function would return immediately after storing the frame which
+ * needs to be returned back after the actual reconciliation is done.
+ * For that we store the frame against this id which acts as a key.
+ */
+gf_boolean_t
+nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_role_t *rr,
+ uint32_t txn_id)
+{
+ uint8_t i=0, j=0;
+ pthread_mutex_lock(&(ctx->mutex));
+ ctx->state = rr->role;
+ // First make all the threads uninitialise
+ for (i = 0; i < ctx->replica_group_size; i++) {
+ nsr_recon_in_use(ctx, i, _gf_false);
+ }
+ if (rr->role == leader) {
+
+ // First set info this node
+ nsr_recon_in_use(ctx, 0, _gf_true);
+ ctx->workers[0].recon_info = GF_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_private_t);
+ if (!ctx->workers[0].recon_info) {
+ return _gf_false;
+ }
+ ctx->current_term = rr->current_term;
+
+ // Find rest of the nodes
+ for (i=1; i < ctx->replica_group_size; i++) {
+ for (j=0 ; j < rr->num; j++) {
+ // TBD - make this strcmp later when etcd servers set properly
+ //if (!strcmp(ctx->workers[i].name, rr->info[j].name)) {
+ if (strstr(ctx->workers[i].name, rr->info[j].name)) {
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_set_role: this as leader. found other server %s\n",
+ ctx->workers[i].name);
+
+ nsr_recon_in_use(ctx, i, _gf_true);
+ // Allocate this here. This will get later filled when
+ // the leader tries to get last term information from all
+ // the nodes
+ ctx->workers[i].recon_info = GF_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_private_t);
+ if (!ctx->workers[i].recon_info) {
+ return _gf_false;
+ }
+ break;
+ }
+ }
+ }
+ ctx->reconciliator_index = -1;
+ } else if (rr->role == reconciliator) {
+ ctx->reconciliator_index = 0;
+ // Copy information about all the other members which had the same term
+ for (i=0; i < rr->num; i++) {
+ for (j=0; j < ctx->replica_group_size; j++) {
+ //if (!strcmp(rr->info[i].name, ctx->workers[j].name)) {
+ if (strstr(ctx->workers[j].name, rr->info[i].name)) {
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_set_role: this as reconciliator. found other server %s\n",
+ ctx->workers[j].name);
+ ctx->workers[j].recon_info = GF_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_private_t);
+ if (!ctx->workers[j].recon_info) {
+ return _gf_false;
+ }
+ ctx->workers[j].recon_info->last_term =
+ rr->info[i].last_term;
+ ctx->workers[j].recon_info->commited_ops =
+ rr->info[i].commited_ops;
+ ctx->workers[j].recon_info->last_index =
+ rr->info[i].last_index;
+ ctx->workers[j].recon_info->first_index =
+ rr->info[i].first_index;
+ nsr_recon_in_use(ctx, j, _gf_true);
+ break;
+ }
+ }
+ }
+ } else if (rr->role == resolutor) {
+ for (j=0; j < ctx->replica_group_size; j++) {
+ // info[1] has the information regarding the reconciliator
+ if (strstr(ctx->workers[j].name, rr->info[1].name)) {
+ //if (!strcmp(rr->info[1].name, ctx->workers[j].name)) {
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_set_role: this as resolutor. found other server %s as reconciliator\n",
+ ctx->workers[1].name);
+ ctx->workers[j].recon_info = GF_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_private_t);
+ if (!ctx->workers[j].recon_info) {
+ return _gf_false;
+ }
+ ctx->workers[j].recon_info->last_term =
+ rr->info[1].last_term;
+ ctx->workers[j].recon_info->commited_ops =
+ rr->info[1].commited_ops;
+ ctx->workers[j].recon_info->last_index =
+ rr->info[1].last_index;
+ ctx->workers[j].recon_info->first_index =
+ rr->info[1].first_index;
+ ctx->reconciliator_index = j;
+ nsr_recon_in_use(ctx, j, _gf_true);
+ GF_ASSERT(ctx->reconciliator_index != 0);
+ break;
+ }
+ }
+ ctx->workers[0].recon_info = GF_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_private_t);
+ if (!ctx->workers[0].recon_info) {
+ return _gf_false;
+ }
+ // info[0] has all info for this node
+ ctx->workers[0].recon_info->last_term = rr->info[0].last_term;
+ ctx->workers[0].recon_info->commited_ops = rr->info[0].commited_ops;
+ ctx->workers[0].recon_info->last_index = rr->info[0].last_index;
+ ctx->workers[0].recon_info->first_index = rr->info[0].first_index;
+ nsr_recon_in_use(ctx, 0, _gf_true);
+ }
+
+ ctx->txn_id = txn_id;
+ // Signal the main driver thread
+ pthread_cond_signal(&(ctx->cv));
+ pthread_mutex_unlock(&(ctx->mutex));
+ return _gf_true;
+}
+
+
+/*
+ * This function gets called if this process is chosen to sync itself with
+ * the reconciliator.
+ *
+ * Input arguments
+ * ctx - driver context.
+ * my_info - local changelog info that has all the local records for indices that require work
+ * his_info - reconciliator's info that has all the golden copies
+ * invalidate - if set to true, then do not consult local records
+ */
+
+static void
+compute_resolution_work(nsr_recon_driver_ctx_t *ctx,
+ nsr_reconciliator_info_t *my_info,
+ nsr_reconciliator_info_t *his_info,
+ gf_boolean_t invalidate)
+{
+ uint32_t i=0;
+ uint32_t num = (my_info->last_index - my_info->first_index + 1);
+
+ for (i=0; i < num; i++) {
+ nsr_log_type_t orig, new;
+ nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE;
+ orig = my_info->records[i].rec.type;
+ if (invalidate)
+ orig = NSR_LOG_HOLE;
+ new = his_info->records[i].rec.type;
+ // TBD - we can never have PSUEDO_HOLE in reconciliator's info
+ // We should have taken care of that during reconciliation.
+ // Put an assert to validate that.
+ if (new != orig) {
+ if ((orig != NSR_LOG_FILL) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ else if ((orig != NSR_LOG_HOLE) && (new == NSR_LOG_HOLE))
+ tw = NSR_RECON_WORK_UNDO_FILL;
+ }
+ // copy the records anyway
+ my_info->records[i].work.type = tw;
+ my_info->records[i].work.source = ctx->reconciliator_index;
+ memcpy(&(my_info->records[i].rec),
+ &(his_info->records[i].rec),
+ sizeof(nsr_recon_record_details_t));
+ }
+ return;
+}
+
+
+// Create an glfs object
+static struct glfs_object *
+create_obj(nsr_per_node_worker_t *ctx, char *gfid_str)
+{
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+
+ uuid_parse(gfid_str, gfid);
+
+ obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL);
+ if (obj == NULL) {
+ GF_ASSERT(obj != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ return NULL;
+ }
+ return obj;
+}
+
+/*
+ * Function to apply the actual record onto the local brick.
+ * prior to this we should have read all the data from the
+ * brick that has the data.
+ *
+ * Input parameters:
+ * ctx - per node worker context that has the fs for communicating to brick
+ * ri - Reconciliation record that needs fixup
+ * dict - So that NSR server translator on brick applis fixup only on this brick
+ * and the changelog translator consumes term and index.
+ */
+
+static void
+apply_record(nsr_per_node_worker_t *ctx,
+ nsr_reconciliation_record_t *ri,
+ dict_t * dict)
+{
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+
+
+ if (ri->rec.op == GF_FOP_WRITE) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "DOing write for file %s @offset %d for len %d\n",
+ ri->rec.gfid, ri->rec.offset, ri->rec.len);
+
+ // The file has got deleted on the source. Hence just ignore this.
+ // TBD - get a way to just stuff the log entry without writing the data so that
+ // changelogs remain identical.
+ if (ri->work.data == NULL) {
+ return;
+ }
+
+ if ((obj = create_obj(ctx,ri->rec.gfid)) == NULL) return;
+
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ return;
+ }
+ if (glfs_lseek_with_xdata(fd, ri->rec.offset, SEEK_SET, dict) != ri->rec.offset) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "lseek for file %s failed at offset %d\n",
+ ri->rec.gfid, ri->rec.offset);
+ return;
+ }
+ if (glfs_write_with_xdata(fd, ri->work.data, ri->rec.len, 0, dict) != ri->rec.len) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "write for file %s failed for bytes %d\n",
+ ri->rec.gfid, ri->rec.len);
+ return;
+ }
+ glfs_close_with_xdata(fd, dict);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished DOing write for gfid %s @offset %d for len %d\n",
+ ri->rec.gfid, ri->rec.offset, ri->rec.len);
+
+ } else if (ri->rec.op == GF_FOP_FTRUNCATE) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "DOing truncate for file %s @offset %d \n",
+ ri->rec.gfid, ri->rec.offset);
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) return;
+
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ return;
+ }
+ if (glfs_ftruncate_with_xdata(fd, ri->rec.offset, dict) == -1) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR
+ "trunctae for file %s failed @offset %d\n",
+ ri->rec.gfid,ri->rec.offset );
+ return;
+ }
+ glfs_close_with_xdata(fd, dict);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished DOing truncate for gfid %s @offset %d \n",
+ ri->rec.gfid, ri->rec.offset);
+
+ } else if ((ri->rec.op == GF_FOP_FREMOVEXATTR) ||
+ (ri->rec.op == GF_FOP_REMOVEXATTR) ||
+ (ri->rec.op == GF_FOP_SETXATTR) ||
+ (ri->rec.op == GF_FOP_FSETXATTR)) {
+
+ uint32_t k_s = 0, v_s = 0;
+ char *t_b= NULL;
+ uint32_t num = 0;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing set extended attr for file %s \n",
+ ri->rec.gfid);
+
+ // The file has got deleted on the source. Hence just ignore this.
+ // TBD - get a way to just stuff the log entry without writing the data so that
+ // changelogs remain identical.
+ if (ri->work.data == NULL) {
+ return;
+ }
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) return;
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ return;
+ }
+
+ if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) {
+ if (t_b) free(t_b);
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "list of xattr of %s failed\n", ri->rec.gfid);
+ return;
+ }
+
+ delete_xattr(fd, dict, t_b, num);
+
+ // Set one special dict flag to indicate the opcode so that
+ // the opcode gets set to this
+ if (dict_set_int32(dict,"recon-xattr-opcode",ri->rec.op)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "setting opcode to %d failed\n",ri->rec.op);
+ return;
+ }
+
+ fill_xattr(fd, dict, ri->work.data, ri->work.num);
+
+ glfs_close_with_xdata(fd, dict);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed Doing set extended attr for %s \n",
+ ri->rec.gfid);
+
+ } else if (ri->rec.op == GF_FOP_CREATE) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing create for file %s \n",
+ ri->rec.gfid);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+
+ if (glfs_h_creat_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing create for file %s\n",
+ ri->rec.entry);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing create for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_MKNOD) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing mknod for file %s \n",
+ ri->rec.entry);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+
+ if (glfs_h_mknod_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing mknod for file %s\n",
+ ri->rec.entry);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing mknod for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_MKDIR) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing mkdir for dir %s \n",
+ ri->rec.gfid);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+
+ if (glfs_h_mkdir_with_xdata(ctx->fs, obj, ri->rec.entry, 0777, NULL, gfid, dict) != 0) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing mkdir for file %s\n",
+ ri->rec.entry);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing mkdir for file %s \n",
+ ri->rec.entry);
+
+ } else if ((ri->rec.op == GF_FOP_RMDIR) || (ri->rec.op == GF_FOP_UNLINK)) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing rmdir/ublink for dir %s \n",
+ ri->rec.entry);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+ if (glfs_h_unlink_with_xdata(ctx->fs, obj, ri->rec.entry, dict) != 0) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing rmdir/unlink for file %s\n",
+ ri->rec.entry);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing rmdir/unlink for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_SYMLINK) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+ uuid_parse(ri->rec.gfid, gfid);
+
+ if (glfs_h_symlink_with_xdata(ctx->fs, obj, ri->rec.entry, ri->rec.link_path, NULL, gfid, dict) == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+
+ } else if (ri->rec.op == GF_FOP_LINK) {
+
+ struct glfs_object *to_obj = NULL;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+ if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) return;
+
+ if (glfs_h_link_with_xdata(ctx->fs, to_obj, obj, ri->rec.entry, dict) == -1) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+
+ } else if (ri->rec.op == GF_FOP_RENAME) {
+
+ struct glfs_object *to_obj = NULL;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing rename for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) return;
+ if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) return;
+
+ if (glfs_h_rename_with_xdata(ctx->fs, obj, ri->rec.entry, to_obj, ri->rec.newloc, dict) == -1) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing rename for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed doing renam for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+
+
+ } else if ((ri->rec.op == GF_FOP_SETATTR) || (ri->rec.op == GF_FOP_FSETATTR)) {
+
+ struct iatt iatt = {0, };
+ int valid = 0;
+ int ret = -1;
+
+ // TBD - do the actual settings once we do that
+ // right now we just set the mode so that changelog gets filled
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing attr for file %s \n",
+ ri->rec.gfid);
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) return;
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ return;
+ }
+
+ iatt.ia_prot = ia_prot_from_st_mode(777);
+ valid = GF_SET_ATTR_MODE;
+
+
+ // Set one special dict flag to indicate the opcode so that
+ // the opcode gets set to this
+ if (dict_set_int32(dict,"recon-attr-opcode",ri->rec.op)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "setting opcode to %d failed\n",ri->rec.op);
+ return;
+ }
+
+ ret = glfs_fsetattr_with_xdata(fd, &iatt, valid, dict);
+ if (ret == -1) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "failed Doing attr for file %s \n",
+ ri->rec.gfid);
+ return;
+ }
+
+ glfs_close_with_xdata(fd, dict);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing attr for file %s \n",
+ ri->rec.gfid);
+
+ }
+
+ return;
+}
+
+//return back opcodes that requires reading from source
+static gf_boolean_t
+recon_check_changelog(nsr_recon_record_details_t *rd)
+{
+ return((rd->op == GF_FOP_WRITE) ||
+ (rd->op == GF_FOP_FSETATTR) ||
+ (rd-> op == GF_FOP_SETATTR) ||
+ (rd->op == GF_FOP_FREMOVEXATTR) ||
+ (rd->op == GF_FOP_SETXATTR) ||
+ (rd->op == GF_FOP_FSETXATTR) ||
+ (rd->op == GF_FOP_SYMLINK));
+
+}
+
+// TBD
+static gf_boolean_t
+recon_compute_undo(nsr_recon_record_details_t *rd)
+{
+ return(_gf_false);
+}
+
+
+/*
+ * Function that talks to the brick for data tranfer.
+ *
+ * Input arguments:
+ * ctx - worker context
+ * work - pointer to work object
+ */
+static void
+data_worker_func(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ nsr_reconciliation_record_t *ri = NULL;
+ nsr_recon_record_details_t *rd = NULL;
+ glfs_fd_t *fd = NULL;
+ int wip = 0;
+
+ switch (work->req_id){
+ case NSR_WORK_ID_INI:
+ {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started data ini \n");
+
+ nsr_recon_start_work(ctx, _gf_false);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished data ini \n");
+ break;
+ }
+ case NSR_WORK_ID_FINI:
+ {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started data fini \n");
+
+ nsr_recon_end_work(ctx, _gf_false);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished data fini \n");
+ break;
+ }
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_READ:
+ {
+ dict_t * dict = NULL;
+ // first_index always starts with 1 but records starts at 0.
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ dict = dict_new ();
+ if (!dict) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ goto commit_out;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ goto commit_out;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ goto commit_out;
+ }
+
+ if (rd->op == GF_FOP_WRITE) {
+
+ // record already copied.
+ // copy data to this node's info.
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+
+ uuid_parse(ri->rec.gfid, gfid);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started recon read for file %s at offset %d at len %d\n",
+ ri->rec.gfid, rd->offset, rd->len);
+
+ obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL);
+ if (obj == NULL) {
+ GF_ASSERT(obj != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ goto read_out;
+ }
+
+ // The file has probably got deleted.
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDONLY, dict);
+ if (fd == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "opening of file failed\n");
+ goto read_out;
+ }
+
+ if (glfs_lseek_with_xdata(fd, rd->offset, SEEK_SET, dict) != rd->offset) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "lseek of file failed to offset %d\n", rd->offset);
+ goto read_out;
+ }
+
+ ri->work.data = GF_CALLOC(rd->len , sizeof(char), gf_mt_recon_private_t);
+ if (glfs_read_with_xdata(fd, ri->work.data, rd->len, 0, dict) != rd->len) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "read of file failed to offset %d for bytes %d\n", rd->offset, rd->len);
+ goto read_out;
+ }
+
+ glfs_close_with_xdata(fd, dict);
+ glfs_h_close(obj);
+
+ } else if (rd->op == GF_FOP_FTRUNCATE) {
+ } else if (rd->op == GF_FOP_SYMLINK) {
+ } else if ((rd->op == GF_FOP_RMDIR) || (rd->op == GF_FOP_UNLINK) ||
+ (rd->op == GF_FOP_MKNOD) || (rd->op == GF_FOP_CREATE) ||
+ (rd->op == GF_FOP_LINK) || (rd->op == GF_FOP_MKDIR)) {
+ } else if (rd->op == GF_FOP_RENAME) {
+ } else if ((rd->op == GF_FOP_FREMOVEXATTR) ||
+ (rd->op == GF_FOP_REMOVEXATTR) ||
+ (rd->op == GF_FOP_SETXATTR) ||
+ (rd->op == GF_FOP_FSETXATTR)) {
+
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+
+ uuid_parse(ri->rec.gfid, gfid);
+
+
+ // This is for all the set attribute/extended attributes commands.
+ // Get all the attributes from the source and fill it in the buffer
+ // as a NULL seperated key and value which are in turn seperated by
+ // NULL.
+ uint32_t k_s = 0, v_s = 0;
+ char *t_b= NULL;
+ uint32_t num=0;
+
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing getattr for gfid %s \n",
+ ri->rec.gfid);
+
+ obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL);
+ if (obj == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ goto read_out;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDONLY, dict);
+
+ if (fd == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "opening of file failed\n");
+ goto read_out;
+ }
+
+ if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) {
+ if (t_b) free(t_b);
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "list of xattr of gfid %s failed\n", rd->gfid);
+ goto read_out;
+ }
+ ri->work.data = GF_CALLOC((k_s + v_s) , sizeof(char), gf_mt_recon_private_t);
+ get_xattr(fd, t_b, ri->work.data, v_s, num, dict);
+ ri->work.num = num;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished getattr for gfid %s \n",
+ ri->rec.gfid);
+ free(t_b);
+ goto read_out;
+
+ } else if ((rd->op == GF_FOP_FSETATTR) ||
+ (rd->op == GF_FOP_SETATTR)) {
+
+ //TBD - to get the actual attrbutes and fill
+ // mode, uid, gid, size, atime, mtime
+ }
+read_out:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished recon read for gfid %s at offset %d for %d bytes \n",
+ rd->gfid, rd->offset, rd->len);
+ break;
+ }
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT:
+ {
+ dict_t * dict = NULL;
+ // first_index always starts with 1 but records starts at 0.
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got recon commit for index %d that has gfid %s \n",
+ wip, rd->gfid);
+ dict = dict_new ();
+ if (!dict) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ goto commit_out;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ goto commit_out;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ goto commit_out;
+ }
+ apply_record(ctx, ri, dict);
+commit_out:
+ dict_unref (dict);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished recon commit for gfid %s \n",
+ rd->gfid);
+ break;
+ }
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH:
+ {
+ dict_t * dict = NULL;
+ dict = dict_new ();
+ if (!dict) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ goto commit_out;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ goto commit_out;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ GF_ASSERT(0);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ goto commit_out;
+ }
+
+ // Increment work index with the start index
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+ //fd = glfs_open(ctx->fs, rd->gfid, O_RDONLY); //TBD - using gfid
+
+ glfs_fsync_with_xdata(fd, dict);
+ break;
+ }
+ }
+ return;
+}
+
+// thread for doing data work
+static void *
+data_worker_main(nsr_per_node_worker_t *ctx)
+{
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting data worker func\n");
+ init_worker(ctx, 0);
+
+ while(1) {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&(ctx->mutex));
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&(ctx->cv), &(ctx->mutex));
+ }
+ pthread_mutex_unlock(&(ctx->mutex));
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n",work->req_id);
+ work->in_use = _gf_false;
+ data_worker_func(ctx, work);
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+
+//make recon work
+static void
+recon_make_work(nsr_recon_work_t **work,
+ nsr_recon_work_req_id_t req_id,
+ int32_t i)
+{
+ // TBD - change this to get from a static pool
+ // This cannot fail
+ (*work) = GF_CALLOC (1, sizeof (nsr_recon_work_t), gf_mt_recon_private_t);
+ (*work)->req_id = req_id;
+ (*work)->index = i;
+ (*work)->in_use = _gf_true;
+ INIT_LIST_HEAD(&((*work)->list));
+ return;
+}
+
+// Schedule a work object to a worker thread.
+static void
+recon_queue_to_worker(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_t *work,
+ unsigned int id,
+ nsr_recon_queue_type_t type)
+{
+ nsr_per_node_worker_t *worker;
+ if (type == NSR_RECON_QUEUE_TO_CONTROL) {
+ worker = ctx->workers[id].control_worker;
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "queueing work to control index %d\n",id);
+ } else {
+ worker= ctx->workers[id].data_worker;
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "queueing work to data index %d\n",id);
+ }
+ pthread_mutex_lock(&worker->mutex);
+ list_add_tail(&work->list, &worker->head.list);
+ pthread_cond_signal(&worker->cv);
+ pthread_mutex_unlock(&worker->mutex);
+ return;
+}
+
+typedef void * (*F_t) (void *);
+
+// In case mode is set to NSR_USE_THREADS, create worker threads.
+static gf_boolean_t
+create_worker_threads(nsr_recon_private_t *priv,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_per_node_worker_t *w,
+ gf_boolean_t control_or_data,
+ F_t f,
+ uint32_t num)
+{
+ uint32_t i;
+ nsr_per_node_worker_t *worker = w;
+
+
+ for (i=0; i < num; i++) {
+ worker->id = GF_CALLOC(1, 10, gf_mt_recon_private_t);
+ if (!worker->id) {
+ nsr_driver_log (priv->this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return _gf_false;
+ }
+ sprintf(worker->id,"recon_%d", i);
+ worker->driver_ctx = ctx ;
+
+ if (ctx->mode == NSR_USE_THREADS) {
+ if (pthread_create(&worker->thread_id, NULL, f, worker)) {
+ nsr_driver_log (ctx->this->name, GF_LOG_ERROR, "control work thread creation error \n");
+ return _gf_false;
+ }
+ }
+ worker->index = i;
+ worker++;
+ }
+ return _gf_true;
+}
+
+/*
+ * In case of thread, send the work item; else call the function directly.
+ *
+ * Input arguments:
+ * bm - bitmap containing indices of nodes we want to send work
+ * num - number of such indices
+ * ctx - driver context from where we derive per worker context
+ * id - request ID
+ * q - control or data
+ * misc - used to overload such as index.
+ */
+static void
+send_and_wait(int32_t bm,
+ uint32_t num,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_req_id_t id,
+ nsr_recon_queue_type_t q,
+ int32_t misc)
+{
+ uint32_t i = 0;
+ nsr_recon_work_t *work;
+
+ if (ctx->mode == NSR_SEQ) {
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ recon_make_work(&work, id, misc);
+ if (q == NSR_RECON_QUEUE_TO_CONTROL) {
+ if (i == 0)
+ control_worker_func_0(ctx->workers[0].control_worker, work);
+ else
+ control_worker_func(ctx->workers[i].control_worker, work);
+ } else {
+ data_worker_func(ctx->workers[i].data_worker, work);
+ }
+ }
+ }
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned\n");
+ return;
+ }
+
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ recon_make_work(&work, id, misc);
+ atomic_inc(&(ctx->outstanding));
+ recon_queue_to_worker(ctx, work, i, q);
+ }
+ }
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: waiting\n");
+ while (ctx->outstanding) {
+ pthread_yield();
+ }
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned\n");
+ return;
+}
+
+#if 0
+static void
+send_and_do_not_wait(int32_t bm,
+ uint32_t num,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_req_id_t id,
+ nsr_recon_queue_type_t q,
+ int32_t misc)
+{
+ uint32_t i = 0;
+
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ nsr_recon_work_t *work;
+ recon_make_work(&work, id, misc);
+ recon_queue_to_worker(ctx, work, i, q);
+ }
+ }
+
+ return;
+}
+#endif
+
+// send INI or FINI
+static void
+nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx,
+ uint32_t i,
+ gf_boolean_t in_use)
+{
+ uint32_t bm = 1 << i;
+ gf_boolean_t send = _gf_false;
+
+ if (in_use == _gf_false) {
+ if (ctx->workers[i].in_use == _gf_true)
+ send = _gf_true;
+ ctx->workers[i].in_use = _gf_false;
+ } else {
+ if (ctx->workers[i].in_use != _gf_true) {
+ ctx->workers[i].in_use = _gf_true;
+ send = _gf_true;
+ }
+ }
+#if 1
+ if (send == _gf_true) {
+ if (in_use == _gf_true) {
+ nsr_driver_log(this->name, GF_LOG_INFO, "sending INI to index %d\n",i);
+ } else {
+ nsr_driver_log(this->name, GF_LOG_INFO, "sending FINI to index %d\n",i);
+ }
+ send_and_wait(bm, ctx->replica_group_size, ctx,
+ (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ send_and_wait(bm, ctx->replica_group_size, ctx,
+ (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI,
+ NSR_RECON_QUEUE_TO_DATA, -1);
+ }
+#endif
+}
+
+// main recon driver thread
+void *
+nsr_reconciliation_driver(void *arg)
+{
+ nsr_recon_private_t *priv = (nsr_recon_private_t *) arg;
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_per_node_worker_t *control_s, *data_s;
+ nsr_recon_driver_ctx_t **driver_ctx, *ctx;
+ int32_t bm;
+ xlator_t *this = priv->this;
+
+ driver_ctx = &priv->driver_thread_context;
+ (*driver_ctx) = GF_CALLOC (1,
+ sizeof (nsr_recon_driver_ctx_t),
+ gf_mt_recon_private_t);
+ if (!driver_ctx) {
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ ctx = *driver_ctx;
+ ctx->this = priv->this;
+ ctx->replica_group_size = replica_group_size;
+ if ((pthread_mutex_init(&(ctx->mutex), NULL)) ||
+ (pthread_cond_init(&(ctx->cv), NULL))){
+ nsr_driver_log (this->name, GF_LOG_ERROR, "mutex init error \n");
+ return NULL;
+ }
+
+ ctx->workers = GF_CALLOC (replica_group_size,
+ sizeof(nsr_replica_worker_t),
+ gf_mt_recon_private_t);
+ if (!ctx->workers) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ for (i=0; i < replica_group_size; i++) {
+ strcpy(ctx->workers[i].name, priv->replica_group_members[i]);
+ }
+
+ control_s = GF_CALLOC (replica_group_size,
+ sizeof(nsr_per_node_worker_t),
+ gf_mt_recon_private_t);
+ if (!control_s) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+
+ data_s = GF_CALLOC (replica_group_size,
+ sizeof(nsr_per_node_worker_t),
+ gf_mt_recon_private_t);
+ if (!data_s) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ for (i=0; i < replica_group_size; i++) {
+ ctx->workers[i].control_worker = &control_s[i];
+ ctx->workers[i].data_worker = &data_s[i];
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "creating threads \n");
+ // Create the worker threads
+ // For every brick including itself there will be 2 worker threads:
+ // one for data and one for control
+ if (!create_worker_threads(priv, ctx, control_s, _gf_true,
+ (F_t) control_worker_main, replica_group_size) ||
+ !create_worker_threads(priv, ctx, data_s, _gf_false,
+ (F_t) data_worker_main, replica_group_size)) {
+ return NULL;
+ }
+
+ for (i=0; i < replica_group_size; i++) {
+ nsr_recon_get_file(priv->volname, &(ctx->workers[i]));
+ }
+
+ while (1) {
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "waiting for state change \n");
+ pthread_mutex_lock(&(ctx->mutex));
+ while ((*driver_ctx)->state == 0) {
+ pthread_cond_wait(&(ctx->cv), &(ctx->mutex));
+ }
+ pthread_mutex_unlock(&(ctx->mutex));
+
+ nsr_driver_log (this->name, GF_LOG_INFO, " state changed to %d \n", ctx->state);
+#if 0
+ for (i=0; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use) {
+ nsr_recon_start_work(ctx->workers[i].control_worker, _gf_true);
+ nsr_recon_start_work(ctx->workers[i].data_worker, _gf_false);
+ }
+ }
+#endif
+
+ if (ctx->state == leader) {
+
+ int32_t chosen = -1;
+ int32_t last_term = -1, last_ops = -1;
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "getting last term info from all members of this group\n");
+ // Get last term info from all members for this group
+ send_and_wait(-1,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_LAST_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term);
+
+
+ // compare all the info received and choose the reconciliator
+ // First choose all with latest term
+ for (i=0; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use) {
+ if (ctx->workers[i].recon_info->last_term > last_term) {
+ last_term = ctx->workers[i].recon_info->last_term;
+ }
+ }
+ }
+ // First choose all with latest term and highest ops
+ for (i=0; i < replica_group_size; i++) {
+ if ((ctx->workers[i].in_use) && (last_term == ctx->workers[i].recon_info->last_term)) {
+ if (ctx->workers[i].recon_info->commited_ops > last_ops) {
+ last_ops = ctx->workers[i].recon_info->commited_ops;
+ }
+ }
+ }
+ // choose the first among the lot
+ for (i=0; i < replica_group_size; i++) {
+ if ((ctx->workers[i].in_use) &&
+ (last_term == ctx->workers[i].recon_info->last_term) &&
+ (last_ops == ctx->workers[i].recon_info->commited_ops)) {
+ chosen = i;
+ break;
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "reconciliator chosen is %d\n", chosen);
+ ctx->reconciliator_index = chosen;
+ GF_ASSERT(chosen != -1);
+ if (chosen == -1) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "no reconciliatior chosen\n");
+ goto out;
+ }
+
+ // send the message to reconciliator to do reconciliation with list of nodes that are part of this quorum
+ if (chosen != 0) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "sending reconciliation work to %d\n", chosen);
+ bm = 1 << ctx->reconciliator_index;
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RECONCILIATOR_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ nsr_driver_log (this->name, GF_LOG_INFO, "finished reconciliation work to %d\n", chosen);
+ } else {
+ nsr_driver_log (this->name, GF_LOG_INFO, "local node is reconciliator. before set jmp\n");
+ ctx->env = calloc(1,sizeof(jmp_buf));
+ /*
+ * REVIEW
+ * Use of setjmp/longjmp in an environment
+ * where we already use ucontext is dangerous
+ * and therefore forbidden. Refactoring will
+ * also help with some of the rampant 80-column
+ * violations and indented code crawling across
+ * the screen, which together make this entire
+ * file almost unreadable.
+ */
+ if (!setjmp(*(ctx->env))) {
+ ctx->state = reconciliator;
+ goto i_am_reconciliator;
+ } else {
+ nsr_driver_log (this->name, GF_LOG_INFO, "long jmp return to leader\n");
+ free(ctx->env);
+ ctx->env = NULL;
+ ctx->state = leader;
+ }
+ }
+
+ // send message to all other nodes to sync up with the reconciliator including itself if required
+ // requires optimisation - TBD
+ if (chosen != 0) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "local node resolution needs to be done. before set jmp\n");
+ ctx->env = calloc(1,sizeof(jmp_buf));
+ if (!setjmp(*(ctx->env))) {
+ ctx->state = resolutor;
+ goto i_am_resolutor;
+ } else {
+ nsr_driver_log (this->name, GF_LOG_INFO, "long jmp return to leader\n");
+ free(ctx->env);
+ ctx->env = NULL;
+ ctx->state = leader;
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "sending resolution work to all nodes except this node and reconciliator\n");
+ bm = ~((1 << ctx->reconciliator_index) || 1);
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RESOLUTION_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "finished reconciliation work as leader \n");
+
+ }
+i_am_reconciliator:
+ if (ctx->state == reconciliator) {
+ gf_boolean_t do_recon = _gf_false;
+ uint32_t start_index = ctx->workers[0].recon_info->first_index;
+ uint32_t end_index = ctx->workers[0].recon_info->last_index;
+ uint32_t num = ((start_index == 0) && (end_index == 0)) ? 0 : (end_index - start_index + 1);
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "starting reconciliation work as reconciliator \n");
+
+ // nothing to be done? signal back to the recon translator that this phase done.
+ bm = 1;
+ for (i=1; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use &&
+ (ctx->workers[0].recon_info->last_term == ctx->workers[i].recon_info->last_term)) {
+ ctx->workers[i].recon_info->last_index = end_index;
+ ctx->workers[i].recon_info->first_index = start_index;
+ bm = (1 << i);
+ do_recon = _gf_true;
+ }
+ }
+
+ if (!do_recon || !num) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "nothing needs to be done as resolutor \n");
+ if (ctx->env) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "before longjmp \n");
+ longjmp(*(ctx->env), 1);
+ } else {
+ goto out;
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting reconciliation window for term %d from %dto %d \n",
+ ctx->workers[0].recon_info->last_term,
+ start_index, end_index);
+ // We have set the bm in the above for loop where
+ // we go thru all nodes including this node that
+ // have seen the last term.
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting reconciliation window for term %d from %dto %d \n",
+ ctx->workers[0].recon_info->last_term,
+ start_index, end_index);
+
+
+ // from the changelogs, calculate the entries
+ // that need action and the source for each of these entries
+ compute_reconciliation_work(ctx);
+
+ // for each of the entries that need fixup, issue IO
+ for (i=start_index; i < (start_index + num); i++) {
+ nsr_reconciliator_info_t *my_recon_info =
+ ctx->workers[0].recon_info;
+ nsr_reconciliation_record_t *record =
+ &(my_recon_info->records[i - start_index]);
+
+ record->work.term = ctx->workers[0].recon_info->last_term;
+ record->work.index = i;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing index %d\n",i);
+ if ((record->work.type == NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE) ||
+ (record->work.type == NSR_RECON_WORK_HOLE_TO_FILL)) {
+ // 1st case (RECON_WORK_HOLE_TO_PSEUDO_HOLE):
+ // If there are only pseudo_holes in others, it is best effort.
+ // Just pick from the first node that has it and proceed.
+ // 2nd case (RECON_WORK_HOLE_TO_FILL):
+ // this node has either a HOLE or PSUEDO_HOLE and some one else has a FILL(source).
+ // analyse the changelog to check if data needs to be read or if the log has all the data required
+
+ if (recon_check_changelog(&record->rec)) {
+ bm = (1 << record->work.source);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reading data from source %d\n",record->work.source);
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "got data from source %d\n",record->work.source);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing local data as part of reconciliation\n");
+
+ bm = 1;
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing local data as part of reconciliation\n");
+
+ } else if (record->work.type == NSR_RECON_WORK_COMPARE_PSEUDO_HOLE) {
+ // this node has a pseudo_hole and some others have just that too. Just convert this to FILL.
+ // let others blindly pick it from here.
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing this record as a fill\n");
+ bm = 1;
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing this record as a fill\n");
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "finished reconciliation work as reconciliator \n");
+
+ if (ctx->env) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "before longjmp \n");
+ longjmp(*(ctx->env), 1);
+ }
+
+ // tbd - mark this term golden in the reconciliator
+
+ }
+i_am_resolutor:
+ if (ctx->state == resolutor) {
+
+ // This node's last term is filled when it gets a message
+ // from the leader to act as a reconciliator.
+ uint32_t recon_index = ctx->reconciliator_index;
+ nsr_reconciliator_info_t *my_info =
+ ctx->workers[0].recon_info;
+ nsr_reconciliator_info_t *his_info =
+ ctx->workers[recon_index].recon_info;
+ uint32_t my_last_term = my_info->last_term;
+ uint32_t to_do_term = his_info->last_term;
+ uint32_t my_start_index = 1, my_end_index = 1;
+ uint32_t his_start_index = 1, his_end_index = 1;
+ uint32_t num = 0;
+ gf_boolean_t fl = _gf_true;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "starting resolutor work with reconciliator as %d from term %d to term %d \n",
+ recon_index, my_last_term, to_do_term);
+
+ do {
+
+ if (!fl) {
+ (his_info->last_term)++;
+ (my_info->last_term)++;
+ } else {
+ his_info->last_term = my_last_term;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "resolving term %d \n", my_info->last_term);
+
+ // Get reconciliator's term information for that term
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting info from reconciliator for term %d \n", my_info->last_term);
+ bm = (1 << recon_index);
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_GIVEN_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, his_info->last_term);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting info from reconciliator for term %d \n", my_info->last_term);
+
+
+ // empty term
+ if (!his_info->commited_ops) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reconciliator for term %d is empty. moving to next term. \n", my_info->last_term);
+ // TBD - mark the term golden
+ fl = _gf_false;
+ continue;
+ }
+
+ // calculate the resolution window boundary.
+ // for the last term this node saw, we compare the resolution window of this and reconciliator.
+ // for the rest of the nodes, we just accept the reconciliator info.
+ if (fl) {
+ my_start_index = my_info->first_index;
+ my_end_index = my_info->last_index;
+ his_start_index = his_info->first_index;
+ his_end_index = his_info->last_index;
+ my_info->first_index = (my_start_index < his_start_index) ? my_start_index : his_start_index;
+ my_info->last_index = (my_end_index > his_end_index) ? my_end_index : his_end_index;
+ } else {
+ my_info->first_index = his_info->first_index;
+ my_info->last_index = his_info->last_index;
+ my_info->commited_ops = his_info->commited_ops;
+ }
+ if (my_info->first_index == 0)
+ my_info->first_index = 1;
+ num = (my_info->last_index - my_info->first_index) + 1;
+
+
+ // Get the logs from the reconciliator (and this node for this term)
+ if (fl)
+ bm = ((1 << recon_index) | 1);
+ else
+ bm = (1 << recon_index);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting reconciliation window for term %d from %d to %d \n",
+ my_info->last_term,
+ my_info->first_index, my_info->last_index);
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting reconciliation window for term %d from %d to %d \n",
+ my_info->last_term,
+ my_info->first_index, my_info->last_index);
+
+ // from the changelogs, calculate the entries that need action
+ compute_resolution_work(ctx, my_info, his_info, !fl);
+
+
+ // for each of the entries that need fixup, issue IO
+ for (i=my_info->first_index; i < (my_info->first_index + num); i++) {
+ nsr_reconciliation_record_t *record =
+ &(my_info->records[i - my_info->first_index]);
+
+ record->work.term = my_info->last_term;
+ record->work.index = i;
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "fixing index %d\n",i);
+ if ((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) ||
+ (record->work.type == NSR_RECON_WORK_UNDO_FILL)) {
+ if (((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) &&
+ recon_check_changelog(&record->rec)) ||
+ ((record->work.type == NSR_RECON_WORK_UNDO_FILL) &&
+ recon_compute_undo(&record->rec))) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reading data from source %d\n",recon_index);
+ bm = (1 << recon_index);
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reading data from source %d\n",recon_index);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing local data as part of resolutor\n");
+
+ bm = 1;
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing local data as part of resolutor\n");
+ }
+ }
+ fl = _gf_false;
+
+ // tbd - mark this term golden in the reconciliator
+ } while (my_last_term++ != to_do_term);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished resolutor work \n");
+
+ if (ctx->env) {
+ nsr_driver_log (this->name, GF_LOG_INFO, "before longjmp \n");
+ longjmp(*(ctx->env), 1);
+ }
+
+ }
+
+ // free the asasociated recon_info contexts created as part of this role
+
+out:
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending end of reconciliation message \n");
+ nsr_recon_return_back(priv, ctx->txn_id);
+#if 0
+ // send message that job is done by writing to local recon translator
+ bm = 1;
+ send_and_wait(bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_END_RECONCILIATION,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+#endif
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished sending end of reconciliation message \n");
+ ctx->state = 0;
+ }
+
+ return NULL;
+}
diff --git a/xlators/cluster/nsr-recon/src/recon_driver.h b/xlators/cluster/nsr-recon/src/recon_driver.h
new file mode 100644
index 000000000..67f4d6014
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_driver.h
@@ -0,0 +1,308 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RECON_DRIVER_H__
+#define __RECON_DRIVER_H__
+
+
+#include "api/src/glfs.h"
+#include <setjmp.h>
+
+#define MAX_HOSTNAME_LEN 32
+#define MAXIMUM_REPLICA_STRENGTH 8
+#define MAX_RECONCILIATION_WINDOW_SIZE 10000
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+/*
+ * Even with the names fixed, the non-NSR_DEBUG definitions of nsr_*_log don't
+ * work because many callers don't have "this" defined.
+ *
+ * TBD: use gf_log, fix "this" problem, eliminate extra fields and newlines.
+ */
+#define NSR_DEBUG
+
+typedef enum nsr_recon_work_req_id_t {
+ NSR_WORK_ID_GET_NONE = 0,
+ NSR_WORK_ID_GET_LAST_TERM_INFO = NSR_WORK_ID_GET_NONE + 1,
+ NSR_WORK_ID_GET_GIVEN_TERM_INFO = NSR_WORK_ID_GET_LAST_TERM_INFO + 1,
+ NSR_WORK_ID_RECONCILIATOR_DO_WORK = NSR_WORK_ID_GET_GIVEN_TERM_INFO + 1,
+ NSR_WORK_ID_RESOLUTION_DO_WORK = NSR_WORK_ID_RECONCILIATOR_DO_WORK + 1,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW = NSR_WORK_ID_RESOLUTION_DO_WORK + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ = NSR_WORK_ID_GET_RECONCILATION_WINDOW + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT = NSR_WORK_ID_SINGLE_RECONCILIATION_READ + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH = NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT + 1,
+ NSR_WORK_ID_GET_RESOLUTION_WINDOW = NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH + 1,
+ NSR_WORK_ID_END_RECONCILIATION = NSR_WORK_ID_GET_RESOLUTION_WINDOW + 1,
+ NSR_WORK_ID_INI = NSR_WORK_ID_END_RECONCILIATION + 1,
+ NSR_WORK_ID_FINI = NSR_WORK_ID_INI + 1
+} nsr_recon_work_req_id_t;
+
+typedef enum nsr_recon_queue_type_t {
+ NSR_RECON_QUEUE_TO_CONTROL = 0,
+ NSR_RECON_QUEUE_TO_DATA =NSR_RECON_QUEUE_TO_CONTROL + 1,
+} nsr_recon_queue_type_t;
+
+typedef enum nsr_log_type_t {
+ NSR_LOG_HOLE = 0b0,
+ NSR_LOG_PSEUDO_HOLE = 0b1,
+ NSR_LOG_FILL = 0b11
+} nsr_log_type_t;
+
+typedef enum nsr_mode_t {
+ NSR_SEQ = 0,
+ NSR_USE_THREADS = 1,
+ NSR_ASYNC = 2
+} nsr_mode_t;
+
+typedef enum nsr_recon_work_type_t {
+ NSR_RECON_WORK_NONE = 0,
+ NSR_RECON_WORK_HOLE_TO_NOOP = NSR_RECON_WORK_NONE + 1,
+ NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_NOOP + 1,
+ NSR_RECON_WORK_COMPARE_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE + 1,
+ NSR_RECON_WORK_HOLE_TO_FILL = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE + 1,
+ NSR_RECON_WORK_UNDO_FILL = NSR_RECON_WORK_HOLE_TO_FILL + 1,
+} nsr_recon_work_type_t;
+
+typedef enum nsr_recon_driver_state_t {
+ none = 0,
+ leader = 1,
+ reconciliator = 2,
+ resolutor = 3,
+} nsr_recon_driver_state_t;
+
+// role structure
+#pragma pack(push, 1)
+typedef struct _nsr_recon_role_s {
+ uint32_t role; // leader, reconciliator, resolutor
+ uint32_t num; // required in case state is reconciliator
+ uint32_t current_term; // current term used in case of leader
+ // In case this is reconciliator, num is set to nodes that were part
+ // of previous term.
+ // In case this is resolutor, num is set to 2.
+ // info[0] - information for this node.
+ // info[1] - information of the reconciliator.
+ // In case this is leader, num is set to this term's membership list
+ // set info.name to all members including the leader
+ struct {
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+ char name[MAX_HOSTNAME_LEN];
+ } info[MAXIMUM_REPLICA_STRENGTH];
+} nsr_recon_role_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_RR(rr, is_true) \
+{ \
+ uint32_t i=0; \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ if (is_true == _gf_true) rr.num = f(rr.num); \
+ rr.current_term = f(rr.current_term); \
+ for (i=0; i < rr.num; i++) { \
+ rr.info[i].last_term = f(rr.info[i].last_term); \
+ rr.info[i].commited_ops = f(rr.info[i].commited_ops); \
+ rr.info[i].last_index = f(rr.info[i].last_index); \
+ rr.info[i].first_index = f(rr.info[i].first_index); \
+ } \
+ if (is_true == _gf_false) rr.num = f(rr.num); \
+}
+
+// last term info structure
+#pragma pack(push, 1)
+typedef struct _nsr_recon_last_term_info_s {
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+} nsr_recon_last_term_info_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_LT(lt, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ lt.last_term = f(lt.last_term); \
+ lt.commited_ops = f(lt.commited_ops); \
+ lt.last_index = f(lt.last_index); \
+ lt.first_index = f(lt.first_index); \
+}
+
+// log information
+#pragma pack(push, 1)
+typedef struct _nsr_recon_log_info_s {
+ uint32_t term;
+ uint32_t first_index;
+ uint32_t last_index;
+} nsr_recon_log_info_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_LI(li, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ li.term = f(li.term); \
+ li.first_index = f(li.first_index); \
+ li.last_index = f(li.last_index); \
+}
+
+#pragma pack(push, 1)
+typedef struct nsr_recon_record_details_s {
+ uint32_t type;
+ uint32_t op;
+ char gfid[36+1];
+ char pargfid[36+1];
+ char link_path[256]; // should it be PATH_MAX?
+ uint32_t offset;
+ uint32_t len;
+ char entry[128];
+ char newloc[128]; // for rename. can you overload link_path for this? TBD
+} nsr_recon_record_details_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_RD(rd, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ rd.type = f(rd.type); \
+ rd.op = f(rd.op); \
+ rd.offset = f(rd.offset); \
+ rd.len = f(rd.len); \
+}
+
+typedef struct _nsr_recon_work_s {
+ gf_boolean_t in_use;
+ uint32_t index;
+ uint32_t req_id;
+ struct list_head list;
+} nsr_recon_work_t;
+
+typedef struct _nsr_reconciliation_work_s {
+ uint32_t term;
+ uint32_t index;
+ uint32_t type;
+ uint32_t source;
+ void *data;
+
+ uint32_t num; // used for xattr
+
+} nsr_reconciliation_work_t;
+
+typedef struct _nsr_reconciliation_record_s {
+ nsr_reconciliation_work_t work; // will store the computed work
+ nsr_recon_record_details_t rec;
+} nsr_reconciliation_record_t;
+
+typedef struct _nsr_reconciliator_info {
+ uint32_t reconcilator_index;
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+ nsr_reconciliation_record_t records[MAX_RECONCILIATION_WINDOW_SIZE];
+} nsr_reconciliator_info_t;
+
+typedef struct _nsr_per_node_worker_s {
+ char *id; // identifier
+ char vol_file[256]; //volfile that will be used by this thread
+ glfs_t *fs;
+ glfs_fd_t *aux_fd;
+ uint32_t index; // index into array of workers
+ pthread_t thread_id; // thread id
+ void * context; // thread context
+ struct _nsr_recon_driver_ctxt *driver_ctx;
+ char local; // local data worker
+ //struct list_head list; //list of work items
+ nsr_recon_work_t head;
+ pthread_mutex_t mutex; //mutex to gaurd the above list
+ pthread_cond_t cv; //condition variable for signaling the worker thread
+ gf_boolean_t is_control;
+#ifdef NSR_DEBUG
+ uint32_t worker_log_fd;
+#endif
+} nsr_per_node_worker_t;
+
+typedef struct _nsr_replica_worker_s {
+ char name[256];
+ nsr_per_node_worker_t *control_worker;
+ nsr_per_node_worker_t *data_worker;
+ gf_boolean_t in_use;
+ nsr_reconciliator_info_t *recon_info; // Bunch of infos kept for this reconciliation
+} nsr_replica_worker_t;
+
+typedef struct _nsr_recon_driver_ctxt {
+ xlator_t *this;
+ uint32_t replica_group_size; // number of static members of replica group
+ nsr_replica_worker_t *workers; // worker info
+ int32_t reconciliator;
+ pthread_mutex_t mutex; //mutex to gaurd the state
+ pthread_cond_t cv; //condition variable for signaling the driver thread
+ uint32_t state; //driver state
+ volatile int32_t outstanding;
+ uint32_t reconciliator_index;
+ uint32_t txn_id;
+ uint32_t current_term;
+ jmp_buf *env;
+#ifdef NSR_DEBUG
+ uint32_t driver_log_fd;
+#endif
+ nsr_mode_t mode; // default set to seq
+} nsr_recon_driver_ctx_t;
+
+void *
+nsr_reconciliation_driver(void *);
+
+gf_boolean_t
+nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, nsr_recon_role_t *rr, uint32_t txn_id);
+
+#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1))
+#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1))
+#define atomic_fetch_and __sync_fetch_and_and
+#define atomic_fetch_or __sync_fetch_and_or
+
+/*
+ * REVIEW
+ * Ideally, use gf_log like everyone else. Failing that, at least put the logs
+ * with all the others in /var/log instead of /tmp.
+ * NB two instances, for nsr_driver_log and nsr_worker_log
+ */
+#ifdef NSR_DEBUG
+#define nsr_driver_log(dom, levl, fmt...) \
+ { \
+ char c[255]; \
+ if (!ctx->driver_log_fd) { \
+ mkdir("/tmp/nsr-logs/", 0777); \
+ ctx->driver_log_fd = open("/tmp/nsr-logs/nsr-driver-log", O_RDWR|O_CREAT|O_TRUNC); \
+ } \
+ sprintf(c, fmt); \
+ write(ctx->driver_log_fd, c, strlen(c)); \
+ }
+#else
+#define nsr_driver_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+#ifdef NSR_DEBUG
+#define nsr_worker_log(dom, levl, fmt...) \
+ { \
+ char c[255]; \
+ if (!ctx->worker_log_fd) { \
+ char str[255]; \
+ sprintf(str,"/tmp/nsr-logs/%s-%d",ctx->is_control? "con" : "data",ctx->index); \
+ mkdir("/tmp/nsr-logs/", 0777); \
+ ctx->worker_log_fd = open(str, O_RDWR|O_CREAT|O_TRUNC); \
+ } \
+ sprintf(c, fmt); \
+ write(ctx->worker_log_fd, c, strlen(c)); \
+ }
+#else
+#define nsr_worker_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+#endif /* #ifndef __RECON_DRIVER_H__ */
diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.c b/xlators/cluster/nsr-recon/src/recon_xlator.c
new file mode 100644
index 000000000..62583d526
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_xlator.c
@@ -0,0 +1,837 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+
+#include "recon_driver.h"
+#include "recon_xlator.h"
+
+typedef struct _nsr_recon_fd_s {
+ int32_t term;
+ nsr_recon_driver_state_t state;
+ uint32_t first_index;
+ uint32_t last_index;
+ call_frame_t *frame;
+} nsr_recon_fd_t;
+
+
+typedef struct _nsr_txn_id_s {
+ uint32_t txn_id;
+ call_frame_t *frame;
+ struct list_head list;
+} nsr_txn_id_t;
+
+// Given fd, get back the NSR based fd context.
+static int32_t this_fd_ctx_get(fd_t *fd, xlator_t *this, nsr_recon_fd_t **rfd)
+{
+ uint64_t tmp = 0;
+ int32_t ret = -1;
+
+ if ((ret = fd_ctx_get(fd, this, &tmp)) != 0) {
+ return ret;
+ } else {
+ *rfd = (nsr_recon_fd_t *)tmp;
+ return 0;
+ }
+}
+
+// Add the frame in q after associating with txn_id
+static void put_frame(nsr_recon_private_t *priv,
+ call_frame_t *frame,
+ uint32_t txn_id)
+{
+ xlator_t *this = priv->this;
+ nsr_txn_id_t * tid = GF_CALLOC(1, sizeof(nsr_txn_id_t), gf_mt_recon_private_t);
+ tid->txn_id = txn_id;
+ tid->frame = frame;
+ INIT_LIST_HEAD(&(tid->list));
+ list_add_tail(&(tid->list), &(priv->list));
+ recon_main_log (this->name, GF_LOG_INFO, "adding framef or txn id %d into queue \n", txn_id);
+}
+
+// get the frame from the queue given the txn id
+static void get_frame(nsr_recon_private_t *priv,
+ call_frame_t **frame,
+ uint32_t txn_id)
+{
+ nsr_txn_id_t *tid = NULL;
+ xlator_t *this = priv->this;
+
+ list_for_each_entry(tid, &(priv->list), list) {
+ if (tid->txn_id == txn_id) {
+ *frame = tid->frame;
+ recon_main_log (this->name, GF_LOG_INFO, "got frame for txn id %d into queue \n", txn_id);
+ return;
+ }
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "got no frame for txn id %d into queue \n", txn_id);
+ GF_ASSERT(0);
+}
+
+// Get the term info for the term number specified
+void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt)
+{
+ struct stat buf;
+ char path[PATH_MAX];
+
+ bzero(lt, sizeof(nsr_recon_last_term_info_t));
+ lt->last_term = term;
+ sprintf(path,"%s/%s%d",bp,"TERM.",term);
+ if (!stat(path, &buf) && (buf.st_size > 128)) {
+ if (buf.st_size <= 128) {
+ lt->first_index = 0;
+ lt->last_index = 0;
+ lt->commited_ops = 0;
+ }
+ else {
+ lt->first_index = 1;
+ lt->last_index = ((buf.st_size - 128)/128) + 1 ;
+ lt->commited_ops = lt->last_index - lt->first_index + 1;
+ }
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "for term=%d got first_index=%d last_index=%d commited_ops=%d\n",
+ term, lt->first_index, lt->last_index, lt->commited_ops);
+ return;
+}
+
+// Given the term number, find the last term in the changelogs
+void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt)
+{
+ uint32_t t = term;
+ struct stat buf;
+ char path[PATH_MAX];
+ bzero(lt, sizeof(nsr_recon_last_term_info_t));
+ while(t) {
+ // journal file is of type TERM-1.jnl
+ sprintf(path,"%s/%s%d",bp,"TERM.",t);
+ if (!stat(path, &buf)) {
+ nsr_recon_libchangelog_get_this_term_info(this, bp, t, lt);
+ recon_main_log (this->name, GF_LOG_INFO, "got last term given current term %d as %d\n", term, t);
+ return;
+ }
+ t--;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "got no last term given current term %d \n", term);
+
+ return;
+}
+
+// Return back the frame stored against the txn_id
+void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t txn_id)
+{
+ call_frame_t *old_frame = NULL;
+ xlator_t *this = priv->this;
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+
+ get_frame(priv, &old_frame, txn_id);
+ if (old_frame) {
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev returns old frame \n");
+ // first return the original write for which this ack was sent
+ STACK_UNWIND_STRICT (writev, old_frame, op_ret, op_errno, NULL, NULL, NULL);
+ } else {
+ recon_main_log (this->name, GF_LOG_ERROR, "EIII---nsr_recon_writev cnnot return old frame \n");
+ }
+}
+
+typedef enum records_type_t {
+ fop_gfid_pgfid_oldloc_newloc = 1,
+ fop_gfid_pgfid_entry = fop_gfid_pgfid_oldloc_newloc + 1,
+ fop_gfid = fop_gfid_pgfid_entry + 1 ,
+ fop_gfid_offset = fop_gfid + 1,
+ fop_gfid_offset_len = fop_gfid_offset + 1,
+} records_type_t;
+
+// Get the backend ./glusterfs/xx/xx/<...> path
+static void
+get_gfid_path(nsr_recon_private_t *priv, char *gfid, char *path)
+{
+ strcpy(path, priv->base_dir);
+ strcat(path, "/.glusterfs/");
+ strncat(path,gfid,2);
+ strcat(path,"/");
+ strncat(path,gfid+2,2);
+ strcat(path,"/");
+ strcat(path,gfid);
+}
+
+
+// Get the link to which backend points to
+static gf_boolean_t
+get_link_using_gfid(nsr_recon_private_t *priv, char *gfid, char *path)
+{
+ char lp[PATH_MAX];
+ xlator_t *this = priv->this;
+ get_gfid_path(priv,gfid, lp);
+ if (readlink(lp, path, 255) == -1) {
+ GF_ASSERT(0);
+ recon_main_log(priv->this, GF_LOG_ERROR,
+ "cannot get readlink for %s\n",lp);
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
+// Get the list of changelog records given a term , first and last index.
+void nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf)
+{
+ // do a mmap; seek into the first and read all records till last.
+ // TBD - right now all records are pseudo holes but mark them as fills.
+ // TBD - pseudo hole to be implemented when actual fsync gets done on data.
+ char read_buf[((last - first) + 1) * 128];
+ char *rb = &(read_buf[0]);
+ char path[PATH_MAX];
+ int fd;
+ uint32_t index = 0;
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records called for term %d index from %d to %d \n",
+ term, first, last );
+
+ sprintf(path,"%s/%s%d",bp,"TERM.",term);
+ fd = open(path, O_RDONLY);
+ if (fd != -1) {
+ char *start = NULL;
+ nsr_recon_record_details_t * rec = (nsr_recon_record_details_t *)buf;
+ if (first == 0)
+ lseek(fd, 128, SEEK_SET);
+ else
+ lseek(fd, first * 128, SEEK_SET);
+ read(fd, rb, (last - first + 1) * 128);
+ start = rb;
+ index = first;
+ do {
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records start inspecting records at index %d \n",
+ index );
+ if (!strncmp(start, "_PRE_", 5)) {
+ char op_str[4];
+ uint32_t i=0, opcode = 0;
+ records_type_t type;
+
+ start += 5;
+ // increment by the NULLs after the PRE
+ start += 4;
+ // now we have the opcode
+ i = 0;
+ while (*start != 0) {
+ op_str[i++] = (*start);
+ start++;
+ }
+ op_str[i] = '\0';
+ opcode = strtoul(op_str, NULL, 10);
+ recon_main_log (this->name, GF_LOG_ERR,
+ "libchangelog_get_records: got opcode %d @index %d\n", opcode, index);
+ if ((opcode == GF_FOP_RENAME)) {
+ type = fop_gfid_pgfid_oldloc_newloc;
+ } else if ((opcode == GF_FOP_UNLINK) ||
+ (opcode == GF_FOP_RMDIR) ||
+ (opcode == GF_FOP_LINK) ||
+ (opcode == GF_FOP_MKDIR) ||
+ (opcode == GF_FOP_SYMLINK) ||
+ (opcode == GF_FOP_MKNOD) ||
+ (opcode == GF_FOP_CREATE)) {
+ type = fop_gfid_pgfid_entry;
+ } else if ((opcode == GF_FOP_FSETATTR) ||
+ (opcode == GF_FOP_SETATTR) ||
+ (opcode == GF_FOP_FREMOVEXATTR) ||
+ (opcode == GF_FOP_REMOVEXATTR) ||
+ (opcode == GF_FOP_SETXATTR) ||
+ (opcode == GF_FOP_FSETXATTR)) {
+ type = fop_gfid;
+ } else if ((opcode == GF_FOP_TRUNCATE) ||
+ (opcode == GF_FOP_FTRUNCATE)) {
+ type = fop_gfid_offset;
+ } else if (opcode == GF_FOP_WRITE) {
+ type = fop_gfid_offset_len;
+ } else {
+ recon_main_log (this->name, GF_LOG_ERR,
+ "libchangelog_get_records:got no proper opcode %d @index %d\n",
+ opcode, index);
+ //GF_ASSERT(0);
+ // make this as a hole.
+ // TBD - check this logic later. maybe we should raise alarm here because
+ // this means that changelog is corrupted. We are not handling changelog
+ // corruptions as of now.
+ rec->type = NSR_LOG_HOLE;
+ goto finish;
+ }
+ // TBD - handle psuedo holes once that logic is in.
+ rec->type = NSR_LOG_FILL;
+ recon_main_log (this->name, GF_LOG_ERR,
+ "libchangelog_get_records:got type %d at index %d \n",
+ rec->type, index);
+ rec->op = opcode;
+
+ // Now get the gfid and parse it
+ // before that increment the pointer
+ start++;
+ for (i=0; i < 36; i++) {
+ rec->gfid[i] = (*start);
+ start++;
+ }
+ rec->gfid[i] = '\0';
+
+ if (opcode == GF_FOP_SYMLINK) {
+ // the symlink would have been removed. Hence ignore this.
+ // TBD - have an uniform error policy in case of such cases.
+ // Right now we are handling some on the source and some on the destination.
+ if(get_link_using_gfid(this->private, rec->gfid, rec->link_path) == _gf_false) {
+ rec->type = NSR_LOG_HOLE;
+ goto finish;
+ }
+ }
+
+ GF_ASSERT(*start == 0);
+ start ++;
+
+ i = 0;
+ // If type is fop_gfid_offset+_len, get offset
+ if ((type == fop_gfid_offset) || (type == fop_gfid_offset_len)) {
+ char offset_str[128];
+ while(*start != 0) {
+ offset_str[i++] = *start;
+ start ++;
+ }
+ offset_str[i] = '\0';
+ // get over the 0
+ start++;
+ rec->offset = strtoul(offset_str, NULL, 10);
+ recon_main_log (this->name, GF_LOG_ERR,
+ "libchangelog_get_records:got offset %d @index %d \n", rec->offset, index);
+
+ }
+ i = 0;
+ if (type == fop_gfid_offset_len) {
+ char len_str[128];
+ while(*start != 0) {
+ len_str[i++] = *start;
+ start ++;
+ }
+ len_str[i] = '\0';
+ // get over the 0
+ start++;
+ rec->len = strtoul(len_str, NULL, 10);
+ recon_main_log (this->name, GF_LOG_ERR,
+ "libchangelog_get_records:got length %d @index %d \n", rec->len, index);
+ }
+ i = 0;
+ if (type == fop_gfid_pgfid_entry) {
+ // first get the gfid and then the path
+ for (i=0; i < 36; i++) {
+ rec->pargfid[i] = (*start);
+ start++;
+ }
+ rec->pargfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+
+ i = 0;
+ while(*start != 0) {
+ rec->entry[i++] = *start;
+ start ++;
+ }
+ rec->entry[i] = '\0';
+ // get over the 0
+ start++;
+ recon_main_log (this->name, GF_LOG_ERR,
+ "libchangelog_get_records:got entry %s @index %d \n", rec->entry, index);
+
+ }
+ i = 0;
+ if (type == fop_gfid_pgfid_oldloc_newloc) {
+
+ // first get the source and then the destination
+ // source stuff gets stored in pargfid/entry
+ for (i=0; i < 36; i++) {
+ rec->pargfid[i] = (*start);
+ start++;
+ }
+ rec->pargfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+
+ i=0;
+ while(*start != 0) {
+ rec->entry[i++] = *start;
+ start ++;
+ }
+ rec->entry[i] = '\0';
+ // get over the 0
+ start++;
+
+ // dst stuff gets stored in gfid/newloc
+ for (i=0; i < 36; i++) {
+ rec->gfid[i] = (*start);
+ start++;
+ }
+ rec->gfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+ i = 0;
+ while(*start != 0) {
+ rec->newloc[i++] = *start;
+ start ++;
+ }
+ rec->newloc[i] = '\0';
+ // get over the 0
+ start++;
+
+ }
+ ENDIAN_CONVERSION_RD((*rec), _gf_false); //htonl
+ }
+finish:
+ if (index == last)
+ break;
+ index++;
+ rb += 128;
+ start = rb;
+ rec++;
+ } while(1);
+ }
+ close(fd);
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records finsihed inspecting records for term %d \n",
+ term);
+ return;
+}
+
+int32_t
+nsr_recon_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ nsr_recon_fd_t *rfd = NULL;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open called for path %s \n",loc->path );
+ rfd = GF_CALLOC (1, sizeof (*rfd), gf_mt_recon_private_t);
+ if (!rfd) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ }
+
+ op_ret = fd_ctx_set (fd, this, (uint64_t)(long)rfd);
+ if (op_ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open returns with %d for path %s \n",op_ret,loc->path );
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL);
+ return 0;
+}
+
+int32_t
+nsr_recon_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ nsr_recon_fd_t *rfd = NULL;
+ nsr_recon_private_t *priv = NULL;
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ int32_t ret = 0;
+
+ ret = this_fd_ctx_get (fd, this, &rfd);
+ if (ret < 0) {
+ return -1;
+ }
+ priv = (nsr_recon_private_t *)this->private;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called for offset %d \n",(unsigned int)offset );
+ GF_ASSERT(count == 1);
+ switch (offset) {
+ // gets called to return back
+ case nsr_recon_xlator_sector_0:
+ {
+ char c[4];
+ uint32_t txn_id;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev clled to return back \n");
+ memcpy((void *)c, (void *)vector[0].iov_base, 4);
+ txn_id = ntohl(atoi(c));
+ nsr_recon_return_back(priv, txn_id);
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(brick, leader) writes the role of the node
+ case nsr_recon_xlator_sector_1 :
+ {
+ nsr_recon_role_t rr;
+ memcpy((void *)&rr, (void *)vector[0].iov_base, sizeof(rr));
+ ENDIAN_CONVERSION_RR(rr, _gf_true); //ntohl
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called to set role %d\n", rr.role);
+ if ((rr.role != leader) &&
+ (rr.role != reconciliator) &&
+ (rr.role != resolutor)) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "EIII---nsr_recon_writev cannot set state \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ }
+
+ GF_ASSERT(rr.num <= MAXIMUM_REPLICA_STRENGTH);
+
+ // Store the stack frame so that when the actual job gets finished
+ // we send the response back to the brick.
+ if (nsr_recon_driver_set_role(priv->driver_thread_context,
+ &rr,
+ priv->txn_id) == _gf_false) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev set_role - cannot seem to set role \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ } else {
+ uint32_t old = priv->txn_id;
+ atomic_cmpxchg(&priv->txn_id, old,old+1);
+ put_frame(priv, frame, old);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev set_role - set role succesfully \n");
+ }
+ break;
+ }
+ // client(reconciliator) writes how much it needs for the read
+ case nsr_recon_xlator_sector_2 :
+ {
+ nsr_recon_log_info_t li;
+ memcpy((void *)&li, (void *)vector[0].iov_base, sizeof(li));
+ ENDIAN_CONVERSION_LI(li, _gf_true); //ntohl
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for reconcilation info. term=%d, first_index=%d,start_index=%d \n",
+ li.term, li.first_index, li.last_index);
+ rfd->term = li.term;
+ rfd->last_index = li.last_index;
+ rfd->first_index = li.first_index;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(reconciliator) writes term for which it needs info
+ case nsr_recon_xlator_sector_3 :
+ {
+ int32_t term;
+
+ memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term));
+ term = ntohl(term); //ntohl
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for term info. term=%d\n",
+ term);
+ rfd->term = term;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(reconciliator) writes current term so that it gets last term info later
+ case nsr_recon_xlator_sector_4 :
+ {
+ int32_t term;
+
+ memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term));
+ term = ntohl(term); //ntohl
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for last term info given current term=%d\n",
+ term);
+ rfd->term = term;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int
+nsr_recon_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ nsr_recon_fd_t *rfd = NULL;
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ // copied stuff from quick-read.c and posix.c
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec iov = {0, };
+ int32_t ret = -1;
+ nsr_recon_private_t *priv = NULL;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, op_ret);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ ret = this_fd_ctx_get (fd, this, &rfd);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+ priv = (nsr_recon_private_t *)this->private;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_readv called for offset %d \n",(unsigned int)offset );
+ switch (offset) {
+ // client(leader) reads from here to get info for this term on this node
+ // invole libchagelog to get the information
+ case nsr_recon_xlator_sector_3 :
+ {
+ nsr_recon_last_term_info_t lt;
+ GF_ASSERT(size == sizeof(lt));
+ nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, rfd->term, &lt);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting term info for term=%d, ops=%d, first=%d, last=%d\n",
+ rfd->term, lt.commited_ops, lt.first_index, lt.last_index);
+ ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl
+ memcpy(iobuf->ptr, &lt, size);
+ goto out;
+ }
+ // client(reconciliator) reads individual record information
+ case nsr_recon_xlator_sector_2 :
+ {
+ uint32_t num = (rfd->last_index - rfd->first_index + 1);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - expected size %lu got size %lu\n",
+ (num * sizeof(nsr_recon_record_details_t)), size);
+
+ GF_ASSERT(size == (num * sizeof(nsr_recon_record_details_t)));
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting records for term=%d from %d to %d\n",
+ rfd->term, rfd->first_index, rfd->last_index);
+ nsr_recon_libchangelog_get_records(this, priv->changelog_base_path,
+ rfd->term, rfd->first_index, rfd->last_index, iobuf->ptr);
+ goto out;
+ }
+ // read last term info
+ case nsr_recon_xlator_sector_4 :
+ {
+ nsr_recon_last_term_info_t lt;
+ GF_ASSERT(size == sizeof(lt));
+ nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, rfd->term, &lt);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting last term info given current term=%d. last term = %d ops=%d, first=%d, last=%d\n",
+ rfd->term, lt.last_term, lt.commited_ops, lt.first_index, lt.last_index);
+ ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl
+ memcpy(iobuf->ptr, &lt, size);
+ goto out;
+ }
+ }
+
+out:
+ if (op_errno == 0) {
+ iov.iov_base = iobuf->ptr;
+ ret = iov.iov_len = size;
+ }
+
+ STACK_UNWIND_STRICT (readv, frame, ret, op_errno, &iov, 1, NULL, iobref , NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ return 0;
+}
+
+int
+nsr_recon_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ struct iatt buf = {0, };
+ // dirty hack to set root as regular but seems to work.
+ buf.ia_type = IA_IFREG;
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_lookup called \n");
+
+ STACK_UNWIND_STRICT (lookup, frame, 0, 0, this->itable->root, &buf, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+nsr_recon_flush (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (flush, frame, 0, 0, NULL);
+ return 0;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ nsr_recon_private_t *priv = NULL;
+ char *local, *members;
+ unsigned int i=0;
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_mt_recon_private_t);
+ if (!priv) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "priv allocation error\n");
+ return -1;
+ }
+ GF_OPTION_INIT ("replica-group-size", priv->replica_group_size, uint32, err);
+ GF_OPTION_INIT ("vol-name", priv->volname, str, err);
+ if (!priv->volname) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "missing volname option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("changelog-dir", priv->changelog_base_path, str, err);
+ if (!priv->changelog_base_path) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "missing changelog directory option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("base-dir", priv->base_dir, str, err);
+ if (!priv->base_dir) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "missing brick base directory option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("replica-group-members", members, str, err);
+ if (!members) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "missing membership option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("local-member", local, str, err);
+ if (!local) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "missing local member option (required)");
+ return -1;
+ }
+
+ priv->replica_group_members = GF_CALLOC (priv->replica_group_size,
+ sizeof(char *),
+ gf_mt_recon_private_t);
+ priv->replica_group_members[0] = GF_CALLOC (1,
+ strlen(local),
+ gf_mt_recon_private_t);
+ if (!priv->replica_group_members || !(priv->replica_group_members[0])) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "str allocation error\n");
+ return -1;
+ }
+ strcpy(priv->replica_group_members[0], local);
+ for (i=1; i < priv->replica_group_size; i++) {
+ char *member;
+ if (i == 1)
+ member = strtok(members, ",");
+ else
+ member = strtok(NULL, ",");
+ priv->replica_group_members[i] = GF_CALLOC (1, strlen(member) + 1, gf_mt_recon_private_t);
+ if (!priv->replica_group_members[i]) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "str allocation error\n");
+ return -1;
+ }
+ strcpy(priv->replica_group_members[i], member);
+ }
+
+
+ priv->this = this;
+ this->private = (void *)priv;
+
+ recon_main_log (this->name, GF_LOG_INFO, "creating reconciliation driver \n");
+
+ if (pthread_create(&priv->thread_id, NULL, nsr_reconciliation_driver, priv)) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "pthread creation error \n");
+ return -1;
+ }
+
+ INIT_LIST_HEAD(&(priv->list));
+
+
+ return 0;
+
+err:
+ return -1;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ nsr_recon_private_t *priv = NULL;
+ void *ret = NULL;
+
+ priv = (nsr_recon_private_t *)this->private;
+
+ pthread_cancel(priv->thread_id);
+ pthread_join(priv->thread_id, &ret);
+}
+
+
+struct xlator_fops fops = {
+ .open = nsr_recon_open,
+ .readv = nsr_recon_readv,
+ .writev = nsr_recon_writev,
+ .lookup = nsr_recon_lookup,
+ .flush = nsr_recon_flush
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"replica-group-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 2,
+ .max = INT_MAX,
+ .default_value = "2",
+ .description = "Number of bricks in replica group. can be derived but putting it here for testing."
+ },
+ {
+ .key = {"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ {
+ .key = {"local-member"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "member(brick) for which this translator is responsible."
+ },
+ {
+ .key = {"replica-group-members"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Comma seperated member names other than local."
+ },
+ {
+ .key = {"changelog-dir"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Base directory where per term changelogs are maintained."
+ },
+ {
+ .key = {"base-dir"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Base directory for this brick. This should go away once we fix gfid based lookups"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.h b/xlators/cluster/nsr-recon/src/recon_xlator.h
new file mode 100644
index 000000000..c0f1e2145
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_xlator.h
@@ -0,0 +1,78 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RECON_XLATOR_H__
+#define __RECON_XLATOR_H__
+
+#include <semaphore.h>
+#include <pthread.h>
+
+enum gf_dht_mem_types_ {
+ gf_mt_recon_private_t = gf_common_mt_end + 1,
+};
+
+enum nsr_recon_xlator_sector_t {
+ nsr_recon_xlator_sector_0 = 0, // to report back the status of given transaction ids
+ nsr_recon_xlator_sector_1 = 512, // to write here information about leadership changes from the brick
+ nsr_recon_xlator_sector_2 = (512 * 2), // to write here individual roles and wait for that role to be done
+ nsr_recon_xlator_sector_3 = (512 *3), // read from here to get term info for given term
+ nsr_recon_xlator_sector_4 = (512 * 4), // read from here to get last term info
+};
+
+
+typedef struct _nsr_recon_private_s {
+ xlator_t *this; //back pointer
+ unsigned int replica_group_size; // number of static members of replica group
+ char **replica_group_members; // replica group members (including itself in first slot)
+ pthread_t thread_id; // driver thread id
+ nsr_recon_driver_ctx_t *driver_thread_context; //driver thread context
+ unsigned int outstanding; // for communicating with driver thread
+ call_frame_t *frame; // old frame that is pending (just one as of now)
+ struct list_head list;
+ char *volname;
+ uint32_t txn_id;
+ char *changelog_base_path;
+ char *base_dir;
+#ifdef NSR_DEBUG
+ uint32_t recon_main_log_fd;
+#endif
+} nsr_recon_private_t;
+
+#define atomic_cmpxchg __sync_val_compare_and_swap
+
+/*
+ * REVIEW
+ * Ideally, use gf_log like everyone else. Failing that, at least put the logs
+ * with all the others in /var/log instead of /tmp.
+ */
+#ifdef NSR_DEBUG
+#define recon_main_log(dom, levl, fmt...) \
+ { \
+ nsr_recon_private_t *priv = this->private; \
+ char c[255]; \
+ if (!priv->recon_main_log_fd) { \
+ mkdir("/tmp/nsr-logs/", 0777); \
+ priv->recon_main_log_fd = open("/tmp/nsr-logs/recon-main-log", O_RDWR|O_CREAT|O_TRUNC); \
+ } \
+ sprintf(c, fmt); \
+ write(priv->recon_main_log_fd, c, strlen(c)); \
+ }
+#else
+#define recon_main_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+
+void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt);
+void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt);
+void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term_id);
+void nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf);
+
+
+#endif /* #ifndef __RECON_XLATOR_H__ */
diff --git a/xlators/cluster/nsr-server/Makefile.am b/xlators/cluster/nsr-server/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-server/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-server/src/Makefile.am b/xlators/cluster/nsr-server/src/Makefile.am
new file mode 100644
index 000000000..df0d68539
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/Makefile.am
@@ -0,0 +1,36 @@
+python_PYTHON = codegen.py gen-fops.py
+
+xlator_LTLIBRARIES = nsr.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsr_la_LDFLAGS = -module -avoid-version -lgfapi -lcurl
+nsr_la_SOURCES = nsr.c leader.c etcd-api.c \
+ yajl.c yajl_alloc.c yajl_buf.c yajl_encode.c yajl_gen.c \
+ yajl_lex.c yajl_parser.c yajl_tree.c yajl_version.c
+
+nsr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = nsr-internal.h etcd-api.h all-templates.c \
+ yajl_alloc.h yajl_buf.h yajl_bytestack.h yajl_encode.h \
+ yajl_lex.h yajl_parser.h yajl/yajl_common.h yajl/yajl_gen.h \
+ yajl/yajl_parse.h yajl/yajl_tree.h yajl/yajl_version.h \
+ $(top_srcdir)/xlators/lib/src/libxlator.h \
+ $(top_srcdir)/glusterfsd/src/glusterfsd.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES = nsr-cg.c
+
+nsr-cg.c: gen-fops.py codegen.py $(XLATOR_HEADER) all-templates.c
+ $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) all-templates.c > $@
+
+nsr.lo: nsr-cg.c
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c
new file mode 100644
index 000000000..541653029
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/all-templates.c
@@ -0,0 +1,299 @@
+/*
+ * You can put anything here - it doesn't even have to be a comment - and it
+ * will be ignored until we reach the first template-name comment.
+ */
+
+
+// template-name read-fop
+$TYPE$
+nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_$NAME$_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, EREMOTE,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name read-dispatch
+/* No "dispatch" function needed for $NAME$ */
+
+// template-name read-fan-in
+/* No "fan-in" function needed for $NAME$ */
+
+// template-name read-continue
+/* No "continue" function needed for $NAME$ */
+
+// template-name read-complete
+/* No "complete" function needed for $NAME$ */
+
+// template-name write-fop
+$TYPE$
+nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if defined(NSR_CG_NEED_FD)
+ local->fd = fd_ref(fd);
+#else
+ local->fd = NULL;
+#endif
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_$NAME$_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_$NAME$_stub (frame,nsr_$NAME$_continue,
+ $ARGS_SHORT$);
+ if (!local->stub) {
+ goto err;
+ }
+
+#if defined(NSR_CG_QUEUE)
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd->inode);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ LOCK(&ictx->lock);
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ /*
+ * TBD: enqueue only for real conflict
+ *
+ * Currently we just act like all writes are in
+ * conflict with one another. What we should really do
+ * is check the active/pending queues and defer only if
+ * there's a conflict there.
+ *
+ * It's important to check the pending queue because we
+ * might have an active request X which conflicts with
+ * a pending request Y, and this request Z might
+ * conflict with Y but not X. If we checked only the
+ * active queue then Z could jump ahead of Y, which
+ * would be incorrect.
+ */
+ local->qstub = fop_$NAME$_stub (frame,
+ nsr_$NAME$_dispatch,
+ $ARGS_SHORT$);
+ if (!local->qstub) {
+ UNLOCK(&ictx->lock);
+ goto err;
+ }
+ list_add_tail(&local->qlinks,&ictx->pqueue);
+ ++(ictx->pending);
+ UNLOCK(&ictx->lock);
+ return 0;
+ }
+ else {
+ list_add_tail(&local->qlinks,&ictx->aqueue);
+ ++(ictx->active);
+ }
+ UNLOCK(&ictx->lock);
+#endif
+
+ return nsr_$NAME$_dispatch (frame, this, $ARGS_SHORT$);
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->qstub) {
+ call_stub_destroy(local->qstub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, op_errno,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name write-dispatch
+$TYPE$
+nsr_$NAME$_dispatch (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = frame->local;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+
+ /*
+ * TBD: unblock pending request(s) if we fail after this point but
+ * before we get to nsr_$NAME$_complete (where that code currently
+ * resides).
+ */
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_$NAME$_fan_in,
+ trav->xlator, trav->xlator->fops->$NAME$,
+ $ARGS_SHORT$);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+}
+
+// template-name write-fan-in
+$TYPE$
+nsr_$NAME$_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+// template-name write-continue
+$TYPE$
+nsr_$NAME$_continue (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ STACK_WIND (frame, nsr_$NAME$_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+}
+
+// template-name write-complete
+$TYPE$
+nsr_$NAME$_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+#if defined(NSR_CG_NEED_FD)
+ nsr_local_t *local = frame->local;
+#endif
+
+#if defined(NSR_CG_QUEUE)
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd->inode);
+ nsr_local_t *next;
+ if (ictx) {
+ LOCK(&ictx->lock);
+ list_del(&local->qlinks);
+ if (ictx->pending) {
+ /*
+ * TBD: dequeue *all* non-conflicting reqs
+ *
+ * With the stub implementation there can only
+ * be one request active at a time (zero here)
+ * so it's not an issue. In a real
+ * implementation there might still be other
+ * active requests to check against, and
+ * multiple pending requests that could
+ * continue.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking next request");
+ --(ictx->pending);
+ next = list_entry (ictx->pqueue.next,
+ nsr_local_t, qlinks);
+ list_del(&next->qlinks);
+ list_add_tail(&next->qlinks,&ictx->aqueue);
+ call_resume(next->qstub);
+ }
+ else {
+ --(ictx->active);
+ }
+ UNLOCK(&ictx->lock);
+ }
+#endif
+
+#if defined(NSR_CG_FSYNC)
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if defined(NSR_CG_NEED_FD)
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno,
+ $ARGS_SHORT$);
+ return 0;
+
+}
diff --git a/xlators/cluster/nsr-server/src/codegen.py b/xlators/cluster/nsr-server/src/codegen.py
new file mode 100644
index 000000000..709f5662f
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/codegen.py
@@ -0,0 +1,174 @@
+#!/usr/bin/python
+
+# This module lets us auto-generate boilerplate versions of fops and cbks,
+# both for the client side and (eventually) on the server side as well. This
+# allows us to implement common logic (e.g. leader fan-out and sequencing)
+# once, without all the problems that come with copying and pasting the same
+# code into dozens of functions (or failing to).
+#
+# I've tried to make this code pretty generic, since it's already likely to
+# be used multiple ways within NSR. Really, we should use something like this
+# to generate defaults.[ch] as well, to avoid the same sorts of mismatches
+# that we've already seen and to which this approach makes NSR immune. That
+# would require using something other than defaults.h as the input, but that
+# format could be even simpler so that's a good thing too.
+
+
+import re
+import sys
+
+decl_re = re.compile("([a-z0-9_]+)$")
+tmpl_re = re.compile("// template-name (.*)")
+
+class CodeGenerator:
+
+ def __init__ (self):
+ self.decls = {}
+ self.skip = 0
+ self.templates = {}
+ self.make_defaults = self._make_defaults
+
+ # Redefine this to preprocess the name in a declaration, e.g.
+ # fop_lookup_t => nsrc_lookup
+ def munge_name (self, orig):
+ return orig
+
+ # By default, this will convert the argument string into a sequence of
+ # (type, name) tuples minus the first self.skip (default zero) arguments.
+ # You can redefine it to skip the conversion, do a different conversion,
+ # or rearrange the arguments however you like.
+ def munge_args (self, orig):
+ args = []
+ for decl in orig.strip("(); ").split(","):
+ m = decl_re.search(decl)
+ if m:
+ args.append((m.group(1),decl[:m.start(1)].strip()))
+ else:
+ raise RuntimeError("can't split %s into type+name"%decl)
+ return args[self.skip:]
+
+ def add_decl (self, fname, ftype, fargs):
+ self.decls[self.munge_name(fname)] = (ftype, self.munge_args(fargs))
+
+ def parse_decls (self, path, pattern):
+ regex = re.compile(pattern)
+ f = open(path,"r")
+ have_decl = False
+ while True:
+ line = f.readline()
+ if not line:
+ break
+ m = regex.search(line)
+ if m:
+ if have_decl:
+ self.add_decl(f_name,f_type,f_args)
+ f_name = m.group(2)
+ f_type = m.group(1)
+ f_args = line[m.end(0):-1].strip()
+ if f_args.rfind(")") >= 0:
+ self.add_decl(f_name,f_type,f_args)
+ else:
+ have_decl = True
+ elif have_decl:
+ if line.strip() == "":
+ self.add_decl(f_name,f_type,f_args)
+ have_decl = False
+ else:
+ f_args += " "
+ f_args += line[:-1].strip()
+ if have_decl:
+ self.add_decl(f_name,f_type,f_args)
+
+ # Legacy function (yeah, already) to load a single template. If you're
+ # using multiple templates, you're better off loading them all from one
+ # file using load_templates (note plural) instead.
+ def load_template (self, name, path):
+ self.templates[name] = open(path,"r").readlines()
+
+ # Load multiple templates. Each is introduced by a special comment of
+ # the form
+ #
+ # // template-name xyz
+ #
+ # One side effect is that the block before the first such comment will be
+ # ignored. This seems like it might be useful some day so I'll leave it
+ # in, but if people trip over it maybe it will change.
+ #
+ # It is recommended to define templates in expected execution order, to
+ # make the result more readable than the inverted order (e.g. callback
+ # then fop) common in the rest of our code.
+ def load_templates (self, path):
+ t_name = None
+ for line in open(path,"r").readlines():
+ if not line:
+ break
+ m = tmpl_re.match(line)
+ if m:
+ if t_name:
+ self.templates[t_name] = t_contents
+ t_name = m.group(1).strip()
+ t_contents = []
+ elif t_name:
+ t_contents.append(line)
+ if t_name:
+ self.templates[t_name] = t_contents
+
+ # Emit the template, with the following expansions:
+ #
+ # $NAME$ => function name (as passed in)
+ # $TYPE$ => function return value
+ # $ARGS_SHORT$ => argument list, including types
+ # $ARGS_LONG$ => argument list, *not* including types
+ # $DEFAULTS$ => default callback args (see below)
+ #
+ # The $DEFAULTS$ substitution is for the case where a fop (which has one
+ # set of arguments) needs to signal an error via STACK_UNWIND (which
+ # requires a different set of arguments). In this case we look up the
+ # argument list for the opposite direction, using self.make_defaults which
+ # the user must explicitly set to the method for the opposite direction.
+ # If an argument is a pointer, we replace it with NULL; otherwise we
+ # replace it with zero. It's a hack, but it's the only thing we do that
+ # doesn't require specific knowledge of our environment and the specific
+ # call we're handling. If this doesn't suffice, we'll have to add
+ # something like $ARG0$ which can be passed in for specific cases.
+ def emit (self, f_name, tmpl):
+ args = self.decls[f_name][1]
+ zipper = lambda x: x[0]
+ a_short = ", ".join(map(zipper,args))
+ zipper = lambda x: x[1] + " " + x[0]
+ a_long = ", ".join(map(zipper,args))
+ for line in self.templates[tmpl]:
+ line = line.replace("$NAME$",f_name)
+ line = line.replace("$TYPE$",self.decls[f_name][0])
+ line = line.replace("$ARGS_SHORT$",a_short)
+ line = line.replace("$ARGS_LONG$",a_long)
+ line = line.replace("$DEFAULTS$",self.make_defaults(f_name))
+ print(line.rstrip())
+
+ def _make_defaults (self, f_name):
+ result = []
+ for arg in self.decls[f_name][1]:
+ if arg[1][-1] == "*":
+ result.append("NULL")
+ else:
+ result.append("0")
+ return ", ".join(result)
+
+if __name__ == "__main__":
+ type_re = "([a-z_0-9]+)"
+ name_re = "\(\*fop_([a-z0-9]+)_t\)"
+ full_re = type_re + " *" + name_re
+ cg = CodeGenerator()
+ cg.skip = 2
+ cg.parse_decls(sys.argv[1],full_re)
+ """
+ for k, v in cg.decls.iteritems():
+ print("=== %s" % k)
+ print(" return type %s" % v[0])
+ for arg in v[1]:
+ print(" arg %s (type %s)" % arg)
+ """
+ cg.load_template("fop",sys.argv[2])
+ cg.emit("lookup","fop")
+ cg.emit("rename","fop")
+ cg.emit("setxattr","fop")
diff --git a/xlators/cluster/nsr-server/src/codegen.pyc b/xlators/cluster/nsr-server/src/codegen.pyc
new file mode 100644
index 000000000..388b517df
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/codegen.pyc
Binary files differ
diff --git a/xlators/cluster/nsr-server/src/etcd-api.c b/xlators/cluster/nsr-server/src/etcd-api.c
new file mode 100644
index 000000000..a46a40745
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-api.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2013, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <curl/curl.h>
+#include <yajl/yajl_tree.h>
+
+
+#include "etcd-api.h"
+
+#define DEFAULT_ETCD_PORT 4001
+#define SL_DELIM "\n\r\t ,;"
+
+/*
+ * This shuts up gcc, which complains about "null argument where non-null
+ * required" when we pass the result to strdup.
+ */
+#define MY_YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : "fubar")
+
+typedef struct {
+ etcd_server *servers;
+} _etcd_session;
+
+typedef struct {
+ char *key;
+ char *value;
+ int *index_in; /* pointer so NULL can be special */
+ int index_out; /* NULL would be meaningless */
+} etcd_watch_t;
+
+typedef size_t curl_callback_t (void *, size_t, size_t, void *);
+
+int g_inited = 0;
+
+#if defined(DEBUG)
+void
+print_curl_error (char *intro, CURLcode res)
+{
+ printf("%s: %s\n",intro,curl_easy_strerror(res));
+}
+#else
+#define print_curl_error(intro,res)
+#endif
+
+
+etcd_session
+etcd_open (etcd_server *server_list)
+{
+ _etcd_session *this;
+
+ if (!g_inited) {
+ curl_global_init(CURL_GLOBAL_ALL);
+ g_inited = 1;
+ }
+
+ this = malloc(sizeof(*this));
+ if (!this) {
+ return NULL;
+ }
+
+ /*
+ * Some day we'll set up more persistent connections, and keep track
+ * (via redirects) of which server is leader so that we can always
+ * try it first. For now we just push that to the individual request
+ * functions, which do the most brain-dead thing that can work.
+ */
+
+ this->servers = server_list;
+ return this;
+}
+
+
+void
+etcd_close (etcd_session this)
+{
+ free(this);
+}
+
+
+size_t
+parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ static const char *path[] = { "value", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = yajl_tree_get(node,path,yajl_t_string);
+ if (value) {
+ /*
+ * YAJL probably copied it once, now we're going to
+ * copy it again. If anybody really cares for such
+ * small and infrequently used values, we'd have to do
+ * do something much more complicated (like using the
+ * stream interface) to avoid the copy. Right now it's
+ * just not worth it.
+ */
+ *((char **)stream) = strdup(MY_YAJL_GET_STRING(value));
+ }
+ }
+
+ return size*nmemb;
+}
+
+
+etcd_result
+etcd_get_one (_etcd_session *this, char *key, etcd_server *srv, char *prefix,
+ char *post, curl_callback_t cb, char **stream)
+{
+ char *url;
+ CURL *curl;
+ CURLcode curl_res;
+ etcd_result res = ETCD_WTF;
+ void *err_label = &&done;
+
+ if (asprintf(&url,"http://%s:%u/v1/%s%s",
+ srv->host,srv->port,prefix,key) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_url;
+
+ curl = curl_easy_init();
+ if (!curl) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_curl;
+
+ /* TBD: add error checking for these */
+ curl_easy_setopt(curl,CURLOPT_URL,url);
+ curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);
+ curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,cb);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,stream);
+ if (post) {
+ curl_easy_setopt(curl,CURLOPT_POST,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTFIELDS,post);
+ }
+#if defined(DEBUG)
+ curl_easy_setopt(curl,CURLOPT_VERBOSE,1L);
+#endif
+
+ curl_res = curl_easy_perform(curl);
+ if (curl_res != CURLE_OK) {
+ print_curl_error("perform",curl_res);
+ goto *err_label;
+ }
+
+ res = ETCD_OK;
+
+cleanup_curl:
+ curl_easy_cleanup(curl);
+free_url:
+ free(url);
+done:
+ return res;
+}
+
+
+char *
+etcd_get (etcd_session this_as_void, char *key)
+{
+ _etcd_session *this = this_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+
+ for (srv = this->servers; srv->host; ++srv) {
+ res = etcd_get_one(this,key,srv,"keys/",NULL,
+ parse_get_response,&value);
+ if ((res == ETCD_OK) && value) {
+ return value;
+ }
+ }
+
+ return NULL;
+}
+
+
+size_t
+parse_watch_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ etcd_watch_t *watch = stream;
+ static const char *i_path[] = { "index", NULL };
+ static const char *k_path[] = { "key", NULL };
+ static const char *v_path[] = { "value", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = yajl_tree_get(node,i_path,yajl_t_number);
+ if (value) {
+ watch->index_out = strtoul(YAJL_GET_NUMBER(value),
+ NULL,10);
+ }
+ value = yajl_tree_get(node,k_path,yajl_t_string);
+ if (value) {
+ watch->key = strdup(MY_YAJL_GET_STRING(value));
+ }
+ value = yajl_tree_get(node,v_path,yajl_t_string);
+ if (value) {
+ watch->value = strdup(MY_YAJL_GET_STRING(value));
+ }
+ else {
+ /* Must have been a DELETE. */
+ watch->value = NULL;
+ }
+ }
+
+ return size*nmemb;
+}
+
+
+etcd_result
+etcd_watch (etcd_session this_as_void, char *pfx,
+ char **keyp, char **valuep, int *index_in, int *index_out)
+{
+ _etcd_session *this = this_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ etcd_watch_t watch;
+ char *post;
+
+ if (index_in) {
+ if (asprintf(&post,"index=%d",*index_in) < 0) {
+ return ETCD_WTF;
+ }
+ }
+ else {
+ post = NULL;
+ }
+
+ memset(&watch.key,0,sizeof(watch));
+ watch.index_in = index_in;
+
+ for (srv = this->servers; srv->host; ++srv) {
+ res = etcd_get_one(this,pfx,srv,"watch/",post,
+ parse_watch_response,(char **)&watch);
+ if ((res == ETCD_OK) && watch.key) {
+ if (keyp) {
+ *keyp = watch.key;
+ }
+ if (valuep) {
+ *valuep = watch.value;
+ }
+ if (index_out) {
+ *index_out = watch.index_out;
+ }
+ break;
+ }
+ }
+
+ if (post) {
+ free(post);
+ }
+ return res;
+}
+
+
+size_t
+parse_set_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ etcd_result res = ETCD_PROTOCOL_ERROR;
+ /*
+ * Success responses contain prevValue and index. Failure responses
+ * contain errorCode and cause. Among all these, index seems to be the
+ * one we're most likely to need later, so look for that.
+ */
+ static const char *path[] = { "index", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = yajl_tree_get(node,path,yajl_t_number);
+ if (value) {
+ res = ETCD_OK;
+ }
+ }
+
+ *((etcd_result *)stream) = res;
+ return size*nmemb;
+}
+
+
+/* NB: a null value means to use HTTP DELETE and ignore precond/ttl */
+etcd_result
+etcd_put_one (_etcd_session *this, char *key, char *value,
+ char *precond, unsigned int ttl, etcd_server *srv)
+{
+ char *url;
+ char *contents = NULL;
+ CURL *curl;
+ etcd_result res = ETCD_WTF;
+ CURLcode curl_res;
+ void *err_label = &&done;
+
+ if (asprintf(&url,"http://%s:%u/v1/keys/%s",
+ srv->host,srv->port,key) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_url;
+
+ if (value) {
+ if (asprintf(&contents,"value=%s",value) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_contents;
+
+ if (precond) {
+ char *c2;
+ if (asprintf(&c2,"%s;prevValue=%s",contents,
+ precond) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ }
+
+ if (ttl) {
+ char *c2;
+ if (asprintf(&c2,"%s;ttl=%u",contents,ttl) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ }
+ }
+
+ curl = curl_easy_init();
+ if (!curl) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_curl;
+
+ /* TBD: add error checking for these */
+ curl_easy_setopt(curl,CURLOPT_URL,url);
+ curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);
+ curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,parse_set_response);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,&res);
+ if (value) {
+ /*
+ * CURLOPT_HTTPPOST would be easier, but it looks like etcd
+ * will barf on that. Sigh.
+ */
+ curl_easy_setopt(curl,CURLOPT_POST,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTFIELDS,contents);
+ }
+ else {
+ /* This must be a DELETE. */
+ curl_easy_setopt(curl,CURLOPT_CUSTOMREQUEST,"DELETE");
+ }
+#if defined(DEBUG)
+ curl_easy_setopt(curl,CURLOPT_VERBOSE,1L);
+#endif
+
+ curl_res = curl_easy_perform(curl);
+ if (curl_res != CURLE_OK) {
+ print_curl_error("perform",curl_res);
+ goto *err_label;
+ }
+
+ /*
+ * If the request succeeded, or at least got to the server and failed
+ * there, parse_set_response should have set res appropriately.
+ */
+
+cleanup_curl:
+ curl_easy_cleanup(curl);
+free_contents:
+ free(contents); /* might already be NULL for delete, but that's OK */
+free_url:
+ free(url);
+done:
+ return res;
+}
+
+
+etcd_result
+etcd_set (etcd_session this_as_void, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ _etcd_session *this = this_as_void;
+ etcd_server *srv;
+ etcd_result res;
+
+ for (srv = this->servers; srv->host; ++srv) {
+ res = etcd_put_one(this,key,value,precond,ttl,srv);
+ /*
+ * Protocol errors are likely to be things like precondition
+ * failures, which won't be helped by retrying on another
+ * server.
+ */
+ if ((res == ETCD_OK) || (res == ETCD_PROTOCOL_ERROR)) {
+ return res;
+ }
+ }
+
+ return ETCD_WTF;
+}
+
+
+/*
+ * This uses the same path and status checks as SET, but with a different HTTP
+ * command instead of data. Precondition and TTL are obviously not used in
+ * this case, though a conditional delete would be a cool feature for etcd. I
+ * think you can get a timed delete by doing a conditional set to the current
+ * value with a TTL, but I haven't actually tried it.
+ */
+etcd_result
+etcd_delete (etcd_session this_as_void, char *key)
+{
+ _etcd_session *this = this_as_void;
+ etcd_server *srv;
+ etcd_result res;
+
+ for (srv = this->servers; srv->host; ++srv) {
+ res = etcd_put_one(this,key,NULL,NULL,0,srv);
+ if (res == ETCD_OK) {
+ break;
+ }
+ }
+
+ return res;
+}
+
+
+size_t
+store_leader (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ *((char **)stream) = strdup(ptr);
+ return size * nmemb;
+}
+
+
+char *
+etcd_leader (etcd_session this_as_void)
+{
+ _etcd_session *this = this_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+
+ for (srv = this->servers; srv->host; ++srv) {
+ res = etcd_get_one(this,"leader",srv,"",NULL,
+ store_leader,&value);
+ if ((res == ETCD_OK) && value) {
+ return value;
+ }
+ }
+
+ return NULL;
+}
+
+
+void
+free_sl (etcd_server *server_list)
+{
+ size_t num_servers;
+
+ for (num_servers = 0; server_list[num_servers].host; ++num_servers) {
+ free(server_list[num_servers].host);
+ }
+ free(server_list);
+}
+
+
+int
+_count_matching (char *text, char *cset, int result)
+{
+ char *t;
+ int res = 0;
+
+ for (t = text; *t; ++t) {
+ if ((strchr(cset,*t) != NULL) != result) {
+ break;
+ }
+ ++res;
+ }
+
+ return res;
+}
+
+#define count_matching(t,cs) _count_matching(t,cs,1)
+#define count_nonmatching(t,cs) _count_matching(t,cs,0)
+
+
+etcd_session
+etcd_open_str (char *server_names)
+{
+ char *snp;
+ int run_len;
+ int host_len;
+ size_t num_servers;
+ etcd_server *server_list;
+ etcd_session *session;
+
+ /*
+ * Yeah, we iterate over the string twice so we can allocate an
+ * appropriately sized array instead of turning it into a linked list.
+ * Unfortunately this means we can't use strtok* which is destructive
+ * with no platform-independent way to reverse the destructive effects.
+ */
+
+ num_servers = 0;
+ snp = server_names;
+ while (*snp) {
+ run_len = count_nonmatching(snp,SL_DELIM);
+ if (!run_len) {
+ snp += count_matching(snp,SL_DELIM);
+ continue;
+ }
+ ++num_servers;
+ snp += run_len;
+ }
+
+ if (!num_servers) {
+ return NULL;
+ }
+
+ server_list = calloc(num_servers+1,sizeof(*server_list));
+ if (!server_list) {
+ return NULL;
+ }
+ num_servers = 0;
+
+ snp = server_names;
+ while (*snp) {
+ run_len = count_nonmatching(snp,SL_DELIM);
+ if (!run_len) {
+ snp += count_matching(snp,SL_DELIM);
+ continue;
+ }
+ host_len = count_nonmatching(snp,":");
+ if ((run_len - host_len) > 1) {
+ server_list[num_servers].host = strndup(snp,host_len);
+ server_list[num_servers].port = (unsigned short)
+ strtoul(snp+host_len+1,NULL,10);
+ }
+ else {
+ server_list[num_servers].host = strndup(snp,run_len);
+ server_list[num_servers].port = DEFAULT_ETCD_PORT;
+ }
+ ++num_servers;
+ snp += run_len;
+ }
+
+ session = etcd_open(server_list);
+ if (!session) {
+ free_sl(server_list);
+ }
+ return session;
+}
+
+
+void
+etcd_close_str (etcd_session this)
+{
+ free_sl(((_etcd_session *)this)->servers);
+ etcd_close(this);
+}
diff --git a/xlators/cluster/nsr-server/src/etcd-api.h b/xlators/cluster/nsr-server/src/etcd-api.h
new file mode 100644
index 000000000..df8babd55
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-api.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2013, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Description of an etcd server. For now it just includes the name and
+ * port, but some day it might include other stuff like SSL certificate
+ * information.
+ */
+
+typedef enum {
+ ETCD_OK = 0,
+ ETCD_PROTOCOL_ERROR,
+ /* TBD: add other error categories here */
+ ETCD_WTF /* anything we can't easily categorize */
+} etcd_result;
+
+typedef struct {
+ char *host;
+ unsigned short port;
+} etcd_server;
+
+typedef void *etcd_session;
+
+/*
+ * etcd_open
+ *
+ * Establish a session to an etcd cluster, with automatic reconnection and
+ * so on.
+ *
+ * server_list
+ * Array of etcd_server structures, with the last having host=NULL. The
+ * caller is responsible for ensuring that this remains valid as long as
+ * the session exists.
+ */
+etcd_session etcd_open (etcd_server *server_list);
+
+
+/*
+ * etcd_open_str
+ *
+ * Same as etcd_open, except that the servers are specified as a list of
+ * host:port strings, separated by comma/semicolon or whitespace.
+ */
+etcd_session etcd_open_str (char *server_names);
+
+
+/*
+ * etcd_close
+ *
+ * Terminate a session, closing connections and freeing memory (or any other
+ * resources) associated with it.
+ */
+void etcd_close (etcd_session this);
+
+
+/*
+ * etcd_close
+ *
+ * Same as etcd_close, but also free the server list as etcd_open_str would
+ * have allocated it.
+ */
+void etcd_close_str (etcd_session this_as_void);
+
+
+/*
+ * etcd_get
+ *
+ * Fetch a key from one of the servers in a session. The return value is a
+ * newly allocated string, which must be freed by the caller.
+ *
+ * key
+ * The etcd key (path) to fetch.
+ */
+char * etcd_get (etcd_session this, char *key);
+
+
+/*
+ * etcd_watch
+ * Watch the set of keys matching a prefix.
+ *
+ * pfx
+ * The etcd key prefix (like a path) to watch.
+ *
+ * keyp
+ * Space for a pointer to the key that was added/modified/deleted.
+ *
+ * valuep
+ * Space for a pointer to the value if a key was added/modified. A delete
+ * is signified by this being set to NULL.
+ *
+ * index_in
+ * Pointer to an index to be used for *issuing* the watch request, or
+ * NULL for a watch without an index.
+ *
+ * index_out
+ * Pointer to space for an index *returned* by etcd, or NULL to mean don't
+ * bother.
+ *
+ * In normal usage, index_in will be NULL and index_out will be set to receive
+ * the index for the first watch. Subsequently, index_in will be set to
+ * provide the previous index (plus one) and index_out will be set to receive
+ * the next. It's entirely legitimate to point both at the same variable.
+ */
+
+etcd_result etcd_watch (etcd_session this, char *pfx,
+ char **keyp, char **valuep,
+ int *index_in, int *index_out);
+
+
+/*
+ * etcd_set
+ *
+ * Write a key, with optional TTL and/or previous value (as a precondition).
+ *
+ * key
+ * The etcd key (path) to set.
+ *
+ * value
+ * New value as a null-terminated string. Unlike etcd_get, we can derive
+ * the length ourselves instead of needing it to be passed in separately.
+ *
+ * precond
+ * Required previous value as a null-terminated string, or NULL to mean
+ * an unconditional set.
+ *
+ * ttl
+ * Time in seconds after which the value will automatically expire and be
+ * deleted, or zero to mean no auto-expiration.
+ */
+
+etcd_result etcd_set (etcd_session this, char *key, char *value,
+ char *precond, unsigned int ttl);
+
+
+/*
+ * etcd_delete
+ *
+ * Delete a key from one of the servers in a session.
+ *
+ * key
+ * The etcd key (path) to delete.
+ */
+
+etcd_result etcd_delete (etcd_session this, char *key);
+
+
+/*
+ * etcd_leader
+ *
+ * Get the identify of the current leader.
+ */
+
+char * etcd_leader (etcd_session this_as_void);
diff --git a/xlators/cluster/nsr-server/src/gen-fops.py b/xlators/cluster/nsr-server/src/gen-fops.py
new file mode 100644
index 000000000..d0f88d370
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/gen-fops.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+
+# This script generates the boilerplate versions of most fops and cbks in the
+# server. This allows the details of leadership-status checking, sequencing
+# between leader and followers (including fan-out), and basic error checking
+# to be centralized one place, with per-operation code kept to a minimum.
+
+import sys
+import codegen
+
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_t\)"
+full_re = type_re + " *" + name_re
+fop_cg = codegen.CodeGenerator()
+fop_cg.skip = 2
+fop_cg.parse_decls(sys.argv[1],full_re)
+fop_cg.load_templates(sys.argv[2])
+
+# Use the multi-template feature to generate multiple callbacks from the same
+# parsed declarations.
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)"
+full_re = type_re + " *" + name_re
+cbk_cg = codegen.CodeGenerator()
+cbk_cg.skip = 5
+cbk_cg.parse_decls(sys.argv[1],full_re)
+cbk_cg.load_templates(sys.argv[2])
+
+# This is a nasty little trick to handle the case where a generated fop needs
+# a set of default arguments for the corresponding callback.
+fop_cg.make_defaults = cbk_cg.make_defaults
+
+# We need two types of templates. The first, for pure read operations, just
+# needs to do a simple am-i-leader check (augmented to allow dirty reads).
+# The second, for pure writes, needs to do fan-out to followers between those
+# initial checks and local execution. There are other operations that don't
+# fit neatly into either category - e.g. lock ops or fsync - so we'll just have
+# to handle those manually. The table thus includes entries only for those we
+# can categorize. The special cases, plus any new operations we've never even
+# heard of, aren't in there.
+#
+# The "cplx" suffix means that we need to do special things to propagate an
+# fd from the fop to the final callback. The way we do that is that we define
+# a macro for the generated code to use. If this is a "complex" operation,
+# the macro saves/releases the fd; otherwise it's a no-op. I know that's very
+# icky and hard to follow. Sorry. This would all be a lot easier if the
+# translator infrastructure used a request block instead of separate argument
+# lists for every call (and then we wouldn't even need stubs), but that's not
+# the way things work so we're stuck with legacy-preserving hacks like this.
+
+fop_table = {
+ "access": "read",
+ "create": "write",
+ "discard": "write",
+# "entrylk": "read",
+ "fallocate": "write",
+# "fentrylk": "read",
+ "fgetxattr": "read",
+# "finodelk": "read",
+# "flush": "read",
+ "fremovexattr": "write",
+ "fsetattr": "write",
+ "fsetxattr": "write",
+ "fstat": "read",
+# "fsync": "read",
+# "fsyncdir": "read",
+ "ftruncate": "write",
+ "fxattrop": "write",
+ "getxattr": "read",
+# "inodelk": "read",
+ "link": "write",
+# "lk": "read",
+# "lookup": "read",
+ "mkdir": "write",
+ "mknod": "write",
+ "open": "write",
+ "opendir": "read",
+ "rchecksum": "read",
+ "readdir": "read",
+ "readdirp": "read",
+ "readlink": "read",
+ "readv": "read",
+ "removexattr": "write",
+ "rename": "write",
+ "rmdir": "write",
+ "setattr": "write",
+ "setxattr": "write",
+ "stat": "read",
+ "statfs": "read",
+ "symlink": "write",
+ "truncate": "write",
+ "unlink": "write",
+ "writev": "write,fsync,queue",
+ "xattrop": "write",
+}
+
+fops_done = []
+for x in sorted(fop_cg.decls.keys()):
+ if x in fop_table.keys():
+ info = fop_table[x].split(",")
+ kind = info[0]
+ flags = info[1:]
+ if ("fsync" in flags) or ("queue" in flags):
+ flags.append("need_fd")
+ for fname in flags:
+ print "#define NSR_CG_%s" % fname.upper()
+ cbk_cg.emit(x,kind+"-complete")
+ fop_cg.emit(x,kind+"-continue")
+ cbk_cg.emit(x,kind+"-fan-in")
+ fop_cg.emit(x,kind+"-dispatch")
+ fop_cg.emit(x,kind+"-fop")
+ for fname in flags:
+ print "#undef NSR_CG_%s" % fname.upper()
+ fops_done.append(x)
+ else:
+ print("/* No code emitted for %s */"%x)
+ print("")
+
+# Just for fun, emit the fops table too.
+print("struct xlator_fops fops = {")
+for x in fops_done:
+ print(" .%s = nsr_%s,"%(x,x))
+print("};")
diff --git a/xlators/cluster/nsr-server/src/leader.c b/xlators/cluster/nsr-server/src/leader.c
new file mode 100644
index 000000000..bb0dbabe7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/leader.c
@@ -0,0 +1,420 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <regex.h>
+//#include <stdlib.h>
+#include <string.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+/* Vote format: UUID,vote_status,fitness,term_number */
+#define VOTE_ELEMS 4 /* Whole match plus four actual pieces. */
+#define DEFAULT_FITNESS 42
+#define DEFAULT_KEY "nsr"
+#define LEADER_TTL 5 /* TBD: make this tunable */
+
+typedef enum { LS_SUCCESS, LS_FAILURE, LS_ERROR } leader_retval_t;
+enum { NO_LEADER, TENTATIVE, CONFIRMED };
+
+regex_t vote_re;
+
+long
+nsr_get_fitness (xlator_t *this)
+{
+ /* TBD: calculate based on presence/absence from terms */
+ return 42;
+}
+
+long
+nsr_get_term (xlator_t *this)
+{
+ nsr_private_t *priv = this->private;
+ char *text = NULL;
+ etcd_session etcd = priv->etcd;
+
+ text = etcd_get(etcd, priv->term_uuid);
+ // first time and hence no key at all.
+ // this should ideally be done at vol creation time
+ // by glusterd. Move it there later
+ if(text == NULL) {
+ gf_log (this->name, GF_LOG_TRACE, "nsr_get_term returns 1");
+ return 0;
+ } else {
+ gf_log (this->name, GF_LOG_TRACE,
+ "nsr_get_term returns %ld", strtol(text, NULL, 10));
+ return (strtol(text, NULL, 10));
+ }
+}
+
+
+// in etcd-api-master.
+// send a patch to this package to expose this
+extern size_t
+parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream);
+typedef struct {
+ etcd_server *servers;
+} _etcd_session;
+typedef size_t curl_callback_t (void *, size_t, size_t, void *);
+extern etcd_result etcd_get_one (_etcd_session *this, char *key, etcd_server *srv, char *prefix,
+ char *post, curl_callback_t cb, char **stream);
+
+
+
+void
+nsr_leader_cb(glfs_fd_t *fd, ssize_t ret, void *data)
+{
+ xlator_t *this = (xlator_t *) data;
+ nsr_private_t *priv = this->private;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "nsr_leader_cb arrived with return value %d", (int)ret);
+
+ // TBD - error handling; look at ret
+ atomic_fetch_and(&(priv->fence_io), 0);
+
+ return;
+}
+
+void
+nsr_set_leader (xlator_t *this)
+{
+ long term = 0;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+ nsr_private_t *priv = this->private;
+ _etcd_session *etcd = priv->etcd;
+ char *term_key = priv->term_uuid;
+ char *master_key = priv->vol_uuid;
+ char n_t[sizeof(long)+1];
+ nsr_recon_role_t role;
+ char *text = NULL;
+
+ gf_log (this->name, GF_LOG_INFO, "Just became leader");
+
+ text = etcd_get(etcd, priv->term_uuid);
+ if(text == NULL) {
+ term = 0;
+ } else {
+ term = strtol(text, NULL, 10);
+ }
+ sprintf(n_t,"%ld",term+1);
+ res = etcd_set(etcd, term_key,n_t,text,0);
+ if(res != ETCD_OK) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set term");
+ return;
+ }
+ priv->leader = _gf_true;
+
+ if (priv->nsr_recon_start == _gf_false) {
+ atomic_fetch_and(&(priv->fence_io), 0);
+ return;
+ }
+
+ priv->current_term = term + 1;
+
+ atomic_fetch_or(&(priv->fence_io), 1);
+
+ role.num = 0;
+ role.role = leader;
+ // Get the rest of nodes for this term.
+ // TBD: fix this so that it uses per-brick keys instead of violating
+ // modularity and making bad assumptions about etcd behavior.
+ for (srv = etcd->servers; srv->host; ++srv) {
+ res = etcd_get_one(etcd,master_key,srv,"keys/",NULL,
+ parse_get_response,&value);
+ gf_log (this->name, GF_LOG_INFO,
+ "Probing for %s, got %d, value:%s",
+ srv->host, res, value);
+ if ((res == ETCD_OK) && value) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Found for %s", srv->host);
+ strcpy(role.info[role.num].name, srv->host);
+ (role.num)++;
+ }
+ value = NULL;
+ }
+ gf_log (this->name, GF_LOG_INFO,
+ "Discovered %d nodes that has key %s", role.num, master_key);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "setting current term as %ld", term + 1);
+ role.current_term = term + 1;
+ ENDIAN_CONVERSION_RR(role, _gf_false);
+
+ // inform the reconciliator that this is leader
+ // in the callback (once reconciliation is done),
+ // we will unfence the IOs.
+ // TBD - error handling later.
+ glfs_lseek(priv->fd, nsr_recon_xlator_sector_1, SEEK_SET);
+ gf_log (this->name, GF_LOG_INFO,
+ "Writing to local node to set leader");
+ glfs_write_async(priv->fd, &role,
+ sizeof(role),nsr_recon_xlator_sector_1,
+ nsr_leader_cb, this);
+}
+
+
+leader_retval_t
+nsr_get_leader (xlator_t *this, etcd_session etcd, char *key)
+{
+ char *text = NULL;
+ regmatch_t matches[VOTE_ELEMS];
+ char *nominee;
+ long state;
+ long fitness;
+ char *vote = NULL;
+ int retval = LS_ERROR;
+ nsr_private_t *priv = this->private;
+
+ for (;;sleep(1)) {
+
+ if (text) {
+ free(text);
+ }
+
+ text = etcd_get(etcd,key);
+ if (text) {
+ if (regexec(&vote_re,text,VOTE_ELEMS,matches,0) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got malformed vote %s\n", text);
+ continue;
+ }
+ /* We can be destructive here, so convert commas. */
+ text[matches[1].rm_eo] = '\0';
+ text[matches[2].rm_eo] = '\0';
+ nominee = text + matches[1].rm_so;
+ state = strtol(text+matches[2].rm_so,NULL,10);
+ fitness = strtol(text+matches[3].rm_so,NULL,10);
+ }
+ else {
+ nominee = NULL;
+ state = NO_LEADER;
+ fitness = 0;
+ }
+
+ if (state == CONFIRMED) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "leader is %s\n",nominee);
+ if (strcmp(nominee,priv->brick_uuid) == 0) {
+ nsr_set_leader(this);
+ retval = LS_SUCCESS;
+ }
+ else {
+ priv->leader = _gf_false;
+ retval = LS_FAILURE;
+ }
+ break;
+ }
+
+ /* TBD: override based on fitness */
+ if ((state >= TENTATIVE) && (strcmp(nominee,
+ priv->brick_uuid) != 0)) {
+ continue;
+ }
+
+ if (vote) {
+ free(vote);
+ }
+
+ fitness = nsr_get_fitness(this);
+ if (asprintf(&vote,"%s,%ld,%ld",priv->brick_uuid,
+ state+1,fitness) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to construct vote\n");
+ break;
+ }
+
+ if (text) {
+ text[matches[1].rm_eo] = ',';
+ text[matches[2].rm_eo] = ',';
+ }
+ if (etcd_set(etcd,key,vote,text,LEADER_TTL) != ETCD_OK) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to cast vote\n");
+ continue;
+ }
+
+ }
+
+ if (text) {
+ free(text);
+ }
+ if (vote) {
+ free(vote);
+ }
+ return retval;
+}
+
+leader_retval_t
+nsr_confirm (xlator_t *this, etcd_session etcd, char *key)
+{
+ char *vote;
+ long fitness;
+ nsr_private_t *priv = this->private;
+
+ fitness = nsr_get_fitness(this);
+ if (asprintf(&vote,"%s,%ld,%ld",priv->brick_uuid,(long)CONFIRMED,
+ fitness) < 0) {
+ fprintf (stderr, "%s: failed to construct confirmation\n",
+ __func__);
+ return LS_ERROR;
+ }
+
+ if (etcd_set(etcd,key,vote,vote,LEADER_TTL) != ETCD_OK) {
+ fprintf (stderr, "%s: failed to confirm\n", __func__);
+ free(vote);
+ return LS_FAILURE;
+ }
+
+ free(vote);
+ return LS_SUCCESS;
+}
+
+gf_boolean_t
+nsr_init_re (xlator_t *this)
+{
+ static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+ static int was_inited = 0;
+ static char *vote_re_str = "([^,]+),([^,]+),([^,]+)";
+ gf_boolean_t retval = _gf_false;
+
+ pthread_mutex_lock(&mutex);
+ if (!was_inited) {
+ if (regcomp(&vote_re,vote_re_str,REG_EXTENDED) == 0) {
+ retval = _gf_true;
+ }
+ else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set up vote regex\n");
+ }
+ }
+ pthread_mutex_unlock(&mutex);
+
+ return retval;
+}
+
+
+uint32_t
+nsr_leader_setup_recon (xlator_t *this)
+{
+ nsr_private_t *priv = this->private;
+ xlator_t *old = this;
+ uint32_t ret = 0;
+
+ if (priv->nsr_recon_start == _gf_false)
+ return 0;
+
+ priv->fs = glfs_new(priv->vol_uuid);
+ if (!priv->fs) {
+ ret = 1;
+ gf_log (this->name, GF_LOG_ERROR, "failed to initialise glfs \n");
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ ret = glfs_set_volfile(priv->fs, priv->vol_file);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set volfile \n");
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+ glfs_set_logging (priv->fs,"/tmp/glfs-log", 7);
+ if (glfs_init(priv->fs) < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to init volfile \n");
+ ret = 1;
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ priv->fd = glfs_open (priv->fs, "/", O_RDWR);
+ if (priv->fd == NULL) {
+ ret = 1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open fd to communicate with recon process \n");
+ goto done;
+ }
+
+
+done:
+ glusterfs_this_set(old);
+ return ret;
+}
+
+void *
+nsr_leader_thread (xlator_t *this)
+{
+ leader_retval_t retval;
+ nsr_private_t *priv = this->private;
+
+ if (!nsr_init_re(this)) {
+ gf_log (this->name, GF_LOG_ERROR, "could not init regex");
+ return NULL;
+ }
+
+ if (nsr_leader_setup_recon(this)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to do glfs initialisation inside leader thread");
+ return NULL;
+ }
+
+ priv->leader_inited = 1;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "calling glfs_opens_str on servers %s", priv->etcd_servers);
+
+ priv->etcd = etcd_open_str(priv->etcd_servers);
+ if (!(priv->etcd)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open etcd session\n");
+ return NULL;
+ }
+
+ for (;;) {
+ if (nsr_get_leader(this,priv->etcd,priv->vol_uuid) == LS_ERROR) {
+ break;
+ }
+ if (priv->leader) {
+ do {
+ sleep(1);
+ retval = nsr_confirm(this,priv->etcd,priv->vol_uuid);
+ } while (retval == LS_SUCCESS);
+ if (retval == LS_ERROR) {
+ break;
+ }
+ }
+ else {
+ sleep(1);
+ }
+ }
+
+ etcd_close_str(priv->etcd);
+ return NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/nsr-cg.c b/xlators/cluster/nsr-server/src/nsr-cg.c
new file mode 100644
index 000000000..54f370b75
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr-cg.c
@@ -0,0 +1,4444 @@
+/* No stub needed for access */
+
+/* No cbk needed for access */
+
+int32_t
+nsr_access (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int32_t mask, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_access_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->access,
+ loc, mask, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (access, frame, -1, EREMOTE,
+ NULL);
+ return 0;
+}
+
+int32_t
+nsr_create_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t * fd, inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (create, frame, op_ret, op_errno,
+ fd, inode, buf, preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_create_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int32_t flags, mode_t mode, mode_t umask, fd_t * fd, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_create_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+}
+
+int32_t
+nsr_create_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t * fd, inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_create (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int32_t flags, mode_t mode, mode_t umask, fd_t * fd, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_create_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_create_stub (frame,nsr_create_continue,
+ loc, flags, mode, umask, fd, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_create_fan_in,
+ trav->xlator, trav->xlator->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (create, frame, -1, op_errno,
+ NULL, NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_discard_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_discard_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, off_t offset, size_t len, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_discard_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+}
+
+int32_t
+nsr_discard_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_discard (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, off_t offset, size_t len, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_discard_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_discard_stub (frame,nsr_discard_continue,
+ fd, offset, len, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_discard_fan_in,
+ trav->xlator, trav->xlator->fops->discard,
+ fd, offset, len, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (discard, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for entrylk */
+
+int32_t
+nsr_fallocate_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_fallocate_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, int32_t keep_size, off_t offset, size_t len, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_fallocate_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate,
+ fd, keep_size, offset, len, xdata);
+ return 0;
+}
+
+int32_t
+nsr_fallocate_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_fallocate (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, int32_t keep_size, off_t offset, size_t len, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_fallocate_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate,
+ fd, keep_size, offset, len, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_fallocate_stub (frame,nsr_fallocate_continue,
+ fd, keep_size, offset, len, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fallocate_fan_in,
+ trav->xlator, trav->xlator->fops->fallocate,
+ fd, keep_size, offset, len, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (fallocate, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for fentrylk */
+
+/* No stub needed for fgetxattr */
+
+/* No cbk needed for fgetxattr */
+
+int32_t
+nsr_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, const char * name, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_fgetxattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (fgetxattr, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for finodelk */
+
+/* No code emitted for flush */
+
+int32_t
+nsr_fremovexattr_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno,
+ xdata);
+ return 0;
+
+}
+int32_t
+nsr_fremovexattr_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, const char * name, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_fremovexattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int32_t
+nsr_fremovexattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, const char * name, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_fremovexattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_fremovexattr_stub (frame,nsr_fremovexattr_continue,
+ fd, name, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fremovexattr_fan_in,
+ trav->xlator, trav->xlator->fops->fremovexattr,
+ fd, name, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (fremovexattr, frame, -1, op_errno,
+ NULL);
+ return 0;
+}
+
+int32_t
+nsr_fsetattr_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_fsetattr_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, struct iatt * stbuf, int32_t valid, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_fsetattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+}
+
+int32_t
+nsr_fsetattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_fsetattr (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, struct iatt * stbuf, int32_t valid, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_fsetattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_fsetattr_stub (frame,nsr_fsetattr_continue,
+ fd, stbuf, valid, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fsetattr_fan_in,
+ trav->xlator, trav->xlator->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (fsetattr, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_fsetxattr_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno,
+ xdata);
+ return 0;
+
+}
+int32_t
+nsr_fsetxattr_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, dict_t * dict, int32_t flags, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_fsetxattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+nsr_fsetxattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_fsetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, dict_t * dict, int32_t flags, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_fsetxattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_fsetxattr_stub (frame,nsr_fsetxattr_continue,
+ fd, dict, flags, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fsetxattr_fan_in,
+ trav->xlator, trav->xlator->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (fsetxattr, frame, -1, op_errno,
+ NULL);
+ return 0;
+}
+
+/* No stub needed for fstat */
+
+/* No cbk needed for fstat */
+
+int32_t
+nsr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_fstat_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat,
+ fd, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (fstat, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for fsync */
+
+/* No code emitted for fsyncdir */
+
+int32_t
+nsr_ftruncate_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_ftruncate_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, off_t offset, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_ftruncate_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+nsr_ftruncate_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, off_t offset, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_ftruncate_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_ftruncate_stub (frame,nsr_ftruncate_continue,
+ fd, offset, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_ftruncate_fan_in,
+ trav->xlator, trav->xlator->fops->ftruncate,
+ fd, offset, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_fxattrop_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xattr, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno,
+ xattr, xdata);
+ return 0;
+
+}
+int32_t
+nsr_fxattrop_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_fxattrop_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop,
+ fd, optype, xattr, xdata);
+ return 0;
+}
+
+int32_t
+nsr_fxattrop_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xattr, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_fxattrop (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_fxattrop_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop,
+ fd, optype, xattr, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_fxattrop_stub (frame,nsr_fxattrop_continue,
+ fd, optype, xattr, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fxattrop_fan_in,
+ trav->xlator, trav->xlator->fops->fxattrop,
+ fd, optype, xattr, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (fxattrop, frame, -1, op_errno,
+ NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for getspec */
+
+/* No stub needed for getxattr */
+
+/* No cbk needed for getxattr */
+
+int32_t
+nsr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, const char * name, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_getxattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for inodelk */
+
+int32_t
+nsr_link_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (link, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_link_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * oldloc, loc_t * newloc, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_link_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+int32_t
+nsr_link_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_link (call_frame_t *frame, xlator_t *this,
+ loc_t * oldloc, loc_t * newloc, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_link_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_link_stub (frame,nsr_link_continue,
+ oldloc, newloc, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_link_fan_in,
+ trav->xlator, trav->xlator->fops->link,
+ oldloc, newloc, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (link, frame, -1, op_errno,
+ NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for lk */
+
+/* No code emitted for lookup */
+
+int32_t
+nsr_mkdir_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_mkdir_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, mode_t mode, mode_t umask, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_mkdir_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+}
+
+int32_t
+nsr_mkdir_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, mode_t mode, mode_t umask, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_mkdir_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_mkdir_stub (frame,nsr_mkdir_continue,
+ loc, mode, umask, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_mkdir_fan_in,
+ trav->xlator, trav->xlator->fops->mkdir,
+ loc, mode, umask, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno,
+ NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_mknod_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_mknod_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, mode_t mode, dev_t rdev, mode_t umask, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_mknod_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ return 0;
+}
+
+int32_t
+nsr_mknod_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, mode_t mode, dev_t rdev, mode_t umask, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_mknod_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_mknod_stub (frame,nsr_mknod_continue,
+ loc, mode, rdev, umask, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_mknod_fan_in,
+ trav->xlator, trav->xlator->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (mknod, frame, -1, op_errno,
+ NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_open_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t * fd, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno,
+ fd, xdata);
+ return 0;
+
+}
+int32_t
+nsr_open_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int32_t flags, fd_t * fd, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_open_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+}
+
+int32_t
+nsr_open_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t * fd, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_open (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int32_t flags, fd_t * fd, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_open_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_open_stub (frame,nsr_open_continue,
+ loc, flags, fd, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_open_fan_in,
+ trav->xlator, trav->xlator->fops->open,
+ loc, flags, fd, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (open, frame, -1, op_errno,
+ NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for opendir */
+
+/* No cbk needed for opendir */
+
+int32_t
+nsr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, fd_t * fd, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_opendir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (opendir, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for rchecksum */
+
+/* No cbk needed for rchecksum */
+
+int32_t
+nsr_rchecksum (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, off_t offset, int32_t len, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_rchecksum_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rchecksum,
+ fd, offset, len, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (rchecksum, frame, -1, EREMOTE,
+ 0, NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for readdir */
+
+/* No cbk needed for readdir */
+
+int32_t
+nsr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, size_t size, off_t offset, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_readdir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir,
+ fd, size, offset, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (readdir, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for readdirp */
+
+/* No cbk needed for readdirp */
+
+int32_t
+nsr_readdirp (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, size_t size, off_t offset, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_readdirp_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
+ fd, size, offset, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (readdirp, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for readlink */
+
+/* No cbk needed for readlink */
+
+int32_t
+nsr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, size_t size, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_readlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink,
+ loc, size, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (readlink, frame, -1, EREMOTE,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for readv */
+
+/* No cbk needed for readv */
+
+int32_t
+nsr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, size_t size, off_t offset, uint32_t flags, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_readv_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, EREMOTE,
+ NULL, 0, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_removexattr_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno,
+ xdata);
+ return 0;
+
+}
+int32_t
+nsr_removexattr_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, const char * name, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_removexattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int32_t
+nsr_removexattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, const char * name, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_removexattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_removexattr_stub (frame,nsr_removexattr_continue,
+ loc, name, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_removexattr_fan_in,
+ trav->xlator, trav->xlator->fops->removexattr,
+ loc, name, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (removexattr, frame, -1, op_errno,
+ NULL);
+ return 0;
+}
+
+int32_t
+nsr_rename_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * buf, struct iatt * preoldparent, struct iatt * postoldparent, struct iatt * prenewparent, struct iatt * postnewparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno,
+ buf, preoldparent, postoldparent, prenewparent, postnewparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_rename_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * oldloc, loc_t * newloc, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_rename_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+int32_t
+nsr_rename_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * buf, struct iatt * preoldparent, struct iatt * postoldparent, struct iatt * prenewparent, struct iatt * postnewparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t * oldloc, loc_t * newloc, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_rename_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_rename_stub (frame,nsr_rename_continue,
+ oldloc, newloc, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_rename_fan_in,
+ trav->xlator, trav->xlator->fops->rename,
+ oldloc, newloc, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (rename, frame, -1, op_errno,
+ NULL, NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_rmdir_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_rmdir_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int xflags, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_rmdir_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+ loc, xflags, xdata);
+ return 0;
+}
+
+int32_t
+nsr_rmdir_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int xflags, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_rmdir_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+ loc, xflags, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_rmdir_stub (frame,nsr_rmdir_continue,
+ loc, xflags, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_rmdir_fan_in,
+ trav->xlator, trav->xlator->fops->rmdir,
+ loc, xflags, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_setattr_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_setattr_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, struct iatt * stbuf, int32_t valid, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_setattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+}
+
+int32_t
+nsr_setattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preop_stbuf, struct iatt * postop_stbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_setattr (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, struct iatt * stbuf, int32_t valid, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_setattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_setattr_stub (frame,nsr_setattr_continue,
+ loc, stbuf, valid, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_setattr_fan_in,
+ trav->xlator, trav->xlator->fops->setattr,
+ loc, stbuf, valid, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (setattr, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_setxattr_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno,
+ xdata);
+ return 0;
+
+}
+int32_t
+nsr_setxattr_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, dict_t * dict, int32_t flags, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_setxattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+nsr_setxattr_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, dict_t * dict, int32_t flags, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_setxattr_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_setxattr_stub (frame,nsr_setxattr_continue,
+ loc, dict, flags, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_setxattr_fan_in,
+ trav->xlator, trav->xlator->fops->setxattr,
+ loc, dict, flags, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno,
+ NULL);
+ return 0;
+}
+
+/* No stub needed for stat */
+
+/* No cbk needed for stat */
+
+int32_t
+nsr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_stat_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat,
+ loc, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (stat, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+/* No stub needed for statfs */
+
+/* No cbk needed for statfs */
+
+int32_t
+nsr_statfs (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, dict_t * xdata)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_statfs_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs,
+ loc, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (statfs, frame, -1, EREMOTE,
+ NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_symlink_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_symlink_continue (call_frame_t *frame, xlator_t *this,
+ const char * linkname, loc_t * loc, mode_t umask, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_symlink_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkname, loc, umask, xdata);
+ return 0;
+}
+
+int32_t
+nsr_symlink_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t * inode, struct iatt * buf, struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_symlink (call_frame_t *frame, xlator_t *this,
+ const char * linkname, loc_t * loc, mode_t umask, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_symlink_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkname, loc, umask, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_symlink_stub (frame,nsr_symlink_continue,
+ linkname, loc, umask, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_symlink_fan_in,
+ trav->xlator, trav->xlator->fops->symlink,
+ linkname, loc, umask, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (symlink, frame, -1, op_errno,
+ NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_truncate_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_truncate_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, off_t offset, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_truncate_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+int32_t
+nsr_truncate_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, off_t offset, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_truncate_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_truncate_stub (frame,nsr_truncate_continue,
+ loc, offset, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_truncate_fan_in,
+ trav->xlator, trav->xlator->fops->truncate,
+ loc, offset, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (truncate, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+nsr_unlink_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+
+}
+int32_t
+nsr_unlink_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int xflags, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_unlink_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+ loc, xflags, xdata);
+ return 0;
+}
+
+int32_t
+nsr_unlink_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * preparent, struct iatt * postparent, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, int xflags, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_unlink_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+ loc, xflags, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_unlink_stub (frame,nsr_unlink_continue,
+ loc, xflags, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_unlink_fan_in,
+ trav->xlator, trav->xlator->fops->unlink,
+ loc, xflags, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (unlink, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+#define NSR_CG_FSYNC
+#define NSR_CG_QUEUE
+#define NSR_CG_NEED_FD
+int32_t
+nsr_writev_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+
+}
+int32_t
+nsr_writev_continue (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, struct iovec * vector, int32_t count, off_t offset, uint32_t flags, struct iobref * iobref, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_writev_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, offset, flags, iobref, xdata);
+ return 0;
+}
+
+int32_t
+nsr_writev_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt * prebuf, struct iatt * postbuf, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_writev (call_frame_t *frame, xlator_t *this,
+ fd_t * fd, struct iovec * vector, int32_t count, off_t offset, uint32_t flags, struct iobref * iobref, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_writev_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_writev_stub (frame,nsr_writev_continue,
+ fd, vector, count, offset, flags, iobref, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_writev_fan_in,
+ trav->xlator, trav->xlator->fops->writev,
+ fd, vector, count, offset, flags, iobref, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+#undef NSR_CG_FSYNC
+#undef NSR_CG_QUEUE
+#undef NSR_CG_NEED_FD
+int32_t
+nsr_xattrop_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xattr, dict_t * xdata)
+{
+#if NSR_CG_NEED_FD
+ nsr_local_t *local = frame->local;
+#endif
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,local->fd);
+ if (ictx) {
+ /* TBD: LOCK */
+ if (ictx->pending) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking %u requests",
+ ictx->pending);
+ /* TBD: actually dequeue */
+ ictx->pending = 0;
+ }
+ /* TBD: UNLOCK */
+ }
+#endif
+
+#if NSR_CG_FSYNC
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if NSR_CG_NEED_FD
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno,
+ xattr, xdata);
+ return 0;
+
+}
+int32_t
+nsr_xattrop_continue (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata)
+{
+ STACK_WIND (frame, nsr_xattrop_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop,
+ loc, optype, xattr, xdata);
+ return 0;
+}
+
+int32_t
+nsr_xattrop_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t * xattr, dict_t * xdata)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_xattrop (call_frame_t *frame, xlator_t *this,
+ loc_t * loc, gf_xattrop_flags_t optype, dict_t * xattr, dict_t * xdata)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if NSR_CG_NEED_FD
+ local->fd = fd_ref(fd)
+#else
+ local->fd = NULL
+#endif
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ STACK_WIND (frame, nsr_xattrop_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop,
+ loc, optype, xattr, xdata);
+ return 0;
+ }
+
+ if (!priv->leader || priv->fence_io) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+#if NSR_CG_QUEUE
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ /* TBD: LOCK */
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ ++(ictx->pending);
+ /* TBD: actually enqueue */
+ }
+ else {
+ ++(ictx->active);
+ }
+ /* TBD: UNLOCK */
+#endif
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ local->stub = fop_xattrop_stub (frame,nsr_xattrop_continue,
+ loc, optype, xattr, xdata);
+ if (!local->stub) {
+ goto err;
+ }
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_xattrop_fan_in,
+ trav->xlator, trav->xlator->fops->xattrop,
+ loc, optype, xattr, xdata);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT (xattrop, frame, -1, op_errno,
+ NULL, NULL);
+ return 0;
+}
+
+/* No code emitted for zerofill */
+
+struct xlator_fops fops = {
+ .access = nsr_access,
+ .create = nsr_create,
+ .discard = nsr_discard,
+ .fallocate = nsr_fallocate,
+ .fgetxattr = nsr_fgetxattr,
+ .fremovexattr = nsr_fremovexattr,
+ .fsetattr = nsr_fsetattr,
+ .fsetxattr = nsr_fsetxattr,
+ .fstat = nsr_fstat,
+ .ftruncate = nsr_ftruncate,
+ .fxattrop = nsr_fxattrop,
+ .getxattr = nsr_getxattr,
+ .link = nsr_link,
+ .mkdir = nsr_mkdir,
+ .mknod = nsr_mknod,
+ .open = nsr_open,
+ .opendir = nsr_opendir,
+ .rchecksum = nsr_rchecksum,
+ .readdir = nsr_readdir,
+ .readdirp = nsr_readdirp,
+ .readlink = nsr_readlink,
+ .readv = nsr_readv,
+ .removexattr = nsr_removexattr,
+ .rename = nsr_rename,
+ .rmdir = nsr_rmdir,
+ .setattr = nsr_setattr,
+ .setxattr = nsr_setxattr,
+ .stat = nsr_stat,
+ .statfs = nsr_statfs,
+ .symlink = nsr_symlink,
+ .truncate = nsr_truncate,
+ .unlink = nsr_unlink,
+ .writev = nsr_writev,
+ .xattrop = nsr_xattrop,
+};
diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h
new file mode 100644
index 000000000..282247a47
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr-internal.h
@@ -0,0 +1,81 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#define LEADER_XATTR "user.nsr.leader"
+#define SECOND_CHILD(xl) (xl->children->next->xlator)
+
+enum {
+ gf_mt_nsr_private_t = gf_common_mt_end + 1,
+ gf_mt_nsr_fd_ctx_t,
+ gf_mt_nsr_inode_ctx_t,
+ gf_mt_nsr_dirty_t,
+ gf_mt_nsr_end
+};
+
+
+typedef struct {
+ char *etcd_servers;
+ char *vol_uuid;
+ char *term_uuid;
+ char *brick_uuid;
+ gf_boolean_t leader;
+ uint8_t n_children;
+ char *vol_file;
+ glfs_t *fs;
+ etcd_session etcd;
+ volatile unsigned int fence_io;
+ glfs_fd_t *fd;
+ uint32_t current_term;
+#ifdef NSR_DEBUG
+ uint32_t leader_log_fd;
+#endif
+ volatile int leader_inited;
+ uint32_t kid_state;
+ gf_lock_t dirty_lock;
+ struct list_head dirty_fds;
+ gf_boolean_t nsr_recon_start;
+} nsr_private_t;
+
+typedef struct {
+ call_stub_t *stub;
+ call_stub_t *qstub;
+ uint8_t call_count;
+ fd_t *fd;
+ struct list_head qlinks;
+} nsr_local_t;
+
+/*
+ * This should match whatever changelog returns on the pre-op for us to pass
+ * when we're ready for our post-op.
+ */
+typedef uint32_t log_id_t;
+
+typedef struct {
+ struct list_head links;
+ log_id_t id;
+} nsr_dirty_list_t;
+
+typedef struct {
+ fd_t *fd;
+ struct list_head dirty_list;
+ struct list_head fd_list;
+} nsr_fd_ctx_t;
+
+typedef struct {
+ gf_lock_t lock;
+ uint32_t active;
+ struct list_head aqueue;
+ uint32_t pending;
+ struct list_head pqueue;
+} nsr_inode_ctx_t;
+
diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c
new file mode 100644
index 000000000..3707b3003
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr.c
@@ -0,0 +1,682 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+#include "run.h"
+#include "common-utils.h"
+
+
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+#define NSR_FLUSH_INTERVAL 5
+
+nsr_inode_ctx_t *
+nsr_get_inode_ctx (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_inode_ctx_t *ctx_ptr;
+
+ if (__inode_ctx_get(inode,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr),
+ gf_mt_nsr_inode_ctx_t);
+ if (ctx_ptr) {
+ ctx_int = (uint64_t)(long)ctx_ptr;
+ if (__inode_ctx_set(inode,this,&ctx_int) == 0) {
+ LOCK_INIT(&ctx_ptr->lock);
+ INIT_LIST_HEAD(&ctx_ptr->aqueue);
+ INIT_LIST_HEAD(&ctx_ptr->pqueue);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+nsr_fd_ctx_t *
+nsr_get_fd_ctx (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t);
+ if (ctx_ptr) {
+ if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) {
+ INIT_LIST_HEAD(&ctx_ptr->dirty_list);
+ INIT_LIST_HEAD(&ctx_ptr->fd_list);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+void
+nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local)
+{
+ fd_t *fd = local->fd;
+ nsr_fd_ctx_t *ctx_ptr;
+ nsr_dirty_list_t *dirty;
+ nsr_private_t *priv = this->private;
+
+ /*
+ * TBD: don't do any of this for O_SYNC/O_DIRECT writes.
+ * Unfortunately, that optimization requires that we distinguish
+ * between writev and other "write" calls, saving the original flags
+ * and checking them in the callback. Too much work for too little
+ * gain right now.
+ */
+
+ LOCK(&fd->lock);
+ ctx_ptr = nsr_get_fd_ctx(this,fd);
+ dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t);
+ if (ctx_ptr && dirty) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "marking fd %p as dirty (%p)", fd, dirty);
+ /* TBD: fill dirty->id from what changelog gave us */
+ list_add_tail(&dirty->links,&ctx_ptr->dirty_list);
+ if (list_empty(&ctx_ptr->fd_list)) {
+ /* Add a ref so _release doesn't get called. */
+ ctx_ptr->fd = fd_ref(fd);
+ LOCK(&priv->dirty_lock);
+ list_add_tail (&ctx_ptr->fd_list,
+ &priv->dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+ }
+ }
+ else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not mark %p dirty", fd);
+ if (ctx_ptr) {
+ GF_FREE(ctx_ptr);
+ }
+ if (dirty) {
+ GF_FREE(dirty);
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count"
+#include "nsr-cg.c"
+
+uint8_t
+nsr_count_up_kids (nsr_private_t *priv)
+{
+ uint8_t retval = 0;
+ uint8_t i;
+
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ ++retval;
+ }
+ }
+
+ return retval;
+}
+
+/*
+ * The fsync machinery looks a lot like that for any write call, but there are
+ * some important differences that are easy to miss. First, we don't care
+ * about the xdata that shows whether the call came from a leader or
+ * reconciliation process. If we're the leader we fan out; if we're not we
+ * don't. Second, we don't wait for followers before we issue the local call.
+ * The code generation system could be updated to handle this, and still might
+ * if we need to implement other "almost identical" paths (e.g. for open), but
+ * a copy is more readable as long as it's just one.
+ */
+
+int32_t
+nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_local_t *local = frame->local;
+ gf_boolean_t unwind;
+
+ LOCK(&frame->lock);
+ unwind = !--(local->call_count);
+ UNLOCK(&frame->lock);
+
+ if (unwind) {
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ }
+ return 0;
+}
+
+int32_t
+nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+ nsr_local_t *local = frame->local;
+
+ list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", local->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+int32_t
+nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ nsr_private_t *priv = this->private;
+ nsr_local_t *local;
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+ xlator_list_t *trav;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata);
+ return 0;
+ }
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ /* Move the dirty list from the fd to the fsync request. */
+ LOCK(&fd->lock);
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ list_splice_init (&ctx_ptr->dirty_list,
+ &local->qlinks);
+ }
+ UNLOCK(&fd->lock);
+
+ /* Issue the local call. */
+ local->call_count = priv->leader ? priv->n_children : 1;
+ STACK_WIND (frame, nsr_fsync_local_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+
+ /* Issue remote calls if we're the leader. */
+ if (priv->leader) {
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fsync_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ dict_t *result;
+ uint8_t up;
+ nsr_private_t *priv = this->private;
+
+ if (!priv->leader) {
+ STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL);
+ return 0;
+ }
+
+ if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) {
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+ }
+
+ result = dict_new();
+ if (!result) {
+ goto dn_failed;
+ }
+
+ up = nsr_count_up_kids(this->private);
+ if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,up) != 0) {
+ goto dsu_failed;
+ }
+
+ STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL);
+ dict_destroy(result);
+ return 0;
+
+dsu_failed:
+ dict_destroy(result);
+dn_failed:
+ STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+void
+nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+
+ list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", fd_ctx->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ INIT_LIST_HEAD(&fd_ctx->dirty_list);
+}
+
+void *
+nsr_flush_thread (void *ctx)
+{
+ xlator_t *this = ctx;
+ nsr_private_t *priv = this->private;
+ struct list_head dirty_fds;
+ nsr_fd_ctx_t *fd_ctx;
+ nsr_fd_ctx_t *fd_tmp;
+
+ for (;;) {
+ /*
+ * We have to be very careful to avoid lock inversions here, so
+ * we can't just hold priv->dirty_lock while we take and
+ * release locks for each fd. Instead, we only hold dirty_lock
+ * at the beginning of each iteration, as we (effectively) make
+ * a copy of the current list head and then clear the original.
+ * This leads to four scenarios for adding the first entry to
+ * an fd and potentially putting it on the global list.
+ *
+ * (1) While we're asleep. No lock contention, it just gets
+ * added and will be processed on the next iteration.
+ *
+ * (2) After we've made a local copy, but before we've started
+ * processing that fd. The new entry will be added to the
+ * fd (under its lock), and we'll process it on the current
+ * iteration.
+ *
+ * (3) While we're processing the fd. They'll block on the fd
+ * lock, then see that the list is empty and put it on the
+ * global list. We'll process it here on the next
+ * iteration.
+ *
+ * (4) While we're working, but after we've processed that fd.
+ * Same as (1) as far as that fd is concerned.
+ */
+ INIT_LIST_HEAD(&dirty_fds);
+ LOCK(&priv->dirty_lock);
+ list_splice_init(&priv->dirty_fds,&dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+
+ list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) {
+ LOCK(&fd_ctx->fd->lock);
+ nsr_flush_fd(this,fd_ctx);
+ list_del_init(&fd_ctx->fd_list);
+ UNLOCK(&fd_ctx->fd->lock);
+ fd_unref(fd_ctx->fd);
+ }
+
+ sleep(NSR_FLUSH_INTERVAL);
+ }
+
+ return NULL;
+}
+
+int32_t
+nsr_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx = 0LL;
+
+ if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_release (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx = 0LL;
+
+ if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+struct xlator_cbks cbks = {
+ .forget = nsr_forget,
+ .release = nsr_release,
+};
+
+int
+nsr_reconfigure (xlator_t *this, dict_t *options)
+{
+ nsr_private_t *priv = this->private;
+
+ GF_OPTION_RECONF ("leader", priv->leader, options, bool, err);
+ return 0;
+
+err:
+ return -1;
+}
+
+int
+nsr_get_child_index (xlator_t *this, xlator_t *kid)
+{
+ xlator_list_t *trav;
+ int retval = -1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++retval;
+ if (trav->xlator == kid) {
+ return retval;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Child notify handling is unreasonably FUBAR. Sometimes we'll get a
+ * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it.
+ * Other times we won't. Because it's effectively random (probably racy), we
+ * can't just maintain a count. We actually have to keep track of the state
+ * for each child separately, to filter out the bogus CHILD_DOWN events, and
+ * then generate counts on demand.
+ */
+int
+nsr_notify (xlator_t *this, int event, void *data, ...)
+{
+ nsr_private_t *priv = this->private;
+ int index;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state |= (1 << index);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_UP for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ nsr_count_up_kids(priv));
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state &= ~(1 << index);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_DOWN for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ nsr_count_up_kids(priv));
+ }
+ break;
+ default:
+ ;
+ }
+
+ return default_notify(this,event,data);
+}
+
+
+extern void *nsr_leader_thread (void *);
+
+int32_t
+nsr_init (xlator_t *this)
+{
+ xlator_list_t *remote;
+ xlator_list_t *local;
+ nsr_private_t *priv = NULL;
+ xlator_list_t *trav;
+ pthread_t kid;
+ uuid_t tmp_uuid;
+ char *my_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL;
+ char *volname;
+ extern xlator_t global_xlator;
+ glusterfs_ctx_t *oldctx = global_xlator.ctx;
+ runner_t runner = {0,};
+ int32_t ret = -1;
+ struct stat buf;
+
+ /*
+ * Any fop that gets special treatment has to be patched in here,
+ * because the compiled-in table is produced by the code generator and
+ * only contains generated functions. Note that we have to go through
+ * this->fops because of some dynamic-linking strangeness; modifying
+ * the static table doesn't work.
+ */
+ this->fops->getxattr = nsr_getxattr_special;
+ this->fops->fsync = nsr_fsync;
+
+ local = this->children;
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "no local subvolume");
+ goto err;
+ }
+
+ remote = local->next;
+ if (!remote) {
+ gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes");
+ goto err;
+ }
+
+ this->local_pool = mem_pool_new (nsr_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsr_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate priv");
+ goto err;
+ }
+
+ // set this so that unless leader election is done, IO is fenced
+ priv->fence_io = 1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++(priv->n_children);
+ }
+
+ LOCK_INIT(&priv->dirty_lock);
+ INIT_LIST_HEAD(&priv->dirty_fds);
+
+ this->private = priv;
+
+ GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err);
+ if (!priv->etcd_servers) {
+ gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???");
+ goto err;
+ }
+ priv->vol_uuid = "temporary";
+ uuid_generate(tmp_uuid);
+ priv->brick_uuid = strdup(uuid_utoa(tmp_uuid));
+ priv->term_uuid = "nsr-term";
+ gf_log (this->name, GF_LOG_INFO,
+ "brick_uuid = %s\n", priv->brick_uuid);
+
+ GF_OPTION_INIT ("my-name", my_name, str, err);
+ if (!my_name) {
+ gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???");
+ goto err;
+ }
+ GF_OPTION_INIT ("vol-name", volname, str, err);
+ if (!volname) {
+ gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???");
+ goto err;
+ }
+
+ recon_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("recon") +1, gf_mt_nsr_private_t);
+ if ((!recon_file) || (!recon_pid_file)) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ ptr = strchr (my_name, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (my_name, '/');
+ }
+
+ sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(recon_file, my_name);
+ strcat(recon_file, "-nsr-recon.vol");
+
+ sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ "run");
+ strcat(recon_pid_file, my_name);
+ strcat(recon_pid_file, "-recon.pid");
+
+ priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ if (!priv->vol_file) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ sprintf(priv->vol_file,"%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(priv->vol_file, "con:");
+ strcat(priv->vol_file, my_name);
+
+ if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not start flush thread");
+ /* TBD: treat this as a fatal error? */
+ }
+
+ // Start the recon process. Then start the leader thread.
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+ if (!stat(priv->vol_file, &buf)) {
+ runinit (&runner);
+ runner_add_args(&runner, "/usr/local/sbin/glusterfs",
+ "-f", recon_file,
+ "-p", recon_pid_file,
+ "-l", "/tmp/reconciliation.log",
+ NULL);
+ ret = runner_run (&runner);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "could not exec reconciliation process " );
+ goto err;
+ }
+
+ // TBD - convert this to make sure recon process runs
+ sleep(2);
+ priv->nsr_recon_start = _gf_true;
+ }
+
+
+ (void)pthread_create(&kid,NULL,nsr_leader_thread,this);
+ while (priv->leader_inited == 0) {
+ sleep(1);
+ }
+ /*
+ * Calling glfs_new changes old->ctx, even if THIS still points
+ * to global_xlator. That causes problems later in the main
+ * thread, when gf_log_dump_graph tries to use the FILE after
+ * we've mucked with it and gets a segfault in __fprintf_chk.
+ * We can avoid all that by undoing the damage before we
+ * continue.
+ */
+ global_xlator.ctx = oldctx;
+
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+
+void
+nsr_fini (xlator_t *this)
+{
+}
+
+class_methods_t class_methods = {
+ .init = nsr_init,
+ .fini = nsr_fini,
+ .reconfigure = nsr_reconfigure,
+ .notify = nsr_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {"leader"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "Start in the leader role. This is only for "
+ "bootstrapping the code, and should go away when we "
+ "have real leader election."
+ },
+ { .key ={"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ { .key = {"my-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "brick name in form of host:/path"
+ },
+ { .key = {"etcd-servers"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "list of comma seperated etc servers"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-server/src/stub_etcd.c b/xlators/cluster/nsr-server/src/stub_etcd.c
new file mode 100644
index 000000000..83f5525a2
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/stub_etcd.c
@@ -0,0 +1,129 @@
+/*
+ * Stub version of etcd. If the etcd executable is present, this will
+ * behave exactly like the regular etcd API. Otherwise, it will stub out
+ * the API functions by using local files.
+ */
+
+#include "etcd-api.h"
+
+/* copied from glusterd-etcd.c */
+#define GLUSTERD_ETCD_DIR "/var/lib/glusterd/etcd"
+#define GLUSTERD_ETCD_CMD "/root/etcd/etcd"
+
+#define MAX_KEY_SIZE 256
+#define MAX_VALUE_SIZE 1023
+
+etcd_session *bogus_etcd = (void *)0x7766554433221100;
+
+void
+concat_convert (char *dst, char *base, char *key)
+{
+ while (*dst) {
+ *(base++) = *(dst++);
+ }
+ *(base++) = '/';
+
+ while (*key) {
+ *(base++) = (*key == '/') ? '@' : *@key;
+ ++key;
+ }
+ *(base++) = '\0';
+}
+
+etcd_session
+s_etcd_open_str (char *server_names)
+{
+ if (access(GLUSTERD_ETCD_CMD,X_OK) == 0) {
+ return etcd_open_str(server_names);
+ }
+
+ return bogus_etcd;
+}
+
+void
+s_etcd_close_str (etcd_session this_as_void)
+{
+ if (this_as_void != bogus_etcd) {
+ etcd_close_str(this_as_void);
+ }
+}
+
+char *
+s_etcd_get (etcd_session this, char *key)
+{
+ char path[MAX_KEY_SIZE];
+ int fd = -1;
+ char buf[MAX_VALUE_SIZE+1];
+ ssize_t bytes;
+ char *retval = NULL;
+
+ if (this != bogus_etcd) {
+ return etcd_get(this,key);
+ }
+
+ concat_convert(path,GLUSTERD_ETCD_DIR,key);
+
+ fd = open(path,O_RDONLY);
+ if (fd < 0) {
+ perror("open");
+ goto err;
+ }
+
+ bytes = read(fd,buf,MAX_VALUE_SIZE);
+ if (bytes <= 0) {
+ if (bytes < 0) {
+ perror("read");
+ }
+ goto err;
+ }
+
+ buf[bytes] = '\0';
+ retval = strdup(buf);
+
+err:
+ if (fd >= 0) {
+ close(fd);
+ }
+ return retval;
+}
+
+etcd_result
+s_etcd_set (etcd_session this, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ char path[MAX_KEY_SIZE];
+ int fd = -1;
+ ssize_t bytes;
+ etcd_result retval = ETCD_WTF;
+
+ if (this != bogus_etcd) {
+ return etcd_set(this,key,value,precond,ttl);
+ }
+
+ concat_convert(path,GLUSTERD_ETCD_DIR,key);
+
+ fd = open(path,O_WRONLY,0666);
+ if (fd < 0) {
+ perror("open");
+ goto err;
+ }
+
+ bytes = write(fd,value,strlen(value)+1);
+ if (bytes <= 0) {
+ if (bytes < 0) {
+ perror("write");
+ }
+ goto err;
+ }
+
+ retval = ETCD_OK;
+
+
+err:
+ if (fd >= 0) {
+ close(fd);
+ }
+ return retval;
+}
+
+
diff --git a/xlators/cluster/nsr-server/src/yajl.c b/xlators/cluster/nsr-server/src/yajl.c
new file mode 100644
index 000000000..54e6474fc
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_parse.h"
+#include "yajl_lex.h"
+#include "yajl_parser.h"
+#include "yajl_alloc.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+const char *
+yajl_status_to_string(yajl_status stat)
+{
+ const char * statStr = "unknown";
+ switch (stat) {
+ case yajl_status_ok:
+ statStr = "ok, no error";
+ break;
+ case yajl_status_client_canceled:
+ statStr = "client canceled parse";
+ break;
+ case yajl_status_error:
+ statStr = "parse error";
+ break;
+ }
+ return statStr;
+}
+
+yajl_handle
+yajl_alloc(const yajl_callbacks * callbacks,
+ yajl_alloc_funcs * afs,
+ void * ctx)
+{
+ yajl_handle hand = NULL;
+ yajl_alloc_funcs afsBuffer;
+
+ /* first order of business is to set up memory allocation routines */
+ if (afs != NULL) {
+ if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL)
+ {
+ return NULL;
+ }
+ } else {
+ yajl_set_default_alloc_funcs(&afsBuffer);
+ afs = &afsBuffer;
+ }
+
+ hand = (yajl_handle) YA_MALLOC(afs, sizeof(struct yajl_handle_t));
+
+ /* copy in pointers to allocation routines */
+ memcpy((void *) &(hand->alloc), (void *) afs, sizeof(yajl_alloc_funcs));
+
+ hand->callbacks = callbacks;
+ hand->ctx = ctx;
+ hand->lexer = NULL;
+ hand->bytesConsumed = 0;
+ hand->decodeBuf = yajl_buf_alloc(&(hand->alloc));
+ hand->flags = 0;
+ yajl_bs_init(hand->stateStack, &(hand->alloc));
+ yajl_bs_push(hand->stateStack, yajl_state_start);
+
+ return hand;
+}
+
+int
+yajl_config(yajl_handle h, yajl_option opt, ...)
+{
+ int rv = 1;
+ va_list ap;
+ va_start(ap, opt);
+
+ switch(opt) {
+ case yajl_allow_comments:
+ case yajl_dont_validate_strings:
+ case yajl_allow_trailing_garbage:
+ case yajl_allow_multiple_values:
+ case yajl_allow_partial_values:
+ if (va_arg(ap, int)) h->flags |= opt;
+ else h->flags &= ~opt;
+ break;
+ default:
+ rv = 0;
+ }
+ va_end(ap);
+
+ return rv;
+}
+
+void
+yajl_free(yajl_handle handle)
+{
+ yajl_bs_free(handle->stateStack);
+ yajl_buf_free(handle->decodeBuf);
+ if (handle->lexer) {
+ yajl_lex_free(handle->lexer);
+ handle->lexer = NULL;
+ }
+ YA_FREE(&(handle->alloc), handle);
+}
+
+yajl_status
+yajl_parse(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen)
+{
+ yajl_status status;
+
+ /* lazy allocation of the lexer */
+ if (hand->lexer == NULL) {
+ hand->lexer = yajl_lex_alloc(&(hand->alloc),
+ hand->flags & yajl_allow_comments,
+ !(hand->flags & yajl_dont_validate_strings));
+ }
+
+ status = yajl_do_parse(hand, jsonText, jsonTextLen);
+ return status;
+}
+
+
+yajl_status
+yajl_complete_parse(yajl_handle hand)
+{
+ /* The lexer is lazy allocated in the first call to parse. if parse is
+ * never called, then no data was provided to parse at all. This is a
+ * "premature EOF" error unless yajl_allow_partial_values is specified.
+ * allocating the lexer now is the simplest possible way to handle this
+ * case while preserving all the other semantics of the parser
+ * (multiple values, partial values, etc). */
+ if (hand->lexer == NULL) {
+ hand->lexer = yajl_lex_alloc(&(hand->alloc),
+ hand->flags & yajl_allow_comments,
+ !(hand->flags & yajl_dont_validate_strings));
+ }
+
+ return yajl_do_finish(hand);
+}
+
+unsigned char *
+yajl_get_error(yajl_handle hand, int verbose,
+ const unsigned char * jsonText, size_t jsonTextLen)
+{
+ return yajl_render_error_string(hand, jsonText, jsonTextLen, verbose);
+}
+
+size_t
+yajl_get_bytes_consumed(yajl_handle hand)
+{
+ if (!hand) return 0;
+ else return hand->bytesConsumed;
+}
+
+
+void
+yajl_free_error(yajl_handle hand, unsigned char * str)
+{
+ /* use memory allocation functions if set */
+ YA_FREE(&(hand->alloc), str);
+}
+
+/* XXX: add utility routines to parse from file */
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_common.h b/xlators/cluster/nsr-server/src/yajl/yajl_common.h
new file mode 100644
index 000000000..49ca3a5cb
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_common.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_COMMON_H__
+#define __YAJL_COMMON_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define YAJL_MAX_DEPTH 128
+
+/* msft dll export gunk. To build a DLL on windows, you
+ * must define WIN32, YAJL_SHARED, and YAJL_BUILD. To use a shared
+ * DLL, you must define YAJL_SHARED and WIN32 */
+#if defined(WIN32) && defined(YAJL_SHARED)
+# ifdef YAJL_BUILD
+# define YAJL_API __declspec(dllexport)
+# else
+# define YAJL_API __declspec(dllimport)
+# endif
+#else
+# if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+# define YAJL_API __attribute__ ((visibility("default")))
+# else
+# define YAJL_API
+# endif
+#endif
+
+/** pointer to a malloc function, supporting client overriding memory
+ * allocation routines */
+typedef void * (*yajl_malloc_func)(void *ctx, size_t sz);
+
+/** pointer to a free function, supporting client overriding memory
+ * allocation routines */
+typedef void (*yajl_free_func)(void *ctx, void * ptr);
+
+/** pointer to a realloc function which can resize an allocation. */
+typedef void * (*yajl_realloc_func)(void *ctx, void * ptr, size_t sz);
+
+/** A structure which can be passed to yajl_*_alloc routines to allow the
+ * client to specify memory allocation functions to be used. */
+typedef struct
+{
+ /** pointer to a function that can allocate uninitialized memory */
+ yajl_malloc_func malloc;
+ /** pointer to a function that can resize memory allocations */
+ yajl_realloc_func realloc;
+ /** pointer to a function that can free memory allocated using
+ * reallocFunction or mallocFunction */
+ yajl_free_func free;
+ /** a context pointer that will be passed to above allocation routines */
+ void * ctx;
+} yajl_alloc_funcs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_gen.h b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h
new file mode 100644
index 000000000..52fa99fc2
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_gen.h
+ * Interface to YAJL's JSON generation facilities.
+ */
+
+#include <yajl/yajl_common.h>
+
+#ifndef __YAJL_GEN_H__
+#define __YAJL_GEN_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /** generator status codes */
+ typedef enum {
+ /** no error */
+ yajl_gen_status_ok = 0,
+ /** at a point where a map key is generated, a function other than
+ * yajl_gen_string was called */
+ yajl_gen_keys_must_be_strings,
+ /** YAJL's maximum generation depth was exceeded. see
+ * YAJL_MAX_DEPTH */
+ yajl_max_depth_exceeded,
+ /** A generator function (yajl_gen_XXX) was called while in an error
+ * state */
+ yajl_gen_in_error_state,
+ /** A complete JSON document has been generated */
+ yajl_gen_generation_complete,
+ /** yajl_gen_double was passed an invalid floating point value
+ * (infinity or NaN). */
+ yajl_gen_invalid_number,
+ /** A print callback was passed in, so there is no internal
+ * buffer to get from */
+ yajl_gen_no_buf,
+ /** returned from yajl_gen_string() when the yajl_gen_validate_utf8
+ * option is enabled and an invalid was passed by client code.
+ */
+ yajl_gen_invalid_string
+ } yajl_gen_status;
+
+ /** an opaque handle to a generator */
+ typedef struct yajl_gen_t * yajl_gen;
+
+ /** a callback used for "printing" the results. */
+ typedef void (*yajl_print_t)(void * ctx,
+ const char * str,
+ size_t len);
+
+ /** configuration parameters for the parser, these may be passed to
+ * yajl_gen_config() along with option specific argument(s). In general,
+ * all configuration parameters default to *off*. */
+ typedef enum {
+ /** generate indented (beautiful) output */
+ yajl_gen_beautify = 0x01,
+ /**
+ * Set an indent string which is used when yajl_gen_beautify
+ * is enabled. Maybe something like \\t or some number of
+ * spaces. The default is four spaces ' '.
+ */
+ yajl_gen_indent_string = 0x02,
+ /**
+ * Set a function and context argument that should be used to
+ * output generated json. the function should conform to the
+ * yajl_print_t prototype while the context argument is a
+ * void * of your choosing.
+ *
+ * example:
+ * yajl_gen_config(g, yajl_gen_print_callback, myFunc, myVoidPtr);
+ */
+ yajl_gen_print_callback = 0x04,
+ /**
+ * Normally the generator does not validate that strings you
+ * pass to it via yajl_gen_string() are valid UTF8. Enabling
+ * this option will cause it to do so.
+ */
+ yajl_gen_validate_utf8 = 0x08,
+ /**
+ * the forward solidus (slash or '/' in human) is not required to be
+ * escaped in json text. By default, YAJL will not escape it in the
+ * iterest of saving bytes. Setting this flag will cause YAJL to
+ * always escape '/' in generated JSON strings.
+ */
+ yajl_gen_escape_solidus = 0x10
+ } yajl_gen_option;
+
+ /** allow the modification of generator options subsequent to handle
+ * allocation (via yajl_alloc)
+ * \returns zero in case of errors, non-zero otherwise
+ */
+ YAJL_API int yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...);
+
+ /** allocate a generator handle
+ * \param allocFuncs an optional pointer to a structure which allows
+ * the client to overide the memory allocation
+ * used by yajl. May be NULL, in which case
+ * malloc/free/realloc will be used.
+ *
+ * \returns an allocated handle on success, NULL on failure (bad params)
+ */
+ YAJL_API yajl_gen yajl_gen_alloc(const yajl_alloc_funcs * allocFuncs);
+
+ /** free a generator handle */
+ YAJL_API void yajl_gen_free(yajl_gen handle);
+
+ YAJL_API yajl_gen_status yajl_gen_integer(yajl_gen hand, long long int number);
+ /** generate a floating point number. number may not be infinity or
+ * NaN, as these have no representation in JSON. In these cases the
+ * generator will return 'yajl_gen_invalid_number' */
+ YAJL_API yajl_gen_status yajl_gen_double(yajl_gen hand, double number);
+ YAJL_API yajl_gen_status yajl_gen_number(yajl_gen hand,
+ const char * num,
+ size_t len);
+ YAJL_API yajl_gen_status yajl_gen_string(yajl_gen hand,
+ const unsigned char * str,
+ size_t len);
+ YAJL_API yajl_gen_status yajl_gen_null(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_bool(yajl_gen hand, int boolean);
+ YAJL_API yajl_gen_status yajl_gen_map_open(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_map_close(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_array_open(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_array_close(yajl_gen hand);
+
+ /** access the null terminated generator buffer. If incrementally
+ * outputing JSON, one should call yajl_gen_clear to clear the
+ * buffer. This allows stream generation. */
+ YAJL_API yajl_gen_status yajl_gen_get_buf(yajl_gen hand,
+ const unsigned char ** buf,
+ size_t * len);
+
+ /** clear yajl's output buffer, but maintain all internal generation
+ * state. This function will not "reset" the generator state, and is
+ * intended to enable incremental JSON outputing. */
+ YAJL_API void yajl_gen_clear(yajl_gen hand);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_parse.h b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h
new file mode 100644
index 000000000..55c831101
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_parse.h
+ * Interface to YAJL's JSON stream parsing facilities.
+ */
+
+#include <yajl/yajl_common.h>
+
+#ifndef __YAJL_PARSE_H__
+#define __YAJL_PARSE_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /** error codes returned from this interface */
+ typedef enum {
+ /** no error was encountered */
+ yajl_status_ok,
+ /** a client callback returned zero, stopping the parse */
+ yajl_status_client_canceled,
+ /** An error occured during the parse. Call yajl_get_error for
+ * more information about the encountered error */
+ yajl_status_error
+ } yajl_status;
+
+ /** attain a human readable, english, string for an error */
+ YAJL_API const char * yajl_status_to_string(yajl_status code);
+
+ /** an opaque handle to a parser */
+ typedef struct yajl_handle_t * yajl_handle;
+
+ /** yajl is an event driven parser. this means as json elements are
+ * parsed, you are called back to do something with the data. The
+ * functions in this table indicate the various events for which
+ * you will be called back. Each callback accepts a "context"
+ * pointer, this is a void * that is passed into the yajl_parse
+ * function which the client code may use to pass around context.
+ *
+ * All callbacks return an integer. If non-zero, the parse will
+ * continue. If zero, the parse will be canceled and
+ * yajl_status_client_canceled will be returned from the parse.
+ *
+ * \attention {
+ * A note about the handling of numbers:
+ *
+ * yajl will only convert numbers that can be represented in a
+ * double or a 64 bit (long long) int. All other numbers will
+ * be passed to the client in string form using the yajl_number
+ * callback. Furthermore, if yajl_number is not NULL, it will
+ * always be used to return numbers, that is yajl_integer and
+ * yajl_double will be ignored. If yajl_number is NULL but one
+ * of yajl_integer or yajl_double are defined, parsing of a
+ * number larger than is representable in a double or 64 bit
+ * integer will result in a parse error.
+ * }
+ */
+ typedef struct {
+ int (* yajl_null)(void * ctx);
+ int (* yajl_boolean)(void * ctx, int boolVal);
+ int (* yajl_integer)(void * ctx, long long integerVal);
+ int (* yajl_double)(void * ctx, double doubleVal);
+ /** A callback which passes the string representation of the number
+ * back to the client. Will be used for all numbers when present */
+ int (* yajl_number)(void * ctx, const char * numberVal,
+ size_t numberLen);
+
+ /** strings are returned as pointers into the JSON text when,
+ * possible, as a result, they are _not_ null padded */
+ int (* yajl_string)(void * ctx, const unsigned char * stringVal,
+ size_t stringLen);
+
+ int (* yajl_start_map)(void * ctx);
+ int (* yajl_map_key)(void * ctx, const unsigned char * key,
+ size_t stringLen);
+ int (* yajl_end_map)(void * ctx);
+
+ int (* yajl_start_array)(void * ctx);
+ int (* yajl_end_array)(void * ctx);
+ } yajl_callbacks;
+
+ /** allocate a parser handle
+ * \param callbacks a yajl callbacks structure specifying the
+ * functions to call when different JSON entities
+ * are encountered in the input text. May be NULL,
+ * which is only useful for validation.
+ * \param afs memory allocation functions, may be NULL for to use
+ * C runtime library routines (malloc and friends)
+ * \param ctx a context pointer that will be passed to callbacks.
+ */
+ YAJL_API yajl_handle yajl_alloc(const yajl_callbacks * callbacks,
+ yajl_alloc_funcs * afs,
+ void * ctx);
+
+
+ /** configuration parameters for the parser, these may be passed to
+ * yajl_config() along with option specific argument(s). In general,
+ * all configuration parameters default to *off*. */
+ typedef enum {
+ /** Ignore javascript style comments present in
+ * JSON input. Non-standard, but rather fun
+ * arguments: toggled off with integer zero, on otherwise.
+ *
+ * example:
+ * yajl_config(h, yajl_allow_comments, 1); // turn comment support on
+ */
+ yajl_allow_comments = 0x01,
+ /**
+ * When set the parser will verify that all strings in JSON input are
+ * valid UTF8 and will emit a parse error if this is not so. When set,
+ * this option makes parsing slightly more expensive (~7% depending
+ * on processor and compiler in use)
+ *
+ * example:
+ * yajl_config(h, yajl_dont_validate_strings, 1); // disable utf8 checking
+ */
+ yajl_dont_validate_strings = 0x02,
+ /**
+ * By default, upon calls to yajl_complete_parse(), yajl will
+ * ensure the entire input text was consumed and will raise an error
+ * otherwise. Enabling this flag will cause yajl to disable this
+ * check. This can be useful when parsing json out of a that contains more
+ * than a single JSON document.
+ */
+ yajl_allow_trailing_garbage = 0x04,
+ /**
+ * Allow multiple values to be parsed by a single handle. The
+ * entire text must be valid JSON, and values can be seperated
+ * by any kind of whitespace. This flag will change the
+ * behavior of the parser, and cause it continue parsing after
+ * a value is parsed, rather than transitioning into a
+ * complete state. This option can be useful when parsing multiple
+ * values from an input stream.
+ */
+ yajl_allow_multiple_values = 0x08,
+ /**
+ * When yajl_complete_parse() is called the parser will
+ * check that the top level value was completely consumed. I.E.,
+ * if called whilst in the middle of parsing a value
+ * yajl will enter an error state (premature EOF). Setting this
+ * flag suppresses that check and the corresponding error.
+ */
+ yajl_allow_partial_values = 0x10
+ } yajl_option;
+
+ /** allow the modification of parser options subsequent to handle
+ * allocation (via yajl_alloc)
+ * \returns zero in case of errors, non-zero otherwise
+ */
+ YAJL_API int yajl_config(yajl_handle h, yajl_option opt, ...);
+
+ /** free a parser handle */
+ YAJL_API void yajl_free(yajl_handle handle);
+
+ /** Parse some json!
+ * \param hand - a handle to the json parser allocated with yajl_alloc
+ * \param jsonText - a pointer to the UTF8 json text to be parsed
+ * \param jsonTextLength - the length, in bytes, of input text
+ */
+ YAJL_API yajl_status yajl_parse(yajl_handle hand,
+ const unsigned char * jsonText,
+ size_t jsonTextLength);
+
+ /** Parse any remaining buffered json.
+ * Since yajl is a stream-based parser, without an explicit end of
+ * input, yajl sometimes can't decide if content at the end of the
+ * stream is valid or not. For example, if "1" has been fed in,
+ * yajl can't know whether another digit is next or some character
+ * that would terminate the integer token.
+ *
+ * \param hand - a handle to the json parser allocated with yajl_alloc
+ */
+ YAJL_API yajl_status yajl_complete_parse(yajl_handle hand);
+
+ /** get an error string describing the state of the
+ * parse.
+ *
+ * If verbose is non-zero, the message will include the JSON
+ * text where the error occured, along with an arrow pointing to
+ * the specific char.
+ *
+ * \returns A dynamically allocated string will be returned which should
+ * be freed with yajl_free_error
+ */
+ YAJL_API unsigned char * yajl_get_error(yajl_handle hand, int verbose,
+ const unsigned char * jsonText,
+ size_t jsonTextLength);
+
+ /**
+ * get the amount of data consumed from the last chunk passed to YAJL.
+ *
+ * In the case of a successful parse this can help you understand if
+ * the entire buffer was consumed (which will allow you to handle
+ * "junk at end of input").
+ *
+ * In the event an error is encountered during parsing, this function
+ * affords the client a way to get the offset into the most recent
+ * chunk where the error occured. 0 will be returned if no error
+ * was encountered.
+ */
+ YAJL_API size_t yajl_get_bytes_consumed(yajl_handle hand);
+
+ /** free an error returned from yajl_get_error */
+ YAJL_API void yajl_free_error(yajl_handle hand, unsigned char * str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_tree.h b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h
new file mode 100644
index 000000000..8b377f636
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2010-2011 Florian Forster <ff at octo.it>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_tree.h
+ *
+ * Parses JSON data and returns the data in tree form.
+ *
+ * \author Florian Forster
+ * \date August 2010
+ *
+ * This interface makes quick parsing and extraction of
+ * smallish JSON docs trivial:
+ *
+ * \include example/parse_config.c
+ */
+
+#ifndef YAJL_TREE_H
+#define YAJL_TREE_H 1
+
+#include <yajl/yajl_common.h>
+
+/** possible data types that a yajl_val_s can hold */
+typedef enum {
+ yajl_t_string = 1,
+ yajl_t_number = 2,
+ yajl_t_object = 3,
+ yajl_t_array = 4,
+ yajl_t_true = 5,
+ yajl_t_false = 6,
+ yajl_t_null = 7,
+ /** The any type isn't valid for yajl_val_s.type, but can be
+ * used as an argument to routines like yajl_tree_get().
+ */
+ yajl_t_any = 8
+} yajl_type;
+
+#define YAJL_NUMBER_INT_VALID 0x01
+#define YAJL_NUMBER_DOUBLE_VALID 0x02
+
+/** A pointer to a node in the parse tree */
+typedef struct yajl_val_s * yajl_val;
+
+/**
+ * A JSON value representation capable of holding one of the seven
+ * types above. For "string", "number", "object", and "array"
+ * additional data is available in the union. The "YAJL_IS_*"
+ * and "YAJL_GET_*" macros below allow type checking and convenient
+ * value extraction.
+ */
+struct yajl_val_s
+{
+ /** Type of the value contained. Use the "YAJL_IS_*" macors to check for a
+ * specific type. */
+ yajl_type type;
+ /** Type-specific data. You may use the "YAJL_GET_*" macros to access these
+ * members. */
+ union
+ {
+ char * string;
+ struct {
+ long long i; /*< integer value, if representable. */
+ double d; /*< double value, if representable. */
+ /** Signals whether the \em i and \em d members are
+ * valid. See \c YAJL_NUMBER_INT_VALID and
+ * \c YAJL_NUMBER_DOUBLE_VALID. */
+ char *r; /*< unparsed number in string form. */
+ unsigned int flags;
+ } number;
+ struct {
+ const char **keys; /*< Array of keys */
+ yajl_val *values; /*< Array of values. */
+ size_t len; /*< Number of key-value-pairs. */
+ } object;
+ struct {
+ yajl_val *values; /*< Array of elements. */
+ size_t len; /*< Number of elements. */
+ } array;
+ } u;
+};
+
+/**
+ * Parse a string.
+ *
+ * Parses an null-terminated string containing JSON data and returns a pointer
+ * to the top-level value (root of the parse tree).
+ *
+ * \param input Pointer to a null-terminated utf8 string containing
+ * JSON data.
+ * \param error_buffer Pointer to a buffer in which an error message will
+ * be stored if \em yajl_tree_parse fails, or
+ * \c NULL. The buffer will be initialized before
+ * parsing, so its content will be destroyed even if
+ * \em yajl_tree_parse succeeds.
+ * \param error_buffer_size Size of the memory area pointed to by
+ * \em error_buffer_size. If \em error_buffer_size is
+ * \c NULL, this argument is ignored.
+ *
+ * \returns Pointer to the top-level value or \c NULL on error. The memory
+ * pointed to must be freed using \em yajl_tree_free. In case of an error, a
+ * null terminated message describing the error in more detail is stored in
+ * \em error_buffer if it is not \c NULL.
+ */
+YAJL_API yajl_val yajl_tree_parse (const char *input,
+ char *error_buffer, size_t error_buffer_size);
+
+/**
+ * Free a parse tree returned by "yajl_tree_parse".
+ *
+ * \param v Pointer to a JSON value returned by "yajl_tree_parse". Passing NULL
+ * is valid and results in a no-op.
+ */
+YAJL_API void yajl_tree_free (yajl_val v);
+
+/**
+ * Access a nested value inside a tree.
+ *
+ * \param parent the node under which you'd like to extract values.
+ * \param path A null terminated array of strings, each the name of an object key
+ * \param type the yajl_type of the object you seek, or yajl_t_any if any will do.
+ *
+ * \returns a pointer to the found value, or NULL if we came up empty.
+ *
+ * Future Ideas: it'd be nice to move path to a string and implement support for
+ * a teeny tiny micro language here, so you can extract array elements, do things
+ * like .first and .last, even .length. Inspiration from JSONPath and css selectors?
+ * No it wouldn't be fast, but that's not what this API is about.
+ */
+YAJL_API yajl_val yajl_tree_get(yajl_val parent, const char ** path, yajl_type type);
+
+/* Various convenience macros to check the type of a `yajl_val` */
+#define YAJL_IS_STRING(v) (((v) != NULL) && ((v)->type == yajl_t_string))
+#define YAJL_IS_NUMBER(v) (((v) != NULL) && ((v)->type == yajl_t_number))
+#define YAJL_IS_INTEGER(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_INT_VALID))
+#define YAJL_IS_DOUBLE(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_DOUBLE_VALID))
+#define YAJL_IS_OBJECT(v) (((v) != NULL) && ((v)->type == yajl_t_object))
+#define YAJL_IS_ARRAY(v) (((v) != NULL) && ((v)->type == yajl_t_array ))
+#define YAJL_IS_TRUE(v) (((v) != NULL) && ((v)->type == yajl_t_true ))
+#define YAJL_IS_FALSE(v) (((v) != NULL) && ((v)->type == yajl_t_false ))
+#define YAJL_IS_NULL(v) (((v) != NULL) && ((v)->type == yajl_t_null ))
+
+/** Given a yajl_val_string return a ptr to the bare string it contains,
+ * or NULL if the value is not a string. */
+#define YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : NULL)
+
+/** Get the string representation of a number. You should check type first,
+ * perhaps using YAJL_IS_NUMBER */
+#define YAJL_GET_NUMBER(v) ((v)->u.number.r)
+
+/** Get the double representation of a number. You should check type first,
+ * perhaps using YAJL_IS_DOUBLE */
+#define YAJL_GET_DOUBLE(v) ((v)->u.number.d)
+
+/** Get the 64bit (long long) integer representation of a number. You should
+ * check type first, perhaps using YAJL_IS_INTEGER */
+#define YAJL_GET_INTEGER(v) ((v)->u.number.i)
+
+/** Get a pointer to a yajl_val_object or NULL if the value is not an object. */
+#define YAJL_GET_OBJECT(v) (YAJL_IS_OBJECT(v) ? &(v)->u.object : NULL)
+
+/** Get a pointer to a yajl_val_array or NULL if the value is not an object. */
+#define YAJL_GET_ARRAY(v) (YAJL_IS_ARRAY(v) ? &(v)->u.array : NULL)
+
+#endif /* YAJL_TREE_H */
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_version.h b/xlators/cluster/nsr-server/src/yajl/yajl_version.h
new file mode 100644
index 000000000..0fba9b8fc
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_version.h
@@ -0,0 +1,23 @@
+#ifndef YAJL_VERSION_H_
+#define YAJL_VERSION_H_
+
+#include <yajl/yajl_common.h>
+
+#define YAJL_MAJOR 2
+#define YAJL_MINOR 0
+#define YAJL_MICRO 1
+
+#define YAJL_VERSION ((YAJL_MAJOR * 10000) + (YAJL_MINOR * 100) + YAJL_MICRO)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int YAJL_API yajl_version(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* YAJL_VERSION_H_ */
+
diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.c b/xlators/cluster/nsr-server/src/yajl_alloc.c
new file mode 100644
index 000000000..276315af7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_alloc.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_alloc.h
+ * default memory allocation routines for yajl which use malloc/realloc and
+ * free
+ */
+
+#include "yajl_alloc.h"
+#include <stdlib.h>
+
+static void * yajl_internal_malloc(void *ctx, size_t sz)
+{
+ return malloc(sz);
+}
+
+static void * yajl_internal_realloc(void *ctx, void * previous,
+ size_t sz)
+{
+ return realloc(previous, sz);
+}
+
+static void yajl_internal_free(void *ctx, void * ptr)
+{
+ free(ptr);
+}
+
+void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf)
+{
+ yaf->malloc = yajl_internal_malloc;
+ yaf->free = yajl_internal_free;
+ yaf->realloc = yajl_internal_realloc;
+ yaf->ctx = NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.h b/xlators/cluster/nsr-server/src/yajl_alloc.h
new file mode 100644
index 000000000..a8a9e45e6
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_alloc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_alloc.h
+ * default memory allocation routines for yajl which use malloc/realloc and
+ * free
+ */
+
+#ifndef __YAJL_ALLOC_H__
+#define __YAJL_ALLOC_H__
+
+#include "yajl/yajl_common.h"
+
+#define YA_MALLOC(afs, sz) (afs)->malloc((afs)->ctx, (sz))
+#define YA_FREE(afs, ptr) (afs)->free((afs)->ctx, (ptr))
+#define YA_REALLOC(afs, ptr, sz) (afs)->realloc((afs)->ctx, (ptr), (sz))
+
+void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_buf.c b/xlators/cluster/nsr-server/src/yajl_buf.c
new file mode 100644
index 000000000..0d249d364
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_buf.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_buf.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define YAJL_BUF_INIT_SIZE 2048
+
+struct yajl_buf_t {
+ size_t len;
+ size_t used;
+ unsigned char * data;
+ yajl_alloc_funcs * alloc;
+};
+
+static
+void yajl_buf_ensure_available(yajl_buf buf, size_t want)
+{
+ size_t need;
+
+ assert(buf != NULL);
+
+ /* first call */
+ if (buf->data == NULL) {
+ buf->len = YAJL_BUF_INIT_SIZE;
+ buf->data = (unsigned char *) YA_MALLOC(buf->alloc, buf->len);
+ buf->data[0] = 0;
+ }
+
+ need = buf->len;
+
+ while (want >= (need - buf->used)) need <<= 1;
+
+ if (need != buf->len) {
+ buf->data = (unsigned char *) YA_REALLOC(buf->alloc, buf->data, need);
+ buf->len = need;
+ }
+}
+
+yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc)
+{
+ yajl_buf b = YA_MALLOC(alloc, sizeof(struct yajl_buf_t));
+ memset((void *) b, 0, sizeof(struct yajl_buf_t));
+ b->alloc = alloc;
+ return b;
+}
+
+void yajl_buf_free(yajl_buf buf)
+{
+ assert(buf != NULL);
+ if (buf->data) YA_FREE(buf->alloc, buf->data);
+ YA_FREE(buf->alloc, buf);
+}
+
+void yajl_buf_append(yajl_buf buf, const void * data, size_t len)
+{
+ yajl_buf_ensure_available(buf, len);
+ if (len > 0) {
+ assert(data != NULL);
+ memcpy(buf->data + buf->used, data, len);
+ buf->used += len;
+ buf->data[buf->used] = 0;
+ }
+}
+
+void yajl_buf_clear(yajl_buf buf)
+{
+ buf->used = 0;
+ if (buf->data) buf->data[buf->used] = 0;
+}
+
+const unsigned char * yajl_buf_data(yajl_buf buf)
+{
+ return buf->data;
+}
+
+size_t yajl_buf_len(yajl_buf buf)
+{
+ return buf->used;
+}
+
+void
+yajl_buf_truncate(yajl_buf buf, size_t len)
+{
+ assert(len <= buf->used);
+ buf->used = len;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_buf.h b/xlators/cluster/nsr-server/src/yajl_buf.h
new file mode 100644
index 000000000..94929a519
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_buf.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_BUF_H__
+#define __YAJL_BUF_H__
+
+#include "yajl/yajl_common.h"
+#include "yajl_alloc.h"
+
+/*
+ * Implementation/performance notes. If this were moved to a header
+ * only implementation using #define's where possible we might be
+ * able to sqeeze a little performance out of the guy by killing function
+ * call overhead. YMMV.
+ */
+
+/**
+ * yajl_buf is a buffer with exponential growth. the buffer ensures that
+ * you are always null padded.
+ */
+typedef struct yajl_buf_t * yajl_buf;
+
+/* allocate a new buffer */
+yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc);
+
+/* free the buffer */
+void yajl_buf_free(yajl_buf buf);
+
+/* append a number of bytes to the buffer */
+void yajl_buf_append(yajl_buf buf, const void * data, size_t len);
+
+/* empty the buffer */
+void yajl_buf_clear(yajl_buf buf);
+
+/* get a pointer to the beginning of the buffer */
+const unsigned char * yajl_buf_data(yajl_buf buf);
+
+/* get the length of the buffer */
+size_t yajl_buf_len(yajl_buf buf);
+
+/* truncate the buffer */
+void yajl_buf_truncate(yajl_buf buf, size_t len);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_bytestack.h b/xlators/cluster/nsr-server/src/yajl_bytestack.h
new file mode 100644
index 000000000..1fc50c470
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_bytestack.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * A header only implementation of a simple stack of bytes, used in YAJL
+ * to maintain parse state.
+ */
+
+#ifndef __YAJL_BYTESTACK_H__
+#define __YAJL_BYTESTACK_H__
+
+#include "yajl/yajl_common.h"
+
+#define YAJL_BS_INC 128
+
+typedef struct yajl_bytestack_t
+{
+ unsigned char * stack;
+ size_t size;
+ size_t used;
+ yajl_alloc_funcs * yaf;
+} yajl_bytestack;
+
+/* initialize a bytestack */
+#define yajl_bs_init(obs, _yaf) { \
+ (obs).stack = NULL; \
+ (obs).size = 0; \
+ (obs).used = 0; \
+ (obs).yaf = (_yaf); \
+ } \
+
+
+/* initialize a bytestack */
+#define yajl_bs_free(obs) \
+ if ((obs).stack) (obs).yaf->free((obs).yaf->ctx, (obs).stack);
+
+#define yajl_bs_current(obs) \
+ (assert((obs).used > 0), (obs).stack[(obs).used - 1])
+
+#define yajl_bs_push(obs, byte) { \
+ if (((obs).size - (obs).used) == 0) { \
+ (obs).size += YAJL_BS_INC; \
+ (obs).stack = (obs).yaf->realloc((obs).yaf->ctx,\
+ (void *) (obs).stack, (obs).size);\
+ } \
+ (obs).stack[((obs).used)++] = (byte); \
+}
+
+/* removes the top item of the stack, returns nothing */
+#define yajl_bs_pop(obs) { ((obs).used)--; }
+
+#define yajl_bs_set(obs, byte) \
+ (obs).stack[((obs).used) - 1] = (byte);
+
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_encode.c b/xlators/cluster/nsr-server/src/yajl_encode.c
new file mode 100644
index 000000000..9dc9a3e81
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_encode.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_encode.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+static void CharToHex(unsigned char c, char * hexBuf)
+{
+ const char * hexchar = "0123456789ABCDEF";
+ hexBuf[0] = hexchar[c >> 4];
+ hexBuf[1] = hexchar[c & 0x0F];
+}
+
+void
+yajl_string_encode(const yajl_print_t print,
+ void * ctx,
+ const unsigned char * str,
+ size_t len,
+ int escape_solidus)
+{
+ size_t beg = 0;
+ size_t end = 0;
+ char hexBuf[7];
+ hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
+ hexBuf[6] = 0;
+
+ while (end < len) {
+ const char * escaped = NULL;
+ switch (str[end]) {
+ case '\r': escaped = "\\r"; break;
+ case '\n': escaped = "\\n"; break;
+ case '\\': escaped = "\\\\"; break;
+ /* it is not required to escape a solidus in JSON:
+ * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
+ * specifically, this production from the grammar:
+ * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
+ */
+ case '/': if (escape_solidus) escaped = "\\/"; break;
+ case '"': escaped = "\\\""; break;
+ case '\f': escaped = "\\f"; break;
+ case '\b': escaped = "\\b"; break;
+ case '\t': escaped = "\\t"; break;
+ default:
+ if ((unsigned char) str[end] < 32) {
+ CharToHex(str[end], hexBuf + 4);
+ escaped = hexBuf;
+ }
+ break;
+ }
+ if (escaped != NULL) {
+ print(ctx, (const char *) (str + beg), end - beg);
+ print(ctx, escaped, (unsigned int)strlen(escaped));
+ beg = ++end;
+ } else {
+ ++end;
+ }
+ }
+ print(ctx, (const char *) (str + beg), end - beg);
+}
+
+static void hexToDigit(unsigned int * val, const unsigned char * hex)
+{
+ unsigned int i;
+ for (i=0;i<4;i++) {
+ unsigned char c = hex[i];
+ if (c >= 'A') c = (c & ~0x20) - 7;
+ c -= '0';
+ assert(!(c & 0xF0));
+ *val = (*val << 4) | c;
+ }
+}
+
+static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
+{
+ if (codepoint < 0x80) {
+ utf8Buf[0] = (char) codepoint;
+ utf8Buf[1] = 0;
+ } else if (codepoint < 0x0800) {
+ utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
+ utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
+ utf8Buf[2] = 0;
+ } else if (codepoint < 0x10000) {
+ utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
+ utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
+ utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
+ utf8Buf[3] = 0;
+ } else if (codepoint < 0x200000) {
+ utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
+ utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
+ utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
+ utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
+ utf8Buf[4] = 0;
+ } else {
+ utf8Buf[0] = '?';
+ utf8Buf[1] = 0;
+ }
+}
+
+void yajl_string_decode(yajl_buf buf, const unsigned char * str,
+ size_t len)
+{
+ size_t beg = 0;
+ size_t end = 0;
+
+ while (end < len) {
+ if (str[end] == '\\') {
+ char utf8Buf[5];
+ const char * unescaped = "?";
+ yajl_buf_append(buf, str + beg, end - beg);
+ switch (str[++end]) {
+ case 'r': unescaped = "\r"; break;
+ case 'n': unescaped = "\n"; break;
+ case '\\': unescaped = "\\"; break;
+ case '/': unescaped = "/"; break;
+ case '"': unescaped = "\""; break;
+ case 'f': unescaped = "\f"; break;
+ case 'b': unescaped = "\b"; break;
+ case 't': unescaped = "\t"; break;
+ case 'u': {
+ unsigned int codepoint = 0;
+ hexToDigit(&codepoint, str + ++end);
+ end+=3;
+ /* check if this is a surrogate */
+ if ((codepoint & 0xFC00) == 0xD800) {
+ end++;
+ if (str[end] == '\\' && str[end + 1] == 'u') {
+ unsigned int surrogate = 0;
+ hexToDigit(&surrogate, str + end + 2);
+ codepoint =
+ (((codepoint & 0x3F) << 10) |
+ ((((codepoint >> 6) & 0xF) + 1) << 16) |
+ (surrogate & 0x3FF));
+ end += 5;
+ } else {
+ unescaped = "?";
+ break;
+ }
+ }
+
+ Utf32toUtf8(codepoint, utf8Buf);
+ unescaped = utf8Buf;
+
+ if (codepoint == 0) {
+ yajl_buf_append(buf, unescaped, 1);
+ beg = ++end;
+ continue;
+ }
+
+ break;
+ }
+ default:
+ assert("this should never happen" == NULL);
+ }
+ yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped));
+ beg = ++end;
+ } else {
+ end++;
+ }
+ }
+ yajl_buf_append(buf, str + beg, end - beg);
+}
+
+#define ADV_PTR s++; if (!(len--)) return 0;
+
+int yajl_string_validate_utf8(const unsigned char * s, size_t len)
+{
+ if (!len) return 1;
+ if (!s) return 0;
+
+ while (len--) {
+ /* single byte */
+ if (*s <= 0x7f) {
+ /* noop */
+ }
+ /* two byte */
+ else if ((*s >> 5) == 0x6) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ }
+ /* three byte */
+ else if ((*s >> 4) == 0x0e) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ }
+ /* four byte */
+ else if ((*s >> 3) == 0x1e) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ } else {
+ return 0;
+ }
+
+ s++;
+ }
+
+ return 1;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_encode.h b/xlators/cluster/nsr-server/src/yajl_encode.h
new file mode 100644
index 000000000..af1e8bbde
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_encode.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_ENCODE_H__
+#define __YAJL_ENCODE_H__
+
+#include "yajl_buf.h"
+#include "yajl/yajl_gen.h"
+
+void yajl_string_encode(const yajl_print_t printer,
+ void * ctx,
+ const unsigned char * str,
+ size_t length,
+ int escape_solidus);
+
+void yajl_string_decode(yajl_buf buf, const unsigned char * str,
+ size_t length);
+
+int yajl_string_validate_utf8(const unsigned char * s, size_t len);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_gen.c b/xlators/cluster/nsr-server/src/yajl_gen.c
new file mode 100644
index 000000000..73763a9e0
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_gen.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_gen.h"
+#include "yajl_buf.h"
+#include "yajl_encode.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include <stdarg.h>
+
+typedef enum {
+ yajl_gen_start,
+ yajl_gen_map_start,
+ yajl_gen_map_key,
+ yajl_gen_map_val,
+ yajl_gen_array_start,
+ yajl_gen_in_array,
+ yajl_gen_complete,
+ yajl_gen_error
+} yajl_gen_state;
+
+struct yajl_gen_t
+{
+ unsigned int flags;
+ unsigned int depth;
+ const char * indentString;
+ yajl_gen_state state[YAJL_MAX_DEPTH];
+ yajl_print_t print;
+ void * ctx; /* yajl_buf */
+ /* memory allocation routines */
+ yajl_alloc_funcs alloc;
+};
+
+int
+yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...)
+{
+ int rv = 1;
+ va_list ap;
+ va_start(ap, opt);
+
+ switch(opt) {
+ case yajl_gen_beautify:
+ case yajl_gen_validate_utf8:
+ if (va_arg(ap, int)) g->flags |= opt;
+ else g->flags &= ~opt;
+ break;
+ case yajl_gen_indent_string: {
+ const char *indent = va_arg(ap, const char *);
+ g->indentString = indent;
+ for (; *indent; indent++) {
+ if (*indent != '\n'
+ && *indent != '\v'
+ && *indent != '\f'
+ && *indent != '\t'
+ && *indent != '\r'
+ && *indent != ' ')
+ {
+ g->indentString = NULL;
+ rv = 0;
+ }
+ }
+ break;
+ }
+ case yajl_gen_print_callback:
+ yajl_buf_free(g->ctx);
+ g->print = va_arg(ap, const yajl_print_t);
+ g->ctx = va_arg(ap, void *);
+ break;
+ default:
+ rv = 0;
+ }
+
+ va_end(ap);
+
+ return rv;
+}
+
+
+
+yajl_gen
+yajl_gen_alloc(const yajl_alloc_funcs * afs)
+{
+ yajl_gen g = NULL;
+ yajl_alloc_funcs afsBuffer;
+
+ /* first order of business is to set up memory allocation routines */
+ if (afs != NULL) {
+ if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL)
+ {
+ return NULL;
+ }
+ } else {
+ yajl_set_default_alloc_funcs(&afsBuffer);
+ afs = &afsBuffer;
+ }
+
+ g = (yajl_gen) YA_MALLOC(afs, sizeof(struct yajl_gen_t));
+ if (!g) return NULL;
+
+ memset((void *) g, 0, sizeof(struct yajl_gen_t));
+ /* copy in pointers to allocation routines */
+ memcpy((void *) &(g->alloc), (void *) afs, sizeof(yajl_alloc_funcs));
+
+ g->print = (yajl_print_t)&yajl_buf_append;
+ g->ctx = yajl_buf_alloc(&(g->alloc));
+ g->indentString = " ";
+
+ return g;
+}
+
+void
+yajl_gen_free(yajl_gen g)
+{
+ if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_free((yajl_buf)g->ctx);
+ YA_FREE(&(g->alloc), g);
+}
+
+#define INSERT_SEP \
+ if (g->state[g->depth] == yajl_gen_map_key || \
+ g->state[g->depth] == yajl_gen_in_array) { \
+ g->print(g->ctx, ",", 1); \
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); \
+ } else if (g->state[g->depth] == yajl_gen_map_val) { \
+ g->print(g->ctx, ":", 1); \
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, " ", 1); \
+ }
+
+#define INSERT_WHITESPACE \
+ if ((g->flags & yajl_gen_beautify)) { \
+ if (g->state[g->depth] != yajl_gen_map_val) { \
+ unsigned int _i; \
+ for (_i=0;_i<g->depth;_i++) \
+ g->print(g->ctx, \
+ g->indentString, \
+ (unsigned int)strlen(g->indentString)); \
+ } \
+ }
+
+#define ENSURE_NOT_KEY \
+ if (g->state[g->depth] == yajl_gen_map_key || \
+ g->state[g->depth] == yajl_gen_map_start) { \
+ return yajl_gen_keys_must_be_strings; \
+ } \
+
+/* check that we're not complete, or in error state. in a valid state
+ * to be generating */
+#define ENSURE_VALID_STATE \
+ if (g->state[g->depth] == yajl_gen_error) { \
+ return yajl_gen_in_error_state;\
+ } else if (g->state[g->depth] == yajl_gen_complete) { \
+ return yajl_gen_generation_complete; \
+ }
+
+#define INCREMENT_DEPTH \
+ if (++(g->depth) >= YAJL_MAX_DEPTH) return yajl_max_depth_exceeded;
+
+#define DECREMENT_DEPTH \
+ if (--(g->depth) >= YAJL_MAX_DEPTH) return yajl_gen_error;
+
+#define APPENDED_ATOM \
+ switch (g->state[g->depth]) { \
+ case yajl_gen_start: \
+ g->state[g->depth] = yajl_gen_complete; \
+ break; \
+ case yajl_gen_map_start: \
+ case yajl_gen_map_key: \
+ g->state[g->depth] = yajl_gen_map_val; \
+ break; \
+ case yajl_gen_array_start: \
+ g->state[g->depth] = yajl_gen_in_array; \
+ break; \
+ case yajl_gen_map_val: \
+ g->state[g->depth] = yajl_gen_map_key; \
+ break; \
+ default: \
+ break; \
+ } \
+
+#define FINAL_NEWLINE \
+ if ((g->flags & yajl_gen_beautify) && g->state[g->depth] == yajl_gen_complete) \
+ g->print(g->ctx, "\n", 1);
+
+yajl_gen_status
+yajl_gen_integer(yajl_gen g, long long int number)
+{
+ char i[32];
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ sprintf(i, "%lld", number);
+ g->print(g->ctx, i, (unsigned int)strlen(i));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+#ifdef WIN32
+#include <float.h>
+#define isnan _isnan
+#define isinf !_finite
+#endif
+
+yajl_gen_status
+yajl_gen_double(yajl_gen g, double number)
+{
+ char i[32];
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY;
+ if (isnan(number) || isinf(number)) return yajl_gen_invalid_number;
+ INSERT_SEP; INSERT_WHITESPACE;
+ sprintf(i, "%.20g", number);
+ g->print(g->ctx, i, (unsigned int)strlen(i));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_number(yajl_gen g, const char * s, size_t l)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, s, l);
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_string(yajl_gen g, const unsigned char * str,
+ size_t len)
+{
+ // if validation is enabled, check that the string is valid utf8
+ // XXX: This checking could be done a little faster, in the same pass as
+ // the string encoding
+ if (g->flags & yajl_gen_validate_utf8) {
+ if (!yajl_string_validate_utf8(str, len)) {
+ return yajl_gen_invalid_string;
+ }
+ }
+ ENSURE_VALID_STATE; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, "\"", 1);
+ yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus);
+ g->print(g->ctx, "\"", 1);
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_null(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, "null", strlen("null"));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_bool(yajl_gen g, int boolean)
+{
+ const char * val = boolean ? "true" : "false";
+
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, val, (unsigned int)strlen(val));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_map_open(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ INCREMENT_DEPTH;
+
+ g->state[g->depth] = yajl_gen_map_start;
+ g->print(g->ctx, "{", 1);
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_map_close(yajl_gen g)
+{
+ ENSURE_VALID_STATE;
+ DECREMENT_DEPTH;
+
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ APPENDED_ATOM;
+ INSERT_WHITESPACE;
+ g->print(g->ctx, "}", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_array_open(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ INCREMENT_DEPTH;
+ g->state[g->depth] = yajl_gen_array_start;
+ g->print(g->ctx, "[", 1);
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_array_close(yajl_gen g)
+{
+ ENSURE_VALID_STATE;
+ DECREMENT_DEPTH;
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ APPENDED_ATOM;
+ INSERT_WHITESPACE;
+ g->print(g->ctx, "]", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_get_buf(yajl_gen g, const unsigned char ** buf,
+ size_t * len)
+{
+ if (g->print != (yajl_print_t)&yajl_buf_append) return yajl_gen_no_buf;
+ *buf = yajl_buf_data((yajl_buf)g->ctx);
+ *len = yajl_buf_len((yajl_buf)g->ctx);
+ return yajl_gen_status_ok;
+}
+
+void
+yajl_gen_clear(yajl_gen g)
+{
+ if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_clear((yajl_buf)g->ctx);
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_lex.c b/xlators/cluster/nsr-server/src/yajl_lex.c
new file mode 100644
index 000000000..b098e6a99
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_lex.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_lex.h"
+#include "yajl_buf.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef YAJL_LEXER_DEBUG
+static const char *
+tokToStr(yajl_tok tok)
+{
+ switch (tok) {
+ case yajl_tok_bool: return "bool";
+ case yajl_tok_colon: return "colon";
+ case yajl_tok_comma: return "comma";
+ case yajl_tok_eof: return "eof";
+ case yajl_tok_error: return "error";
+ case yajl_tok_left_brace: return "brace";
+ case yajl_tok_left_bracket: return "bracket";
+ case yajl_tok_null: return "null";
+ case yajl_tok_integer: return "integer";
+ case yajl_tok_double: return "double";
+ case yajl_tok_right_brace: return "brace";
+ case yajl_tok_right_bracket: return "bracket";
+ case yajl_tok_string: return "string";
+ case yajl_tok_string_with_escapes: return "string_with_escapes";
+ }
+ return "unknown";
+}
+#endif
+
+/* Impact of the stream parsing feature on the lexer:
+ *
+ * YAJL support stream parsing. That is, the ability to parse the first
+ * bits of a chunk of JSON before the last bits are available (still on
+ * the network or disk). This makes the lexer more complex. The
+ * responsibility of the lexer is to handle transparently the case where
+ * a chunk boundary falls in the middle of a token. This is
+ * accomplished is via a buffer and a character reading abstraction.
+ *
+ * Overview of implementation
+ *
+ * When we lex to end of input string before end of token is hit, we
+ * copy all of the input text composing the token into our lexBuf.
+ *
+ * Every time we read a character, we do so through the readChar function.
+ * readChar's responsibility is to handle pulling all chars from the buffer
+ * before pulling chars from input text
+ */
+
+struct yajl_lexer_t {
+ /* the overal line and char offset into the data */
+ size_t lineOff;
+ size_t charOff;
+
+ /* error */
+ yajl_lex_error error;
+
+ /* a input buffer to handle the case where a token is spread over
+ * multiple chunks */
+ yajl_buf buf;
+
+ /* in the case where we have data in the lexBuf, bufOff holds
+ * the current offset into the lexBuf. */
+ size_t bufOff;
+
+ /* are we using the lex buf? */
+ unsigned int bufInUse;
+
+ /* shall we allow comments? */
+ unsigned int allowComments;
+
+ /* shall we validate utf8 inside strings? */
+ unsigned int validateUTF8;
+
+ yajl_alloc_funcs * alloc;
+};
+
+#define readChar(lxr, txt, off) \
+ (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
+ (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
+ ((txt)[(*(off))++]))
+
+#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
+
+yajl_lexer
+yajl_lex_alloc(yajl_alloc_funcs * alloc,
+ unsigned int allowComments, unsigned int validateUTF8)
+{
+ yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
+ memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
+ lxr->buf = yajl_buf_alloc(alloc);
+ lxr->allowComments = allowComments;
+ lxr->validateUTF8 = validateUTF8;
+ lxr->alloc = alloc;
+ return lxr;
+}
+
+void
+yajl_lex_free(yajl_lexer lxr)
+{
+ yajl_buf_free(lxr->buf);
+ YA_FREE(lxr->alloc, lxr);
+ return;
+}
+
+/* a lookup table which lets us quickly determine three things:
+ * VEC - valid escaped control char
+ * note. the solidus '/' may be escaped or not.
+ * IJC - invalid json char
+ * VHC - valid hex char
+ * NFP - needs further processing (from a string scanning perspective)
+ * NUC - needs utf8 checking when enabled (from a string scanning perspective)
+ */
+#define VEC 0x01
+#define IJC 0x02
+#define VHC 0x04
+#define NFP 0x08
+#define NUC 0x10
+
+static const char charLookupTable[256] =
+{
+/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+
+/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
+/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
+/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
+/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
+/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
+
+/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
+/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
+/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
+/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
+};
+
+/** process a variable length utf8 encoded codepoint.
+ *
+ * returns:
+ * yajl_tok_string - if valid utf8 char was parsed and offset was
+ * advanced
+ * yajl_tok_eof - if end of input was hit before validation could
+ * complete
+ * yajl_tok_error - if invalid utf8 was encountered
+ *
+ * NOTE: on error the offset will point to the first char of the
+ * invalid utf8 */
+#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
+
+static yajl_tok
+yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ unsigned char curChar)
+{
+ if (curChar <= 0x7f) {
+ /* single byte */
+ return yajl_tok_string;
+ } else if ((curChar >> 5) == 0x6) {
+ /* two byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ } else if ((curChar >> 4) == 0x0e) {
+ /* three byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ }
+ } else if ((curChar >> 3) == 0x1e) {
+ /* four byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ }
+ }
+ }
+
+ return yajl_tok_error;
+}
+
+/* lex a string. input is the lexer, pointer to beginning of
+ * json text, and start of string (offset).
+ * a token is returned which has the following meanings:
+ * yajl_tok_string: lex of string was successful. offset points to
+ * terminating '"'.
+ * yajl_tok_eof: end of text was encountered before we could complete
+ * the lex.
+ * yajl_tok_error: embedded in the string were unallowable chars. offset
+ * points to the offending char
+ */
+#define STR_CHECK_EOF \
+if (*offset >= jsonTextLen) { \
+ tok = yajl_tok_eof; \
+ goto finish_string_lex; \
+}
+
+/** scan a string for interesting characters that might need further
+ * review. return the number of chars that are uninteresting and can
+ * be skipped.
+ * (lth) hi world, any thoughts on how to make this routine faster? */
+static size_t
+yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
+{
+ unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+ size_t skip = 0;
+ while (skip < len && !(charLookupTable[*buf] & mask))
+ {
+ skip++;
+ buf++;
+ }
+ return skip;
+}
+
+static yajl_tok
+yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ yajl_tok tok = yajl_tok_error;
+ int hasEscapes = 0;
+
+ for (;;) {
+ unsigned char curChar;
+
+ /* now jump into a faster scanning routine to skip as much
+ * of the buffers as possible */
+ {
+ const unsigned char * p;
+ size_t len;
+
+ if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
+ lexer->bufOff < yajl_buf_len(lexer->buf)))
+ {
+ p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
+ (lexer->bufOff));
+ len = yajl_buf_len(lexer->buf) - lexer->bufOff;
+ lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ else if (*offset < jsonTextLen)
+ {
+ p = jsonText + *offset;
+ len = jsonTextLen - *offset;
+ *offset += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ }
+
+ STR_CHECK_EOF;
+
+ curChar = readChar(lexer, jsonText, offset);
+
+ /* quote terminates */
+ if (curChar == '"') {
+ tok = yajl_tok_string;
+ break;
+ }
+ /* backslash escapes a set of control chars, */
+ else if (curChar == '\\') {
+ hasEscapes = 1;
+ STR_CHECK_EOF;
+
+ /* special case \u */
+ curChar = readChar(lexer, jsonText, offset);
+ if (curChar == 'u') {
+ unsigned int i = 0;
+
+ for (i=0;i<4;i++) {
+ STR_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if (!(charLookupTable[curChar] & VHC)) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_hex_char;
+ goto finish_string_lex;
+ }
+ }
+ } else if (!(charLookupTable[curChar] & VEC)) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_escaped_char;
+ goto finish_string_lex;
+ }
+ }
+ /* when not validating UTF8 it's a simple table lookup to determine
+ * if the present character is invalid */
+ else if(charLookupTable[curChar] & IJC) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_json_char;
+ goto finish_string_lex;
+ }
+ /* when in validate UTF8 mode we need to do some extra work */
+ else if (lexer->validateUTF8) {
+ yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
+ offset, curChar);
+
+ if (t == yajl_tok_eof) {
+ tok = yajl_tok_eof;
+ goto finish_string_lex;
+ } else if (t == yajl_tok_error) {
+ lexer->error = yajl_lex_string_invalid_utf8;
+ goto finish_string_lex;
+ }
+ }
+ /* accept it, and move on */
+ }
+ finish_string_lex:
+ /* tell our buddy, the parser, wether he needs to process this string
+ * again */
+ if (hasEscapes && tok == yajl_tok_string) {
+ tok = yajl_tok_string_with_escapes;
+ }
+
+ return tok;
+}
+
+#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
+
+static yajl_tok
+yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ /** XXX: numbers are the only entities in json that we must lex
+ * _beyond_ in order to know that they are complete. There
+ * is an ambiguous case for integers at EOF. */
+
+ unsigned char c;
+
+ yajl_tok tok = yajl_tok_integer;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* optional leading minus */
+ if (c == '-') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ /* a single zero, or a series of integers */
+ if (c == '0') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } else if (c >= '1' && c <= '9') {
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c >= '0' && c <= '9');
+ } else {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_minus;
+ return yajl_tok_error;
+ }
+
+ /* optional fraction (indicates this is floating point) */
+ if (c == '.') {
+ int numRd = 0;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ while (c >= '0' && c <= '9') {
+ numRd++;
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ if (!numRd) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_decimal;
+ return yajl_tok_error;
+ }
+ tok = yajl_tok_double;
+ }
+
+ /* optional exponent (indicates this is floating point) */
+ if (c == 'e' || c == 'E') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* optional sign */
+ if (c == '+' || c == '-') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ if (c >= '0' && c <= '9') {
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c >= '0' && c <= '9');
+ } else {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_exponent;
+ return yajl_tok_error;
+ }
+ tok = yajl_tok_double;
+ }
+
+ /* we always go "one too far" */
+ unreadChar(lexer, offset);
+
+ return tok;
+}
+
+static yajl_tok
+yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ unsigned char c;
+
+ yajl_tok tok = yajl_tok_comment;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* either slash or star expected */
+ if (c == '/') {
+ /* now we throw away until end of line */
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c != '\n');
+ } else if (c == '*') {
+ /* now we throw away until end of comment */
+ for (;;) {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ if (c == '*') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ if (c == '/') {
+ break;
+ } else {
+ unreadChar(lexer, offset);
+ }
+ }
+ }
+ } else {
+ lexer->error = yajl_lex_invalid_char;
+ tok = yajl_tok_error;
+ }
+
+ return tok;
+}
+
+yajl_tok
+yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ const unsigned char ** outBuf, size_t * outLen)
+{
+ yajl_tok tok = yajl_tok_error;
+ unsigned char c;
+ size_t startOffset = *offset;
+
+ *outBuf = NULL;
+ *outLen = 0;
+
+ for (;;) {
+ assert(*offset <= jsonTextLen);
+
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+
+ c = readChar(lexer, jsonText, offset);
+
+ switch (c) {
+ case '{':
+ tok = yajl_tok_left_bracket;
+ goto lexed;
+ case '}':
+ tok = yajl_tok_right_bracket;
+ goto lexed;
+ case '[':
+ tok = yajl_tok_left_brace;
+ goto lexed;
+ case ']':
+ tok = yajl_tok_right_brace;
+ goto lexed;
+ case ',':
+ tok = yajl_tok_comma;
+ goto lexed;
+ case ':':
+ tok = yajl_tok_colon;
+ goto lexed;
+ case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
+ startOffset++;
+ break;
+ case 't': {
+ const char * want = "rue";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_bool;
+ goto lexed;
+ }
+ case 'f': {
+ const char * want = "alse";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_bool;
+ goto lexed;
+ }
+ case 'n': {
+ const char * want = "ull";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_null;
+ goto lexed;
+ }
+ case '"': {
+ tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ goto lexed;
+ }
+ case '-':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ /* integer parsing wants to start from the beginning */
+ unreadChar(lexer, offset);
+ tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ goto lexed;
+ }
+ case '/':
+ /* hey, look, a probable comment! If comments are disabled
+ * it's an error. */
+ if (!lexer->allowComments) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_unallowed_comment;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ /* if comments are enabled, then we should try to lex
+ * the thing. possible outcomes are
+ * - successful lex (tok_comment, which means continue),
+ * - malformed comment opening (slash not followed by
+ * '*' or '/') (tok_error)
+ * - eof hit. (tok_eof) */
+ tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ if (tok == yajl_tok_comment) {
+ /* "error" is silly, but that's the initial
+ * state of tok. guilty until proven innocent. */
+ tok = yajl_tok_error;
+ yajl_buf_clear(lexer->buf);
+ lexer->bufInUse = 0;
+ startOffset = *offset;
+ break;
+ }
+ /* hit error or eof, bail */
+ goto lexed;
+ default:
+ lexer->error = yajl_lex_invalid_char;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ }
+
+
+ lexed:
+ /* need to append to buffer if the buffer is in use or
+ * if it's an EOF token */
+ if (tok == yajl_tok_eof || lexer->bufInUse) {
+ if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
+ lexer->bufInUse = 1;
+ yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
+ lexer->bufOff = 0;
+
+ if (tok != yajl_tok_eof) {
+ *outBuf = yajl_buf_data(lexer->buf);
+ *outLen = yajl_buf_len(lexer->buf);
+ lexer->bufInUse = 0;
+ }
+ } else if (tok != yajl_tok_error) {
+ *outBuf = jsonText + startOffset;
+ *outLen = *offset - startOffset;
+ }
+
+ /* special case for strings. skip the quotes. */
+ if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
+ {
+ assert(*outLen >= 2);
+ (*outBuf)++;
+ *outLen -= 2;
+ }
+
+
+#ifdef YAJL_LEXER_DEBUG
+ if (tok == yajl_tok_error) {
+ printf("lexical error: %s\n",
+ yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
+ } else if (tok == yajl_tok_eof) {
+ printf("EOF hit\n");
+ } else {
+ printf("lexed %s: '", tokToStr(tok));
+ fwrite(*outBuf, 1, *outLen, stdout);
+ printf("'\n");
+ }
+#endif
+
+ return tok;
+}
+
+const char *
+yajl_lex_error_to_string(yajl_lex_error error)
+{
+ switch (error) {
+ case yajl_lex_e_ok:
+ return "ok, no error";
+ case yajl_lex_string_invalid_utf8:
+ return "invalid bytes in UTF8 string.";
+ case yajl_lex_string_invalid_escaped_char:
+ return "inside a string, '\\' occurs before a character "
+ "which it may not.";
+ case yajl_lex_string_invalid_json_char:
+ return "invalid character inside string.";
+ case yajl_lex_string_invalid_hex_char:
+ return "invalid (non-hex) character occurs after '\\u' inside "
+ "string.";
+ case yajl_lex_invalid_char:
+ return "invalid char in json text.";
+ case yajl_lex_invalid_string:
+ return "invalid string in json text.";
+ case yajl_lex_missing_integer_after_exponent:
+ return "malformed number, a digit is required after the exponent.";
+ case yajl_lex_missing_integer_after_decimal:
+ return "malformed number, a digit is required after the "
+ "decimal point.";
+ case yajl_lex_missing_integer_after_minus:
+ return "malformed number, a digit is required after the "
+ "minus sign.";
+ case yajl_lex_unallowed_comment:
+ return "probable comment found in input text, comments are "
+ "not enabled.";
+ }
+ return "unknown error code";
+}
+
+
+/** allows access to more specific information about the lexical
+ * error when yajl_lex_lex returns yajl_tok_error. */
+yajl_lex_error
+yajl_lex_get_error(yajl_lexer lexer)
+{
+ if (lexer == NULL) return (yajl_lex_error) -1;
+ return lexer->error;
+}
+
+size_t yajl_lex_current_line(yajl_lexer lexer)
+{
+ return lexer->lineOff;
+}
+
+size_t yajl_lex_current_char(yajl_lexer lexer)
+{
+ return lexer->charOff;
+}
+
+yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t offset)
+{
+ const unsigned char * outBuf;
+ size_t outLen;
+ size_t bufLen = yajl_buf_len(lexer->buf);
+ size_t bufOff = lexer->bufOff;
+ unsigned int bufInUse = lexer->bufInUse;
+ yajl_tok tok;
+
+ tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
+ &outBuf, &outLen);
+
+ lexer->bufOff = bufOff;
+ lexer->bufInUse = bufInUse;
+ yajl_buf_truncate(lexer->buf, bufLen);
+
+ return tok;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_lex.h b/xlators/cluster/nsr-server/src/yajl_lex.h
new file mode 100644
index 000000000..cbaae0c13
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_lex.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_LEX_H__
+#define __YAJL_LEX_H__
+
+#include "yajl/yajl_common.h"
+
+typedef enum {
+ yajl_tok_bool,
+ yajl_tok_colon,
+ yajl_tok_comma,
+ yajl_tok_eof,
+ yajl_tok_error,
+ yajl_tok_left_brace,
+ yajl_tok_left_bracket,
+ yajl_tok_null,
+ yajl_tok_right_brace,
+ yajl_tok_right_bracket,
+
+ /* we differentiate between integers and doubles to allow the
+ * parser to interpret the number without re-scanning */
+ yajl_tok_integer,
+ yajl_tok_double,
+
+ /* we differentiate between strings which require further processing,
+ * and strings that do not */
+ yajl_tok_string,
+ yajl_tok_string_with_escapes,
+
+ /* comment tokens are not currently returned to the parser, ever */
+ yajl_tok_comment
+} yajl_tok;
+
+typedef struct yajl_lexer_t * yajl_lexer;
+
+yajl_lexer yajl_lex_alloc(yajl_alloc_funcs * alloc,
+ unsigned int allowComments,
+ unsigned int validateUTF8);
+
+void yajl_lex_free(yajl_lexer lexer);
+
+/**
+ * run/continue a lex. "offset" is an input/output parameter.
+ * It should be initialized to zero for a
+ * new chunk of target text, and upon subsetquent calls with the same
+ * target text should passed with the value of the previous invocation.
+ *
+ * the client may be interested in the value of offset when an error is
+ * returned from the lexer. This allows the client to render useful
+n * error messages.
+ *
+ * When you pass the next chunk of data, context should be reinitialized
+ * to zero.
+ *
+ * Finally, the output buffer is usually just a pointer into the jsonText,
+ * however in cases where the entity being lexed spans multiple chunks,
+ * the lexer will buffer the entity and the data returned will be
+ * a pointer into that buffer.
+ *
+ * This behavior is abstracted from client code except for the performance
+ * implications which require that the client choose a reasonable chunk
+ * size to get adequate performance.
+ */
+yajl_tok yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ const unsigned char ** outBuf, size_t * outLen);
+
+/** have a peek at the next token, but don't move the lexer forward */
+yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t offset);
+
+
+typedef enum {
+ yajl_lex_e_ok = 0,
+ yajl_lex_string_invalid_utf8,
+ yajl_lex_string_invalid_escaped_char,
+ yajl_lex_string_invalid_json_char,
+ yajl_lex_string_invalid_hex_char,
+ yajl_lex_invalid_char,
+ yajl_lex_invalid_string,
+ yajl_lex_missing_integer_after_decimal,
+ yajl_lex_missing_integer_after_exponent,
+ yajl_lex_missing_integer_after_minus,
+ yajl_lex_unallowed_comment
+} yajl_lex_error;
+
+const char * yajl_lex_error_to_string(yajl_lex_error error);
+
+/** allows access to more specific information about the lexical
+ * error when yajl_lex_lex returns yajl_tok_error. */
+yajl_lex_error yajl_lex_get_error(yajl_lexer lexer);
+
+/** get the current offset into the most recently lexed json string. */
+size_t yajl_lex_current_offset(yajl_lexer lexer);
+
+/** get the number of lines lexed by this lexer instance */
+size_t yajl_lex_current_line(yajl_lexer lexer);
+
+/** get the number of chars lexed by this lexer instance since the last
+ * \n or \r */
+size_t yajl_lex_current_char(yajl_lexer lexer);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_parser.c b/xlators/cluster/nsr-server/src/yajl_parser.c
new file mode 100644
index 000000000..bf9ef24ef
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_parser.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_parse.h"
+#include "yajl_lex.h"
+#include "yajl_parser.h"
+#include "yajl_encode.h"
+#include "yajl_bytestack.h"
+
+#include <stdlib.h>
+#include <limits.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+
+#define MAX_VALUE_TO_MULTIPLY ((LLONG_MAX / 10) + (LLONG_MAX % 10))
+
+ /* same semantics as strtol */
+long long
+yajl_parse_integer(const unsigned char *number, unsigned int length)
+{
+ long long ret = 0;
+ long sign = 1;
+ const unsigned char *pos = number;
+ if (*pos == '-') { pos++; sign = -1; }
+ if (*pos == '+') { pos++; }
+
+ while (pos < number + length) {
+ if ( ret > MAX_VALUE_TO_MULTIPLY ) {
+ errno = ERANGE;
+ return sign == 1 ? LLONG_MAX : LLONG_MIN;
+ }
+ ret *= 10;
+ if (LLONG_MAX - ret < (*pos - '0')) {
+ errno = ERANGE;
+ return sign == 1 ? LLONG_MAX : LLONG_MIN;
+ }
+ ret += (*pos++ - '0');
+ }
+
+ return sign * ret;
+}
+
+unsigned char *
+yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen, int verbose)
+{
+ size_t offset = hand->bytesConsumed;
+ unsigned char * str;
+ const char * errorType = NULL;
+ const char * errorText = NULL;
+ char text[72];
+ const char * arrow = " (right here) ------^\n";
+
+ if (yajl_bs_current(hand->stateStack) == yajl_state_parse_error) {
+ errorType = "parse";
+ errorText = hand->parseError;
+ } else if (yajl_bs_current(hand->stateStack) == yajl_state_lexical_error) {
+ errorType = "lexical";
+ errorText = yajl_lex_error_to_string(yajl_lex_get_error(hand->lexer));
+ } else {
+ errorType = "unknown";
+ }
+
+ {
+ size_t memneeded = 0;
+ memneeded += strlen(errorType);
+ memneeded += strlen(" error");
+ if (errorText != NULL) {
+ memneeded += strlen(": ");
+ memneeded += strlen(errorText);
+ }
+ str = (unsigned char *) YA_MALLOC(&(hand->alloc), memneeded + 2);
+ if (!str) return NULL;
+ str[0] = 0;
+ strcat((char *) str, errorType);
+ strcat((char *) str, " error");
+ if (errorText != NULL) {
+ strcat((char *) str, ": ");
+ strcat((char *) str, errorText);
+ }
+ strcat((char *) str, "\n");
+ }
+
+ /* now we append as many spaces as needed to make sure the error
+ * falls at char 41, if verbose was specified */
+ if (verbose) {
+ size_t start, end, i;
+ size_t spacesNeeded;
+
+ spacesNeeded = (offset < 30 ? 40 - offset : 10);
+ start = (offset >= 30 ? offset - 30 : 0);
+ end = (offset + 30 > jsonTextLen ? jsonTextLen : offset + 30);
+
+ for (i=0;i<spacesNeeded;i++) text[i] = ' ';
+
+ for (;start < end;start++, i++) {
+ if (jsonText[start] != '\n' && jsonText[start] != '\r')
+ {
+ text[i] = jsonText[start];
+ }
+ else
+ {
+ text[i] = ' ';
+ }
+ }
+ assert(i <= 71);
+ text[i++] = '\n';
+ text[i] = 0;
+ {
+ char * newStr = (char *)
+ YA_MALLOC(&(hand->alloc), (unsigned int)(strlen((char *) str) +
+ strlen((char *) text) +
+ strlen(arrow) + 1));
+ if (newStr) {
+ newStr[0] = 0;
+ strcat((char *) newStr, (char *) str);
+ strcat((char *) newStr, text);
+ strcat((char *) newStr, arrow);
+ }
+ YA_FREE(&(hand->alloc), str);
+ str = (unsigned char *) newStr;
+ }
+ }
+ return str;
+}
+
+/* check for client cancelation */
+#define _CC_CHK(x) \
+ if (!(x)) { \
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error); \
+ hand->parseError = \
+ "client cancelled parse via callback return value"; \
+ return yajl_status_client_canceled; \
+ }
+
+
+yajl_status
+yajl_do_finish(yajl_handle hand)
+{
+ yajl_status stat;
+ stat = yajl_do_parse(hand,(const unsigned char *) " ",1);
+
+ if (stat != yajl_status_ok) return stat;
+
+ switch(yajl_bs_current(hand->stateStack))
+ {
+ case yajl_state_parse_error:
+ case yajl_state_lexical_error:
+ return yajl_status_error;
+ case yajl_state_got_value:
+ case yajl_state_parse_complete:
+ return yajl_status_ok;
+ default:
+ if (!(hand->flags & yajl_allow_partial_values))
+ {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "premature EOF";
+ return yajl_status_error;
+ }
+ return yajl_status_ok;
+ }
+}
+
+yajl_status
+yajl_do_parse(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen)
+{
+ yajl_tok tok;
+ const unsigned char * buf;
+ size_t bufLen;
+ size_t * offset = &(hand->bytesConsumed);
+
+ *offset = 0;
+
+ around_again:
+ switch (yajl_bs_current(hand->stateStack)) {
+ case yajl_state_parse_complete:
+ if (hand->flags & yajl_allow_multiple_values) {
+ yajl_bs_set(hand->stateStack, yajl_state_got_value);
+ goto around_again;
+ }
+ if (!(hand->flags & yajl_allow_trailing_garbage)) {
+ if (*offset != jsonTextLen) {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ if (tok != yajl_tok_eof) {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "trailing garbage";
+ }
+ goto around_again;
+ }
+ }
+ return yajl_status_ok;
+ case yajl_state_lexical_error:
+ case yajl_state_parse_error:
+ return yajl_status_error;
+ case yajl_state_start:
+ case yajl_state_got_value:
+ case yajl_state_map_need_val:
+ case yajl_state_array_need_val:
+ case yajl_state_array_start: {
+ /* for arrays and maps, we advance the state for this
+ * depth, then push the state of the next depth.
+ * If an error occurs during the parsing of the nesting
+ * enitity, the state at this level will not matter.
+ * a state that needs pushing will be anything other
+ * than state_start */
+
+ yajl_state stateToPush = yajl_state_start;
+
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+
+ switch (tok) {
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ case yajl_tok_string:
+ if (hand->callbacks && hand->callbacks->yajl_string) {
+ _CC_CHK(hand->callbacks->yajl_string(hand->ctx,
+ buf, bufLen));
+ }
+ break;
+ case yajl_tok_string_with_escapes:
+ if (hand->callbacks && hand->callbacks->yajl_string) {
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_string_decode(hand->decodeBuf, buf, bufLen);
+ _CC_CHK(hand->callbacks->yajl_string(
+ hand->ctx, yajl_buf_data(hand->decodeBuf),
+ yajl_buf_len(hand->decodeBuf)));
+ }
+ break;
+ case yajl_tok_bool:
+ if (hand->callbacks && hand->callbacks->yajl_boolean) {
+ _CC_CHK(hand->callbacks->yajl_boolean(hand->ctx,
+ *buf == 't'));
+ }
+ break;
+ case yajl_tok_null:
+ if (hand->callbacks && hand->callbacks->yajl_null) {
+ _CC_CHK(hand->callbacks->yajl_null(hand->ctx));
+ }
+ break;
+ case yajl_tok_left_bracket:
+ if (hand->callbacks && hand->callbacks->yajl_start_map) {
+ _CC_CHK(hand->callbacks->yajl_start_map(hand->ctx));
+ }
+ stateToPush = yajl_state_map_start;
+ break;
+ case yajl_tok_left_brace:
+ if (hand->callbacks && hand->callbacks->yajl_start_array) {
+ _CC_CHK(hand->callbacks->yajl_start_array(hand->ctx));
+ }
+ stateToPush = yajl_state_array_start;
+ break;
+ case yajl_tok_integer:
+ if (hand->callbacks) {
+ if (hand->callbacks->yajl_number) {
+ _CC_CHK(hand->callbacks->yajl_number(
+ hand->ctx,(const char *) buf, bufLen));
+ } else if (hand->callbacks->yajl_integer) {
+ long long int i = 0;
+ i = yajl_parse_integer(buf, bufLen);
+ if ((i == LLONG_MIN || i == LLONG_MAX) &&
+ errno == ERANGE)
+ {
+ yajl_bs_set(hand->stateStack,
+ yajl_state_parse_error);
+ hand->parseError = "integer overflow" ;
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ _CC_CHK(hand->callbacks->yajl_integer(hand->ctx,
+ i));
+ }
+ }
+ break;
+ case yajl_tok_double:
+ if (hand->callbacks) {
+ if (hand->callbacks->yajl_number) {
+ _CC_CHK(hand->callbacks->yajl_number(
+ hand->ctx, (const char *) buf, bufLen));
+ } else if (hand->callbacks->yajl_double) {
+ double d = 0.0;
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_buf_append(hand->decodeBuf, buf, bufLen);
+ buf = yajl_buf_data(hand->decodeBuf);
+ d = strtod((char *) buf, NULL);
+ if ((d == HUGE_VAL || d == -HUGE_VAL) &&
+ errno == ERANGE)
+ {
+ yajl_bs_set(hand->stateStack,
+ yajl_state_parse_error);
+ hand->parseError = "numeric (floating point) "
+ "overflow";
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ _CC_CHK(hand->callbacks->yajl_double(hand->ctx,
+ d));
+ }
+ }
+ break;
+ case yajl_tok_right_brace: {
+ if (yajl_bs_current(hand->stateStack) ==
+ yajl_state_array_start)
+ {
+ if (hand->callbacks &&
+ hand->callbacks->yajl_end_array)
+ {
+ _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ }
+ /* intentional fall-through */
+ }
+ case yajl_tok_colon:
+ case yajl_tok_comma:
+ case yajl_tok_right_bracket:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "unallowed token at this point in JSON text";
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "invalid token, internal error";
+ goto around_again;
+ }
+ /* got a value. transition depends on the state we're in. */
+ {
+ yajl_state s = yajl_bs_current(hand->stateStack);
+ if (s == yajl_state_start || s == yajl_state_got_value) {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_complete);
+ } else if (s == yajl_state_map_need_val) {
+ yajl_bs_set(hand->stateStack, yajl_state_map_got_val);
+ } else {
+ yajl_bs_set(hand->stateStack, yajl_state_array_got_val);
+ }
+ }
+ if (stateToPush != yajl_state_start) {
+ yajl_bs_push(hand->stateStack, stateToPush);